diff --git a/.gitignore b/.gitignore
index 7f6a3c413..d7917b34d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ python/flexflow/core/flexflow_cffi_header.py
 *.pb.h
 *.o
 *.a
+*.nsys-rep
+*.nfs*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -188,3 +190,8 @@ python/flexflow/version.txt
 
 inference_tensors
 tests/inference/python_test_configs/*.json
+
+core.*
+*.out
+sharegpt.json
+wildchat.json
diff --git a/.gitmodules b/.gitmodules
index c68582d4a..6b437e036 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,4 +22,10 @@
 [submodule "deps/tokenizers-cpp"]
 	path = deps/tokenizers-cpp
 	url = https://github.com/mlc-ai/tokenizers-cpp.git
-	fetchRecurseSubmodules = true
\ No newline at end of file
+	fetchRecurseSubmodules = true
+[submodule "deps/flashinfer"]
+	path = deps/flashinfer
+	url = https://github.com/flashinfer-ai/flashinfer.git
+[submodule "deps/raft"]
+	path = deps/raft
+	url = https://github.com/rapidsai/raft.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43ce4f704..978d84de4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,12 @@ project(FlexFlow)
 
 include(ExternalProject)
 
+enable_language(CXX)
+enable_language(CUDA)
+if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)
+    message(FATAL_ERROR "Your C++ compiler is too old. Please upgrade to version 8 or higher.")
+endif()
+
 # Set policy CMP0074 to eliminate cmake warnings
 cmake_policy(SET CMP0074 NEW)
 cmake_policy(SET CMP0077 NEW)
@@ -128,6 +134,9 @@ list(APPEND CC_FLAGS
 list(APPEND NVCC_FLAGS
   -std=c++17)
 
+list(APPEND NVCC_FLAGS
+  --expt-relaxed-constexpr
+  --extended-lambda)
 
 add_compile_options(${CC_FLAGS})
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS})
@@ -201,6 +210,12 @@ if(NOT BUILD_LEGION_ONLY)
   # optional
   include(optional)
 
+  set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/build/install)
+  find_package(raft)
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/include)
+
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include)
+
   if (FF_GPU_BACKEND STREQUAL "cuda")
     list(APPEND FF_CC_FLAGS
       -DFF_USE_CUDA)
@@ -290,6 +305,12 @@ if(NOT BUILD_LEGION_ONLY)
       LIST_DIRECTORIES False
       ${FLEXFLOW_ROOT}/src/*.cu)
 
+    # tensorrt_llm custom allreduce
+    if(FF_USE_NCCL)
+      list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm)
+      list(APPEND FLEXFLOW_GPU_SRC ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu)
+    endif()
+
     add_compile_definitions(FF_USE_CUDA)
 
     if(BUILD_SHARED_LIBS)
@@ -397,6 +418,8 @@ if(NOT BUILD_LEGION_ONLY)
     target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
   endif()
 
+  target_link_libraries(flexflow raft::raft)
+
   #library api version, bump from time to time
   set(SOVERSION 1)
 
@@ -425,7 +448,7 @@ if(NOT BUILD_LEGION_ONLY)
       # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
       add_custom_command(TARGET flexflow
         POST_BUILD	
-        COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
+        COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
       )
       # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
@@ -557,7 +580,9 @@ if(NOT BUILD_LEGION_ONLY)
 
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
+    add_subdirectory(inference/simplified_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/trace_generator)
   endif()
 
 
diff --git a/FlexFlow.mk b/FlexFlow.mk
index 14f32a763..fadcf4de3 100644
--- a/FlexFlow.mk
+++ b/FlexFlow.mk
@@ -95,9 +95,12 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1)
 endif
 
 
-INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
+INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src \
+				-I${FF_HOME}/deps/raft/cpp/include -I${FF_HOME}/deps/rmm/include -I${FF_HOME}/deps/spdlog/include \
+				-I${FF_HOME}/deps/flashinfer/include
 CC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
-NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
+NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 \
+			    --expt-relaxed-constexpr --extended-lambda
 HIPCC_FLAGS     += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
 GASNET_FLAGS	+=
 # For Point and Rect typedefs
diff --git a/benchmarking/average_accepted_tokens.pdf b/benchmarking/average_accepted_tokens.pdf
new file mode 100644
index 000000000..717e6e68a
Binary files /dev/null and b/benchmarking/average_accepted_tokens.pdf differ
diff --git a/benchmarking/benchmark_incr_dec.sh b/benchmarking/benchmark_incr_dec.sh
new file mode 100755
index 000000000..3a75fa61d
--- /dev/null
+++ b/benchmarking/benchmark_incr_dec.sh
@@ -0,0 +1,88 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/../build"
+
+# export BUILD_TYPE=Debug
+# ../config/config.linux
+make -j install
+
+model_name=meta-llama/Llama-3.1-70B-Instruct
+NGPUS=8
+NCPUS=16
+FSIZE=36000
+ZSIZE=200000
+CSIZE=100000
+
+# comment these lines in for debugging
+# model_name=meta-llama/Llama-3.1-8B-Instruct
+# NGPUS=8
+# FSIZE=36000
+# ZSIZE=30000
+# CSIZE=100000
+
+
+
+MAX_SEQ_LEN=7000
+tokens_per_batch=1024
+
+batch_sizes=(
+    8
+    4
+)
+
+request_per_second_values=(
+    -1
+    1
+    2
+    4
+    8
+)
+
+dataset_name="sharegpt"
+dataset_fp="../benchmarking/${dataset_name}.json"
+partition_name="all"
+
+export LEGION_BACKTRACE=1
+
+# python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='meta-llama/Llama-3.1-70B-Instruct', allow_patterns='*.safetensors', max_workers=30)"
+# python ../inference/utils/download_hf_model.py --half-precision-only $model_name --refresh-cache
+
+for k in "${!request_per_second_values[@]}"; do
+for j in "${!batch_sizes[@]}"; do
+    batch_size=${batch_sizes[$j]}
+    request_per_second=${request_per_second_values[$k]}
+    
+    echo "Running dataset ${dataset_fp} with model ${model_name}, batch size ${batch_size}, tokens per batch ${tokens_per_batch}, and request per second ${request_per_second}"
+    # create model name version where "/" is replaced with "-"
+    model_name_=$(echo $model_name | tr / -)
+    if [ $request_per_second -gt 0 ]; then
+        rate=$request_per_second
+    else
+        rate="offline"
+    fi
+    log_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.log"
+    output_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.json"
+    metrics_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv"
+    rm $metrics_fp $output_fp $log_fp || true
+
+    time ./inference/simplified_infer/incr_dec \
+        -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \
+        -tensor-parallelism-degree $NGPUS \
+        -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \
+        --fusion \
+        --max-sequence-length $MAX_SEQ_LEN \
+        --max-requests-per-batch $batch_size \
+        --max-tokens-per-batch $tokens_per_batch \
+        --max-output-length 1024 \
+        --request-per-second ${request_per_second} \
+        -llm-model $model_name \
+        -trace ${dataset_fp} \
+        -trace-output-path ${output_fp} \
+        -csv-output-path $metrics_fp \
+        -target-partition ${partition_name} \
+        2>&1 | tee ${log_fp}
+done
+done
\ No newline at end of file
diff --git a/benchmarking/benchmark_specinfer.sh b/benchmarking/benchmark_specinfer.sh
new file mode 100755
index 000000000..e0c8e39d7
--- /dev/null
+++ b/benchmarking/benchmark_specinfer.sh
@@ -0,0 +1,109 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/../build"
+
+# export BUILD_TYPE=Debug
+# ../config/config.linux
+make -j
+source ./set_python_envs.sh
+# reset
+
+model_name=meta-llama/Llama-3.1-70B-Instruct
+NGPUS=8
+NCPUS=16
+FSIZE=36000
+ZSIZE=200000
+CSIZE=100000
+
+# comment these lines in for debugging
+# model_name=meta-llama/Llama-3.1-8B-Instruct
+# NGPUS=8
+# FSIZE=36000
+# ZSIZE=30000
+# CSIZE=100000
+######################################
+
+small_model_names=(
+    Zhuominc/Llama-3-330M
+    meta-llama/Llama-3.2-1B-Instruct
+    meta-llama/Llama-3.2-3B-Instruct
+    meta-llama/Llama-3.1-8B-Instruct
+)
+
+MAX_SEQ_LEN=7000
+tokens_per_batch=1024
+max_tree_depth=8
+expansion_degree=3
+
+batch_sizes=(
+    8
+    4
+)
+
+request_per_second_values=(
+    -1
+    1
+    2
+    4
+    8
+)
+
+dataset_name="sharegpt"
+dataset_fp="../benchmarking/${dataset_name}.json"
+partition_name="all"
+
+export LEGION_BACKTRACE=1
+
+# python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='meta-llama/Llama-3.1-70B-Instruct', allow_patterns='*.safetensors', max_workers=30)"
+python ../inference/utils/download_hf_model.py --half-precision-only $model_name
+for small_model_name in "${small_model_names[@]}"; do
+    python ../inference/utils/download_hf_model.py --half-precision-only $small_model_name
+done
+
+for k in "${!request_per_second_values[@]}"; do
+for j in "${!batch_sizes[@]}"; do
+for i in "${!small_model_names[@]}"; do
+    small_model_name=${small_model_names[$i]}
+    batch_size=${batch_sizes[$j]}
+    request_per_second=${request_per_second_values[$k]}
+    
+    echo "Running dataset ${dataset_fp} with model ${model_name}, draft model ${small_model_name}, batch size ${batch_size}, tokens per batch ${tokens_per_batch}, and request per second ${request_per_second}"
+    # create model name version where "/" is replaced with "-"
+    model_name_=$(echo $model_name | tr / -)
+    small_model_name_=$(echo $small_model_name | tr / -)
+    if [ $request_per_second -gt 0 ]; then
+        rate=$request_per_second
+    else
+        rate="offline"
+    fi
+    log_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.log"
+    output_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.json"
+    metrics_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv"
+    rm $metrics_fp $output_fp $log_fp || true
+
+    time ./inference/suffix_decoding/specinfer \
+        -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \
+        -tensor-parallelism-degree $NGPUS \
+        -ssm-tp-degree $NGPUS \
+        -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \
+        --fusion \
+        --max-sequence-length $MAX_SEQ_LEN \
+        --max-requests-per-batch $batch_size \
+        --max-tokens-per-batch $tokens_per_batch \
+        --max-output-length 1024 \
+        --max-tree-depth ${max_tree_depth} \
+        --expansion-degree ${expansion_degree} \
+        --request-per-second ${request_per_second} \
+        -llm-model $model_name \
+        -ssm-model $small_model_name \
+        -trace ${dataset_fp} \
+        -trace-output-path ${output_fp} \
+        -csv-output-path $metrics_fp \
+        -target-partition ${partition_name} \
+        2>&1 | tee ${log_fp}
+done
+done
+done
\ No newline at end of file
diff --git a/benchmarking/get_sharegpt_trace.py b/benchmarking/get_sharegpt_trace.py
new file mode 100644
index 000000000..dbe8f4d3b
--- /dev/null
+++ b/benchmarking/get_sharegpt_trace.py
@@ -0,0 +1,206 @@
+from dataclasses import asdict, dataclass, field
+import json
+import os
+import random
+import requests
+from tqdm.asyncio import tqdm
+from typing import List, Optional
+from collections import OrderedDict
+from transformers import AutoTokenizer
+
+SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
+
+@dataclass
+class TraceEntry:
+    prompt: str
+    response: str
+    prompt_length: int
+    response_length: int
+
+@dataclass
+class TracePartition:
+    partition_name: str
+    model_name: str
+    num_warmup_requests: int
+    training_entries: List[TraceEntry]
+    eval_entries: List[TraceEntry]
+
+@dataclass
+class TraceMetadata:
+    avg_entries_per_partition: float
+    max_prompt_length: int
+    min_prompt_length: int
+    avg_prompt_length: float
+    max_response_length: int
+    min_response_length: int
+    avg_response_length: float
+    max_total_length: int
+
+@dataclass
+class Trace:
+    partitions: List[TracePartition]
+    metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0,0))
+
+def download_and_cache_file(url: str, filename: Optional[str] = None):
+    """Read and cache a file from a url."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    # Check if the cache file already exists
+    if os.path.exists(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+
+    # Stream the response to show the progress bar
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check for request errors
+
+    # Total size of the file in bytes
+    total_size = int(response.headers.get("content-length", 0))
+    chunk_size = 1024  # Download in chunks of 1KB
+
+    # Use tqdm to display the progress bar
+    with open(filename, "wb") as f, tqdm(
+        desc=filename,
+        total=total_size,
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            f.write(chunk)
+            bar.update(len(chunk))
+
+    return filename
+
+def get_warmup_entries(model_name: str, num_warmup_requests: int) -> List[TraceEntry]:
+    """
+    Get a list of warmup entries for a model.
+    
+    Args:
+    model_name (str): The name of the model.
+    num_warmup_requests (int): The number of warmup requests to generate.
+    
+    Returns:
+    List[TraceEntry]: A list of warmup entries.
+    """
+    warmup_entries = []
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    for i in range(num_warmup_requests):
+        prompt = "Hello, how are you?"
+        prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        response = "I'm doing well, thank you for asking."
+        prompt_length = len(tokenizer(prompt)["input_ids"])
+        response_length = len(tokenizer(response)["input_ids"])
+        warmup_entries.append(TraceEntry(prompt, response, prompt_length, response_length))
+    return warmup_entries
+
+def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, seed: int):
+    # Download sharegpt if necessary
+    dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f, object_pairs_hook=OrderedDict)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+        if data["conversations"][0]["from"] == "human" and data["conversations"][1]["from"] == "gpt"
+    ]
+
+    # Shuffle the dataset.
+    random.seed(seed)
+    random.shuffle(dataset)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    trace = Trace(partitions=[])
+    partition = TracePartition(
+        partition_name="all",
+        model_name=model_name,
+        num_warmup_requests=num_warmup_requests,
+        training_entries=[],
+        eval_entries=[],
+    )
+    trace_metadata = TraceMetadata(
+        avg_entries_per_partition=0,
+        max_prompt_length=0,
+        min_prompt_length=float("inf"),
+        avg_prompt_length=0,
+        max_response_length=0,
+        min_response_length=float("inf"),
+        avg_response_length=0,
+        max_total_length=0,
+    )
+
+    partition.eval_entries += get_warmup_entries(model_name, num_warmup_requests)
+    
+    for i in tqdm(range(len(dataset))):
+        if len(partition.eval_entries) == num_entries:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        response = dataset[i][1]
+        prompt_length = len(tokenizer(prompt)["input_ids"])
+        response_length = len(tokenizer(response)["input_ids"])
+        new_entry = TraceEntry(prompt, response, prompt_length, response_length)
+        partition.eval_entries.append(new_entry)
+        trace_metadata.max_prompt_length = max(trace_metadata.max_prompt_length, prompt_length)
+        trace_metadata.min_prompt_length = min(trace_metadata.min_prompt_length, prompt_length)
+        trace_metadata.avg_prompt_length += prompt_length
+        trace_metadata.max_response_length = max(trace_metadata.max_response_length, response_length)
+        trace_metadata.min_response_length = min(trace_metadata.min_response_length, response_length)
+        trace_metadata.avg_response_length += response_length
+        trace_metadata.max_total_length = max(trace_metadata.max_total_length, prompt_length + response_length)
+    trace_metadata.avg_prompt_length /= len(partition.eval_entries)
+    trace_metadata.avg_response_length /= len(partition.eval_entries)
+    trace_metadata.avg_entries_per_partition = len(partition.eval_entries)
+
+    trace.partitions.append(partition)
+    trace.metadata = trace_metadata
+
+    return trace
+
+def save_trace(trace: Trace, output_path: str):
+    """
+    Save a Trace instance to a JSON file.
+    
+    Args:
+    trace (Trace): The trace to save.
+    output_path (str): The path where the JSON file will be saved.
+    """
+    # Convert the Trace instance to a dictionary
+    trace_dict = asdict(trace)
+    
+    # Save the dictionary as a JSON file
+    with open(output_path, 'w') as f:
+        json.dump(trace_dict, f, indent=2)
+    
+    print(f"Trace saved to {output_path}")
+
+if __name__ == "__main__":
+    # Change directory to that holding this script
+    os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+    num_entries=125
+    num_warmup_requests=8
+    seed=42
+
+    trace = build_trace("meta-llama/Llama-3.1-70B-Instruct", num_entries, num_warmup_requests, seed)
+    print(trace.metadata)
+    # Save prompts list to a json file
+    save_trace(trace, "sharegpt.json")
\ No newline at end of file
diff --git a/benchmarking/get_wildchat_trace.py b/benchmarking/get_wildchat_trace.py
new file mode 100644
index 000000000..53ee46efb
--- /dev/null
+++ b/benchmarking/get_wildchat_trace.py
@@ -0,0 +1,64 @@
+import datasets
+from transformers import AutoTokenizer
+from tqdm import tqdm
+import json, os
+
+def build_trace(dataset: datasets.Dataset, model_name: str, num_entries: int, seed: int):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    
+    dataset = dataset["train"].filter(
+        lambda x: x["model"] == "gpt-4" and x["turn"] == 1 and x["language"] == "English"
+    ).shuffle(seed=seed).select(range(num_entries))
+    pairs = []
+    for row in dataset:
+        assert len(row["conversation"]) == 2
+        assert row["conversation"][0]["role"] == "user"
+        assert row["conversation"][1]["role"] == "assistant"
+        pairs.append((
+            row["conversation"][0]["content"],
+            row["conversation"][1]["content"],
+        ))
+
+    prompts = []
+    avg_prompt_length = 0
+    min_prompt_length = float("inf")
+    max_prompt_length = 0
+    avg_response_length = 0
+    min_response_length = float("inf")
+    max_response_length = 0
+    max_total_length = 0
+    for prompt, response in tqdm(pairs, desc="Processing HF trace"):
+        prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        prompt_length = len(tokenizer(prompt)["input_ids"])
+        response_length = len(tokenizer(response)["input_ids"])
+        prompts.append(prompt)
+        avg_prompt_length += prompt_length
+        avg_response_length += response_length
+        min_prompt_length = min(min_prompt_length, prompt_length)
+        min_response_length = min(min_response_length, response_length)
+        max_prompt_length = max(max_prompt_length, prompt_length)
+        max_response_length = max(max_response_length, response_length)
+        max_total_length = max(max_total_length, prompt_length + response_length)
+    avg_prompt_length /= len(prompts)
+    avg_response_length /= len(prompts)
+
+    return prompts, max_prompt_length, max_response_length, avg_prompt_length, avg_response_length, min_prompt_length, min_response_length, max_total_length
+
+if __name__ == "__main__":
+    # Change directory to that holding this script
+    os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+    dataset = datasets.load_dataset("allenai/WildChat")
+    prompts, max_prompt_length, max_response_length, avg_prompt_length, avg_response_length, min_prompt_length, min_response_length, max_total_length = build_trace(dataset, "meta-llama/Llama-3.1-70B-Instruct", 250, 42)
+    print(f"Number of prompts: {len(prompts)}")
+    print(f"Prompt lengths: [{min_prompt_length} -> {max_prompt_length}] (avg: {avg_prompt_length})")
+    print(f"Response lengths: [{min_response_length} -> {max_response_length}] (avg: {avg_response_length})")
+    print(f"Max total length: {max_total_length}")
+    # Save prompts list to a json file
+
+    with open("wildchat.json", "w") as f:
+        json.dump(prompts, f, indent=2)
\ No newline at end of file
diff --git a/benchmarking/plot_results.ipynb b/benchmarking/plot_results.ipynb
new file mode 100644
index 000000000..c7dcff18c
--- /dev/null
+++ b/benchmarking/plot_results.ipynb
@@ -0,0 +1,776 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/usr/FlexFlow/inference/output\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "os.chdir(\"/usr/FlexFlow/inference/output\")\n",
+    "print(os.getcwd())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "small_model_names = [\n",
+    "    \"Zhuominc/Llama-3-330M\",\n",
+    "    \"meta-llama/Llama-3.2-1B-Instruct\",\n",
+    "    # \"meta-llama/Llama-3.2-3B-Instruct\",\n",
+    "    \"meta-llama/Llama-3.1-8B-Instruct\",\n",
+    "]\n",
+    "batch_sizes=[4,8]\n",
+    "arrival_rates=[\"offline\", \"1\", \"2\", \"4\", \"8\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_speculation_len(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    return df[\"num_speculated_tokens\"].mean()\n",
+    "\n",
+    "def get_accepted_len(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    return df[\"num_accepted_tokens\"].mean()\n",
+    "\n",
+    "def get_acceptance_rates(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    # group = df.groupby(\"request_guid\", as_index=False)\n",
+    "    num_speculated_tokens = df[\"num_speculated_tokens\"].sum()\n",
+    "    num_accepted_tokens = df[\"num_accepted_tokens\"].sum()\n",
+    "    return num_accepted_tokens/num_speculated_tokens\n",
+    "\n",
+    "def get_tpot(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    group = df.groupby(\"request_guid\", as_index=False)\n",
+    "    min_time = group[\"timestamp\"].min()[\"timestamp\"]\n",
+    "    max_time = group[\"timestamp\"].max()[\"timestamp\"]\n",
+    "    num_tokens = group[\"num_generated_tokens\"].sum()[\"num_generated_tokens\"]\n",
+    "    tpots = (max_time - min_time) / num_tokens / 1000\n",
+    "    return tpots.mean()\n",
+    "\n",
+    "def get_throughput(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    num_tokens = df[\"num_generated_tokens\"].sum()\n",
+    "    total_time = df[\"timestamp\"].max() - df[\"timestamp\"].min() # in microseconds\n",
+    "    total_time = total_time / 1000000 # convert to seconds\n",
+    "    throughput = num_tokens / total_time # (tokens/sec)\n",
+    "    return throughput\n",
+    "\n",
+    "def get_ttft(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0)]\n",
+    "    group = df.groupby(\"request_guid\", as_index=False)\n",
+    "    ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+    "    # convert to milliseconds from microseconds\n",
+    "    return ttft.mean()[1] / 1000\n",
+    "\n",
+    "def get_queueing_time(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0)]\n",
+    "    group = df.groupby(\"request_guid\", as_index=False)\n",
+    "    # in each group, find the difference between the timestampt at request_step_idx=-1 and the timestamp at request_step_idx=-2.\n",
+    "    queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+    "    # convert to seconds from microseconds\n",
+    "    return queueing_time.mean()[1] / 1000000\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA9wAAAPECAYAAABc1TPrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACsUklEQVR4nOzdeXxM1//H8fdkMUlI7EEIiVC72rWU2PdYqlVULW1RS7X17V6K0qKtLqpFUZRYaqmuqrSxdaHW6rf2SilaSxGEiMz5/eGX+ZpOkIm5YvT1fDzyYM49c+9n7owb7zn3nmszxhgBAAAAAACv8svuAgAAAAAAuBURuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQC3hAYNGqhixYrZXUamzZo1S2XLllVgYKDy5MmT3eX4tAYNGqhBgwY3ZFvDhw+XzWbTsWPHbsj2AAC+jcANAP/w3nvvyWazqXbt2tldyk0nKipKNptNjz76qNuylStXymazaeHChdlQmW/ZsWOHevbsqZiYGE2ZMkXvv/9+pp739NNPy2az6b777rO4Qu/7/vvvNXz4cJ08efKGbzs9JF/r50aF9n+jzz77TLGxsQoPD1dISIhKliypTp066auvvnL2OXTokIYPH64tW7ZkX6EA4GUB2V0AANxs4uPjFRUVpfXr12vPnj0qVapUdpd005kyZYqee+45RUREZHcpPmnlypVyOBx6++23M/35MsZo7ty5ioqK0meffabTp08rNDTU4kq95/vvv9eIESPUs2fPGz6if/fdd7vs5zNnzqhfv37q0KGD7r77bmd7oUKFbmhd/xavv/66nnrqKcXGxuq5555TSEiI9uzZoxUrVmjevHlq0aKFpEuBe8SIEYqKilKVKlWyt2gA8BICNwBcZt++ffr++++1ePFi9e3bV/Hx8Ro2bNgNrcHhcOjChQsKCgq6odvNrAoVKmjnzp0aM2aMxo8fn93l3FDeem+OHDkiSR4Fz5UrV+qPP/7Qt99+q+bNm2vx4sXq0aPHddXxb1G5cmVVrlzZ+fjYsWPq16+fKleurG7dumVjZbcGY4zOnz+v4OBgt2UXL17UyJEj1bRpU3399dduy9P/LQDArYpTygHgMvHx8cqbN69at26te+65R/Hx8c5lqampypcvn3r16uX2vKSkJAUFBenJJ590tqWkpGjYsGEqVaqU7Ha7IiMj9fTTTyslJcXluTabTQMHDlR8fLwqVKggu93uPM3y9ddfV506dZQ/f34FBwerevXqGZ6yfe7cOQ0aNEgFChRQaGio2rZtq4MHD8pms2n48OEufQ8ePKgHH3xQhQoVkt1uV4UKFfTBBx9keh9FRUWpe/fumjJlig4dOnTVvj179lRUVJRbe/opvhnthwULFqh8+fIKDg7WnXfeqW3btkmSJk+erFKlSikoKEgNGjRQYmJihtvcuHGj6tSpo+DgYEVHR2vSpElufbzx3lzJe++95+wbERGhAQMGuJxGHRUV5fwSp2DBghm+RxmJj49X+fLl1bBhQzVp0sTls3m5gwcP6qGHHlJERITsdruio6PVr18/Xbhwwdnn5MmTeuKJJxQVFSW73a5ixYqpe/fuLtclZ2UflSlTRkFBQapevbpWr17t7DN8+HA99dRTkqTo6GjnKdyXv4ezZ89W9erVFRwcrHz58qlz5846cOCA2+t7//33FRMTo+DgYNWqVUtr1qy55r7LrG+//Vb16tVTzpw5lSdPHrVr107bt2+/5vN+//13lSpVShUrVtRff/0l6dI+fvzxxxUZGSm73a5SpUpp7NixcjgczuclJibKZrPp9ddfd74uu92umjVr6qeffnLZxp9//qlevXqpWLFistvtKlKkiNq1a3fFfwfpevbsqVy5cum3335T8+bNlTNnTkVEROill16SMcalr8Ph0FtvvaUKFSooKChIhQoVUt++fXXixAmXflFRUWrTpo2WLVumGjVqKDg4WJMnT85w+8eOHVNSUpLq1q2b4fLw8HBJl75QqlmzpiSpV69ezs/IjBkznH3XrVunFi1aKHfu3AoJCVFsbKy+++47l/WlH1t27NihTp06KSwsTPnz59djjz2m8+fPX3VfAYAlDADAqWzZsuahhx4yxhizevVqI8msX7/eufzBBx80efLkMSkpKS7PmzlzppFkfvrpJ2OMMWlpaaZZs2YmJCTEPP7442by5Mlm4MCBJiAgwLRr187luZJMuXLlTMGCBc2IESPMu+++azZv3myMMaZYsWKmf//+ZsKECeaNN94wtWrVMpLM559/7rKOTp06GUnmgQceMO+++67p1KmTuf32240kM2zYMGe/P//80xQrVsxERkaal156yUycONG0bdvWSDJvvvnmNfdPiRIlTOvWrc3evXtNQECAefTRR53LEhISjCSzYMECZ1uPHj1MiRIl3NYzbNgw889fQZJM5cqVTWRkpBkzZowZM2aMyZ07tylevLiZMGGCKV++vBk3bpwZMmSIyZEjh2nYsKHL82NjY01ERIQJDw83AwcONOPHjzd33XWXkWSmTZvm7Oet9yYj6a+rSZMm5p133jEDBw40/v7+pmbNmubChQvGGGM+/vhj06FDByPJTJw40cyaNcts3br1qvv9/PnzJk+ePGbkyJHGGGM+/PBD4+/vbw4fPuzS7+DBgyYiIsL52iZNmmSGDh1qypUrZ06cOGGMMeb06dOmYsWKxt/f3/Tu3dtMnDjRjBw50tSsWdP52jzdRxUrVjQFChQwL730khk7dqwpUaKECQ4ONtu2bTPGGLN161bTpUsX5+ds1qxZZtasWebMmTPGGGNGjRplbDabue+++8x7771nRowYYQoUKGCioqKcdRtjzNSpU40kU6dOHTN+/Hjz+OOPmzx58piSJUua2NjYq+7Dyx09etTt38by5ctNQECAue2228yrr77qrCFv3rxm3759zn7p7/HRo0eNMcbs2bPHFC9e3FSpUsXZdvbsWVO5cmWTP39+8/zzz5tJkyaZ7t27G5vNZh577DHnuvbt22ckmapVq5pSpUqZsWPHmldffdUUKFDAFCtWzPmZMcaYOnXqmNy5c5shQ4aYqVOnmldeecU0bNjQrFq16qqvtUePHiYoKMiULl3aPPDAA2bChAmmTZs2RpIZOnSoS9+HH37YBAQEmN69e5tJkyaZZ555xuTMmdPl82vMpeNAqVKlTN68ec2zzz5rJk2aZBISEjLcflpamgkODjbVq1c3x48fv2Kdf/75p3nppZeMJNOnTx/nZ2Tv3r3GGGO++eYbkyNHDnPnnXeacePGmTfffNNUrlzZ5MiRw6xbt87t/alUqZKJi4szEyZMMN26dXMeHwHgRiNwA8D/27Bhg5Fkli9fbowxxuFwmGLFirn8B3nZsmVGkvnss89cntuqVStTsmRJ5+NZs2YZPz8/s2bNGpd+kyZNMpLMd99952yTZPz8/Mx///tft5qSk5NdHl+4cMFUrFjRNGrUyNm2ceNGI8k8/vjjLn179uzpFioeeughU6RIEXPs2DGXvp07dza5c+d2294/pQduY4zp1auXCQoKMocOHTLGeCdw2+12l3AzefJkI8kULlzYJCUlOdufe+45I8mlb2xsrJFkxo0b52xLSUkxVapUMeHh4c7A4K335p+OHDlicuTIYZo1a2bS0tKc7RMmTDCSzAcffOD2+tMD2rUsXLjQSDK7d+82xhiTlJRkgoKC3L4k6d69u/Hz83N+8XM5h8NhjDHmxRdfNJLM4sWLr9jH030kyWzYsMHZ9vvvv5ugoCDToUMHZ9trr73m9p4ZY0xiYqLx9/c3L7/8skv7tm3bTEBAgLP9woULJjw83FSpUsXlC6/333/fSLruwJ3+Obk8FG7dutX4+fmZ7t27O9suf++2b99uIiIiTM2aNc3ff//t7DNy5EiTM2dOs2vXLpftPvvss8bf39/s37/fGPO/wJ0/f36X53/yyScux5kTJ04YSea1117L9GtM16NHDyPJ5csxh8NhWrdubXLkyOH8DK5Zs8ZIMvHx8S7P/+qrr9zaS5QoYSSZr776KlM1pH/mcubMaVq2bGlefvlls3HjRrd+P/30k5Fkpk+f7tLucDhM6dKlTfPmzZ2fUWMuHR+jo6NN06ZNnW3p70/btm1d1tG/f38j6ZpfbgGAt3FKOQD8v/j4eBUqVEgNGzaUJOds0PPmzVNaWpokqVGjRipQoIDmz5/vfN6JEye0fPlyl5mjFyxYoHLlyqls2bI6duyY86dRo0aSpISEBJdtx8bGqnz58m41XX5N5IkTJ3Tq1CnVq1dPmzZtcrann+Lcv39/l+f+cyZxY4wWLVqkuLg4GWNc6mrevLlOnTrlst5rGTJkiC5evKgxY8Zk+jnX0rhxY5dT0NNniu/YsaPLBGHp7b/99pvL8wMCAtS3b1/n4xw5cqhv3746cuSINm7cKMl7780/rVixQhcuXNDjjz8uP7///Xrt3bu3wsLC9MUXX2RmF2QoPj5eNWrUcE78FRoaqtatW7ucVu5wOLRkyRLFxcWpRo0abutIP4V/0aJFuv3229WhQ4cr9vF0H915552qXr2683Hx4sXVrl07LVu2zPlv50oWL14sh8OhTp06uWyrcOHCKl26tHNbGzZs0JEjR/TII48oR44czuf37NlTuXPnvuo2ruXw4cPasmWLevbsqXz58jnbK1eurKZNm+rLL790e84vv/yi2NhYRUVFacWKFcqbN69z2YIFC1SvXj3lzZvX5TU1adJEaWlpLqfbS9J9993n8vx69epJ+t/nOzg4WDly5NDKlSvdTu/OrIEDBzr/nn4ZwIULF7RixQpnzblz51bTpk1daq5evbpy5crl9p5HR0erefPmmdr2iBEjNGfOHFWtWlXLli3TCy+8oOrVq6tatWqZOmV/y5Yt2r17t7p27arjx487azt79qwaN26s1atXu5yqL0kDBgxweZx+PMzovQQAKzFpGgBISktL07x589SwYUPt27fP2V67dm2NGzdO33zzjZo1a6aAgAB17NhRc+bMUUpKiux2uxYvXqzU1FSXwL17925t375dBQsWzHB7/5woKDo6OsN+n3/+uUaNGqUtW7a4XDt7+fXPv//+u/z8/NzW8c/Zr48ePaqTJ0/q/fffv+JtqDyZwKhkyZJ64IEH9P777+vZZ5/N9POupnjx4i6P04NUZGRkhu3/DB8RERHKmTOnS9ttt90m6dL1snfccYfX3pt/+v333yVJZcqUcWnPkSOHSpYs6VzuqZMnT+rLL7/UwIEDtWfPHmd73bp1tWjRIu3atUu33Xabjh49qqSkpGvei3zv3r3q2LHjVft4uo9Kly7t1ue2225TcnKyjh49qsKFC191W8aYDNchSYGBgZL+t3//2S8wMFAlS5a88ovJhCu9d5JUrlw5LVu2TGfPnnX5bMXFxalQoUJatmyZcuXK5fKc3bt36+eff870/vvn5z49fKd/vu12u8aOHav//Oc/KlSokO644w61adNG3bt3v+q+Tefn5+e2jy7/d5Fe86lTp5zXVF+r5sz+u0jXpUsXdenSRUlJSVq3bp1mzJihOXPmKC4uTr/88stVJyLcvXu3JF11ksBTp065fGnxz89JTEyM/Pz8rnnNOwB4G4EbAHRpsqTDhw9r3rx5mjdvntvy+Ph4NWvWTJLUuXNnTZ48WUuXLlX79u310UcfqWzZsrr99tud/R0OhypVqqQ33ngjw+39M0BmNLvvmjVr1LZtW9WvX1/vvfeeihQposDAQE2fPl1z5szx+DWmjwB169btiv9xvXwm58x44YUXNGvWLI0dO1bt27d3W/7PidHSXWnU09/f36N2849JnzLDG+/NjbRgwQKlpKRo3LhxGjdunNvy+Ph4jRgxwqvb9HQfXe+2bDabli5dmuH7/M8we7Po2LGjZs6cqfj4eJezKqRLr6lp06Z6+umnM3xuethNl5nP9+OPP664uDgtWbJEy5Yt09ChQzV69Gh9++23qlq16nW+mks1h4eHX3Eyvn9+eZDVfxdhYWFq2rSpmjZtqsDAQM2cOVPr1q1TbGzsVWuTpNdee+2Ktwu71ufkSsciALAagRsAdCm0hIeH691333VbtnjxYn388ceaNGmSgoODVb9+fRUpUkTz58/XXXfdpW+//VYvvPCCy3NiYmK0detWNW7cOMv/0Vu0aJGCgoK0bNky2e12Z/v06dNd+pUoUUIOh0P79u1zGdW5fDRUuvQf5tDQUKWlpalJkyZZqumfYmJi1K1bN02ePNl5mvfl8ubN6zJDd7qsjvZey6FDh9xGInft2iVJzlPVvfHeZKREiRKSpJ07d7qMJl64cEH79u3L8j6Pj49XxYoVM7w93eTJkzVnzhyNGDFCBQsWVFhYmH755Zerri8mJiZTfTzZR+kjkJfbtWuXQkJCnEHtSuuJiYmRMUbR0dFuQfRy6ft39+7dzlPbpUt3D9i3b5/LF16euvy9+6cdO3aoQIECbmdOvPbaawoICFD//v0VGhqqrl27urymM2fOeO3f2eXr/c9//qP//Oc/2r17t6pUqaJx48Zp9uzZV32ew+HQb7/95rJ/M/p3sWLFCtWtW/eGfclUo0YNzZw5U4cPH5Z09c+IdCmsZ3af7t6922UUfs+ePXI4HBneNQEArMQ13AD+9c6dO6fFixerTZs2uueee9x+Bg4cqNOnT+vTTz+VdOn0zHvuuUefffaZZs2apYsXL7qcTi5JnTp10sGDBzVlypQMt3f27Nlr1uXv7y+bzeYyGpyYmKglS5a49Eu/jvK9995zaX/nnXfc1texY0ctWrQow8B19OjRa9aUkSFDhig1NVWvvvqq27KYmBidOnVKP//8s7Pt8OHD+vjjj7O0rWu5ePGiy+2JLly4oMmTJ6tgwYLOa4y98d5kpEmTJsqRI4fGjx/vMjI5bdo0nTp1Sq1bt/Z4nQcOHNDq1avVqVOnDD+bvXr10p49e7Ru3Tr5+fmpffv2+uyzz7Rhwwa3daXX1LFjR23dujXD9yC9j6f76IcffnC5/v/AgQP65JNP1KxZM+fobXpg/ecXMHfffbf8/f01YsQItzMWjDE6fvy4pEvhrGDBgpo0aZLLLc5mzJiR4Zc6nihSpIiqVKmimTNnuqzrl19+0ddff61WrVq5Pcdms+n999/XPffcox49ejiPD9Kl/ffDDz9o2bJlbs87efKkLl686FF9ycnJbre0iomJUWhoqNtt2q5kwoQJzr8bYzRhwgQFBgaqcePGzprT0tI0cuRIt+devHgxy/s4OTlZP/zwQ4bLli5dKul/p/Jf6TNSvXp1xcTE6PXXX9eZM2fc1pPRseufX56mHw9btmzp2QsAgOvECDeAf71PP/1Up0+fVtu2bTNcfscdd6hgwYKKj493Buv77rtP77zzjoYNG6ZKlSqpXLlyLs954IEH9NFHH+mRRx5RQkKC6tatq7S0NO3YsUMfffSR8/61V9O6dWu98cYbatGihbp27aojR47o3XffValSpVwCbPXq1dWxY0e99dZbOn78uO644w6tWrXKOYJ1+ajRmDFjlJCQoNq1a6t3794qX768/v77b23atEkrVqzQ33//7fH+Sx/lnjlzptuyzp0765lnnlGHDh00aNAgJScna+LEibrttts8mqAtsyIiIjR27FglJibqtttu0/z587Vlyxa9//77zmuBvfHeZKRgwYJ67rnnNGLECLVo0UJt27bVzp079d5776lmzZrq1q2bx+ucM2eOjDFX/Gy2atVKAQEBio+PV+3atfXKK6/o66+/VmxsrPr06aNy5crp8OHDWrBggdauXas8efLoqaee0sKFC3XvvffqwQcfVPXq1fX333/r008/1aRJk3T77bd7vI8qVqyo5s2ba9CgQbLb7c4vfy4/1T39C48XXnhBnTt3VmBgoOLi4hQTE6NRo0bpueeeU2Jiotq3b6/Q0FDt27dPH3/8sfr06aMnn3xSgYGBGjVqlPr27atGjRrpvvvu0759+zR9+vTrvoZbujRi3bJlS91555166KGHdO7cOb3zzjvKnTv3Fe+T7ufnp9mzZ6t9+/bq1KmTvvzySzVq1EhPPfWUPv30U7Vp00Y9e/ZU9erVdfbsWW3btk0LFy5UYmKiChQokOnadu3apcaNG6tTp04qX768AgIC9PHHH+uvv/5S586dr/n8oKAgffXVV+rRo4dq166tpUuX6osvvtDzzz/vPAMhNjZWffv21ejRo7VlyxY1a9ZMgYGB2r17txYsWKC3335b99xzT6ZrTpecnKw6derojjvuUIsWLRQZGamTJ09qyZIlWrNmjdq3b+88JT4mJkZ58uTRpEmTFBoaqpw5c6p27dqKjo7W1KlT1bJlS1WoUEG9evVS0aJFdfDgQSUkJCgsLEyfffaZy3b37duntm3bqkWLFvrhhx80e/Zsde3a9brOhACALMmWudEB4CYSFxdngoKCzNmzZ6/Yp2fPniYwMNB5Oy2Hw2EiIyONJDNq1KgMn3PhwgUzduxYU6FCBWO3203evHlN9erVzYgRI8ypU6ec/SSZAQMGZLiOadOmmdKlSxu73W7Kli1rpk+fnuEttc6ePWsGDBhg8uXLZ3LlymXat29vdu7caSSZMWPGuPT966+/zIABA0xkZKQJDAw0hQsXNo0bNzbvv//+NffV5bcFu9zu3buNv7+/223BjDHm66+/NhUrVjQ5cuQwZcqUMbNnz77ibcH+uR/Sb5v0z9shZXQLstjYWFOhQgWzYcMGc+edd5qgoCBTokQJM2HCBLd6vfHeXMmECRNM2bJlTWBgoClUqJDp16+fy72kjcn8bcEqVapkihcvftU+DRo0MOHh4SY1NdUYc+mWXN27dzcFCxY0drvdlCxZ0gwYMMDlVlrHjx83AwcONEWLFjU5cuQwxYoVMz169HC5XZyn+2j27NnOz2rVqlUzvC/zyJEjTdGiRY2fn5/bLcIWLVpk7rrrLpMzZ06TM2dOU7ZsWTNgwACzc+dOl3W89957Jjo62tjtdlOjRg2zevVqExsbe923BTPGmBUrVpi6deua4OBgExYWZuLi4syvv/7q0iej9y45OdnExsaaXLlymR9//NEYc+l+588995wpVaqUyZEjhylQoICpU6eOef311523qLvS5zt9v6bXd+zYMTNgwABTtmxZkzNnTpM7d25Tu3Zt89FHH13ztfbo0cPkzJnT7N2713lv9UKFCplhw4a53L4u3fvvv2+qV69ugoODTWhoqKlUqZJ5+umnnbf/M+bKx4GMpKammilTppj27dubEiVKGLvdbkJCQkzVqlXNa6+95vK5NObSLdHKly9vAgIC3G4RtnnzZnP33Xeb/PnzG7vdbkqUKGE6depkvvnmG2ef9Pfn119/Nffcc48JDQ01efPmNQMHDjTnzp3LVM0A4E02Y7Iw4wwA4Ka3ZcsWVa1aVbNnz9b999+f3eXgFmWz2TRgwACXU5Zx8+jZs6cWLlyY4anYt6Lhw4drxIgROnr0qEdnEQCAVbiGGwBuAefOnXNre+utt+Tn56f69etnQ0UAAADgGm4AuAW8+uqr2rhxoxo2bKiAgAAtXbpUS5cuVZ8+fbx6CycAAABkHoEbAG4BderU0fLlyzVy5EidOXNGxYsX1/Dhw91uVwYAAIAbh2u4AQAAAACwANdwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcA4KpmzJghm82mDRs2XLFPYmKibDabXn/99auuKyoqSjabTU2aNMlw+ZQpU2Sz2a65vasZPny4bDabjh07dsU+K1eulM1m08KFCzO93k6dOslms+mZZ5656jptNptmz56dYZ+6devKZrOpYsWKGS5PS0tTRESEbDabli5dmunaJOmJJ55QtWrVlC9fPoWEhKhcuXIaPnx4pu+/PHHiRN17770qXry4bDabevbs6dH209/bjH5Kly7t1n/atGkqV66cgoKCVLp0ab3zzjtufXr27OmynoCAAEVGRqpz58769ddfM1VXZj6/1+PXX3/V8OHDlZiYaMn6faUGAEDGmKUcAHBDBQUFKSEhQX/++acKFy7ssiw+Pl5BQUE6f/58NlWXsaSkJH322WeKiorS3LlzNWbMGNlstgz7BgUFac6cOerWrZtLe2Jior7//nsFBQVdcTvffvutDh8+rKioKMXHx6tly5aZrvGnn35SvXr11KtXLwUFBWnz5s0aM2aMVqxYodWrV8vP7+rfsY8dO1anT59WrVq1dPjw4UxvN91bb73lFu5///13DRkyRM2aNXNpnzx5sh555BF17NhRgwcP1po1azRo0CAlJye7faFht9s1depUSdLFixe1d+9eTZo0SV999ZV+/fVXRUREeFyrN/36668aMWKEGjRooKioqH9tDQCAjBG4AQA3VN26dfXTTz9p/vz5euyxx5ztf/zxh9asWaMOHTpo0aJF2Vihu0WLFiktLU0ffPCBGjVqpNWrVys2NjbDvq1atdKnn36qY8eOqUCBAs72OXPmqFChQipdurROnDiR4XNnz56tatWqqUePHnr++ed19uxZ5cyZM1M1rl271q0tJiZGTz75pNavX6877rjjqs9ftWqVc3Q7V65cmdrm5dq3b+/WNmrUKEnS/fff72w7d+6cXnjhBbVu3dp5hkHv3r3lcDg0cuRI9enTR3nz5nX2DwgIcPvy4o477lCbNm30xRdfqHfv3h7Xml2MMTp//ryCg4OzuxQAwA3CKeUAgBsqKChId999t+bMmePSPnfuXOXNm1fNmzd3e05qaqp27NiRpZFXb4iPj1fTpk3VsGFDlStXTvHx8Vfs265dO9ntdi1YsMClfc6cOerUqZP8/f0zfN65c+f08ccfq3PnzurUqZPOnTunTz755LrqTh/tPHny5DX7lihR4oqj9lk1Z84cRUdHq06dOs62hIQEHT9+XP3793fpO2DAAJ09e1ZffPHFNdebfmZEQEDWxg169uypXLly6eDBg2rfvr1y5cqlggUL6sknn1RaWppL33nz5ql69eoKDQ1VWFiYKlWqpLffflvSpdPV7733XklSw4YNnae+r1y5UtKl/d+mTRstW7ZMNWrUUHBwsCZPnuy8BGPGjBlutdlsNg0fPtyl7eDBg3rooYcUEREhu92u6Oho9evXTxcuXLhmDQCA7EXgBgDccF27dtX69eu1d+9eZ9ucOXN0zz33KDAw0K3/wYMHVa5cOT333HM3skxJ0qFDh5SQkKAuXbpIkrp06aKFCxfqwoULGfYPCQlRu3btNHfuXGfb1q1b9d///lddu3a94nY+/fRTnTlzRp07d1bhwoXVoEGDqwb7jFy8eFHHjh3ToUOH9PXXX2vIkCEKDQ1VrVq1PFqPN2zevFnbt293e82bN2+WJNWoUcOlvXr16vLz83Muv9yxY8d07Ngx/fXXX/rhhx/0xBNPKH/+/GrTpk2W60tLS1Pz5s2VP39+vf7664qNjdW4ceP0/vvvO/ssX75cXbp0Ud68eTV27FiNGTNGDRo00HfffSdJql+/vgYNGiRJev755zVr1izNmjVL5cqVc65j586d6tKli5o2baq3335bVapU8ajOQ4cOqVatWpo3b57uu+8+jR8/Xg888IBWrVql5OTkTNUAAMg+nFIOALjhGjVqpMKFC2vu3LkaMmSItm/fri1btujtt9/Wb7/9lt3luZg7d67sdrvatWsnSercubNefPFFffnllxmeRi1d+kIhLi5OBw4cUGRkpOLj41WyZMmrntY9e/Zs1alTR5GRkc7t9O/fX0ePHlXBggUzVeuGDRt05513Oh+XKVNGn376qfLly5fJV+s96V8WXH46uSQdPnxY/v7+Cg8Pd2nPkSOH8ufPr0OHDrm0nz171u31Fy1aVF9//XWm90tGzp8/r/vuu09Dhw6VJD3yyCOqVq2apk2bpn79+kmSvvjiC4WFhWnZsmUZnplQsmRJ1atXT+PHj1fTpk3VoEEDtz579uzRV1995XLmhieTmz333HP6888/tW7dOpcvKV566SUZY5QnT55r1gAAyD6McAMAbjh/f3916tTJOQocHx+vyMhI1atXL8P+UVFRMsZkeAqu1eLj49W6dWuFhoZKkkqXLq3q1atfdfS5WbNmypcvn+bNmydjjObNm+ccIc/I8ePHtWzZMpc+HTt2lM1m00cffZTpWsuXL6/ly5dryZIlevrpp5UzZ85Mz1LuTQ6HQ/PmzVPVqlXdRlrPnTunHDlyZPi8oKAgnTt3zq1t+fLlWr58uZYtW6bJkycrV65catWqlXbt2nVddT7yyCMuj+vVq+fyhU+ePHl09uxZLV++PMvbiI6OzvAyicxwOBxasmSJ4uLi3M4IkOT1SwAAAN7HCDcAIFt07dpV48eP19atWzVnzhx17tz5pgsQ27dv1+bNm9W9e3ft2bPH2d6gQQO9++67SkpKUlhYmNvzAgMDde+992rOnDmqVauWDhw4cNXTyefPn6/U1FRVrVrVZTu1a9dWfHy8BgwYIEn6+++/XU5lDw4OVu7cuZ2Pw8LCnLdca9eunebMmaN27dpp06ZNuv3227O+I/7fuXPndOrUKZe2f840L12agO3gwYN64okn3JYFBwdf8XT8jCYU8/f3d7uNXKtWrVS6dGk999xzzgntjh496tInX758Vwz20qUg/88R8rx587pMaNe/f3999NFHatmypYoWLapmzZqpU6dOatGixRXX+0/R0dGZ7vtPR48eVVJS0hVvIwcAuPkxwg0AyBa1a9dWTEyMHn/8ce3bt++qgTS7pN9P+4knnlDp0qWdP+PGjdP58+evOpt6165dtWXLFg0fPly33367ypcvf8W+6aPldevWddnO2rVr9cMPPzhHXe+++24VKVLE+XP5LO8ZufvuuyVdmvjLG+bPn++y/SJFilzx9fj5+WU4ql+kSBGlpaXpyJEjLu0XLlzQ8ePHM3Wbr2LFiqlMmTJavXq1JOnAgQNudX3//fdXXceVJq+7XHh4uLZs2aJPP/1Ubdu2VUJCglq2bKkePXpc87npMpqR/EpfLP1zwjYAgO9jhBsAkG26dOmiUaNGqVy5ch5PJmU1Y4zmzJmjhg0bus2oLUkjR45UfHy8evXqleHz77rrLhUvXlwrV67U2LFjr7idffv26fvvv9fAgQPdbjXmcDj0wAMPaM6cORoyZIjGjRvnMgJ7rXCakpIih8PhNiqdVc2bN7/m6dUpKSlatGiRGjRokGF96e/zhg0b1KpVK2f7hg0b5HA4Mv05uHjxovN0+cKFC7vV5Y0RfenSteVxcXGKi4uTw+FQ//79NXnyZA0dOlSlSpXK0lkZ6bc9++fs8b///rvL44IFCyosLEy//PLLVdd3s50ZAgD4HwI3ACDbPPzww/L391ft2rWv2i81NVV79+5V7ty5rziq6m3fffedEhMT9dJLL+mee+5xW75r1y4NHTpUhw4dyjBY2mw2jR8/Xps3b9YDDzxwxe2kj24//fTTzgnTLjd16lTFx8dryJAhql69eobrOHnypHLmzOk2w/vUqVMluc4InpycrP3796tAgQIu9wnPjKuNaqf78ssvdfLkSbfJ0tI1atRI+fLl08SJE10C98SJExUSEqLWrVtfs45du3Zp586dzv0RFBTkdtq5Nxw/flz58+d3Pvbz81PlypUlXfpiQZLzPumZufVaurCwMBUoUECrV6/W448/7mx/7733XPr5+fmpffv2mj17tjZs2OB2HbcxRjabLUs1AABuDAI3ACBTPvjgA3311Vdu7Zef1vzNN9/o/Pnzbn3at2+f4XWoJUqUcLvncEbSbwvWo0ePTE+c9sYbbygkJMSlzc/PT88//7zz8aJFi7Rjxw635/bo0UPx8fHy9/e/YgBs27atXnjhBc2bN0+DBw/OsE+7du2cs5tfSXx8vKpUqZJh2E7fzqOPPqpNmzapWrVqGfZZuXKlBg0apHvuuUelS5fWhQsXtGbNGi1evFg1atRQt27dnH3Xr1+vhg0batiwYS77/rPPPtPWrVslXfqC4+eff9aoUaOcNaQHzWuJj4+X3W5Xx44dM1weHByskSNHasCAAbr33nvVvHlzrVmzRrNnz9bLL7/sNqP6xYsXnaf2OxwOJSYmatKkSXI4HBo2bFimasqqhx9+WH///bcaNWqkYsWK6ffff9c777yjKlWqOCeDq1Klivz9/TV27FidOnVKdrtdjRo1cpuFPaN1jxkzRg8//LBq1Kih1atXZzgJ3CuvvKKvv/5asbGx6tOnj8qVK6fDhw9rwYIFWrt2rfLkyZPlGgAA1iNwAwAyZeLEiRm29+zZ0/n3r776KsNQHhUVdcMnfho9erRbm7+/v0vgvtK1zbGxsVqwYIHq1KlzxVtqVaxYUdHR0Zo9e/YVA/e1bNq0STt27HDemiojcXFxevTRRzV79uwrBu5KlSqpYcOG+uSTT3T48GEZYxQTE6MXX3xRTz311FUnD0u3aNEizZw50/l48+bNzntiFytWLFOBOykpSV988YVat27tMpnbP/Xv31+BgYEaN26cPv30U0VGRurNN9/M8Jr0lJQUlzMEwsLCVLNmTc2aNUuNGze+Zk3Xo1u3bnr//ff13nvv6eTJkypcuLDuu+8+DR8+XH5+l6bBKVy4sCZNmqTRo0froYceUlpamhISEq4Zdl988UUdPXpUCxcudE7MtnTpUrfnFS1aVOvWrdPQoUMVHx+vpKQkFS1aVC1btnR+oZTVGgAA1rMZY0x2FwEAAAAAwK2GWcoBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAMhGM2bMkM1m04YNGyzfls1m0/Dhwy3fDgAAuITADQD410gPt5f/hIeHq2HDhlq6dGmW1/vKK69oyZIl3ivUQ2vXrlXLli1VtGhRBQUFqXjx4oqLi9OcOXOyrSZva9q0qWw2mwYOHHhd69m4caPatGmjwoULK1euXKpcubLGjx+vtLQ0L1UKAMD/BGR3AQAA3GgvvfSSoqOjZYzRX3/9pRkzZqhVq1b67LPP1KZNG4/X98orr+iee+5R+/btvV/sNSxYsED33XefqlSposcee0x58+bVvn37tHr1ak2ZMkVdu3Z19j137pwCAnzvV//ixYv1ww8/XPd6Nm7cqDp16qh06dJ65plnFBISoqVLl+qxxx7T3r179fbbb3uhWgAA/sf3fusCAHCdWrZsqRo1ajgfP/TQQypUqJDmzp2bpcCdnYYPH67y5cvrxx9/VI4cOVyWHTlyxOVxUFDQjSzNK86fP6///Oc/euaZZ/Tiiy9e17omT54sSVq9erXy5csnSerbt69iY2M1Y8YMAjcAwOs4pRwA8K+XJ08eBQcHu43+vv7666pTp47y58+v4OBgVa9eXQsXLnTpY7PZdPbsWc2cOdN5mnrPnj2dyw8ePKiHHnpIERERstvtio6OVr9+/XThwgWX9aSkpGjw4MEqWLCgcubMqQ4dOujo0aPXrH3v3r2qWbOmW9iWpPDwcLda06/hTkxMdDu9/vKfy61bt04tWrRQ7ty5FRISotjYWH333XcufU6fPq3HH39cUVFRstvtCg8PV9OmTbVp0yZnn+TkZO3YsUPHjh275utK9+qrr8rhcOjJJ5/M9HOuJCkpSUFBQcqTJ49Le5EiRRQcHHzd6wcA4J8Y4QYA/OucOnVKx44dkzFGR44c0TvvvKMzZ86oW7duLv3efvtttW3bVvfff78uXLigefPm6d5779Xnn3+u1q1bS5JmzZqlhx9+WLVq1VKfPn0kSTExMZKkQ4cOqVatWjp58qT69OmjsmXL6uDBg1q4cKGSk5NdQvKjjz6qvHnzatiwYUpMTNRbb72lgQMHav78+Vd9LSVKlNA333yjP/74Q8WKFcv0PihYsKBmzZrl0paamqonnnjCpa5vv/1WLVu2VPXq1TVs2DD5+flp+vTpatSokdasWaNatWpJkh555BEtXLhQAwcOVPny5XX8+HGtXbtW27dvV7Vq1SRJ69evV8OGDTVs2LBMTd62f/9+jRkzRh988IFXAnGDBg00f/589e3bV4MHD3aeUr548WK99tpr171+AADcGAAA/iWmT59uJLn92O12M2PGDLf+ycnJLo8vXLhgKlasaBo1auTSnjNnTtOjRw+353fv3t34+fmZn376yW2Zw+FwqalJkybONmOMeeKJJ4y/v785efLkVV/TtGnTjCSTI0cO07BhQzN06FCzZs0ak5aW5tZXkhk2bNgV19W/f3/j7+9vvv32W2eNpUuXNs2bN3epLTk52URHR5umTZs623Lnzm0GDBhw1VoTEhKuWcPl7rnnHlOnTh2X+q+1jau5ePGiGThwoAkMDHS+9/7+/mbixIlZXicAAFfDCDcA4F/n3Xff1W233SZJ+uuvvzR79mw9/PDDCg0N1d133+3sd/mo6okTJ5SWlqZ69epp7ty519yGw+HQkiVLFBcX53K9eLp/nrbdp08fl7Z69erpzTff1O+//67KlStfcTsPPvigihYtqjfeeEMJCQlKSEjQyJEjVbJkSc2aNUt16tS5Zq2S9OGHH+q9997TuHHj1LBhQ0nSli1btHv3bg0ZMkTHjx936d+4cWPNmjVLDodDfn5+ypMnj9atW6dDhw4pIiIiw200aNBAxphM1ZOQkKBFixZp3bp1meqfGf7+/oqJiVHz5s117733KigoSHPnztWjjz6qwoULZ8ukdwCAWxuBGwDwr1OrVi2XENylSxdVrVpVAwcOVJs2bZynVH/++ecaNWqUtmzZopSUFGf/f4bljBw9elRJSUmqWLFipmoqXry4y+O8efNKuhT0r6V58+Zq3ry5kpOTtXHjRs2fP1+TJk1SmzZttGPHDrdruf9py5YteuSRR9SlSxcNHjzY2b57925JUo8ePa743FOnTilv3rx69dVX1aNHD0VGRqp69epq1aqVunfvrpIlS16z/n+6ePGiBg0apAceeEA1a9b0+PlXMmbMGL399tvavXu3cuXKJUnq1KmTGjZsqAEDBqhNmzY+OYs7AODmxaRpAIB/PT8/PzVs2FCHDx92hsw1a9aobdu2CgoK0nvvvacvv/xSy5cvV9euXTM9SusJf3//DNs92VZISIjq1aunCRMmaMiQITpx4sQ17y9+4sQJdezYUbfddpumTp3qsszhcEiSXnvtNS1fvjzDn8uD62+//aZ33nlHEREReu2111ShQoUs3d/8ww8/1M6dO9W3b18lJiY6f6RLk7MlJiYqOTnZ4/W+9957atSokbPmdG3bttWhQ4ec2wAAwFv4GhcAAF0aVZWkM2fOSJIWLVqkoKAgLVu2THa73dlv+vTpbs/NaMS7YMGCCgsL0y+//GJRxVeXPoJ/+PDhK/ZxOBy6//77dfLkSa1YsUIhISEuy9MnfwsLC1OTJk2uuc0iRYqof//+6t+/v44cOaJq1arp5ZdfVsuWLT2qff/+/UpNTVXdunXdln344Yf68MMP9fHHH3t8Cvhff/2ltLQ0t/bU1FRJ//sMAADgLYxwAwD+9VJTU/X1118rR44cKleunKRLI842m80loCUmJmrJkiVuz8+ZM6dOnjzp0ubn56f27dvrs88+04YNG9ye461R8m+++SbD9i+//FKSVKZMmSs+d8SIEVq2bJnmzp2r6Ohot+XVq1dXTEyMXn/9decXEZdLv21ZWlqaTp065bIsPDxcERERLqfiZ/a2YJ07d9bHH3/s9iNJrVq10scff6zatWtfdR0Zue2227R8+XKX69HT0tL00UcfKTQ01PkFAwAA3sIINwDgX2fp0qXasWOHJOnIkSOaM2eOdu/erWeffVZhYWGSpNatW+uNN95QixYt1LVrVx05ckTvvvuuSpUqpZ9//tllfdWrV9eKFSv0xhtvKCIiQtHR0apdu7ZeeeUVff3114qNjVWfPn1Urlw5HT58WAsWLNDatWvd7gedFe3atVN0dLTi4uIUExOjs2fPasWKFfrss89Us2ZNxcXFZfi8bdu2aeTIkapfv76OHDmi2bNnuyzv1q2b/Pz8NHXqVLVs2VIVKlRQr169VLRoUR08eFAJCQkKCwvTZ599ptOnT6tYsWK65557dPvttytXrlxasWKFfvrpJ40bN865zszeFqxs2bIqW7Zshsuio6PdRrYbNGigVatWXfNLjGeffVbdunVT7dq11adPHwUHB2vu3LnauHGjRo0apcDAwKs+HwAATxG4AQD/Oi+++KLz70FBQSpbtqwmTpyovn37OtsbNWqkadOmacyYMXr88ccVHR2tsWPHKjEx0S1wv/HGG+rTp4+GDBmic+fOqUePHqpdu7aKFi2qdevWaejQoYqPj1dSUpKKFi2qli1bup2+nVVTp07VJ598oo8++kiHDh2SMUYlS5bUCy+8oGeeeeaKk4AdP35cxhitWrVKq1atcluefk/yBg0a6IcfftDIkSM1YcIEnTlzRoULF1bt2rWd+yskJET9+/fX119/rcWLF8vhcKhUqVJ677331K9fP6+8zqtJr+la7r//fhUoUECjR4/Wa6+9pqSkJJUpU0aTJk1yee8BAPAWm7Fi5hcAAIAb4PTp08qXL5/eeustDRgwILvLAQDABddwAwAAn7V69WoVLVpUvXv3zu5SAABwwwg3AAAAAAAWYIQbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsIBP34fb4XDo0KFDCg0Nlc1my+5yAAAAAAC3OGOMTp8+rYiICPn5XX0M26cD96FDhxQZGZndZQAAAAAA/mUOHDigYsWKXbWPTwfu0NBQSZdeaFhYWDZXg1tZamqqvv76azVr1kyBgYHZXQ4AXDeOawBuNRzXcKMkJSUpMjLSmUevxqcDd/pp5GFhYQRuWCo1NVUhISEKCwvjAA7glsBxDcCthuMabrTMXNbMpGkAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWMCnr+HOrLS0NKWmpmZ3GfBhqampCggI0Pnz55WWlpbl9QQGBsrf39+LlQEAAAC4Wd3SgdsYoz///FMnT57M7lLg44wxKly4sA4cOHDd93zPkyePChcuzL3jAQAAgFvcLR2408N2eHi4QkJCCDjIMofDoTNnzihXrlzXvLn9lRhjlJycrCNHjkiSihQp4s0SAQAAANxkbtnAnZaW5gzb+fPnz+5y4OMcDocuXLigoKCgLAduSQoODpYkHTlyROHh4ZxeDgAAANzCbtlJ09Kv2Q4JCcnmSgBX6Z9J5hUAAAAAbm23bOBOx2nkuNnwmQQAAAD+HW75wA3fEBUVpbfeeiu7y8iS4cOHq0qVKtldBgAAAICbDIH7JvbDDz/I399frVu3zu5SMnQjQ/LNEmr9/f21ZMmS7C4DAAAAgA+4ZSdNu5qoZ7+4YdtKHJP1sDxt2jQ9+uijmjZtmg4dOqSIiAgvVgYAAAAAsBIj3DepM2fOaP78+erXr59at26tGTNmuPX57LPPVLNmTQUFBalAgQLq0KGDc1lKSoqeeeYZRUZGym63q1SpUpo2bZpz+S+//KKWLVsqV65cKlSokB544AEdO3bMubxBgwYaOHCgBg4cqNy5c6tAgQIaOnSojDHO5b///rueeOIJ2Ww2l+uS165dq3r16ik4OFiRkZEaNGiQzp4961x+5MgRxcXFKTg4WNHR0YqPj7/u/XXgwAF16tRJefLkUb58+dSuXTslJiY6l/fs2VPt27fX66+/riJFiih//vwaMGCAy8Rlhw8fVuvWrZ11zZkzx2UUv3LlypKkDh06yGazKSoqyqWGWbNmKSoqSrlz51bnzp11+vTp635dAAAAAHwXgfsm9dFHH6ls2bIqU6aMunXrpg8++MAZdiXpiy++UIcOHdSqVStt3rxZ33zzjWrVquVc3r17d82dO1fjx4/X9u3bNXnyZOXKlUuSdPLkSTVq1EhVq1bVhg0b9NVXX+mvv/5Sp06dXGqYOXOmAgICtH79er399tt64403NHXqVEnS4sWLVaxYMb300ks6fPiwDh8+LEnau3evWrRooY4dO+rnn3/W/PnztXbtWg0cONC53p49e+rAgQNKSEjQwoUL9d577znvTZ0Vqampat68uUJDQ7VmzRp99913ypUrl1q0aKELFy44+yUkJGjv3r1KSEjQzJkzNWPGDJcvMrp3765Dhw5p5cqVWrRokd5//32Xur799ltJ0vTp03X48GH99NNPzmV79+7VkiVL9Pnnn+vzzz/XqlWrNGbMmCy/JgAAAAC+7195SrkvmDZtmrp16yZJatGihU6dOqVVq1apQYMGkqSXX35ZnTt31ogRI5zPuf322yVJu3bt0kcffaTly5erSZMmkqSSJUs6+02YMEFVq1bVK6+84mz74IMPFBkZqV27dum2226TJEVGRurNN9+UzWZTmTJltG3bNr355pvq3bu38uXLJ39/f4WGhqpw4cLO9YwePVr333+/Hn/8cUlS6dKlNX78eMXGxmrixInav3+/li5dqvXr16tmzZrO11quXLks76v58+fL4XBo6tSpzpH26dOnK0+ePFq5cqWaNWsmScqbN68mTJggf39/lS1bVq1bt9Y333yj3r17a8eOHVqxYoV++ukn1ahRQ5I0depUlS5d2rmdAgUKSJLy5Mnj8pqlS/fpnjFjhkJDQyVJDzzwgL755hu9/PLLWX5dAAAAAHwbI9w3oZ07d2r9+vXq0qWLJCkgIED33XefyynhW7ZsUePGjTN8/pYtW+Tv76/Y2NgMl2/dulUJCQnKlSuX86ds2bKSLo3UprvjjjtcThW/8847tXv3bqWlpV2x9q1bt2rGjBku627evLkcDof27dun7du3KyAgQNWrV3c+p2zZssqTJ8+1d8xVtrlnzx6FhoY6t5kvXz6dP3/e5fVUqFBB/v7+zsdFihRxjmDv3LlTAQEBqlatmnN5qVKllDdv3kzVEBUV5Qzb/1w3AAAAgH8nRrhvQtOmTdPFixddJkkzxshut2vChAnKnTu3goODr/j8qy2TLl0fHhcXp7Fjx7otK1KkSNYL//919+3bV4MGDXJbVrx4ce3ateu61n+lbVavXj3Da8ELFizo/HtgYKDLMpvNJofD4ZUarFw3AAAAAN9E4L7JXLx4UR9++KHGjRvnPBU6Xfv27TV37lw98sgjqly5sr755hv16tXLbR2VKlWSw+HQqlWrnKeUX65atWpatGiRoqKiFBBw5Y/AunXrXB7/+OOPKl26tHOUOEeOHG6j3dWqVdOvv/6qUqVKZbjOsmXL6uLFi9q4caPzlPKdO3fq5MmTV6zjWqpVq6b58+crPDxcYWFhWVpHmTJldPHiRW3evNk5+r5nzx6dOHHCpV9gYOBVR/gBAAAAIB2nlN9kPv/8c504cUIPPfSQKlas6PLTsWNH52nlw4YN09y5czVs2DBt375d27Ztc45YR0VFqUePHnrwwQe1ZMkS7du3TytXrtRHH30kSRowYID+/vtvdenSRT/99JP27t2rZcuWqVevXi5hcv/+/Ro8eLB27typuXPn6p133tFjjz3mXB4VFaXVq1fr4MGDzhnOn3nmGX3//fcaOHCgtmzZot27d+uTTz5xTppWpkwZtWjRQn379tW6deu0ceNGPfzww9cclZekc+fOacuWLS4/e/fu1f33368CBQqoXbt2WrNmjfP1Dho0SH/88Uem9nvZsmXVpEkT9enTR+vXr9fmzZvVp08fBQcHu5xWHxUVpW+++UZ//vmnWxgHAAAAgMsRuG8y06ZNU5MmTZQ7d263ZR07dtSGDRv0888/q0GDBlqwYIE+/fRTValSRY0aNdL69eudfSdOnKh77rlH/fv3V9myZdW7d2/nrbkiIiL03XffKS0tTc2aNVOlSpX0+OOPK0+ePPLz+99Honv37jp37pxq1aqlAQMG6LHHHlOfPn2cy1966SUlJiYqJibGeep25cqVtWrVKu3atUv16tVT1apV9eKLL7qcHj99+nRFREQoNjZWd999t/r06aPw8PBr7ptdu3apatWqLj99+/ZVSEiIVq9ereLFi+vuu+9WuXLl9NBDD+n8+fMejXh/+OGHKlSokOrXr68OHTqod+/eCg0NVVBQkLPPa6+9puXLlysyMlJVq1bN9LoBAAAA/PvYzOX3mvIxSUlJyp07t06dOuUWrM6fP699+/YpOjraJTAhcxo0aKAqVao470H9b/THH38oMjJSK1asUMOGDZWUlKSwsDCXLyWygs8mgJtBamqqvvzyS7Vq1cptHgoA8EUc13CjXC2H/hPXcAP/79tvv9WZM2dUqVIlHT58WE8//bSioqJUv3797C4NAAAAgA8icAP/LzU1Vc8//7x+++03hYaGqk6dOoqPj1dgYCAzjgMAAADwGIEbGVq5cmV2l3DDNW/eXM2bN8/uMgAAAADcIpg0DQAAAAAAC2Rr4B4+fLhsNpvLT9myZbOzJAAAAAAAvCLbTymvUKGCVqxY4XwcEODdknx4EnbcovhMAgCAzIh69ovsLsGn2P2NXq0lVRy+TClptuwux6ckjmmd3SXcsrI9cAcEBKhw4cJeX2/6rQCSk5MVHBzs9fUDWZWcnCxJ3K4CAAAAuMVle+DevXu3IiIiFBQUpDvvvFOjR49W8eLFM+ybkpKilJQU5+OkpCRJl2aXTk1NdesfGhqqv/76Sw6HQyEhIbLZ+KYLWWOM0YULF3Tu3Lksf46MMUpOTtbRo0cVFhYmh8PB7OcAsk36782Mfn8CuDnY/TkrzhN2P+PyJzKP3wWe8WR/2Uw2nt+6dOlSnTlzRmXKlNHhw4c1YsQIHTx4UL/88otCQ0Pd+g8fPlwjRoxwa58zZ45CQkIy3EZoaKhCQ0Pl58f8cMh+DodDp0+f1unTp7O7FAAAAABZkJycrK5du+rUqVMKCwu7at9sDdz/dPLkSZUoUUJvvPGGHnroIbflGY1wR0ZG6tixY1d9oWlpabp48SLXziLLLl68qO+//1516tTJ8jwDNptNAQEB8vf393J1AOC51NRULV++XE2bNuUSF+AmVXH4suwuwafY/YxG1nBo6AY/pTg4s9UTvwzn1rieSEpKUoECBTIVuLP9lPLL5cmTR7fddpv27NmT4XK73S673e7WHhgYeNX/LPAfCVyv1NRUXbx4Ubly5eLzBOCWcq3foQCyDxN/ZU2Kw8a+8xC/Bzzjyf66qc6zPnPmjPbu3asiRYpkdykAAAAAAFyXbA3cTz75pFatWqXExER9//336tChg/z9/dWlS5fsLAsAAAAAgOuWraeU//HHH+rSpYuOHz+uggUL6q677tKPP/6oggULZmdZAAAAAABct2wN3PPmzcvOzQMAAAAAYJmb6hpuAAAAAABuFQRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACN03gHjNmjGw2mx5//PHsLgUAAAAAgOt2UwTun376SZMnT1blypWzuxQAAAAAALwi2wP3mTNndP/992vKlCnKmzdvdpcDAAAAAIBXZHvgHjBggFq3bq0mTZpkdykAAAAAAHhNQHZufN68edq0aZN++umnTPVPSUlRSkqK83FSUpIkKTU1VampqZbUCEhyfr74nAG4VXBcA25+dn+T3SX4FLufcfkTmcfvAs94sr+yLXAfOHBAjz32mJYvX66goKBMPWf06NEaMWKEW/vXX3+tkJAQb5cIuFm+fHl2lwAAXsVxDbh5vVoruyvwTSNrOLK7BJ/z5ZdfZncJPiU5OTnTfW3GmGz5CmjJkiXq0KGD/P39nW1paWmy2Wzy8/NTSkqKyzIp4xHuyMhIHTt2TGFhYTesdvz7pKamavny5WratKkCAwOzuxwAuG4c14CbX8Xhy7K7BJ9i9zMaWcOhoRv8lOKwZXc5PuWX4c2zuwSfkpSUpAIFCujUqVPXzKHZNsLduHFjbdu2zaWtV69eKlu2rJ555hm3sC1JdrtddrvdrT0wMJD/LOCG4LMG4FbDcQ24eaWkERqzIsVhY995iN8DnvFkf2Vb4A4NDVXFihVd2nLmzKn8+fO7tQMAAAAA4GuyfZZyAAAAAABuRdk6S/k/rVy5MrtLAAAAAADAKxjhBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxwU02aBgBAVkU9+0V2l+BT7P5Gr9aSKg5fxv1qPZQ4pnV2lwAA8BGMcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAW8Dhwb9q0Sdu2bXM+/uSTT9S+fXs9//zzunDhgleLAwAAAADAV3kcuPv27atdu3ZJkn777Td17txZISEhWrBggZ5++mmvFwgAAAAAgC/yOHDv2rVLVapUkSQtWLBA9evX15w5czRjxgwtWrTI2/UBAAAAAOCTPA7cxhg5HA5J0ooVK9SqVStJUmRkpI4dO+bd6gAAAAAA8FEeB+4aNWpo1KhRmjVrllatWqXWrVtLkvbt26dChQp5vUAAAAAAAHyRx4H7rbfe0qZNmzRw4EC98MILKlWqlCRp4cKFqlOnjtcLBAAAAADAFwV4+oTKlSu7zFKe7rXXXpO/v79XigIAAAAAwNd5HLjTXbhwQUeOHHFez52uePHi110UAAAAAAC+zuPAvWvXLj300EP6/vvvXdqNMbLZbEpLS/NacQAAAAAA+CqPA3evXr0UEBCgzz//XEWKFJHNZrOiLgAAAAAAfJrHgXvLli3auHGjypYta0U9AAAAAADcEjyepbx8+fLcbxsAAAAAgGvwOHCPHTtWTz/9tFauXKnjx48rKSnJ5QcAAAAAAGThlPImTZpIkho3buzSzqRpAAAAAAD8j8eBOyEhwYo6AAAAAAC4pXgcuGNjY62oAwAAAACAW4rH13BL0po1a9StWzfVqVNHBw8elCTNmjVLa9eu9WpxAAAAAAD4Ko8D96JFi9S8eXMFBwdr06ZNSklJkSSdOnVKr7zyitcLBAAAAADAF3kcuEeNGqVJkyZpypQpCgwMdLbXrVtXmzZt8mpxAAAAAAD4Ko8D986dO1W/fn239ty5c+vkyZPeqAkAAAAAAJ/nceAuXLiw9uzZ49a+du1alSxZ0itFAQAAAADg6zwO3L1799Zjjz2mdevWyWaz6dChQ4qPj9eTTz6pfv36WVEjAAAAAAA+x+Pbgj377LNyOBxq3LixkpOTVb9+fdntdj355JN69NFHragRAAAAAACf43Hgvnjxol544QU99dRT2rNnj86cOaPy5csrV65cOnbsmAoUKGBFnQAAAAAA+BSPTynv3LmzjDHKkSOHypcvr1q1ailXrlz666+/1KBBAwtKBAAAAADA93gcuPfv36+HH37Ype3w4cNq0KCBypYt67XCAAAAAADwZR4H7i+//FLff/+9Bg8eLEk6dOiQGjRooEqVKumjjz7yeoEAAAAAAPgij6/hLliwoL7++mvdddddkqTPP/9c1apVU3x8vPz8PM7vAAAAAADckjwO3JIUGRmp5cuXq169emratKlmzZolm83m7doAAAAAAPBZmQrcefPmzTBQJycn67PPPlP+/PmdbX///bf3qgMAAAAAwEdlKnC/9dZbFpcBAAAAAMCtJVOBu0ePHlbXAQAAAADALSVL13CnpaVpyZIl2r59uySpQoUKatu2rfz9/b1aHAAAAAAAvsrjwL1nzx61atVKBw8eVJkyZSRJo0ePVmRkpL744gvFxMR4vUgAAAAAAHyNx/fxGjRokGJiYnTgwAFt2rRJmzZt0v79+xUdHa1BgwZZUSMAAAAAAD7H4xHuVatW6ccff1S+fPmcbfnz59eYMWNUt25drxYHAAAAAICv8niE22636/Tp027tZ86cUY4cObxSFAAAAAAAvi7TgXv16tVKTU1VmzZt1KdPH61bt07GGBlj9OOPP+qRRx5R27ZtrawVAAAAAACfkenA3bBhQ504cULjx49XTEyM7rzzTgUFBSkoKEh169ZVqVKl9Pbbb1tZKwAAAAAAPiPT13AbYyRJefLk0SeffKLdu3drx44dkqRy5cqpVKlS1lQIAAAAAIAP8mjSNJvN5vx76dKlVbp0aa8XBAAAAADArcCjwN2zZ0/Z7far9lm8ePF1FQQAAAAAwK3Ao8AdGhqq4OBgq2oBAAAAAOCW4VHgHj9+vMLDw62qBQAAAACAW0amZym//PptAAAAAABwdZkO3OmzlAMAAAAAgGvLdOBOSEhQvnz5rKwFAAAAAIBbRqav4Y6NjbWyDgAAAAAAbimZHuEGAAAAAACZR+AGAAAAAMACBG4AAAAAACyQqWu4k5KSMr3CsLCwLBcDAAAAAMCtIlOBO0+ePJm+D3daWtp1FQQAAAAAwK0gU4E7ISHB+ffExEQ9++yz6tmzp+68805J0g8//KCZM2dq9OjR1lQJAAAAAICPyVTgvvyWYC+99JLeeOMNdenSxdnWtm1bVapUSe+//7569Ojh/SoBAAAAAPAxHk+a9sMPP6hGjRpu7TVq1ND69eu9UhQAAAAAAL7O48AdGRmpKVOmuLVPnTpVkZGRXikKAAAAAABfl6lTyi/35ptvqmPHjlq6dKlq164tSVq/fr12796tRYsWeb1AAAAAAAB8kccj3K1atdKuXbsUFxenv//+W3///bfi4uK0a9cutWrVyooaAQAAAADwOR6PcEuXTit/5ZVXvF0LAAAAAAC3DI9HuCVpzZo16tatm+rUqaODBw9KkmbNmqW1a9d6tTgAAAAAAHyVx4F70aJFat68uYKDg7Vp0yalpKRIkk6dOsWoNwAAAAAA/8/jwD1q1ChNmjRJU6ZMUWBgoLO9bt262rRpk1eLAwAAAADAV3kcuHfu3Kn69eu7tefOnVsnT570Rk0AAAAAAPg8jwN34cKFtWfPHrf2tWvXqmTJkl4pCgAAAAAAX+dx4O7du7cee+wxrVu3TjabTYcOHVJ8fLyefPJJ9evXz4oaAQAAAADwOR7fFuzZZ5+Vw+FQ48aNlZycrPr168tut+vJJ5/Uo48+akWNAAAAAAD4HI8Dt81m0wsvvKCnnnpKe/bs0ZkzZ1S+fHnlypXLivoAAAAAAPBJHp9S/uCDD+r06dPKkSOHypcvr1q1ailXrlw6e/asHnzwQStqBAAAAADA53gcuGfOnKlz5865tZ87d04ffvihV4oCAAAAAMDXZfqU8qSkJBljZIzR6dOnFRQU5FyWlpamL7/8UuHh4ZYUCQAAAACAr8l04M6TJ49sNptsNptuu+02t+U2m00jRozwanEAAAAAAPiqTAfuhIQEGWPUqFEjLVq0SPny5XMuy5Ejh0qUKKGIiAhLigQAAAAAwNdkOnDHxsZKkvbt26fIyEj5+Xl8+TcAAAAAAP8aHt8WrESJEjpx4oSmTZum7du3S5LKly+vXr16uYx6AwAAAADwb+bxMPXq1asVFRWl8ePH68SJEzpx4oTGjx+v6OhorV692ooaAQAAAADwOR6PcA8YMED33XefJk6cKH9/f0mXZinv37+/BgwYoG3btnm9SAAAAAAAfI3HI9x79uzRf/7zH2fYliR/f38NHjxYe/bs8WpxAAAAAAD4Ko8Dd7Vq1ZzXbl9u+/btuv32271SFAAAAAAAvs7jU8oHDRqkxx57THv27NEdd9whSfrxxx/17rvvasyYMfr555+dfStXrnzVdU2cOFETJ05UYmKiJKlChQp68cUX1bJlS0/LAgAAAADgpuJx4O7SpYsk6emnn85wmc1mkzFGNptNaWlpV11XsWLFNGbMGJUuXVrGGM2cOVPt2rXT5s2bVaFCBU9LAwAAAADgpuFx4N63b5/XNh4XF+fy+OWXX9bEiRP1448/ErgBAAAAAD4tS/fhtkJaWpoWLFigs2fP6s4777RkGwAAAAAA3CgeB25JmjVrliZNmqR9+/bphx9+UIkSJfTWW28pOjpa7dq182hd27Zt05133qnz588rV65c+vjjj1W+fPkM+6akpCglJcX5OCkpSZKUmpqq1NTUrLwUIFPSP198zoCbl93fZHcJPsXuZ1z+RObxuwA3Csc1z3BcyzqOa57xZH/ZjDEefSInTpyoF198UY8//rhefvll/fLLLypZsqRmzJihmTNnKiEhwaNiL1y4oP379+vUqVNauHChpk6dqlWrVmUYuocPH64RI0a4tc+ZM0chISEebRcAAAAAAE8lJyera9euOnXqlMLCwq7a1+PAXb58eb3yyitq3769QkNDtXXrVpUsWVK//PKLGjRooGPHjl1X8U2aNFFMTIwmT57stiyjEe7IyEgdO3bsmi8UuB6pqalavny5mjZtqsDAwOwuB0AGKg5flt0l+BS7n9HIGg4N3eCnFIctu8vxKb8Mb57dJeBfguOaZziuZR3HNc8kJSWpQIECmQrcWZo0rWrVqm7tdrtdZ8+e9XR1bhwOh0uo/uc27Ha7W3tgYCAhCDcEnzXg5pWSxn+usiLFYWPfeYjfA7hR+LeZNRzXPMdxzTOe7C+PA3d0dLS2bNniNnnaV199pXLlynm0rueee04tW7ZU8eLFdfr0ac2ZM0crV67UsmV8mwcAAAAA8G0eB+7BgwdrwIABOn/+vIwxWr9+vebOnavRo0dr6tSpHq3ryJEj6t69uw4fPqzcuXOrcuXKWrZsmZo2beppWQAAAAAA3FQ8DtwPP/ywgoODNWTIEOfF4hEREXr77bfVuXNnj9Y1bdo0TzcPAAAAAIBPyNJtwe6//37df//9Sk5O1pkzZxQeHu7tugAAAAAA8GlZmjTt4sWLKl26tEJCQpy349q9e7cCAwMVFRXl7RoBAAAAAPA5fp4+oWfPnvr+++/d2tetW6eePXt6oyYAAAAAAHyex4F78+bNqlu3rlv7HXfcoS1btnijJgAAAAAAfJ7Hgdtms+n06dNu7adOnVJaWppXigIAAAAAwNd5HLjr16+v0aNHu4TrtLQ0jR49WnfddZdXiwMAAAAAwFd5PGna2LFjVb9+fZUpU0b16tWTJK1Zs0ZJSUn69ttvvV4gAAAAAAC+yOMR7vLly+vnn39Wp06ddOTIEZ0+fVrdu3fXjh07VLFiRStqBAAAAADA52TpPtwRERF65ZVXvF0LAAAAAAC3DI9HuKdPn64FCxa4tS9YsEAzZ870SlEAAAAAAPg6jwP36NGjVaBAAbf28PBwRr0BAAAAAPh/Hgfu/fv3Kzo62q29RIkS2r9/v1eKAgAAAADA13kcuMPDw/Xzzz+7tW/dulX58+f3SlEAAAAAAPg6jwN3ly5dNGjQICUkJCgtLU1paWn69ttv9dhjj6lz585W1AgAAAAAgM/xeJbykSNHKjExUY0bN1ZAwKWnOxwOde/eXS+//LLXCwQAAAAAwBd5HLhz5Mih+fPna9SoUdqyZYuCg4NVqVIllShRwor6AAAAAADwSVm6D7cklS5dWqVLl5YkJSUlaeLEiZo2bZo2bNjgteIAAAAAAPBVWQ7ckpSQkKAPPvhAixcvVu7cudWhQwdv1QUAAAAAgE/zOHAfPHhQM2bM0PTp03Xy5EmdOHFCc+bMUadOnWSz2ayoEQAAAAAAn5PpWcoXLVqkVq1aqUyZMtqyZYvGjRunQ4cOyc/PT5UqVSJsAwAAAABwmUyPcN9333165plnNH/+fIWGhlpZEwAAAAAAPi/TI9wPPfSQ3n33XbVo0UKTJk3SiRMnrKwLAAAAAACflunAPXnyZB0+fFh9+vTR3LlzVaRIEbVr107GGDkcDitrBAAAAADA52Q6cEtScHCwevTooVWrVmnbtm2qUKGCChUqpLp166pr165avHixVXUCAAAAAOBTPArclytdurReeeUVHThwQLNnz1ZycrK6dOnizdoAAAAAAPBZ13Ufbkny8/NTXFyc4uLidOTIEW/UBAAAAACAz8vyCHdGwsPDvbk6AAAAAAB8llcDNwAAAAAAuITADQAAAACABTIVuMePH6/z589Lkvbv3y9jjKVFAQAAAADg6zIVuAcPHqykpCRJUnR0tI4ePWppUQAAAAAA+LpMzVIeERGhRYsWqVWrVjLG6I8//nCOeP9T8eLFvVogAAAAAAC+KFOBe8iQIXr00Uc1cOBA2Ww21axZ062PMUY2m01paWleLxIAAAAAAF+TqcDdp08fdenSRb///rsqV66sFStWKH/+/FbXBgAAAACAz8pU4Jak0NBQVaxYUdOnT1fdunVlt9utrAsAAAAAAJ+W6cCdrkePHpKkjRs3avv27ZKk8uXLq1q1at6tDAAAAAAAH+Zx4D5y5Ig6d+6slStXKk+ePJKkkydPqmHDhpo3b54KFizo7RoBAAAAAPA5mbot2OUeffRRnT59Wv/973/1999/6++//9Yvv/yipKQkDRo0yIoaAQAAAADwOR6PcH/11VdasWKFypUr52wrX7683n33XTVr1syrxQEAAAAA4Ks8HuF2OBwKDAx0aw8MDJTD4fBKUQAAAAAA+DqPA3ejRo302GOP6dChQ862gwcP6oknnlDjxo29WhwAAAAAAL7K48A9YcIEJSUlKSoqSjExMYqJiVF0dLSSkpL0zjvvWFEjAAAAAAA+x+NruCMjI7Vp0yatWLFCO3bskCSVK1dOTZo08XpxAAAAAAD4Ko8DtyTZbDY1bdpUTZs29XY9AAAAAADcEjw+pRwAAAAAAFwbgRsAAAAAAAsQuAEAAAAAsIBHgfvixYv68MMP9ddff1lVDwAAAAAAtwSPAndAQIAeeeQRnT9/3qp6AAAAAAC4JXh8SnmtWrW0ZcsWC0oBAAAAAODW4fFtwfr376/BgwfrwIEDql69unLmzOmyvHLlyl4rDgAAAAAAX+Vx4O7cubMkadCgQc42m80mY4xsNpvS0tK8Vx0AAAAAAD7K48C9b98+K+oAAAAAAOCW4nHgLlGihBV1AAAAAABwS8nSfbhnzZqlunXrKiIiQr///rsk6a233tInn3zi1eIAAAAAAPBVHgfuiRMnavDgwWrVqpVOnjzpvGY7T548euutt7xdHwAAAAAAPsnjwP3OO+9oypQpeuGFF+Tv7+9sr1GjhrZt2+bV4gAAAAAA8FUeB+59+/apatWqbu12u11nz571SlEAAAAAAPg6jwN3dHS0tmzZ4tb+1VdfqVy5ct6oCQAAAAAAn+fxLOWDBw/WgAEDdP78eRljtH79es2dO1ejR4/W1KlTragRAAAAAACf43HgfvjhhxUcHKwhQ4YoOTlZXbt2VUREhN5++2117tzZihoBAAAAAPA5HgduSbr//vt1//33Kzk5WWfOnFF4eLi36wIAAAAAwKdlKXBL0pEjR7Rz505Jks1mU8GCBb1WFAAAAAAAvs7jSdNOnz6tBx54QBEREYqNjVVsbKwiIiLUrVs3nTp1yooaAQAAAADwOR4H7ocffljr1q3TF198oZMnT+rkyZP6/PPPtWHDBvXt29eKGgEAAAAA8Dken1L++eefa9myZbrrrrucbc2bN9eUKVPUokULrxYHAAAAAICv8niEO3/+/MqdO7dbe+7cuZU3b16vFAUAAAAAgK/zOHAPGTJEgwcP1p9//uls+/PPP/XUU09p6NChXi0OAAAAAABflalTyqtWrSqbzeZ8vHv3bhUvXlzFixeXJO3fv192u11Hjx7lOm4AAAAAAJTJwN2+fXuLywAAAAAA4NaSqcA9bNgwq+sAAAAAAOCW4vEs5Zc7c+aMHA6HS1tYWNh1FQQAAAAAwK3A40nT9u3bp9atWytnzpzOmcnz5s2rPHnyMEs5AAAAAAD/z+MR7m7duskYow8++ECFChVymUwNAAAAAABc4nHg3rp1qzZu3KgyZcpYUQ8AAAAAALcEj08pr1mzpg4cOGBFLQAAAAAA3DI8HuGeOnWqHnnkER08eFAVK1ZUYGCgy/LKlSt7rTgAAAAAAHyVx4H76NGj2rt3r3r16uVss9lsMsbIZrMpLS3NqwUCAAAAAOCLPA7cDz74oKpWraq5c+cyaRoAAAAAAFfgceD+/fff9emnn6pUqVJW1AMAAAAAwC3B40nTGjVqpK1bt1pRCwAAAAAAtwyPR7jj4uL0xBNPaNu2bapUqZLbpGlt27b1WnEAAAAAAPgqjwP3I488Ikl66aWX3JYxaRoAAAAAAJd4HLgdDocVdQAAAAAAcEvx+BpuAAAAAABwbR6PcGd0KvnlXnzxxSwXAwAAAADArcLjwP3xxx+7PE5NTdW+ffsUEBCgmJgYAjcAAAAAAMpC4N68ebNbW1JSknr27KkOHTp4pSgAAAAAAHydV67hDgsL04gRIzR06FBvrA4AAAAAAJ/ntUnTTp06pVOnTnlrdQAAAAAA+DSPTykfP368y2NjjA4fPqxZs2apZcuWXisMAAAAAABf5nHgfvPNN10e+/n5qWDBgurRo4eee+45rxUGAAAAAIAv8zhw79u3z4o6AAAAAAC4pXjtGm4AAAAAAPA/mR7hfvDBB6/Zx2azadq0addVEAAAAAAAt4JMB+4TJ05ccVlaWppWrFihlJQUAjcAAAAAAPIgcH/88ccZtn/yySd6/vnnZbfb9eKLL3qtMAAAAAAAfFmWr+H+7rvvVK9ePXXt2lVt2rTRb7/9pmeffdabtQEAAAAA4LM8Dty//vqr4uLi1KBBA912223auXOnxo4dq7x581pRHwAAAAAAPinTgfvAgQPq1auXbr/9dgUEBOjnn3/WtGnTVKxYMSvrAwAAAADAJ2X6Gu4yZcrIZrNp8ODBqlu3rnbv3q3du3e79Wvbtq1XCwQAAAAAwBdlOnCfP39ekvTaa6/ptddey7CPzWZTWlqadyoDAAAAAMCHZTpwOxwOK+sAAAAAAOCWkuVZygEAAAAAwJURuAEAAAAAsACBGwAAAAAAC2Rr4B49erRq1qyp0NBQhYeHq3379tq5c2d2lgQAAAAAgFdka+BetWqVBgwYoB9//FHLly9XamqqmjVrprNnz2ZnWQAAAAAAXLdMz1J+uZMnT2rhwoXau3evnnrqKeXLl0+bNm1SoUKFVLRo0Uyv56uvvnJ5PGPGDIWHh2vjxo2qX79+VkoDAAAAAOCm4HHg/vnnn9WkSRPlzp1biYmJ6t27t/Lly6fFixdr//79+vDDD7NczKlTpyRJ+fLly3B5SkqKUlJSnI+TkpIkSampqUpNTc3ydoFrSf988TkDbl52f5PdJfgUu59x+ROZx+8C3Cgc1zzDcS3rOK55xpP9ZTPGePSJbNKkiapVq6ZXX31VoaGh2rp1q0qWLKnvv/9eXbt2VWJioqf1Srp0n++2bdvq5MmTWrt2bYZ9hg8frhEjRri1z5kzRyEhIVnaLgAAAAAAmZWcnKyuXbvq1KlTCgsLu2pfjwN37ty5tWnTJsXExLgE7t9//11lypTR+fPns1R0v379tHTpUq1du1bFihXLsE9GI9yRkZE6duzYNV8ocD1SU1O1fPlyNW3aVIGBgdldDoAMVBy+LLtL8Cl2P6ORNRwausFPKQ5bdpfjU34Z3jy7S8C/BMc1z3BcyzqOa55JSkpSgQIFMhW4PT6l3G63O0/lvtyuXbtUsGBBT1cnSRo4cKA+//xzrV69+ophO33bdrvdrT0wMJAQhBuCzxpw80pJ4z9XWZHisLHvPMTvAdwo/NvMGo5rnuO45hlP9pfHs5S3bdtWL730kvO8dZvNpv379+uZZ55Rx44dPVqXMUYDBw7Uxx9/rG+//VbR0dGelgMAAAAAwE3J48A9btw4nTlzRuHh4Tp37pxiY2NVqlQphYaG6uWXX/ZoXQMGDNDs2bM1Z84chYaG6s8//9Sff/6pc+fOeVoWAAAAAAA3FY9PKc+dO7eWL1+utWvX6ueff9aZM2dUrVo1NWnSxOONT5w4UZLUoEEDl/bp06erZ8+eHq8PAAAAAICbRZbuwy1Jd911l+66667r2riH87UBAAAAAOAzPA7c48ePz7DdZrMpKChIpUqVUv369eXv73/dxQEAAAAA4Ks8Dtxvvvmmjh49quTkZOXNm1eSdOLECYWEhChXrlw6cuSISpYsqYSEBEVGRnq9YAAAAAAAfIHHk6a98sorqlmzpnbv3q3jx4/r+PHj2rVrl2rXrq23335b+/fvV+HChfXEE09YUS8AAAAAAD7B4xHuIUOGaNGiRYqJiXG2lSpVSq+//ro6duyo3377Ta+++qrHtwgDAAAAAOBW4vEI9+HDh3Xx4kW39osXL+rPP/+UJEVEROj06dPXXx0AAAAAAD7K48DdsGFD9e3bV5s3b3a2bd68Wf369VOjRo0kSdu2bVN0dLT3qgQAAAAAwMd4HLinTZumfPnyqXr16rLb7bLb7apRo4by5cunadOmSZJy5cqlcePGeb1YAAAAAAB8hcfXcBcuXFjLly/Xjh07tGvXLklSmTJlVKZMGWefhg0beq9CAAAAAAB8kMeBO13ZsmVVtmxZb9YCAAAAAMAtI0uB+48//tCnn36q/fv368KFCy7L3njjDa8UBgAAAACAL/M4cH/zzTdq27atSpYsqR07dqhixYpKTEyUMUbVqlWzokYAAAAAAHyOx5OmPffcc3ryySe1bds2BQUFadGiRTpw4IBiY2N17733WlEjAAAAAAA+x+PAvX37dnXv3l2SFBAQoHPnzilXrlx66aWXNHbsWK8XCAAAAACAL/I4cOfMmdN53XaRIkW0d+9e57Jjx455rzIAAAAAAHyYx9dw33HHHVq7dq3KlSunVq1a6T//+Y+2bdumxYsX64477rCiRgAAAAAAfI7HgfuNN97QmTNnJEkjRozQmTNnNH/+fJUuXZoZygEAAAAA+H8eBe60tDT98ccfqly5sqRLp5dPmjTJksIAAAAAAPBlHl3D7e/vr2bNmunEiRNW1QMAAAAAwC3B40nTKlasqN9++82KWgAAAAAAuGV4HLhHjRqlJ598Up9//rkOHz6spKQklx8AAAAAAJCFSdNatWolSWrbtq1sNpuz3Rgjm82mtLQ071UHAAAAAICP8jhwJyQkWFEHAAAAAAC3FI8Dd2xsrBV1AAAAAABwS/H4Gm5JWrNmjbp166Y6dero4MGDkqRZs2Zp7dq1Xi0OAAAAAABf5XHgXrRokZo3b67g4GBt2rRJKSkpkqRTp07plVde8XqBAAAAAAD4oizNUj5p0iRNmTJFgYGBzva6detq06ZNXi0OAAAAAABf5XHg3rlzp+rXr+/Wnjt3bp08edIbNQEAAAAA4PM8DtyFCxfWnj173NrXrl2rkiVLeqUoAAAAAAB8nceBu3fv3nrssce0bt062Ww2HTp0SPHx8XryySfVr18/K2oEAAAAAMDneHxbsGeffVYOh0ONGzdWcnKy6tevL7vdrieffFKPPvqoFTUCAAAAAOBzPA7cNptNL7zwgp566int2bNHZ86cUfny5ZUrVy4r6gMAAAAAwCd5fEr57NmzlZycrBw5cqh8+fKqVasWYRsAAAAAgH/wOHA/8cQTCg8PV9euXfXll18qLS3NiroAAAAAAPBpHgfuw4cPa968ebLZbOrUqZOKFCmiAQMG6Pvvv7eiPgAAAAAAfJLHgTsgIEBt2rRRfHy8jhw5ojfffFOJiYlq2LChYmJirKgRAAAAAACf4/GkaZcLCQlR8+bNdeLECf3+++/avn27t+oCAAAAAMCneTzCLUnJycmKj49Xq1atVLRoUb311lvq0KGD/vvf/3q7PgAAAAAAfJLHI9ydO3fW559/rpCQEHXq1ElDhw7VnXfeaUVtAAAAAAD4LI8Dt7+/vz766CM1b95c/v7+Lst++eUXVaxY0WvFAQAAAADgqzwO3PHx8S6PT58+rblz52rq1KnauHEjtwkDAAAAAEBZvIZbklavXq0ePXqoSJEiev3119WoUSP9+OOP3qwNAAAAAACf5dEI959//qkZM2Zo2rRpSkpKUqdOnZSSkqIlS5aofPnyVtUIAAAAAIDPyfQId1xcnMqUKaOff/5Zb731lg4dOqR33nnHytoAAAAAAPBZmR7hXrp0qQYNGqR+/fqpdOnSVtYEAAAAAIDPy/QI99q1a3X69GlVr15dtWvX1oQJE3Ts2DErawMAAAAAwGdlOnDfcccdmjJlig4fPqy+fftq3rx5ioiIkMPh0PLly3X69Gkr6wQAAAAAwKd4PEt5zpw59eCDD2rt2rXatm2b/vOf/2jMmDEKDw9X27ZtragRAAAAAACfk+XbgklSmTJl9Oqrr+qPP/7Q3LlzvVUTAAAAAAA+77oCdzp/f3+1b99en376qTdWBwAAAACAz/NK4AYAAAAAAK4I3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABbI1cK9evVpxcXGKiIiQzWbTkiVLsrMcAAAAAAC8JlsD99mzZ3X77bfr3Xffzc4yAAAAAADwuoDs3HjLli3VsmXL7CwBAAAAAABLZGvg9lRKSopSUlKcj5OSkiRJqampSk1Nza6y8C+Q/vnicwbcvOz+JrtL8Cl2P+PyJzKP3wW4UTiueYbjWtZxXPOMJ/vLZoy5KT6RNptNH3/8sdq3b3/FPsOHD9eIESPc2ufMmaOQkBALqwMAAAAAQEpOTlbXrl116tQphYWFXbWvTwXujEa4IyMjdezYsWu+UOB6pKamavny5WratKkCAwOzuxwAGag4fFl2l+BT7H5GI2s4NHSDn1Ictuwux6f8Mrx5dpeAfwmOa57huJZ1HNc8k5SUpAIFCmQqcPvUKeV2u112u92tPTAwkBCEG4LPGnDzSknjP1dZkeKwse88xO8B3Cj828wajmue47jmGU/2F/fhBgAAAADAAtk6wn3mzBnt2bPH+Xjfvn3asmWL8uXLp+LFi2djZQAAAAAAXJ9sDdwbNmxQw4YNnY8HDx4sSerRo4dmzJiRTVUBAAAAAHD9sjVwN2jQQDfJnG0AAAAAAHgV13ADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFgjI7gKQPaKe/SK7S/Apdn+jV2tJFYcvU0qaLbvL8SmJY1pndwkAAABAtmCEGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALDATRG43333XUVFRSkoKEi1a9fW+vXrs7skAAAAAACuS7YH7vnz52vw4MEaNmyYNm3apNtvv13NmzfXkSNHsrs0AAAAAACyLNsD9xtvvKHevXurV69eKl++vCZNmqSQkBB98MEH2V0aAAAAAABZlq2B+8KFC9q4caOaNGnibPPz81OTJk30ww8/ZGNlAAAAAABcn4Ds3PixY8eUlpamQoUKubQXKlRIO3bscOufkpKilJQU5+NTp05Jkv7++2+lpqZaW+wtJuDi2ewuwacEOIySkx0KSPVTmsOW3eX4lOPHj2d3CfiX4LjmGY5rWcdxDTcKxzXPcFzLOo5rnjl9+rQkyRhzzb7ZGrg9NXr0aI0YMcKtPTo6Ohuqwb9N1+wuwEcVGJfdFQC4Eo5rWcNxDbh5cVzLGo5rWXP69Gnlzp37qn2yNXAXKFBA/v7++uuvv1za//rrLxUuXNit/3PPPafBgwc7HzscDv3999/Knz+/bDa+xYJ1kpKSFBkZqQMHDigsLCy7ywGA68ZxDcCthuMabhRjjE6fPq2IiIhr9s3WwP1/7d15eI13/v/x18lGksqiGss0spBaxpY0ZdRW1K7aUgylaumU4YvRodqZ2jpozdBWa2hpbaNaX6VViijBWKohBFVFg6RGY4k1CYnk/P7wa749iWOy+pyT83xcV6+678+pebmuuY683vd9f24vLy89/PDD2rx5s5566ilJt0v05s2bNWLEiHyfL1eunMqVK2dzLiAg4B4kBW7z8/PjCxxAmcL3GoCyhu813Av/7cr2L4zfUj5mzBgNGDBA0dHRaty4sd5++22lpaVp4MCBpqMBAAAAAFBkxgt37969df78eU2YMEE///yzGjVqpA0bNuTbSA0AAAAAAGdivHBL0ogRI+54CzngKMqVK6eJEyfme6QBAJwV32sAyhq+1+CILNaC7GUOAAAAAAAKxc10AAAAAAAAyiIKNwAAAAAApYDCDQAAAABAKaBwAwAAAABQCijcAAAAAACUAgo3AAAuYsmSJbp582a+85mZmVqyZImBRAAAlG28FgzIIzw8vECfS0xMLOUkAFCy3N3ddfbsWQUFBdmcv3jxooKCgpSdnW0oGQAUzZQpU/TnP/9ZPj4+NuczMjL097//XRMmTDCUDLiNwg3k4ebmppCQEPXt2zffD6W/NmrUqHuYCgCKz83NTSkpKXrggQdszickJKh169ZKTU01lAwAioZBIhydh+kAgKP59NNP9dFHH2nWrFnq1KmTBg0apM6dO8vNjScwADinyMhIWSwWWSwWtW3bVh4e//fXf3Z2tk6ePKmOHTsaTAgARWO1WmWxWPKdT0hIUMWKFQ0kAmxxhRuw48yZM1q0aJEWLVqk9PR09e/fX4MHD1ZERITpaABQKJMnT87990svvaT77rsvd83Ly0uhoaHq0aOHvLy8TEUEgEIJDAyUxWLRlStX5OfnZ1O6s7Ozdf36dQ0dOlRz5swxmBKgcAMFsm3bNk2aNEnbt2/XhQsXFBgYaDoSABTa4sWL9fvf/17lypUzHQUAimXx4sWyWq0aNGiQ3n77bfn7++eu/TJIbNq0qcGEwG0UbuAubty4oZUrV+qjjz7SN998o27dumnx4sX8sArAKcXFxSknJ0dNmjSxOb9nzx65u7srOjraUDIAKJpt27apWbNmNo/KAI6Eh1KBO9izZ4/+8Ic/qEqVKpo1a5a6d++uM2fO6JNPPqFsA3Baw4cPV3Jycr7zZ86c0fDhww0kAoDiSUtL0+bNm/Od37hxo9avX28gEWCLwg3k8dvf/lZdu3aVt7e3tm3bpvj4eI0YMYLbyAE4vSNHjigqKirf+cjISB05csRAIgAonvHjx99xJ3Kr1arx48cbSATY4t4LII/vv/9evr6+WrJkiZYuXWr3c7w+B4CzKVeunFJSUhQeHm5z/uzZs9yOCcApHT9+XHXr1s13vnbt2jpx4oSBRIAt/nYF8li4cKHpCABQKtq3b69XXnlFX3zxRe4GQ5cvX9arr76qdu3aGU4HAIXn7++vxMREhYaG2pw/ceKEfH19zYQCfoVN0wAAcBFnzpxRy5YtdfHiRUVGRkqSDhw4oMqVK2vTpk0KDg42nBAACufFF1/U7t27tXr1atWoUUPS7bLdo0cPPfLII1qwYIHhhHB1FG7Ajlu3bum7777Tzz//LEmqUqWK6tatK09PT8PJAKDo0tLStGzZMiUkJMjb21sNGjRQnz59+G4D4JSuXLmijh07au/evXrwwQclST/99JNatGihVatWKSAgwGxAuDwKN5BHTk6OJkyYoDlz5ujKlSs2a/7+/hoxYoQmT54sNzf2HAQAADDNarVq06ZNNoPEli1bmo4FSKJwA/mMGzdOixYt0uuvv64OHTqocuXKkqSUlBTFxMTotdde0/PPP68333zTcFIAKJwlS5bcdf255567R0kAAHANFG4gjypVqmjx4sXq0KHDHdc3btyo5557TikpKfc4GQAUT97XG2ZlZSk9PV1eXl7y8fHh7QsAnM6UKVPuuj5hwoR7lAS4M3YpB/K4du2aqlWrZne9atWqSktLu4eJAKBkXLp0Kd+548ePa9iwYRo7dqyBRABQPKtXr7Y5zsrK0smTJ+Xh4aEaNWpQuGEcV7iBPLp06aJbt25p2bJlqlSpks3ahQsX1L9/f7m7u2vt2rWGEgJAydq7d6/69euno0ePmo4CAMV29epVPf/883r66afVv39/03Hg4ijcQB7Jycnq3Lmzjh49qvr169s8w33o0CHVrVtXa9eu5fU5AMqMAwcOqGXLlrp69arpKABQIg4dOqQnnnhCp06dMh0FLo7CDdxBTk6ONm7cqG+++cbmtWBNmzZV+/bt2aEcgFNas2aNzbHVatXZs2f13nvvKTg4WOvXrzeUDABK1o4dO/TEE0/c8VEa4F6icAMA4CLyDgstFoseeOABtWnTRjNnzlTVqlUNJQOAopk9e7bN8S+DxKVLl6pVq1b6+OOPDSUDbqNwA3dgtVp16tQpBQcHy8PDQ5mZmVq9erVu3rypzp0753u2GwAAAPdeWFiYzbGbm1vuIPGVV15RhQoVDCUDbqNwA3n88MMP6tChg5KTkxUeHq6YmBj17NlTR48eldVqlY+Pj3bt2qWIiAjTUQGgwLKyslS7dm2tXbtWderUMR0HAACXwIOoQB4vv/yyGjZsqAMHDqhr167q0qWLHnzwQV26dEmpqalq2rTpf33nIwA4Gk9PT924ccN0DAAoMVlZWfLw8NDhw4dNRwHs4go3kEdQUJBiYmLUqFEjpaWlqUKFCtq+fbuaN28uSdq1a5f69Omj06dPG04KAIUzbdo0HTt2TAsWLJCHh4fpOABQbOHh4Vq9erUaNmxoOgpwR/xtC+Rx/fp1VaxYUZLk6+srX19fm42EgoODlZKSYioeABRZXFycNm/erJiYGNWvX1++vr4266tWrTKUDACK5i9/+YteffVVLV26NPfnN8CRULiBPKpVq6akpCRVr15dkjRjxgwFBQXlrp8/f16BgYGm4gFAkQUEBKhHjx6mYwBAiXnvvfd04sQJVatWTSEhIfkGifHx8YaSAbdRuIE8Hn/8cR09ejT3FvJhw4bZrMfExCgqKspENAAoloULF5qOAAAl6sknn5TFYjEdA7CLZ7iBQjp58qTKly/P+2oBOJ02bdpo1apVCggIsDl/9epVPfXUU9qyZYuZYAAAlFHsUg4UUlhYGGUbgFPaunWrMjMz852/ceOG/v3vfxtIBADFEx4erosXL+Y7f/nyZYWHhxtIBNjilnLgDr7//nt98803atq0qWrXrq2jR4/qnXfe0c2bN9WvXz+1adPGdEQAKLCDBw/m/vrIkSP6+eefc4+zs7O1YcMG/eY3vzERDQCK5dSpU8rOzs53/ubNm/rpp58MJAJsUbiBPDZs2KAnn3xS9913n9LT07V69Wo999xzatiwoXJyctS+fXvFxMRQugE4jUaNGslischisdzxu8vb21vvvvuugWQAUDRr1qzJ/fXGjRvl7++fe5ydna3NmzcrLCzMRDTABs9wA3k8+uijatOmjf72t7/pk08+0R//+EcNGzZMU6dOlSS98sor2rdvn2JiYgwnBYCCOX36tKxWq8LDw/Xtt9/qgQceyF3z8vJSUFCQ3N3dDSYEgMJxc7v9ZKzFYlHeOuPp6anQ0FDNnDlTXbt2NREPyEXhBvLw9/fXvn37VLNmTeXk5KhcuXL69ttvFRkZKUk6fPiwHn/8cZtbMgEAAHDvhYWFKS4uTpUqVTIdBbgjNk0D7uCX10u4ubmpfPnyNrcpVahQQVeuXDEVDQCKbPHixVq3bl3u8bhx4xQQEKBHH31Up0+fNpgMAIrm5MmT+cr25cuXzYQB7oDCDeQRGhqq48eP5x7v3r1b1atXzz1OSkpil3IATmnatGny9vaWdPu77b333tOMGTNUqVIl/elPfzKcDgAK780339Snn36ae9yzZ09VrFhRv/nNb5SQkGAwGXAbhRvIY9iwYTa7XdarV08eHv+3v+D69evZMA2AU0pOTlbNmjUlSZ9//rmeeeYZ/eEPf9D06dN5LRgApzRv3jwFBwdLkjZt2qSvv/5aGzZsUKdOnTR27FjD6QB2KQfyGTp06F3Xp02bdo+SAEDJuu+++3Tx4kVVr15dMTExGjNmjCSpfPnyysjIMJwOAArv559/zi3ca9euVa9evdS+fXuFhoaqSZMmhtMBXOEGCmT58uVKS0szHQMAiqVdu3YaMmSIhgwZomPHjqlz586SpO+++06hoaFmwwFAEQQGBio5OVnS7Ve7Pv7445Ikq9V6x/dzA/cahRsogBdffFEpKSmmYwBAscyZM0dNmzbV+fPn9dlnn+n++++XJO3bt099+vQxnA4ACq979+7q27ev2rVrp4sXL6pTp06SpP379+c+QgOYxGvBgAKoUKGCEhISFB4ebjoKAAAA/r+srCy98847Sk5O1vPPP5/7Gte33npLFSpU0JAhQwwnhKujcAMFQOEGUFZcvnxZ3377rc6dO6ecnJzc8xaLRf379zeYDACAsofCDRTAjh07FB0drfLly5uOAgBF9uWXX+rZZ5/V9evX5efnJ4vFkrtmsViUmppqMB0AFM3x48cVGxubb5AoSRMmTDCUCriNwg0UkNVqVU5Ojtzd3U1HAYAieeihh9S5c2dNmzZNPj4+puMAQLHNnz9fw4YNU6VKlVSlSpV8g8T4+HiD6QAKN5DPrVu3NGnSJP373//WY489psmTJ+vvf/+7Jk2apFu3bun3v/+95s+fLy8vL9NRAaBQfH19dejQIR6PAVBmhISE6I9//KNefvll01GAO2KXciCPyZMna8GCBYqOjtbKlSs1bNgwzZ49Wx988IHmz5+vzZs36+233zYdEwAKrUOHDtq7d6/pGABQYi5duqSePXuajgHYxRVuII8aNWronXfeUdeuXXXixAnVqlVLH3/8sXr37i1JWrFihV5//XUdOnTIcFIAKJwPP/xQU6ZM0cCBA1W/fn15enrarHfr1s1QMgAomsGDB+uRRx7R0KFDTUcB7ojCDeTh7e2tY8eOKTg4OPd4//79ql27tiTp5MmTatiwoa5evWoyJgAUmpub/RvbLBaLsrOz72EaACi+6dOna9asWerSpcsdB4kjR440lAy4zcN0AMDR+Pv76/Lly7mFOyoqShUqVMhdv3nzps2GHADgLPLu3gsAzu6DDz7Qfffdp23btmnbtm02axaLhcIN4yjcQB5169ZVfHy86tevL0nauXOnzfqhQ4cUERFhIhoAAAB+5eTJk6YjAHdF4QbymDdvXr7bkX4tKytL48aNu4eJAKB4Zs+eXaDPcSUIAICSxTPcAACUcWFhYf/1MxaLRYmJifcgDQAU35gxYwr0uVmzZpVyEuDuuMINFECXLl20YMECVa1a1XQUACg0brkEUNbs37//v36GPXfgCLjCDRRAhQoVlJCQoPDwcNNRAKBE/PTTT6pWrdpddy4HAADFw9+yAAC4oLp16+rUqVOmYwBAidm5c6du3rxpOgZgg8INFEBISMhdN1IDAGfDDW4AyppOnTrpzJkzpmMANniGGyiAw4cPm44AAACAu2CQCEdE4Qb+i/T0dCUlJSkzM9PmfIMGDQwlAoDie/XVV1WxYkXTMQAAKNPYNA2w4/z58xo4cKDWr19/x/Xs7Ox7nAgAAAD2fPzxx3ryySfl6+trOgqQi2e4ATtGjx6ty5cva8+ePfL29taGDRu0ePFiRUREaM2aNabjAUChHD9+XJ999lnuK8LWrVunli1b6pFHHtHUqVO5FROA0+vbty9lGw6Hwg3YsWXLFs2aNUvR0dFyc3NTSEiI+vXrpxkzZmj69Omm4wFAga1evVp169ZV3759VadOHS1ZskTPPPOMfH19VblyZU2aNEkzZswwHRMACiU2NlYzZ87Uzp07JUnvv/++qlevrgceeEAvvPCCMjIyDCcEKNyAXWlpaQoKCpIkBQYG6vz585Kk+vXrKz4+3mQ0ACiUqVOnaty4cbpx44bmzp2roUOHavr06Vq/fr3Wrl2rOXPmaNGiRaZjAkCBzZ8/X+3atdO8efPUtm1bTZ8+XS+99JK6dOmiXr16acWKFZo8ebLpmACFG7CnVq1a+uGHHyRJDRs21Pvvv68zZ85o3rx5qlq1quF0AFBwP/zwgwYNGiSLxaIBAwYoMzNTjz/+eO56+/btdfr0aYMJAaBw3nnnHb311ls6fvy4Pv/8c02YMEFz5szR3LlzNWfOHC1YsEArV640HRNgl3LAnlGjRuns2bOSpIkTJ6pjx45atmyZvLy8uBIEwKmkpaWpQoUKkiQ3Nzd5e3vLx8cnd93b21s3b940FQ8ACi0xMVHdunWTJHXs2FEWi0WNGzfOXW/SpImSk5NNxQNyUbgBO/r165f764cfflinT5/W0aNHVb16dVWqVMlgMgAoHIvFIovFYvcYAJzNjRs35O3tnXtcrlw5lStXzub41q1bJqIBNijcQAH5+PgoKirKdAwAKDSr1aqHHnoot2Rfv35dkZGRcnNzy10HAGdisVh07do1lS9fXlarVRaLRdevX9fVq1clKfffgGkUbsAOq9WqlStXKjY2VufOnVNOTo7N+qpVqwwlA4DCWbhwoekIAFCifhkk/vo4MjLS5pg7eeAIKNyAHaNHj9b777+v1q1bq3LlynxpA3BaAwYMMB0BAEpUbGys6QhAgVis3EcG3FHFihX1r3/9S507dzYdBQAAAIAT4rVggB3+/v4KDw83HQMASl1CQoLc3d1NxwCAQsnOzlZiYmLuY383b97UihUr9MknnyglJcVwOuA2Cjdgx6RJkzR58mRlZGSYjgIApY4b3gA4k4MHD+rBBx9URESEGjZsqOTkZEVHR2vQoEF64YUXVKdOHcXFxZmOCXBLOWBPRkaGnn76ae3cuVOhoaHy9PS0WY+PjzeUDAAKp3v37nddv3LlirZu3ars7Ox7lAgAiqdjx46qUKGCJk6cqAULFigmJkb16tXTsmXLZLFYNHDgQP3888/atGmT6ahwcRRuwI5evXopNjZWzzzzzB03TZs4caKhZABQOJ6enmrXrp0qV658x/XU1FStXbuWwg3AaVSsWFE7d+5UnTp1lJGRoQoVKmjXrl1q3LixJOm7775Tq1atdOHCBcNJ4erYpRywY926ddq4caOaN29uOgoAFEudOnXUo0cPDR48+I7rBw4c0Nq1a+9xKgAoOqvVKg+P21Um778lyd3dPd8rXQETeIYbsCM4OFh+fn6mYwBAsT388MN3fQymXLlyql69+j1MBADF8/DDD+vNN9/UmTNnNH36dIWFhem9997LXX/33XdVr149gwmB27ilHLBj3bp1evfddzVv3jyFhoaajgMARXbz5k1lZ2fLx8fHdBQAKBFxcXHq1KmTLl26pPvvv1+xsbEaPHiwTp8+LTc3N126dElffvml2rZtazoqXByFG7AjMDBQ6enpunXrlnx8fPJtmpaammooGQAAANLS0nT06FHVqlVL9913n27cuKFly5YpIyND7dq1U61atUxHBCjcgD2LFy++6/qAAQPuURIAKHldunTRggULVLVqVdNRAAAosyjcAAC4oAoVKighIUHh4eGmowBAiahfv76++uorBQcHm44C5GKXcqAAbty4oczMTJtzbKgGAADgOE6dOqWsrCzTMQAb7FIO2JGWlqYRI0YoKChIvr6+CgwMtPkHAJxZSEhIvr0pAABAyaJwA3aMGzdOW7Zs0dy5c1WuXDktWLBAkydPVrVq1bRkyRLT8QCgWA4fPsxtlwDKlBYtWsjb29t0DMAGz3ADdlSvXl1LlizRY489Jj8/P8XHx6tmzZpaunSpli9frq+++sp0RAAolJycHLm55Z+15+Tk6KeffuJd3AAAlDCucAN2pKam5m4m5Ofnl/sasObNm2v79u0mowFAoVy9elW9evWSr6+vKleurAkTJig7Ozt3/fz58woLCzOYEABK1qVLl7gjEQ6Bwg3YER4erpMnT0qSateurRUrVkiSvvzySwUEBBhMBgCF89prrykhIUFLly7V1KlTtWTJEj355JM2m0FywxuAsiQpKUkDBw40HQPglnLAnrfeekvu7u4aOXKkvv76az3xxBOyWq3KysrSrFmzNGrUKNMRAaBAQkJCtHjxYj322GOSpAsXLqhLly4KCAjQmjVrdPnyZVWrVs3mqjcAOLKrV6/edf3gwYNq1aoV32swjsINFNDp06e1b98+1axZUw0aNDAdBwAKzMfHR999953NbePXrl1Thw4d5O3trQULFqhmzZr8YArAabi5uclisdhdt1qtslgsfK/BOAo3AABlXO3atTVr1ix17tzZ5vz169fVvn17paen69ChQ/xgCsBp+Pv76y9/+YuaNGlyx/Xjx4/rxRdf5HsNxnmYDgA4ktmzZxf4syNHjizFJABQctq3b6+FCxfmK9z33XefNm7cqHbt2hlKBgBFExUVJUlq1arVHdcDAgLYmwIOgSvcwK8UdJdei8WixMTEUk4DACXj0qVL+s9//qPf/va3d1y/du2a4uPj7f7gCgCOZv78+crIyLB7ASQlJUXz5s3TxIkT73EywBaFGwAAAACAUsBrwYBCSkxMVPv27U3HAIASs3fvXm3fvt10DAAAyhyucAOFlJCQoKioKDbhAFBm1KlTR8eOHeN7DUCZsXfvXqWnp6tly5amo8DFsWkaAAAubvPmzcrKyjIdAwBKTP/+/RkkwiFQuAEAcHHVqlUzHQEAShSDRDgKCjcAAC4iOztb7u7uucd79uzRzZs31bRpU3l6ehpMBgAli0EiHAWFG8gjMjJSFovF7np6evo9TAMAxXf27Fn17NlT33zzjZo1a6bPP/9c/fv311dffSVJioiI0NatW1W1alXDSQGgcBgkwtFRuIE8nnrqKdMRAKBEvfzyy7JarVq9erWWLVumrl27yt3dXcnJycrOzlbfvn01depUvffee6ajAkCBMEiEs2CXcgAAyrhq1app1apV+t3vfqfU1FRVqlRJmzZtUtu2bSVJW7Zs0QsvvKAff/zRcFIAKJjnnntOP/74o8aPH69ly5YpOTlZ7u7uWr58ee4gsVGjRgwSYRxXuIECeOONNzR06FAFBASYjgIAhXbp0iX95je/kSRVrFhRPj4+CgkJyV2vWbOmzp49ayoeABTa119/nTtIbNasWe4g8ZfvuilTpuiFF14wnBKQ3EwHAJzBtGnTlJqaajoGABRJUFCQTaEeMWKEKlasmHt86dIl+fr6mogGAEXCIBHOgsINFABPXgBwZo0aNdLu3btzj9944w2bwr1jxw41aNDARDQAKBIGiXAW3FIOAEAZ98UXX9x1/ZFHHlGrVq3uURoAKL5fBomNGzeWdHuQ+GsMEuEo2DQNKIDk5GRVq1bN5rUTAAAAcEzffvutfHx8VK9ePdNR4OIo3IAdcXFxysnJUZMmTWzO79mzR+7u7oqOjjaUDACKz8/PTwcOHFB4eLjpKAAAlFk8ww3YMXz4cCUnJ+c7f+bMGQ0fPtxAIgAoOczbAZQ1fn5+SkxMNB0DsEHhBuw4cuSIoqKi8p2PjIzUkSNHDCQCAACAPQwS4Ygo3IAd5cqVU0pKSr7zZ8+elYcH+w0CcG79+vWTn5+f6RgAAJRpFG7Ajvbt2+uVV17RlStXcs9dvnxZr776qtq1a2cwGQAU39y5c1WpUiXTMQCgxDBIhCNi0zTAjjNnzqhly5a6ePGiIiMjJUkHDhxQ5cqVtWnTJgUHBxtOCACFl5aWpm3btikpKUmZmZk2ayNHjjSUCgCAsonCDdxFWlqali1bpoSEBHl7e6tBgwbq06ePPD09TUcDgELbv3+/OnfurPT0dKWlpalixYq6cOGCfHx8FBQUxGZDAJwSg0Q4Mgo3AAAu4rHHHtNDDz2kefPmyd/fXwkJCfL09FS/fv00atQode/e3XREACgUBolwdBRu4C6OHz+u2NhYnTt3Tjk5OTZrEyZMMJQKAIomICBAe/bsUa1atRQQEKDdu3erTp062rNnjwYMGKCjR4+ajggAhcIgEY6OrZYBO+bPn69hw4apUqVKqlKliiwWS+6axWKhcANwOp6ennJzu71falBQkJKSklSnTh35+/srOTnZcDoAKLwDBw7o/fffl5ubm9zd3XXz5k2Fh4drxowZGjBgAIUbxlG4ATv+9re/aerUqXr55ZdNRwGAEhEZGam4uDhFRESoVatWmjBhgi5cuKClS5eqXr16puMBQKExSISj47VggB2XLl1Sz549TccAgBIzbdo0Va1aVZI0depUBQYGatiwYTp//rw++OADw+kAoPB+GSRKyh0kLlu2TKNHj2aQCIfAM9yAHYMHD9YjjzyioUOHmo4CAACAO9i7d6+uXbum1q1b69y5c3ruuee0a9cuRURE6KOPPlLDhg1NR4SLo3ADdkyfPl2zZs1Sly5dVL9+/XyvAuM1EwAAAADuhsIN2BEWFmZ3zWKx8JoJAE7n4sWLmjBhgt23L6SmphpKBgBA2cSmaYAdJ0+eNB0BAEpU//79deLECQ0ePFiVK1e2efsCADgjBolwdBRuAABcxL///W/t2LGDZxoBlBkMEuHoKNzAr4wZM0avv/66fH19NWbMmLt+dtasWfcoFQCUjNq1aysjI8N0DAAoMQwS4ego3MCv7N+/X1lZWbm/tofpKQBn9M9//lPjx4/XhAkTVK9evXybQfr5+RlKBgBFwyARjo5N0wAAcBHHjx9X3759FR8fb3PearXKYrEoOzvbUDIAKJq4uDgGiXBoXOEGAMBFPPvss/L09NTHH3/Ms44AyoSAgABdvXpVbdq0sTnPIBGOgsIN2HHjxg29++67dne9zHuFCAAc3eHDh7V//37VqlXLdBQAKBEMEuHoKNyAHYMHD1ZMTIyeeeYZNW7cmC9wAE4vOjpaycnJFG4AZQaDRDg6Cjdgx9q1a/XVV1+pWbNmpqMAQIn4n//5H40aNUpjx45V/fr18z3r2KBBA0PJAKBoGCTC0bFpGmBH3bp19cknn/ADKIAyw83NLd85i8XCs44AnNb//u//atKkSQwS4bAo3IAd69ev1+zZszVv3jyFhISYjgMAxXb69Om7rvNdB8DZMEiEo+OWcsCO6Oho3bhxQ+Hh4fLx8ck3MU1NTTWUDACKhkINoKw5efKk6QjAXVG4ATv69OmjM2fOaNq0aex6CaBMOXLkiJKSkpSZmWlzvlu3boYSAUDRMEiEo+OWcsAOHx8f7d69Ww0bNjQdBQBKRGJiop5++mkdOnQo95ZLSbkDRW69BOCsGCTCUXGFG7Cjdu3aysjIMB0DAErMqFGjFBYWps2bNyssLEzffvutLl68qJdeekn/+Mc/TMcDgEJjkAhHl3+XAQCSpDfeeEMvvfSStm7dqosXL+rq1as2/wCAs9m9e7emTJmiSpUqyc3NTW5ubmrevLmmT5+ukSNHmo4HAIX2yyDx3Llz8vHx0Xfffaft27crOjpaW7duNR0P4Ao3YE/Hjh0lSW3btrU5z66XAJxVdna2KlSoIEmqVKmS/vOf/6hWrVoKCQnRDz/8YDgdABTe7t27tWXLFruDxP3795uOCBdH4QbsiI2NNR0BAEpUvXr1lJCQoLCwMDVp0kQzZsyQl5eXPvjgA4WHh5uOBwCFxiARjo7CDdjRqlUr0xEAoET99a9/VVpamiRpypQp6tq1q1q0aKH7779fn376qeF0AFB4DBLh6NilHLiLy5cv68MPP9T3338vSfrtb3+rQYMGyd/f33AyACgZqampCgwM5NWHAJzSxo0blZaWpu7du+vEiRPq2rWrjh07ljtIbNOmjemIcHEUbsCOvXv3qkOHDvL29lbjxo0lSXFxccrIyFBMTIyioqIMJwQAAEBeDBLhSCjcgB0tWrRQzZo1NX/+fHl43H764tatWxoyZIgSExO1fft2wwkB4L/r3r17gT+7atWqUkwCAIDr4RluwI69e/falG1J8vDw0Lhx4xQdHW0wGQAUHI/AAChrGCTCmVC4ATv8/PyUlJSk2rVr25xPTk7O3Q0TABzdwoULTUcAgBLFIBHOhMIN2NG7d28NHjxY//jHP/Too49Kknbu3KmxY8eqT58+htMBQMk5ePCgoqOjlZmZaToKAPxXDBLhTCjcgB3/+Mc/ZLFY9Nxzz+nWrVuSJE9PTw0bNkxvvPGG4XQAUHKsVmvu9xwAlAUMEuEo2DQN+C/S09P1448/SpJq1KghHx8fw4kAoGQlJCQoKipK2dnZpqMAQIlISEhQZGSkcnJyTEeBi+MKN/Bf+Pj4qH79+qZjAAAAoBB4LRgcAYUbsKN169Z3/aLesmXLPUwDAEV39erVu65fu3btHiUBAMC1ULgBOxo1amRznJWVpQMHDujw4cMaMGCAmVAAUAQBAQF3HSBarVauBAFwKgwS4Swo3IAdb7311h3PT5o0SdevX7/HaQCg6GJjY01HAIASxSARzoJN04BCOnHihBo3bqzU1FTTUQAAAFzStm3bCvS5Vq1alXIS4O64wg0U0u7du1W+fHnTMQCgWLp06aIFCxaoatWqpqMAQKFRpOEsKNyAHd27d7c5tlqtOnv2rPbu3avXXnvNUCoAKBnbt29XRkaG6RgAUGIYJMIRUbgBO/z9/W2O3dzcVKtWLU2ZMkXt27c3lAoAAAB3wiARjojCDdixcOFC0xEAoNSEhITI09PTdAwAAMo0CjfwX2RmZurcuXPKycmxOV+9enVDiQCg+A4fPmw6AgCUKAaJcETsUg7YcezYMQ0ePFi7du2yOf/Layays7MNJQOA4klPT1dSUpIyMzNtzjdo0MBQIgAAyiaucAN2DBw4UB4eHlq7dq2qVq3KuxwBOL3z589r4MCBWr9+/R3XGSQCcFYMEuGoKNyAHQcOHNC+fftUu3Zt01EAoESMHj1aly9f1p49e/TYY49p9erVSklJ0d/+9jfNnDnTdDwAKDQGiXB0bqYDAI6qbt26unDhgukYAFBitmzZolmzZik6Olpubm4KCQlRv379NGPGDE2fPt10PAAotF8PEr29vbVhwwYtXrxYERERWrNmjel4AFe4gV+7evVq7q/ffPNNjRs3TtOmTVP9+vXzbcLh5+d3r+MBQLGkpaUpKChIkhQYGKjz58/roYceUv369RUfH284HQAU3pYtW/TFF1/YDBLbtWsnPz8/TZ8+XV26dDEdES6Owg38SkBAgM2z2larVW3btrX5DJumAXBWtWrV0g8//KDQ0FA1bNhQ77//vkJDQzVv3jxVrVrVdDwAKDQGiXB0FG7gV2JjY01HAIBSM2rUKJ09e1aSNHHiRHXs2FHLli2Tl5eXFi1aZDYcABQBg0Q4Ol4LBuTRtm1bDR8+XN27d7/j+oULF9S4cWMlJibe42QAULLS09N19OhRVa9eXZUqVTIdBwAK7V//+pdu3bql559/Xvv27VPHjh2VmpqaO0js3bu36YhwcRRuIA83Nze5ubnpL3/5iyZPnpxvPSUlRdWqVeOWcgAAAAfDIBGOhlvKgTuYO3eu/vznP+vgwYP617/+JV9fX9ORAKDYrFarVq5cqdjYWJ07d045OTk266tWrTKUDABKho+Pj6KiokzHAHJRuIE7ePLJJ9W8eXM9+eST+t3vfqcvvvhC4eHhpmMBQLGMHj1a77//vlq3bq3KlSvbbBIJAM6IQSIcHYUbsKNOnTqKi4tTnz599Mgjj+jTTz/V448/bjoWABTZ0qVLtWrVKnXu3Nl0FAAoEQwS4ego3MBd+Pv7a926dXrllVfUuXNnvfnmm+rbt6/pWABQJP7+/tytA6BMYZAIR0fhBvLIOxm1WCx644031KhRIw0ZMkRbtmwxlAwAimfSpEmaPHmyPvroI3l7e5uOAwDFxiARjo5dyoE83Nzc9PPPPysoKCjf2oEDB/TUU08pOTmZXcoBOJ2MjAw9/fTT2rlzp0JDQ+Xp6WmzHh8fbygZABTN4sWLtWHDBgaJcFhc4QbyiI2NVcWKFe+41qhRI+3bt0/r1q27x6kAoPgGDBigffv2qV+/fjzrCKBM6NWrl5YvX66goCAGiXBIXOEGAMBF+Pr6auPGjWrevLnpKABQInr16qXY2Fg988wzdxwkTpw40VAy4DaucAMA4CKCg4Pl5+dnOgYAlJh169YxSIRDczMdAAAA3BszZ87UuHHjdOrUKdNRAKBEMEiEo+OWcgAAXERgYKDS09N169Yt+fj45HvWMTU11VAyACiadevW6d1339W8efMUGhpqOg6QD4UbAAAXsXjx4ruuDxgw4B4lAYCSwSARjo7CDQAAAMApMUiEo6NwAwDggm7cuKHMzEybczwHCQBAyWKXcgAAXERaWppefvllrVixQhcvXsy3np2dbSAVAJQMBolwROxSDgCAixg3bpy2bNmiuXPnqly5clqwYIEmT56satWqacmSJabjAUChpaWlacSIEQoKCpKvr68CAwNt/gFMo3ADAOAivvzyS/3zn/9Ujx495OHhoRYtWuivf/2rpk2bpmXLlpmOBwCFxiARjo7CDQCAi0hNTVV4eLik27dZ/rJ7b/PmzbV9+3aT0QCgSBgkwtFRuAEAcBHh4eE6efKkJKl27dpasWKFpNs/sAYEBBhMBgBFwyARjo7CDQCAixg4cKASEhIkSePHj9ecOXNUvnx5/elPf9LYsWMNpwOAwmOQCEfHa8EAAHBRp0+f1r59+1SzZk01aNDAdBwAKLS33npL7u7uGjlypL7++ms98cQTslqtysrK0qxZszRq1CjTEeHiKNwAAAAAygQGiXA0FG4AAMqw2bNnF/izI0eOLMUkAAC4Hgo3AABlWFhYWIE+Z7FYlJiYWMppAKD4GCTCmVC4AQAAADgNBolwJhRuAAAAAABKgYfpAAAAoPSMGTOmwJ+dNWtWKSYBAMD1ULgBACjD9u/fX6DPWSyWUk4CACWDQSKcCYUbAIAyLDY21nQEAChRDBLhTHiGGwAAF7R8+XJ169ZNvr6+pqMAAFBmuZkOAAAA7r0XX3xRKSkppmMAQIlZvny50tLSTMcAbFC4AQBwQdzgBqCsYZAIR0ThBgAAAOD0GCTCEVG4AQBwQevXr1e1atVMxwAAoExj0zQAAAAATm/Hjh2Kjo5W+fLlTUcBclG4AQBwIStXrtSKFSuUlJSkzMxMm7X4+HhDqQAAKJu4pRwAABcxe/ZsDRw4UJUrV9b+/fvVuHFj3X///UpMTFSnTp1MxwOAIlm5cqV69eql3/3ud4qKirL5BzCNwg0AgIv45z//qQ8++EDvvvuuvLy8NG7cOG3atEkjR47UlStXTMcDgEJjkAhHR+EGAMBFJCUl6dFHH5UkeXt769q1a5Kk/v37a/ny5SajAUCRMEiEo6NwAwDgIqpUqaLU1FRJUvXq1fXNN99Ikk6ePMnrdAA4JQaJcHQUbgAAXESbNm20Zs0aSdLAgQP1pz/9Se3atVPv3r319NNPG04HAIXHIBGOjl3KAQBwETk5OcrJyZGHh4ck6ZNPPtGuXbsUERGhF198UV5eXoYTAkDhDBkyRMHBwZo4caLmzJmjsWPHqlmzZtq7d6+6d++uDz/80HREuDgKNwAALiIpKUnBwcGyWCw2561Wq5KTk1W9enVDyQCgaBgkwtFRuAEAcBHu7u46e/asgoKCbM5fvHhRQUFBys7ONpQMAIqGQSIcHc9wAwDgIqxWa74fSiXp+vXrKl++vIFEAFA8YWFhOn/+fL7zqampCgsLM5AIsOVhOgAAAChdY8aMkSRZLBa99tpr8vHxyV3Lzs7Wnj171KhRI0PpAKDoGCTC0VG4AQAo4/bv3y/p9g+mhw4dsnmm0cvLSw0bNtSf//xnU/EAoNAYJMJZULgBACjjYmNjJd1+Fdg777wjPz8/w4kAoHgYJMJZsGkaAAAu5sSJE/rxxx/VsmVLeXt7270lEwAcHYNEODoKNwAALiI1NVU9e/ZUbGysLBaLjh8/rvDwcA0aNEiBgYGaOXOm6YgAUCQMEuGo2KUcAAAXMXr0aHl6eiopKcnmecfevXtrw4YNBpMBQNGkpqaqbdu2euihh9S5c2edPXtWkjR48GC99NJLhtMBFG4AAFxGTEyM3nzzTT344IM25yMiInT69GlDqQCg6BgkwtGxaRoAAC4iLS3N5gfSX6SmpqpcuXIGEgFA8cTExGjjxo0MEuGwuMINAICLaNGihZYsWZJ7bLFYlJOToxkzZqh169YGkwFA0TBIhKPjCjcAAC5ixowZatu2rfbu3avMzEyNGzdO3333nVJTU7Vz507T8QCg0H4ZJL7++uuSGCTC8bBLOQAALuTy5cuaM2eOEhISdP36dUVFRWn48OGqWrWq6WgAUGiHDx9W27ZtFRUVpS1btqhbt242g8QaNWqYjggXR+EGAMCF3LhxQwcPHtS5c+eUk5Njs9atWzdDqQCg6BgkwpFRuAEAcBEbNmxQ//79lZqaqrx//VssFmVnZxtKBgBFxyARjozCDQCAi4iIiFD79u01YcIEVa5c2XQcACg2BolwdBRuAABchJ+fn/bv388zjQDKDAaJcHS8FgwAABfxzDPPaOvWraZjAECJSUlJ0ZgxYyjbcFhc4QYAwEWkp6erZ8+eeuCBB1S/fn15enrarI8cOdJQMgAomkGDBqlZs2YaPHiw6SjAHVG4AQBwER9++KGGDh2q8uXL6/7775fFYslds1gsSkxMNJgOAAqPQSIcHYUbAAAXUaVKFY0cOVLjx4+XmxtPlQFwfgwS4ego3AAAuIiKFSsqLi6OTdMAlBkMEuHo+H8lAAAuYsCAAfr0009NxwCAEpOZmanevXtTtuGwPEwHAAAA90Z2drZmzJihjRs3qkGDBvmedZw1a5ahZABQNL8MEl999VXTUYA7onADAOAiDh06pMjISEnS4cOHbdZ+/dwjADgLBolwdDzDDQAAAMAptW7d2u6axWLRli1b7mEaID8KNwAAAAAApYDdBQAAAAAAKAUUbgAAAAAASgGFGwAAAACAUkDhBgAAubZu3SqLxaLLly8X+L8JDQ3V22+/XWqZAABwVhRuAACcyPPPPy+LxaKhQ4fmWxs+fLgsFouef/75ex8MAADkQ+EGAMDJBAcH65NPPlFGRkbuuRs3bujjjz9W9erVDSYDAAC/RuEGAMDJREVFKTg4WKtWrco9t2rVKlWvXl2RkZG5527evKmRI0cqKChI5cuXV/PmzRUXF2fze3311Vd66KGH5O3trdatW+vUqVP5/vd27NihFi1ayNvbW8HBwRo5cqTS0tJK7c8HAEBZQeEGAMAJDRo0SAsXLsw9/uijjzRw4ECbz4wbN06fffaZFi9erPj4eNWsWVMdOnRQamqqJCk5OVndu3fXE088oQMHDmjIkCEaP368ze/x448/qmPHjurRo4cOHjyoTz/9VDt27NCIESNK/w8JAICTo3ADAOCE+vXrpx07duj06dM6ffq0du7cqX79+uWup6Wlae7cufr73/+uTp06qW7dupo/f768vb314YcfSpLmzp2rGjVqaObMmapVq5aeffbZfM9/T58+Xc8++6xGjx6tiIgIPfroo5o9e7aWLFmiGzdu3Ms/MgAATsfDdAAAAFB4DzzwgLp06aJFixbJarWqS5cuqlSpUu76jz/+qKysLDVr1iz3nKenpxo3bqzvv/9ekvT999+rSZMmNr9v06ZNbY4TEhJ08OBBLVu2LPec1WpVTk6OTp48qTp16pTGHw8AgDKBwg0AgJMaNGhQ7q3dc+bMKZX/jevXr+vFF1/UyJEj862xQRsAAHdH4QYAwEl17NhRmZmZslgs6tChg81ajRo15OXlpZ07dyokJESSlJWVpbi4OI0ePVqSVKdOHa1Zs8bmv/vmm29sjqOionTkyBHVrFmz9P4gAACUUTzDDQCAk3J3d9f333+vI0eOyN3d3WbN19dXw4YN09ixY7VhwwYdOXJEL7zwgtLT0zV48GBJ0tChQ3X8+HGNHTtWP/zwgz7++GMtWrTI5vd5+eWXtWvXLo0YMUIHDhzQ8ePH9cUXX7BpGgAABUDhBgDAifn5+cnPz++Oa2+88YZ69Oih/v37KyoqSidOnNDGjRsVGBgo6fYt4Z999pk+//xzNWzYUPPmzdO0adNsfo8GDRpo27ZtOnbsmFq0aKHIyEhNmDBB1apVK/U/GwAAzs5itVqtpkMAAAAAAFDWcIUbAAAAAIBSQOEGAAAAAKAUULgBAAAAACgFFG4AAAAAAEoBhRsAAAAAgFJA4QYAAAAAoBRQuAEAAAAAKAUUbgAAAAAASgGFGwAAAACAUkDhBgAAAACgFFC4AQAAAAAoBRRuAAAAAABKwf8DaRzp0Mqrc5QAAAAASUVORK5CYII=",
+      "text/plain": [
+       "<Figure size 1200x800 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "accepted_lengths = []\n",
+    "\n",
+    "for ssm in small_model_names:\n",
+    "    for batch_size in batch_sizes:\n",
+    "        for arrival_rate in arrival_rates:\n",
+    "            model_name = ssm.replace(\"/\", \"-\")\n",
+    "            filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "            if os.path.exists(filepath):\n",
+    "                accepted_lengths.append({\n",
+    "                    'Model': model_name,\n",
+    "                    'Batch Size': batch_size,\n",
+    "                    'Arrival Rate': arrival_rate,\n",
+    "                    'Accepted Length': get_accepted_len(filepath)\n",
+    "                })\n",
+    "\n",
+    "accepted_df = pd.DataFrame(accepted_lengths)\n",
+    "\n",
+    "# # Create a bar plot\n",
+    "# fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "# accepted_df.pivot_table(index=['Model', 'Batch Size'], columns='Arrival Rate', values='Accepted Length').plot(kind='bar', ax=ax)\n",
+    "# plt.title('Accepted Length by Model, Batch Size, and Arrival Rate')\n",
+    "# plt.ylabel('Accepted Length')\n",
+    "# plt.xlabel('Model and Batch Size')\n",
+    "# plt.legend(title='Arrival Rate')\n",
+    "# plt.show()\n",
+    "# Group by model and calculate the mean of accepted lengths\n",
+    "average_accepted_df = accepted_df.groupby('Model')['Accepted Length'].mean().reset_index()\n",
+    "\n",
+    "# Sort the dataframe by 'Accepted Length' in ascending order\n",
+    "average_accepted_df = average_accepted_df.sort_values(by='Accepted Length')\n",
+    "\n",
+    "# Create a bar plot\n",
+    "fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "average_accepted_df.plot(x='Model', y='Accepted Length', kind='bar', ax=ax)\n",
+    "plt.title('Average Number of Accepted Tokens per Step\\nLLM: LLAMA-3.1-70B-Instruct\\nBatch Sizes: 4, 8')\n",
+    "plt.ylabel('Average Number of Accepted Tokens')\n",
+    "plt.xlabel('Model')\n",
+    "plt.grid(True)  # Turn the grid on\n",
+    "\n",
+    "# Save the plot as a PDF\n",
+    "plt.savefig('/usr/FlexFlow/benchmarking/average_accepted_tokens.pdf', bbox_inches='tight')\n",
+    "\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAB8UAAAHvCAYAAADNQw6XAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gUV9sG8HupSwcVRBABkfiCXewNsCEidrELRmOPGjUajQXFGnvXaAKKEOy9koixJZbYYi8RNTZEpQhSd74/+HbisksVXBbv33VxwZ45M/PM7DqP8Mw5IxEEQQAREREREREREREREREREVEppKXuAIiIiIiIiIiIiIiIiIiIiIoLi+JERERERERERERERERERFRqsShORERERERERERERERERESlFoviRERERERERERERERERERUarEoTkREREREREREREREREREpRaL4kREREREREREREREREREVGqxKE5ERERERERERERERERERKUWi+JERERERERERERERERERFRqsShORERERERERERERERERESlFoviRERERERUYp04cQISiQQ7duxQdyiFEh0dDYlEgkWLFqk7FCoi7969w+DBg2FtbQ2JRIKxY8eqOyQlEokEgYGBCm0XLlxAkyZNYGRkBIlEgitXrgAAjhw5gtq1a0MqlUIikSAuLu6Tx6upAgMDIZFIinUfDg4OCAgIKNZ9EBERERERfQ5YFCciIiIiok9KIpHk6+vEiRPqDlXjHTp0SKk4qg7y4mFeXx4eHgCAgIAAhXZTU1PUqlULixcvRmpqqtL2z5w5gy5duqB8+fLQ19eHg4MDhg4disePH4t95Dco5OcrOjo6x2OZO3cuQkJCMHz4cISGhqJ///5FfboUODg4iHFpaWnB3NwcNWrUwJAhQ3Du3Ll8bSM9PR09evTAmzdvsHTpUoSGhsLe3h6vX7+Gn58fDAwMsHr1aoSGhsLIyKhYj6ewnj17hsDAQLGYXxBr1qyBRCJBw4YNiz6wEiL7Z9jU1BTu7u44ePBgobcZHh6OZcuWFV2QREREREREaqSj7gCIiIiIiOjzEhoaqvB68+bNiIyMVGp3cXHBrVu3PmVopc6hQ4ewevVqtRfGu3btiipVqoiv3717h+HDh6NLly7o2rWr2F6+fHnxZ319fWzcuBEAEBcXh507d2LChAm4cOECIiIixH4rV67EmDFjULlyZXz99deoUKECbt26hY0bN2Lr1q04dOgQmjRpAktLS6XP2OLFi/Hvv/9i6dKlCu2WlpY5Hsvx48fRqFEjzJgxo3AnoxBq166N8ePHAwASExNx69YtbN++HRs2bMA333yDJUuWKPR///49dHT++3X/wYMHePToETZs2IDBgweL7UeOHEFiYiKCgoLQunXrT3MwhfTs2TPMnDkTDg4OqF27doHWDQsLg4ODA86fP4/79+8rfBY/xtSpU/Hdd98VybaKQps2bTBgwAAIgoBHjx5h7dq18PX1xeHDh+Hl5VXg7YWHh+P69eslcjYEIiIiIiKigmJRnIiIiIiIPql+/fopvP7zzz8RGRmp1A7go4viycnJMDQ0/Kht0MerWbMmatasKb6OjY3F8OHDUbNmTZXvOwDo6OgoLBsxYgQaNmyIrVu3YsmSJbCxscGZM2cwduxYNGvWDEeOHFF4r4cPH46mTZuie/fuuHHjBiwsLJT2FRERgbdv3+YYgyoxMTFwdXXNd/+8ZGRkQCaTQU9PL8c+tra2SjEuWLAAffr0wdKlS+Hs7Izhw4eLy6RSqVLMAGBubp6v9o+RlJRUokabP3z4EGfPnsWuXbswdOhQhIWF5euGhtzeF/kx6ujoKNx8oG5ffPGFwuekW7ducHV1xfLlywtVFCciIiIiIipNOH06ERERERGVeDKZDHPmzEHFihUhlUrRqlUr3L9/X6GPh4cHqlevjr/++gstWrSAoaEhpkyZAiCr+Ddo0CCUL18eUqkUtWrVwqZNmxTWlz+/PPu07fJpt0NCQhTat2/fDldXV0ilUlSvXh27d+9GQEAAHBwcVB7Djz/+CCcnJ+jr66N+/fq4cOGCwvKAgAAYGxvjn3/+gZeXF4yMjGBjY4NZs2ZBEIQCxxkQEIDVq1cDUJxaOScdOnRA5cqVVS5r3Lgx6tWrJ76OjIxEs2bNYG5uDmNjY1StWlU818VFS0tLnF5dPr15UFAQJBIJNm3apHTzg5OTE3744Qc8f/4c69ev/+j9y8/7w4cPcfDgQaWp1vPzGfvwGfPLli0TPw83b94scDwGBgYIDQ1FmTJlMGfOHIXPyIfPFA8ICIC7uzsAoEePHuI09R4eHvD39wcA1K9fHxKJROHZ1efOnUO7du1gZmYGQ0NDuLu748yZMwoxyKfFv3nzJvr06QMLCws0a9ZMXL5lyxa4ubnBwMAAZcqUQa9evfDkyROFbcj/3d68eROenp4wNDSEra0tfvjhB4VzX79+fQDAwIEDxXOf/d+kKmFhYbCwsICPjw+6d++OsLAwpT65vS+5HWP2Z4pXr14dnp6eStuXyWSwtbVF9+7dxbZFixahSZMmKFu2LAwMDODm5oYdO3bkeTwF4eLignLlyuHBgwcK7Xv37oWPjw9sbGygr68PJycnBAUFITMzU+zj4eGBgwcP4tGjR+L5/vDalpqaihkzZqBKlSrQ19eHnZ0dJk6cqPR4A3VcK4iIiIiIiFQpObc0ExERERER5WD+/PnQ0tLChAkTEB8fjx9++AF9+/ZVeqby69ev4e3tjV69eqFfv34oX7483r9/Dw8PD9y/fx+jRo2Co6Mjtm/fjoCAAMTFxWHMmDEFjufgwYPo2bMnatSogXnz5uHt27cYNGgQbG1tVfYPDw9HYmIihg4dColEgh9++AFdu3bFP//8A11dXbFfZmYm2rVrh0aNGuGHH37AkSNHMGPGDGRkZGDWrFkFinHo0KF49uyZyqnpVenZsycGDBiACxcuiAVIAHj06BH+/PNPLFy4EABw48YNdOjQATVr1sSsWbOgr6+P+/fvKxVMi4O8uFe2bFkkJyfjt99+Q/PmzeHo6JjjMQ0ZMgQHDhz46GmuXVxcEBoaim+++QYVK1YUpzO3tLQs8GcsODgYKSkpGDJkCPT19VGmTJlCxWRsbIwuXbrgp59+ws2bN1GtWjWlPkOHDoWtrS3mzp2L0aNHo379+uI09VWrVsWPP/6IWbNmwdHREU5OTgCypoj39vaGm5sbZsyYAS0tLQQHB6Nly5Y4deoUGjRooLCPHj16wNnZGXPnzhWL83PmzMG0adPg5+eHwYMH49WrV1i5ciVatGiBy5cvK4xOf/v2Ldq1a4euXbvCz88PO3bswKRJk1CjRg14e3vDxcUFs2bNwvTp0zFkyBA0b94cANCkSZM8z1FYWBi6du0KPT099O7dG2vXrlX6jMvl9r6oOsbsevbsicDAQLx48QLW1tZi++nTp/Hs2TP06tVLbFu+fDk6duyIvn37Ii0tDREREejRowcOHDgAHx+fPI8rP+Lj4/H27VvxfZULCQmBsbExxo0bB2NjYxw/fhzTp09HQkKC+O/8+++/R3x8vMLjBYyNjQFkFfk7duyI06dPY8iQIXBxccHff/+NpUuX4u7du9izZw8A9V4riIiIiIiIlAhERERERERqNHLkSCGnX02ioqIEAIKLi4uQmpoqti9fvlwAIPz9999im7u7uwBAWLduncI2li1bJgAQtmzZIralpaUJjRs3FoyNjYWEhASFfUVFRSms//DhQwGAEBwcLLbVqFFDqFixopCYmCi2nThxQgAg2NvbK61btmxZ4c2bN2L73r17BQDC/v37xTZ/f38BgPD111+LbTKZTPDx8RH09PSEV69eFTjO3M5tdvHx8YK+vr4wfvx4hfYffvhBkEgkwqNHjwRBEISlS5cKAMR4CuPVq1cCAGHGjBkql/v7+wtGRkbCq1evhFevXgn3798X5s6dK0gkEqFmzZqCIAjClStXBADCmDFjct1XzZo1hTJlyqhc5uPjo/B+5Ye9vb3g4+Oj0Jbfz5j8PTI1NRViYmIKvb8Pyd+PvXv3im3Zz638M7N9+3aFdYODgwUAwoULF8Q2mUwmODs7C15eXoJMJhPbk5OTBUdHR6FNmzZi24wZMwQAQu/evRW2Gx0dLWhrawtz5sxRaP/7778FHR0dhXb5v9vNmzeLbampqYK1tbXQrVs3se3ChQtKn++8XLx4UQAgREZGisdWsWJFpc9Mbu9LTsf44TK5O3fuCACElStXKvQbMWKEYGxsLCQnJ4ttH/4sCFmfl+rVqwstW7ZUaLe3txf8/f3zPFYAwqBBg4RXr14JMTExwsWLF4V27doJAISFCxcq9M2+b0EQhKFDhwqGhoZCSkqK2JbTv4/Q0FBBS0tLOHXqlEL7unXrBADCmTNnBEEommsFERERERFRUeH06UREREREVOINHDhQ4dm+8pGi//zzj0I/fX19DBw4UKHt0KFDsLa2Ru/evcU2XV1djB49Gu/evcPvv/9eoFiePXuGv//+GwMGDBBHTgKAu7s7atSooXKdnj17wsLCIs/4AWDUqFHizxKJBKNGjUJaWhp+/fXXAsVZUKampvD29sa2bdsURsJu3boVjRo1QqVKlQD89/zpvXv3QiaTFVs8SUlJsLS0hKWlJapUqYIpU6agcePG2L17NwAgMTERAGBiYpLrdkxMTJCQkFBscQIF/4x169YNlpaWRbJv+WdQfj4+1pUrV3Dv3j306dMHr1+/RmxsLGJjY5GUlIRWrVrh5MmTSu/7sGHDFF7v2rULMpkMfn5+4vqxsbGwtraGs7MzoqKilI7hw2dh6+npoUGDBir/fRREWFgYypcvL05pLpFI0LNnT0RERChMFS6X2/uS/RhV+eKLL1C7dm1s3bpVbMvMzMSOHTvg6+sLAwMDsf3Dn9++fYv4+Hg0b94cly5dyvfxZffTTz/B0tISVlZWqFevHn777TdMnDgR48aNU+j34b4TExMRGxuL5s2bIzk5Gbdv385zP9u3b4eLiwv+97//Kby/LVu2BADx/f1U1woiIiIiIqL8YFGciIiIiIhKPHlBVk5eYH779q1Cu62trULxHMia/tvZ2RlaWoq//ri4uIjLC0Lev0qVKkrLVLUB+Y9fS0tL6bneX3zxBYD/nqNdnHr27IknT57gjz/+AJA1Xflff/2Fnj17KvRp2rQpBg8ejPLly6NXr17Ytm1bkRe9pFIpIiMjERkZiZMnT+LJkyc4c+aMeH7kxfC8isGJiYl5Fs4/VkE/YzlN914Y7969A5D3zQH5de/ePQCAv7+/eFOC/Gvjxo1ITU1FfHy8wjrZj+fevXsQBAHOzs5K27h16xZiYmIU+lesWFHpefcWFhZK/z4KIjMzExEREfD09MTDhw9x//593L9/Hw0bNsTLly/x22+/Ka2T2/uS3/esZ8+eOHPmDJ4+fQog63noMTExCv+GAODAgQNo1KgRpFIpypQpA0tLS6xdu1bp3BZEp06dEBkZiYMHD4rPO09OTlb6XN64cQNdunSBmZkZTE1NYWlpKd6UkJ/937t3Dzdu3FB6b+XXKvn7+6muFURERERERPnBZ4oTEREREVGJp62trbJdyPZs3w9HQBZU9qKcnKoRpQWV3/jzozjj9PX1haGhIbZt24YmTZpg27Zt0NLSQo8ePcQ+BgYGOHnyJKKionDw4EEcOXIEW7duRcuWLXHs2LEcj7WgtLW10bp16xyXV6lSBTo6Orh27VqOfVJTU3Hnzh3Uq1evSGIqKh/zOc3u+vXrAHK+IaOg5AXLhQsXonbt2ir7fDhDAqB8PDKZDBKJBIcPH1b5eci+flH++5A7fvw4nj9/joiICERERCgtDwsLQ9u2bRXacntf8vue9ezZE5MnT8b27dsxduxYbNu2DWZmZmjXrp3Y59SpU+jYsSNatGiBNWvWoEKFCtDV1UVwcDDCw8PzeYTKKlasKP6bad++PcqVK4dRo0bB09MTXbt2BQDExcXB3d0dpqammDVrFpycnCCVSnHp0iVMmjQpXwVrmUyGGjVqYMmSJSqX29nZAfh01woiIiIiIqL8YFGciIiIiIhKNXt7e1y7dg0ymUxhxKR8mmB7e3sA/43ejouLU1g/+yhfef/79+8r7UtVW0HIZDL8888/4ohLALh79y4AwMHBoUBxAjkX0HNiZGSEDh06YPv27ViyZAm2bt2K5s2bw8bGRqGflpYWWrVqhVatWmHJkiWYO3cuvv/+e0RFReVayC5KRkZG8PT0xPHjx/Ho0SPxffnQtm3bkJqaig4dOhRrLPn9jBW1d+/eYffu3bCzsxNHpX8sJycnAFnT6Rf2vXRycoIgCHB0dFT4LH+Mgn6Ww8LCYGVlhdWrVyst27VrF3bv3o1169YV6Q0KQNaI8gYNGmDr1q0YNWoUdu3ahc6dO0NfX1/ss3PnTkilUhw9elShPTg4uEhjGTp0KJYuXYqpU6eiS5cukEgkOHHiBF6/fo1du3ahRYsWYt+HDx8qrZ/TOXdycsLVq1fRqlWrPN+XknCtICIiIiIiAjh9OhERERERlXLt27fHixcvFJ7zm5GRgZUrV8LY2Bju7u4AsgqX2traOHnypML6a9asUXhtY2OD6tWrY/PmzeLU1QDw+++/4++///7oeFetWiX+LAgCVq1aBV1dXbRq1apAcQJZhWNAuYCem549e+LZs2fYuHEjrl69qjTt85s3b5TWkY8oTk1Nzfd+isLUqVMhCAICAgLw/v17hWUPHz7ExIkTUaFCBQwdOrRY48jvZ6wovX//Hv3798ebN2/w/fffF7honBM3Nzc4OTlh0aJFCp9vuVevXuW5ja5du0JbWxszZ85UGu0tCAJev35d4LgK8ll+//49du3ahQ4dOqB79+5KX6NGjUJiYiL27dtX4Djyo2fPnvjzzz/x888/IzY2VunfkLa2NiQSicLsDtHR0dizZ0+RxqGjo4Px48fj1q1b2Lt3r7hvQHEUflpaWo7XD1XTqfv5+eHp06fYsGGD0rL3798jKSkJQMm6VhAREREREXGkOBERERERlWpDhgzB+vXrERAQgL/++gsODg7YsWMHzpw5g2XLlonPYjYzM0OPHj2wcuVKSCQSODk54cCBA0rPPwaAuXPnolOnTmjatCkGDhyIt2/fYtWqVahevbrKQmJ+SaVSHDlyBP7+/mjYsCEOHz6MgwcPYsqUKbC0tCxwnG5ubgCA0aNHw8vLC9ra2ujVq1euMbRv3x4mJiaYMGECtLW10a1bN4Xls2bNwsmTJ+Hj4wN7e3vExMRgzZo1qFixIpo1a1boYy+MFi1aYNGiRRg3bhxq1qyJgIAAVKhQAbdv38aGDRsgk8lw6NAhcXR9ccnvZ6ywnj59ii1btgDIGh1+8+ZNbN++HS9evMD48eOLtOivpaWFjRs3wtvbG9WqVcPAgQNha2uLp0+fIioqCqampti/f3+u23BycsLs2bMxefJkREdHo3PnzjAxMcHDhw+xe/duDBkyBBMmTChQXE5OTjA3N8e6detgYmICIyMjNGzYUOWzvvft24fExER07NhR5bYaNWoES0tLhIWFKRWsi4Kfnx8mTJiACRMmoEyZMkojon18fLBkyRK0a9cOffr0QUxMDFavXo0qVark+jiAwggICMD06dOxYMECdO7cGU2aNIGFhQX8/f0xevRoSCQShIaGqpyq3s3NDVu3bsW4ceNQv359GBsbw9fXF/3798e2bdswbNgwREVFoWnTpsjMzMTt27exbds2HD16FPXq1StR1woiIiIiIiIWxYmIiIiIqFQzMDDAiRMn8N1332HTpk1ISEhA1apVERwcjICAAIW+K1euRHp6OtatWwd9fX34+flh4cKFqF69ukI/X19f/PLLLwgMDMR3330HZ2dnhISEYNOmTbhx40ahY9XW1saRI0cwfPhwfPvttzAxMcGMGTMwffr0QsXZtWtXfP3114iIiMCWLVsgCEKeRXGpVIqOHTsiLCwMrVu3hpWVlcLyjh07Ijo6WhwFW65cObi7u2PmzJkwMzMr9LEX1jfffIN69eph8eLFWLZsGeLj41GhQgX06NED33//fbFNXf6hgnzGCuPKlSvo378/JBIJTExMYGdnB19fXwwePBgNGjT4+APIxsPDA3/88QeCgoKwatUqvHv3DtbW1mjYsGG+C/DfffcdvvjiCyxduhQzZ84EkPWs6bZt2+ZYrM6Nrq4uNm3ahMmTJ2PYsGHIyMhAcHCwyqJ4WFgYpFIp2rRpo3JbWlpa8PHxQVhYWKFGreelYsWKaNKkCc6cOYPBgwdDV1dXYXnLli3x008/Yf78+Rg7diwcHR2xYMECREdHF3lR3MDAAKNGjUJgYCBOnDgBDw8PHDhwAOPHj8fUqVNhYWGBfv36oVWrVvDy8lJYd8SIEbhy5QqCg4OxdOlS2Nvbw9fXF1paWtizZw+WLl2KzZs3Y/fu3TA0NETlypUxZswYccr8knatICIiIiKiz5tEUHU7MBERERERERVY7dq1YWlpicjIyAKvGxAQgB07dnzUSHMiIiIiIiIiIlLGZ4oTEREREREVUHp6OjIyMhTaTpw4gatXr8LDw0M9QRERERERERERkUqcPp2IiIiIiKiAnj59itatW6Nfv36wsbHB7du3sW7dOlhbW2PYsGHqDo+IiIiIiIiIiD7AojgREREREVEBWVhYwM3NDRs3bsSrV69gZGQEHx8fzJ8/H2XLllV3eERERERERERE9AE+U5yIiIiIiIiIiIiIiIiIiEotPlOciIiIiIiIiIiIiIiIiIhKLRbFiYiIiIiIiIiIiIiIiIio1GJRnIiIiIiIiIiIiIiIiIiISi0WxYmIiIiIiEqokJAQSCQSXLx4Mcc+0dHRkEgkWLRoUa7bcnBwgEQiQevWrVUu37BhAyQSSZ77y01gYCAkEgliY2Nz7HPixAlIJBLs2LEj39v18/ODRCLBpEmTct2mRCLBli1bVPZp2rQpJBIJqlevrnJ5ZmYmbGxsIJFIcPjw4XzHBgDffPMN6tatizJlysDQ0BAuLi4IDAzEu3fv8rX+2rVr0aNHD1SqVAkSiQQBAQEF2r/8vVX15ezsrNT/p59+gouLC6RSKZydnbFy5UqlPgEBAQrb0dHRgZ2dHXr16oWbN2/mK678fH4/xs2bNxEYGIjo6Ohi2b6mxEBERERERER501F3AERERERERPRpSKVSREVF4cWLF7C2tlZYFhYWBqlUipSUFDVFp1pCQgL2798PBwcH/PLLL5g/fz4kEonKvlKpFOHh4ejXr59Ce3R0NM6ePQupVJrjfo4fP47nz5/DwcEBYWFh8Pb2zneMFy5cQPPmzTFw4EBIpVJcvnwZ8+fPx6+//oqTJ09CSyv3+9EXLFiAxMRENGjQAM+fP8/3fuWWLVumVIB/9OgRpk6dirZt2yq0r1+/HsOGDUO3bt0wbtw4nDp1CqNHj0ZycrLSTQf6+vrYuHEjACAjIwMPHjzAunXrcOTIEdy8eRM2NjYFjrUo3bx5EzNnzoSHhwccHBw+2xiIiIiIiIgobyyKExERERERfSaaNm2KCxcuYOvWrRgzZozY/u+//+LUqVPo0qULdu7cqcYIle3cuROZmZn4+eef0bJlS5w8eRLu7u4q+7Zv3x779u1DbGwsypUrJ7aHh4ejfPnycHZ2xtu3b1Wuu2XLFtStWxf+/v6YMmUKkpKSYGRklK8YT58+rdTm5OSECRMm4Pz582jUqFGu6//+++/iKHFjY+N87fNDnTt3VmqbPXs2AKBv375i2/v37/H999/Dx8dHHKn/1VdfQSaTISgoCEOGDIGFhYXYX0dHR+kGg0aNGqFDhw44ePAgvvrqqwLHqi6CICAlJQUGBgbqDoWIiIiIiIjUgNOnExERERERfSakUim6du2K8PBwhfZffvkFFhYW8PLyUlonPT0dt2/fLtQI5qIQFhaGNm3awNPTEy4uLggLC8uxb6dOnaCvr4/t27crtIeHh8PPzw/a2toq13v//j12796NXr16wc/PD+/fv8fevXs/Km75qOG4uLg8+9rb2+c4+r2wwsPD4ejoiCZNmohtUVFReP36NUaMGKHQd+TIkUhKSsLBgwfz3K58hgEdncLdYx8QEABjY2M8ffoUnTt3hrGxMSwtLTFhwgRkZmYq9I2IiICbmxtMTExgamqKGjVqYPny5QCypmbv0aMHAMDT01Oc5v3EiRMAss5/hw4dcPToUdSrVw8GBgZYv369+LiBkJAQpdgkEgkCAwMV2p4+fYpBgwbBxsYG+vr6cHR0xPDhw5GWlpZnDERERERERFRysChORERERET0GenTpw/Onz+PBw8eiG3h4eHo3r07dHV1lfo/ffoULi4umDx58qcMEwDw7NkzREVFoXfv3gCA3r17Y8eOHUhLS1PZ39DQEJ06dcIvv/witl29ehU3btxAnz59ctzPvn378O7dO/Tq1QvW1tbw8PDItfiuSkZGBmJjY/Hs2TMcO3YMU6dOhYmJCRo0aFCg7RSFy5cv49atW0rHfPnyZQBAvXr1FNrd3NygpaUlLv9QbGwsYmNj8fLlS/zxxx/45ptvULZsWXTo0KHQ8WVmZsLLywtly5bFokWL4O7ujsWLF+PHH38U+0RGRqJ3796wsLDAggULMH/+fHh4eODMmTMAgBYtWmD06NEAgClTpiA0NBShoaFwcXERt3Hnzh307t0bbdq0wfLly1G7du0Cxfns2TM0aNAAERER6NmzJ1asWIH+/fvj999/R3Jycr5iICIiIiIiopKB06cTERERERF9Rlq2bAlra2v88ssvmDp1Km7duoUrV65g+fLl+Oeff9QdnoJffvkF+vr66NSpEwCgV69emD59Og4dOqRyynAgq+jv6+uLJ0+ewM7ODmFhYahcuXKuU5hv2bIFTZo0gZ2dnbifESNG4NWrV7C0tMxXrBcvXkTjxo3F11WrVsW+fftQpkyZfB5t0ZEX9D+cOh0Anj9/Dm1tbVhZWSm06+npoWzZsnj27JlCe1JSktLx29ra4tixY/k+L6qkpKSgZ8+emDZtGgBg2LBhqFu3Ln766ScMHz4cAHDw4EGYmpri6NGjKkf4V65cGc2bN8eKFSvQpk0beHh4KPW5f/8+jhw5ojADQnR0dL7jnDx5Ml68eIFz584p3Egwa9YsCIIAc3PzPGMgIiIiIiKikoEjxYmIiIiIiD4j2tra8PPzE0dTh4WFwc7ODs2bN1fZ38HBAYIgqJxuuriFhYXBx8cHJiYmAABnZ2e4ubnlOoq7bdu2KFOmDCIiIiAIAiIiIsSR5qq8fv0aR48eVejTrVs3SCQSbNu2Ld+xurq6IjIyEnv27MHEiRNhZGSEd+/e5Xv9oiKTyRAREYE6deoojVh+//499PT0VK4nlUrx/v17pbbIyEhERkbi6NGjWL9+PYyNjdG+fXvcvXv3o+IcNmyYwuvmzZsr3JRhbm6OpKQkREZGFnofjo6OKh8JkB8ymQx79uyBr6+v0sh6AEU+3T0REREREREVL44UJyIiIiIi+sz06dMHK1aswNWrVxEeHo5evXqVuCLfrVu3cPnyZQwYMAD3798X2z08PLB69WokJCTA1NRUaT1dXV306NED4eHhaNCgAZ48eZLr1Olbt25Feno66tSpo7Cfhg0bIiwsDCNHjgQAvHnzRmHadgMDA5iZmYmvTU1N0bp1awBZzzYPDw9Hp06dcOnSJdSqVavwJ+L/vX//HvHx8Qpt8ud7f+j333/H06dP8c033ygtMzAwyHHq+ZSUFBgYGCi0aWtri8ck1759ezg7O2Py5MnYuXMnMjMz8erVK4U+ZcqUybH4DmQV27OPNLewsMDbt2/F1yNGjMC2bdvg7e0NW1tbtG3bFn5+fmjXrl2O283O0dEx332ze/XqFRISElC9evVCb4OIiIiIiIhKDo4UJyIiIiIi+sw0bNgQTk5OGDt2LB4+fJhr0VhdtmzZAgD45ptv4OzsLH4tXrwYKSkp2LlzZ47r9unTB1euXEFgYCBq1aoFV1fXHPvKR503bdpUYT+nT5/GH3/8IY5e7tq1KypUqCB+jRkzJtf4u3btCgCIiIgo0HHnZOvWrQr7r1ChQo7Ho6WlpXJ0fIUKFZCZmYmYmBiF9rS0NLx+/Ro2NjZ5xlGxYkVUrVoVJ0+eBAA8efJEKa6zZ8/mug1V06FnZ2VlhStXrmDfvn3o2LEjoqKi4O3tDX9//zzXlcte5AdyHuGdmZmZ7+0SERERERGR5uFIcSIiIiIios9Q7969MXv2bLi4uKB27drqDkeBIAgIDw+Hp6cnRowYobQ8KCgIYWFhGDhwoMr1mzVrhkqVKuHEiRNYsGBBjvt5+PAhzp49i1GjRsHd3V1hmUwmQ//+/REeHo6pU6di8eLFCiOZ8yogp6amQiaTKY3uLiwvL688pxJPTU3Fzp074eHhoTI++ft88eJFtG/fXmy/ePEiZDJZvj8HGRkZ4tTw1tbWSnEVxch4IOtZ576+vvD19YVMJsOIESOwfv16TJs2DVWqVCnU7AYWFhYAgLi4OIX2R48eKby2tLSEqakprl+/nuv2StoMC0RERERERKQai+JERERERESfocGDB0NbWxsNGzbMtV96ejoePHgAMzOzHEcnF7UzZ84gOjoas2bNQvfu3ZWW3717F9OmTcOzZ89UFn8lEglWrFiBy5cvo3///jnuRz5KfOLEibCzs1NavnHjRoSFhWHq1Klwc3NTuY24uDgYGRlBV1dXaV0ACs+jTk5OxuPHj1GuXDmUK1cux7hUyW10uNyhQ4cQFxeHvn37qlzesmVLlClTBmvXrlUoiq9duxaGhobw8fHJM467d+/izp074vmQSqVKU6wXhdevX6Ns2bLiay0tLdSsWRNAVvEfAIyMjAAoF7hzY2pqinLlyuHkyZMYO3as2L5mzRqFflpaWujcuTO2bNmCixcvKj1XXBAESCSSQsVAREREREREnx6L4kRERERERCXczz//jCNHjii1fziF92+//YaUlBSlPp07d1b5XGR7e3sEBgbmue+nT5/CxcUF/v7+CAkJyVe8S5YsgaGhoUKblpYWpkyZIr7euXMnbt++rbSuv78/wsLCoK2tnWORtmPHjvj+++8RERGBcePGqezTqVMndOrUKdc4w8LCULt2bZUFcfl+vv76a1y6dAl169ZV2efEiRMYPXo0unfvDmdnZ6SlpeHUqVPYtWsX6tWrh379+ol9z58/D09PT8yYMUPh3O/fvx9Xr14FkHUTwrVr1zB79mwxBnkxOC9hYWHQ19dHt27dVC43MDBAUFAQRo4ciR49esDLywunTp3Cli1bMGfOHJQpU0ahf0ZGhjiNvUwmQ3R0NNatWweZTIYZM2bkK6bCGjx4MN68eYOWLVuiYsWKePToEVauXInatWvDxcUFQNbId21tbSxYsADx8fHQ19dHy5YtYWVllee258+fj8GDB6NevXo4efIk7t69q9Rv7ty5OHbsGNzd3TFkyBC4uLjg+fPn2L59O06fPg1zc/NCx0BERERERESfFoviREREREREJdzatWtVtgcEBIg/HzlyRGXh3MHBQWVRvDjNmzdPqU1bW1uhKJ7Ts7bd3d2xfft2NGnSRKlIK1e9enU4Ojpiy5YtORbF83Lp0iXcvn0b06ZNy7GPr68vvv76a2zZsiXHoniNGjXg6emJvXv34vnz5xAEAU5OTpg+fTq+/fZb6Onp5RnLzp07sWnTJvH15cuXcfnyZQBZz/DOT1E8ISEBBw8ehI+PD8zMzHLsN2LECOjq6mLx4sXYt28f7OzssHTpUpXPSE9NTVUYaW9qaor69esjNDQUrVq1yjOmj9GvXz/8+OOPWLNmDeLi4mBtbY2ePXsiMDAQWlpaALKmbl+3bh3mzZuHQYMGITMzE1FRUXkWpKdPn45Xr15hx44d2LZtG7y9vXH48GGl9WxtbXHu3DlMmzYNYWFhSEhIgK2tLby9vcWbPgobAxEREREREX1aEkEQBHUHQUREREREREREREREREREVBy01B0AERERERERERERERERERFRcWFRnIiIiIiIiIiIiIiIiIiISi0WxYmIiIiIiIiIiIiIiIiIqNRiUZyIiIiIiIiIiIiIiIiIiEotFsWJiIiIiIiIiIiIiIiIiKjUYlGciIiIiIiIiIiIiIiIiIhKLRbFiYiIiIiI6LMUEhICiUSCixcvFvu+JBIJAgMDi30/RERERERERKSMRXEiIiIiIiIqdvIC9IdfVlZW8PT0xOHDhwu93blz52LPnj1FF2gBnT59Gt7e3rC1tYVUKkWlSpXg6+uL8PBwtcVU1Nq0aQOJRIJRo0Z91Hb++usvdOjQAdbW1jA2NkbNmjWxYsUKZGZmFlGkRERERERERKrpqDsAIiIiIiIi+nzMmjULjo6OEAQBL1++REhICNq3b4/9+/ejQ4cOBd7e3Llz0b17d3Tu3Lnog83D9u3b0bNnT9SuXRtjxoyBhYUFHj58iJMnT2LDhg3o06eP2Pf9+/fQ0dG8X8F37dqFP/7446O389dff6FJkyZwdnbGpEmTYGhoiMOHD2PMmDF48OABli9fXgTREhEREREREammeb+RExERERERkcby9vZGvXr1xNeDBg1C+fLl8csvvxSqKK5OgYGBcHV1xZ9//gk9PT2FZTExMQqvpVLppwytSKSkpGD8+PGYNGkSpk+f/lHbWr9+PQDg5MmTKFOmDABg6NChcHd3R0hICIviREREREREVKw4fToRERERERGpjbm5OQwMDJRGUS9atAhNmjRB2bJlYWBgADc3N+zYsUOhj0QiQVJSEjZt2iROyR4QECAuf/r0KQYNGgQbGxvo6+vD0dERw4cPR1pamsJ2UlNTMW7cOFhaWsLIyAhdunTBq1ev8oz9wYMHqF+/vlJBHACsrKyUYpU/Uzw6OlppKvkPvz507tw5tGvXDmZmZjA0NIS7uzvOnDmj0CcxMRFjx46Fg4MD9PX1YWVlhTZt2uDSpUtin+TkZNy+fRuxsbF5HpfcDz/8AJlMhgkTJuR7nZwkJCRAKpXC3Nxcob1ChQowMDD46O0TERERERER5YYjxYmIiIiIiOiTiY+PR2xsLARBQExMDFauXIl3796hX79+Cv2WL1+Ojh07om/fvkhLS0NERAR69OiBAwcOwMfHBwAQGhqKwYMHo0GDBhgyZAgAwMnJCQDw7NkzNGjQAHFxcRgyZAj+97//4enTp9ixYweSk5MVCtlff/01LCwsMGPGDERHR2PZsmUYNWoUtm7dmuux2Nvb47fffsO///6LihUr5vscWFpaIjQ0VKEtPT0d33zzjUJcx48fh7e3N9zc3DBjxgxoaWkhODgYLVu2xKlTp9CgQQMAwLBhw7Bjxw6MGjUKrq6ueP36NU6fPo1bt26hbt26AIDz58/D09MTM2bMEIvzuXn8+DHmz5+Pn3/+uUiK1h4eHti6dSuGDh2KcePGidOn79q1CwsXLvzo7RMRERERERHlhkVxIiIiIiIi+mRat26t8FpfXx8///wz2rRpo9B+9+5dhWLsqFGjULduXSxZskQsivfr1w/Dhg1D5cqVlYrqkydPxosXL3Du3DmF6dpnzZoFQRAU+pYtWxbHjh0TR2nLZDKsWLEC8fHxMDMzy/FYJk2ahEGDBsHJyQlNmzZFs2bN0LZtWzRp0gRaWjlPzGZkZKQU78iRI/Hu3TtERkYCAARBwLBhw+Dp6YnDhw+LsQ0dOhTVqlXD1KlTcezYMQDAwYMH8dVXX2Hx4sXi9iZOnJjj/vNj/PjxqFOnDnr16vVR25H76quvcOPGDaxfvx4bN24EAGhra2PVqlUYNmxYkeyDiIiIiIiIKCcsihMREREREdEns3r1anzxxRcAgJcvX2LLli0YPHgwTExM0LVrV7HfhwXxt2/fIjMzE82bN8cvv/yS5z5kMhn27NkDX19fhYK4XPYpyocMGaLQ1rx5cyxduhSPHj1CzZo1c9zPl19+CVtbWyxZsgRRUVGIiopCUFAQKleujNDQUDRp0iTPWAFg8+bNWLNmDRYvXgxPT08AwJUrV3Dv3j1MnToVr1+/VujfqlUrhIaGQiaTQUtLC+bm5jh37hyePXsGGxsblfvw8PBQuhkgJ1FRUdi5cyfOnTuXr/75oa2tDScnJ3h5eaFHjx6QSqX45Zdf8PXXX8Pa2hqdO3cusn0RERERERERZceiOBEREREREX0yDRo0UChU9+7dG3Xq1MGoUaPQoUMHcfrwAwcOYPbs2bhy5QpSU1PF/tkL2qq8evUKCQkJqF69er5iqlSpksJrCwsLAFnF+Lx4eXnBy8sLycnJ+Ouvv7B161asW7cOHTp0wO3bt5WeLZ7dlStXMGzYMPTu3Rvjxo0T2+/duwcA8Pf3z3Hd+Ph4WFhY4IcffoC/vz/s7Ozg5uaG9u3bY8CAAahcuXKe8WeXkZGB0aNHo3///qhfv36B18/J/PnzsXz5cty7dw/GxsYAAD8/P3h6emLkyJHo0KGD0nPliYiIiIiIiIpKzvO5ERERERERERUzLS0teHp64vnz52Ih+NSpU+jYsSOkUinWrFmDQ4cOITIyEn369Mn3aOeC0NbWVtlekH0ZGhqiefPmWLVqFaZOnYq3b9/i8OHDua7z9u1bdOvWDV988YU4pbicTCYDACxcuBCRkZEqvz4sLv/zzz9YuXIlbGxssHDhQlSrVi3P/auyefNm3LlzB0OHDkV0dLT4BQCJiYmIjo5GcnJygbe7Zs0atGzZUoxZrmPHjnj27Jm4DyIiIiIiIqLiwNuwiYiIiIiISK0yMjIAAO/evQMA7Ny5E1KpFEePHoW+vr7YLzg4WGldVSPHLS0tYWpqiuvXrxdTxLmTj4R//vx5jn1kMhn69u2LuLg4/PrrrzA0NFRY7uTkBAAwNTVVeg67KhUqVMCIESMwYsQIxMTEoG7dupgzZw68vb0LFPvjx4+Rnp6Opk2bKi3bvHkzNm/ejN27dxd4uvOXL18iMzNTqT09PR3Af58BIiIiIiIiouLAkeJERERERESkNunp6Th27Bj09PTg4uICIGvktkQiUSiiRkdHY8+ePUrrGxkZIS4uTqFNS0sLnTt3xv79+3Hx4kWldYpqtPlvv/2msv3QoUMAgKpVq+a47syZM3H06FH88ssvcHR0VFru5uYGJycnLFq0SLxZ4EOvXr0CAGRmZiI+Pl5hmZWVFWxsbBSmnU9OTsbt27cRGxub6zH16tULu3fvVvoCgPbt22P37t1o2LBhrttQ5YsvvkBkZKTC89EzMzOxbds2mJiYiDcBEBERERERERUHjhQnIiIiIiKiT+bw4cO4ffs2ACAmJgbh4eG4d+8evvvuO5iamgIAfHx8sGTJErRr1w59+vRBTEwMVq9ejSpVquDatWsK23Nzc8Ovv/6KJUuWwMbGBo6OjmjYsCHmzp2LY8eOwd3dHUOGDIGLiwueP3+O7du34/Tp0zA3N//oY+nUqRMcHR3h6+sLJycnJCUl4ddff8X+/ftRv359+Pr6qlzv77//RlBQEFq0aIGYmBhs2bJFYXm/fv2gpaWFjRs3wtvbG9WqVcPAgQNha2uLp0+fIioqCqampti/fz8SExNRsWJFdO/eHbVq1YKxsTF+/fVXXLhwAYsXLxa3ef78eXh6emLGjBkIDAzM8Zj+97//4X//+5/KZY6OjkojxD08PPD777/neaPBd999h379+qFhw4YYMmQIDAwM8Msvv+Cvv/7C7Nmzoaurm+v6RERERERERB+DRXEiIiIiIiL6ZKZPny7+LJVK8b///Q9r167F0KFDxfaWLVvip59+wvz58zF27Fg4OjpiwYIFiI6OViqKL1myBEOGDMHUqVPx/v17+Pv7o2HDhrC1tcW5c+cwbdo0hIWFISEhAba2tvD29laaqrywNm7ciL1792Lbtm149uwZBEFA5cqV8f3332PSpEnQ0VH9K/fr168hCAJ+//13/P7770rL+/XrByCr4PzHH38gKCgIq1atwrt372BtbY2GDRuK58vQ0BAjRozAsWPHsGvXLshkMlSpUgVr1qzB8OHDi+Q4cyOPKS99+/ZFuXLlMG/ePCxcuBAJCQmoWrUq1q1bp/DeExERERERERUHiVBU88YRERERERER0WcjMTERZcqUwbJlyzBy5Eh1h0NERERERESUIz5TnIiIiIiIiIgK7OTJk7C1tcVXX32l7lCIiIiIiIiIcsWR4kREREREREREREREREREVGpxpDgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosilOpFxgYCIlEUqz7cHBwQEBAQLHuQxMtXLgQlStXhra2NmrXrg0AyMjIwMSJE2FnZwctLS107twZACCRSBAYGCiuGxISAolEgujo6E8eNxEVL16XiYgoO+YGIiLKjrmBiIg+xLxARB+LRXEqMdasWQOJRIKGDRuqO5RiI5FIFL5MTU3h7u6OgwcPFnqb4eHhWLZsWdEFWUSOHTuGiRMnomnTpggODsbcuXMBAD///DMWLlyI7t27Y9OmTfjmm2/UHCkR5YTX5cL51NflY8eOYdCgQahevTq0tbXh4ODwyfatCW7evInAwMCPvslq/PjxcHV1LZqgiDQYc0PhfMrckJycjNWrV6Nt27aoUKECTExMUKdOHaxduxaZmZmfJIaSjrmBqGgxNxSOOv+eExcXBysrK0gkEuzYsUMtMZQ0Z8+eRWBgIOLi4j5qO926dUP79u2LJigiDcW8UDifOi/IZDKsW7cOtWvXhrGxMcqXLw9vb2+cPXv2k8VQkjEvlE4silOJERYWBgcHB5w/fx73798vsu1OnToV79+/L7Ltfaw2bdogNDQUmzdvxsSJE3H//n34+vri6NGjhdpeSS2KHz9+HFpaWvjpp58wYMAA8cJ//Phx2NraYunSpejfvz/c3d1Vrt+/f3+8f/8e9vb2nzJsIvoAr8uacV0ODw9HeHg4zMzMYGNj88n2qylu3ryJmTNnfnTh4+DBg/Dx8SmaoIg0GHNDyc8N//zzD77++msIgoBx48Zh0aJFcHR0xIgRI/Dll19+khhKOuYGoqLF3FDyc0N206dPR3Jyslr2XVKdPXsWM2fO/KjiR3p6OiIjI5kb6LPHvKAZeeHbb7/F8OHDUaNGDSxZsgTjx4/H3bt34e7ujvPnz3+yOEoq5oXSiUVxKhEePnyIs2fPYsmSJbC0tERYWFi+1svIyEBaWprKZUlJSQAAHR0dSKXSIov1Y33xxRfo168f+vfvj6lTp+LXX3+FIAhYvny5ukMrUjExMTAwMICenp5Su7m5eZ7ra2trQyqVFvuUOESkGq/LmnNdnjt3LhISEnDmzBnUqlXro7eX23v4ufrnn39w584d/hJDnz3mBs3IDdbW1vj7778RGRmJb7/9FkOHDsWuXbswcOBAbN68uVB/mGRuUMbcQJSFuUEzcsOHrl+/jrVr12LSpEkftZ2UlBTIZLIiiqp0OHXqFBITE5kb6LPGvKAZeSEjIwNr165F9+7dERoaiiFDhmDixIn49ddfkZGRke/37UPMC8qYF0oeFsWpRAgLC4OFhQV8fHzQvXt3lRfd6OhoSCQSLFq0CMuWLYOTkxP09fXFqe8kEglu3ryJPn36wMLCAs2aNQOg/KyR6tWrw9PTU2n7MpkMtra26N69u9i2aNEiNGnSBGXLloWBgQHc3NyKfFopFxcXlCtXDg8ePFBo37t3L3x8fGBjYwN9fX04OTkhKChIYcpDDw8PHDx4EI8ePRKnavlw2tzU1FTMmDEDVapUgb6+Puzs7DBx4kSkpqYq7Cs2Nha3b9/O113CGRkZCAoKEs+/g4MDpkyZorBNiUSC4OBgJCUliXHJnxEeFRWFGzduiO0nTpxQuR9VzxR3cHBAhw4dcPr0aTRo0ABSqRSVK1fG5s2bldaPi4vD2LFjYWdnB319fVSpUgULFixgYibKJ16XNee6bGNjA11d3UIda27vIQDcvn0b3bt3R5kyZSCVSlGvXj3s27dPaTs3btxAy5YtYWBggIoVK2L27Nn4+eefla7jEokEgYGBSuuremZXfq/jERERcHNzg4mJCUxNTVGjRg3xF9CQkBD06NEDAODp6amUey5evAgvLy+UK1cOBgYGcHR0VDmK8uDBgzAzMxM/w4mJiRg7diwcHBygr68PKysrtGnTBpcuXVJY79y5c2jXrh3MzMxgaGgId3d3nDlzRmn7T58+xaBBg8TPlqOjI4YPH84CFJU4zA2akRvKlSuHatWqKbV36dIFAHDr1q1c12duYG4gKgjmBs3IDR8aM2YMunTpgubNm+d7nRMnTkAikSAiIgJTp06Fra0tDA0NkZCQACD/17bTp0+jfv36kEqlcHJywvr165XeZ/nnJSQkRGl9VTnj6dOn+PLLL1G+fHno6+ujWrVq+Pnnn5XWXblyJapVqwZDQ0NYWFigXr16CA8PB5D1Wfv2228BAI6OjuJ7Is9XkZGRaNasGczNzWFsbIyqVatiypQpSvs4ePAgXF1dxffyxYsXGDhwICpWrAh9fX1UqFABnTp1Upqp5PDhw2jevDmMjIxgYmICHx8f3LhxQ2n7t2/fhp+fHywtLWFgYICqVavi+++/V+pHpE7MC5qRF9LT0/H+/XuUL19eod3KygpaWlowMDDIdX3mBeYFTaWj7gCIgKxk2bVrV+jp6aF3795Yu3YtLly4gPr16yv1DQ4ORkpKCoYMGQJ9fX2UKVNGXNajRw84Oztj7ty5EARB5b569uyJwMBAvHjxAtbW1mL76dOn8ezZM/Tq1UtsW758OTp27Ii+ffsiLS0NERER6NGjBw4cOFBkd/fEx8fj7du3cHJyUmgPCQmBsbExxo0bB2NjYxw/fhzTp09HQkICFi5cCAD4/vvvER8fj3///RdLly4FABgbGwPISv4dO3bE6dOnMWTIELi4uODvv//G0qVLcffuXezZs0fc16pVqzBz5kxERUXBw8Mj13gHDx6MTZs2oXv37hg/fjzOnTuHefPm4datW9i9ezcAIDQ0FD/++CPOnz+PjRs3AgDq1KmD0NBQzJkzB+/evcO8efMAZP1noSDu37+P7t27Y9CgQfD398fPP/+MgIAAuLm5iX/8S05Ohru7O54+fYqhQ4eiUqVKOHv2LCZPnoznz5+XyOnmiUoaXpc157pcFFS9hzdu3EDTpk1ha2uL7777DkZGRti2bRs6d+6MnTt3isWVFy9ewNPTExkZGWK/H3/8Mc9foHKT3+t4ZGQkevfujVatWmHBggUAsoo9Z86cwZgxY9CiRQuMHj0aK1aswJQpU8Sc4+LigpiYGLRt2xaWlpb47rvvYG5ujujoaOzatUspnkOHDqFNmzbQ0cn6r/OwYcOwY8cOjBo1Cq6urnj9+jVOnz6NW7duoW7dugCyHhfi7e0NNzc3zJgxA1paWggODkbLli1x6tQpNGjQAADw7NkzNGjQAHFxcRgyZAj+97//4enTp9ixYweSk5OVZlwhUifmBs3ODS9evACQVTTPD+YG5gai/GBu0KzcsH37dpw9exa3bt0q1CMkgoKCoKenhwkTJiA1NRV6enr5vrb9/fff4jU2MDAQGRkZmDFjhlJBpiBevnyJRo0aQSKRYNSoUbC0tMThw4cxaNAgJCQkYOzYsQCADRs2YPTo0ejevTvGjBmDlJQUXLt2DefOnUOfPn3QtWtX3L17F7/88guWLl0q5kpLS0vcuHEDHTp0QM2aNTFr1izo6+vj/v37Kos7hw4dQocOHcTX3bp1w40bN/D111/DwcEBMTExiIyMxOPHj8UCSWhoKPz9/eHl5YUFCxYgOTkZa9euRbNmzXD58mWx37Vr19C8eXPo6upiyJAhcHBwwIMHD7B//37MmTOn0OeQqKgxL2hGXjAwMEDDhg0REhKCxo0bo3nz5oiLi0NQUBAsLCwwZMiQfB0z8wLzgsYRiNTs4sWLAgAhMjJSEARBkMlkQsWKFYUxY8Yo9Hv48KEAQDA1NRViYmIUls2YMUMAIPTu3Vtp+/Jlcnfu3BEACCtXrlToN2LECMHY2FhITk4W2z78WRAEIS0tTahevbrQsmVLhXZ7e3vB398/z2MFIAwaNEh49eqVEBMTI1y8eFFo166dAEBYuHChQt/s+xYEQRg6dKhgaGgopKSkiG0+Pj6Cvb29Ut/Q0FBBS0tLOHXqlEL7unXrBADCmTNnxDb5OYqKiso1/itXrggAhMGDByu0T5gwQQAgHD9+XGzz9/cXjIyMlLbh7u4uVKtWTakdgDBjxgzxdXBwsABAePjwodhmb28vABBOnjwptsXExAj6+vrC+PHjxbagoCDByMhIuHv3rsI+vvvuO0FbW1t4/PhxrsdJ9LnjdVlzrsvZ5bTvnOT2HrZq1UqoUaOGwrHJZDKhSZMmgrOzs9g2duxYAYBw7tw5sS0mJkYwMzNTuo5nv9bLZX+/8nsdHzNmjGBqaipkZGTkeIzbt29XeS53794tABAuXLiQ47qCIAhJSUmCVCoVgoODxTYzMzNh5MiROa4jk8kEZ2dnwcvLS5DJZGJ7cnKy4OjoKLRp00ZsGzBggKClpaUyjg/XJVI35gbNzQ2CIAipqamCq6ur4OjoKKSnp+fal7mBuYEov5gbNCs3JCcnC5UqVRImT54sCIIgREVFCQCE7du357muvG/lypUVjq8g17bOnTsLUqlUePTokdh28+ZNQVtbW+F9ln9ePrzGymXPGYMGDRIqVKggxMbGKvTr1auXYGZmJsbaqVMnlX+L+tDChQuVcpQgCMLSpUsFAMKrV69yXf+ff/5ReC/evn2r8vPxocTERMHc3Fz46quvFNpfvHghmJmZKbS3aNFCMDExUTh/gsC8QCUL84Jm5YV79+4JdevWFQCIX5UrVxZu376d57rMC8wLmorTp5PahYWFoXz58uJUJxKJBD179kRERITCFCJy3bp1g6WlpcptDRs2LM/9ffHFF6hduza2bt0qtmVmZmLHjh3w9fVVGL3w4c9v375FfHw8mjdvrjQFXkH89NNPsLS0hJWVFerVq4fffvsNEydOxLhx4xT6fbjvxMRExMbGonnz5khOTsbt27fz3M/27dvh4uKC//3vf4iNjRW/WrZsCQCIiooS+wYGBkIQhDzvKj506BAAKMU6fvx4AFnTgRQ3V1dXhSm+LC0tUbVqVfzzzz9i2/bt29G8eXNYWFgoHHvr1q2RmZmJkydPFnucRJqM12XNuS4Xlezv4Zs3b3D8+HH4+fmJxxobG4vXr1/Dy8sL9+7dw9OnTwFk5YZGjRqJd/oCWdfmvn37Fjqe/F7Hzc3NkZSUhMjIyALvw9zcHABw4MABpKen59jv+PHjSE1Nhbe3t8K6586dw7Nnz1Suc+XKFdy7dw99+vTB69evxfiTkpLQqlUrnDx5EjKZDDKZDHv27IGvry/q1auntJ0PpwsjUjfmBs3ODaNGjcLNmzexatUqcWRzXpgbmBuI8sLcoFm5Yf78+UhPT1c5vWt++fv7Kxxffq9tmZmZOHr0KDp37oxKlSqJ67u4uMDLy6tQsQiCgJ07d8LX1xeCICicKy8vL8THx4vvt7m5Of79919cuHChwPuR54a9e/fm+ki+7I/VMDAwgJ6eHk6cOIG3b9+qXCcyMhJxcXHo3bu3Qvza2tpo2LCh+F6/evUKJ0+exJdffqlw/gDmBSpZmBc0Ky+YmJigWrVqGDlyJHbt2oU1a9YgIyMDnTt3RmxsbL7OAfMC84Km4fTppFaZmZmIiIiAp6cnHj58KLY3bNgQixcvxm+//Ya2bdsqrOPo6Jjj9nJb9qGePXtiypQpePr0KWxtbXHixAnExMSgZ8+eCv0OHDiA2bNn48qVK0rPzC6sTp06YdSoUUhLS8OFCxcwd+5cJCcnQ0tL8R6VGzduYOrUqTh+/Lj4LA65+Pj4PPdz79493Lp1K8f/WMTExBQ49kePHkFLSwtVqlRRaLe2toa5uTkePXpU4G0WVPaLPABYWFgoJJJ79+7h2rVrRXrsRJ8LXpc167pcVLK/T/fv34cgCJg2bRqmTZumcp2YmBjY2tri0aNHaNiwodLyqlWrFjqe/F7HR4wYgW3btsHb2xu2trZo27Yt/Pz80K5duzz34e7ujm7dumHmzJlYunQpPDw80LlzZ/Tp0wf6+vpiv4MHD6JevXoK03f98MMP8Pf3h52dHdzc3NC+fXsMGDAAlStXFuMHsn45zEl8fDzS0tKQkJCA6tWr531SiNSIuUGzc8PChQuxYcMGBAUFoX379vlej7mBuYEoN8wNmpUboqOjsXDhQqxevVqcjrcwsr9P+b22paam4v3793B2dlZaXrVqVXEQRkG8evUKcXFx+PHHH/Hjjz+q7CM/V5MmTcKvv/6KBg0aoEqVKmjbti369OmDpk2b5rmfnj17YuPGjRg8eDC+++47tGrVCl27dkX37t0V3vuDBw+ibdu24s1n+vr6WLBgAcaPH4/y5cujUaNG6NChAwYMGCBO8yw/f/LCVnampqYAIA4EYW6gkox5QbPyQkZGBlq3bg0PDw+sXLlSbG/dujWqVauGhQsXio8iyg3zAvOCpmFRnNTq+PHjeP78OSIiIhAREaG0PCwsTClZ5vYcuvw+o65nz56YPHkytm/fjrFjx2Lbtm0wMzNT+EPJqVOn0LFjR7Ro0QJr1qxBhQoVoKuri+DgYISHh+fzCJVVrFgRrVu3BgC0b98e5cqVw6hRo+Dp6YmuXbsCAOLi4uDu7g5TU1PMmjULTk5OkEqluHTpEiZNmpTrHUhyMpkMNWrUwJIlS1Qut7OzK/QxqPNuI21tbZXtwgfPlpHJZGjTpg0mTpyosu8XX3xRLLERlQa8LmvmdfljZX+f5MczYcKEHO/QzX6D1MfIfsd4fq/jVlZWuHLlCo4ePYrDhw/j8OHDCA4OxoABA7Bp06Zc9ymRSLBjxw78+eef2L9/P44ePYovv/wSixcvxp9//in+sfDQoUMYOHCgwrp+fn5o3rw5du/ejWPHjom/LO7atQve3t7i+Vu4cCFq166tcv/GxsZ48+ZNnueGqCRgbtDc3BASEoJJkyZh2LBhmDp1aoHWZW5gbiDKDXODZuWG6dOnw9bWFh4eHuKzxF+8eAEgq4gQHR2NSpUqKRVyssspN+R1bfuwAJWXnP7mpCovAEC/fv1yLL7UrFkTQNbIwzt37uDAgQM4cuQIdu7ciTVr1mD69OmYOXNmrvEYGBjg5MmTiIqKwsGDB3HkyBFs3boVLVu2xLFjx6CtrY3k5GScOHECa9euVVh37Nix8PX1xZ49e3D06FFMmzYN8+bNw/Hjx1GnTh3xGEJDQxWehyyX39ldiEoC5gXNygsnT57E9evXlbbp7OwMFxcXlc/HVoV5gXlB0/AMklqFhYXBysoKq1evVlq2a9cu7N69G+vWrct3EswvR0dHNGjQAFu3bsWoUaOwa9cudO7cWeHu/507d0IqleLo0aMK7cHBwUUay9ChQ7F06VJMnToVXbp0gUQiwYkTJ/D69Wvs2rULLVq0EPt+eJedXE5JwcnJCVevXkWrVq2KrIhtb28PmUyGe/fuwcXFRWx/+fIl4uLiYG9vXyT7+VhOTk549+6d+J8SIso/Xpc167pcXOSj2nR1dfO8ltrb24t3sn7ozp07Sm0WFhaIi4tTaEtLS8Pz588V2gpyHdfT04Ovry98fX0hk8kwYsQIrF+/HtOmTUOVKlXyPNeNGjVCo0aNMGfOHISHh6Nv376IiIjA4MGDcf36dTx+/Bg+Pj5K61WoUAEjRozAiBEjEBMTg7p162LOnDnw9vaGk5MTgKw7eHM7BktLS5iamuL69et5HieROjE3aGZu2Lt3LwYPHoyuXbuqfO8KirmBuYHoQ8wNmpUbHj9+jPv374vX8g+NGDECQNZ0wvIpYfOrINc2AwODfOUGCwsLAFDKDdlnJ7S0tISJiQkyMzPzlRuMjIzQs2dP9OzZE2lpaejatSvmzJmDyZMnQyqV5nqutbS00KpVK7Rq1QpLlizB3Llz8f333yMqKgqtW7dW+VgNOScnJ4wfPx7jx4/HvXv3ULt2bSxevBhbtmwRz5+VlVWuxyB/35gbqCRjXtCsvPDy5UsAyoVlAEhPT0dGRkahtsu8wLxQ0vGZ4qQ279+/x65du9ChQwd0795d6WvUqFFITEzEvn37imX/PXv2xJ9//omff/4ZsbGxSlOqaGtrQyKRKCSG6Oho7Nmzp0jj0NHRwfjx43Hr1i3s3btX3DegOPo5LS0Na9asUVrfyMhI5TQrfn5+ePr0KTZs2KC07P3790hKShJfx8bG4vbt20hOTs41Vvl0i8uWLVNol99RpuoPQ+rg5+eHP/74A0ePHlVaFhcXV+ikTlTa8bqcRZOuy8XFysoKHh4eWL9+vVJRAsgaTSLXvn17/Pnnnzh//rzC8rCwMKX1nJycxGe+yv34449Kv4Tl9zr++vVrhWVaWlriXb/yu46NjIzE9T709u1bhfcTgHgXs3zdQ4cOoXz58grPdM3MzFR6f62srGBjYyOu5+bmBicnJyxatAjv3r1TOgb5+dPS0kLnzp2xf/9+XLx4Ualf9viI1IG5IYum5YaTJ0+iV69eaNGiBcLCwvIc+ZcfzA3MDURyzA1ZNCk3zJ49G7t371b4CgoKAgBMnDgRu3fvFq+NBZHfa5u2tja8vLywZ88ePH78WFx+69Ytpeu6qakpypUrp5Qbsp9DbW1tdOvWDTt37lRZFPgwL2XPDXp6enB1dYUgCEhPTweQc25QNYOHqtyQ/bEaycnJSElJUVjPyckJJiYm4npeXl4wNTXF3LlzxThUHYOlpSVatGiBn3/+WeH8AcwLVDIwL2TRpLwgn2Up+6j+S5cu4c6dO6hTp06u6+eEeYF5oaTjSHFSm3379iExMREdO3ZUubxRo0awtLREWFiYUiIrCn5+fpgwYQImTJiAMmXKKN154+PjgyVLlqBdu3bo06cPYmJisHr1alSpUgXXrl0r0lgCAgIwffp0LFiwAJ07d0aTJk1gYWEBf39/jB49GhKJBKGhoSovaG5ubti6dSvGjRuH+vXrw9jYGL6+vujfvz+2bduGYcOGISoqCk2bNkVmZiZu376Nbdu24ejRo+IfclatWoWZM2ciKioKHh4eOcZZq1Yt+Pv748cffxSnfjl//jw2bdqEzp07w9PTs0jPS2F9++232LdvHzp06ICAgAC4ubkhKSkJf//9N3bs2IHo6GiUK1dO3WESlTi8Lv9HU67LAHDt2jXxF8v79+8jPj4es2fPBpB13fb19S3UOVi9ejWaNWuGGjVq4KuvvkLlypXx8uVL/PHHH/j3339x9epVAFl/RAsNDUW7du0wZswYGBkZ4ccff4S9vb3S+zJ48GAMGzYM3bp1Q5s2bXD16lUcPXpU6Zqc3+v44MGD8ebNG7Rs2RIVK1bEo0ePsHLlStSuXVuc0aR27drQ1tbGggULEB8fD319fbRs2RLh4eFYs2YNunTpAicnJyQmJmLDhg0wNTUVbwI7ePAgvL29Fe4OTkxMRMWKFdG9e3fUqlULxsbG+PXXX3HhwgUsXrwYQFZBY+PGjfD29ka1atUwcOBA2Nra4unTp4iKioKpqSn2798PAJg7dy6OHTsGd3d3DBkyBC4uLnj+/Dm2b9+O06dPF3i0DlFRY274j6bkhkePHqFjx46QSCTo3r07tm/frrC8Zs2aYpG4oJgbmBuIAOaGD2lKbmjWrJlSm/xaUr9+fXTu3LlQx1+Qa9vMmTNx5MgRNG/eHCNGjEBGRgZWrlyJatWqqcwN8+fPx+DBg1GvXj2cPHkSd+/eVdr//PnzERUVhYYNG+Krr76Cq6sr3rx5g0uXLuHXX38VCxdt27aFtbU1mjZtivLly+PWrVtYtWoVfHx8YGJiAiDr/QCA77//Hr169YKuri58fX0xa9YsnDx5Ej4+PrC3t0dMTAzWrFmDihUriudV1WM17t69i1atWsHPzw+urq7Q0dHB7t278fLlS/Tq1QtAVqFn7dq16N+/P+rWrYtevXrB0tISjx8/xsGDB9G0aVOsWrUKALBixQo0a9YMdevWxZAhQ+Do6Ijo6GgcPHgQV65cKdT7R1RUmBf+oyl5wc3NDW3atMGmTZuQkJCAtm3b4vnz51i5ciUMDAwwduzYQh0/8wLzQoknEKmJr6+vIJVKhaSkpBz7BAQECLq6ukJsbKzw8OFDAYCwcOFCpX4zZswQAAivXr3KcZkqTZs2FQAIgwcPVrn8p59+EpydnQV9fX3hf//7nxAcHKxye/b29oK/v38uR5sFgDBy5EiVywIDAwUAQlRUlCAIgnDmzBmhUaNGgoGBgWBjYyNMnDhROHr0qEIfQRCEd+/eCX369BHMzc0FAIK9vb24LC0tTViwYIFQrVo1QV9fX7CwsBDc3NyEmTNnCvHx8Urn6MPt5iQ9PV2YOXOm4OjoKOjq6gp2dnbC5MmThZSUFIV+/v7+gpGRkdL67u7uQrVq1VSemxkzZoivg4ODBQDCw4cPxTZ7e3vBx8dH5Tbd3d0V2hITE4XJkycLVapUEfT09IRy5coJTZo0ERYtWiSkpaXleZxEnyNelxVpynVZfr1U9ZXXOcjtPRQEQXjw4IEwYMAAwdraWtDV1RVsbW2FDh06CDt27FDod+3aNcHd3V2QSqWCra2tEBQUJPz0009K1/HMzExh0qRJQrly5QRDQ0PBy8tLuH//vsr3Kz/X8R07dght27YVrKysBD09PaFSpUrC0KFDhefPnytsa8OGDULlypUFbW1t8bxeunRJ6N27t1CpUiVBX19fsLKyEjp06CBcvHhREARBiIuLE3R0dIRt27YpbCs1NVX49ttvhVq1agkmJiaCkZGRUKtWLWHNmjVK5+/y5ctC165dhbJlywr6+vqCvb294OfnJ/z2228K/R49eiQMGDBAsLS0FPT19YXKlSsLI0eOFFJTU3N+84g+EeYGRZqQG6KionLMC9n/z60KcwNzA1FemBsUaUJuUEWeL7Zv3/7RffN7bfv9998FNzc3QU9PT6hcubKwbt06le9LcnKyMGjQIMHMzEwwMTER/Pz8hJiYGJV57OXLl8LIkSMFOzs7QVdXV7C2thZatWol/Pjjj2Kf9evXCy1atBDjc3JyEr799luFcykIghAUFCTY2toKWlpaYr767bffhE6dOgk2NjaCnp6eYGNjI/Tu3Vu4e/euIAiCcP36dQGAcP78eYVtxcbGCiNHjhT+97//CUZGRoKZmZnQsGFDpRwiP79eXl6CmZmZIJVKBScnJyEgIEDMP3LXr18XunTpIpibmwtSqVSoWrWqMG3aNJXvCdGnxLygSFPyQnJysjBr1izB1dVVMDAwEMzMzIQOHToIly9fznNd5gXmBU0lEQSOpSciIiKiohUSEoKBAwfi4cOHcHBwUHc4BbZt2zb07dsXsbGxMDMzU3c4RESlAnMDERFlFxgYiJkzZ2rsdK8//PADlixZgufPnxfZs36JiD5nzAtUnPhMcSIiIiKibMzNzbFixQoWPYiISMTcQERE2Tk4OGDp0qUsfBAREQDmhZKOzxQnIiIiIsqmbdu26g6BiIhKGOYGIiLKzs/PT90hEBFRCcK8ULJxpDgREREREREREREREREREZVafKY4ERERERERERERERERERGVWhwpTkREREREREREREREREREpRaL4kREREREREREREREREREVGrpqDuAkkAmk+HZs2cwMTGBRCJRdzhERJ8FQRCQmJgIGxsbaGmVrHu0mBeIiNSDuYGIiD5UkvMCwNxARKQOzA1ERJRdfnMDi+IAnj17Bjs7O3WHQUT0WXry5AkqVqyo7jAUMC8QEakXcwMREX2oJOYFgLmBiEidmBuIiCi7vHIDi+IATExMAGSdLFNTUzVHk7f09HQcO3YMbdu2ha6urrrDKRBNjV1T4wYYuzpoatzAp409ISEBdnZ24jW4JJHH9PDhQ/zxxx8a915q6mdQU+MGNDd2xv1pMe68aUJu0JTfGeT4ufu0NDVuQHNjZ9yf1qeOuyTnBeC/3LBx40Z07txZo95LQHM/hwBjVxdNjV1T4wYYuyqakhs06fcGTf6cAZodvybHDjB+dWP8/8lvbmBRHBCnMTE1NdWIRJWeng5DQ0OYmppq3AddU2PX1LgBxq4Omho3oJ7YS+JUUvKYTExMNPK91NTPoKbGDWhu7Iz702Lc+VeSc4Om/M4gx8/dp6WpcQOaGzvj/rTUFXdJzAvAf3Fp4nsJaO7nEGDs6qKpsWtq3ABjz01Jzw2a9HuDJn/OAM2OX5NjBxi/ujF+ZXnlhpL30A0iIiIiIiIiIiIiIiIiIqIiwqI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcVnin/GMjMzkZ6e/kn3mZ6eDh0dHaSkpCAzM/OT7vtjaGrcAGNXB02NGyja2HV1daGtrV1EkRERERERERERERERERUOi+KfIUEQ8OLFC8TFxall39bW1njy5EmeD7wvSTQ1boCxq4Omxg0Ufezm5uawtrbWuPNARERERERERERERESlB4vinyF5QdzKygqGhoaftFglk8nw7t07GBsbQ0tLc2bv19S4AcauDpoaN1B0sQuCgOTkZMTExAAAKlSoUFQhEhERERERERERERERFQiL4sVNlgk8Ogu8ewkYlwfsmwBa6ptOODMzUyyIly1b9pPvXyaTIS0tDVKpVKOKhZoaN8DY1UFT4waKNnYDAwMAQExMDKysrDiVOhERERERERER0ccoYfUGIiJNwqJ4cbq5DzgyCUh49l+bqQ3QbgHg2lEtIcmfIW5oaKiW/RPR50V+rUlPT2dRnIiIKD/4Ry4iIiIiIlKlBNYbiIg0CYvixeXmPmDbAACCYnvC86x2v81qTVR8vi8RfQq81hARERUA/8hFRER54c1TRESfpxJebyAi0gSaNa+vppBlZv0xK3uCAv5rO/JdVj8iDXbixAlIJBLExcXlex0HBwcsW7as2GIiIiIi0kjyP3J9WBAH/vsj18196omLiIhKjpv7gGXVgU0dgJ2Dsr4vq84cQURU2rHeQERUJFgULw6Pzir/MUuBACQ8zepHVIwCAgKgra2Nb775RmnZyJEjIZFIEBAQ8OkDIyIiIqL/8I9cRESUF948RUT0+WK9gYioSLAoXhzevSzafkQfwc7ODrt27cL79+/FtpSUFISHh6NSpUpqjIyIAGQVOB6eAv7ekfWdBQ8ios8P/8hFRES54c1TRESfN9YbiIiKBIvixcG4fNH2I/oIderUga2tLXbt2iW27dq1C5UqVUKdOnXEttTUVIwePRpWVlaQSqVo1qwZLly4oLCtQ4cO4YsvvoCBgQE8PT0RHR2ttL/Tp0+jefPmMDAwgJ2dHUaPHo2kpKRiOz4ijcbpD4mICOAfuYiIKHe8eYqI6PPGegMRUZFgUbw42DcBTG0ASHLoIAFMbbP6EX0C/fr1w6ZNm8TXP//8MwYOHKjQZ+LEidi5cyc2bdqES5cuoUqVKvDy8sKbN28AAE+ePEHXrl3h6+uLK1euYPDgwfjuu+8UtvHgwQO0a9cO3bp1w7Vr17B161acPn0ao0aNKv6DJNI0nP6QiIjk+EcuIiLKDW+eIiL6vLHeQERUJFgULw5a2kC7Bf//Inui+v/X7eZn9SP6BPz8/HD69Gk8evQIjx49wpkzZ9CvXz9xeVJSEtauXYuFCxfC29sbrq6u2LBhAwwMDPDTTz8BANauXQsnJycsXrwYVatWRd++fZWeRz5v3jz07dsXY8eOhbOzM5o0aYIVK1Zg8+bNSElJ+ZSHTFSycfpDIiL6EP/IRUREueHNU0REnzeFekN2rDcQEeUXi+LFxbUj4LcZMLFWbDe1yWp37aieuOizVK5cObRv3x4hISEIDg6Gj48PypUrJy5/8OAB0tPT0bRpU7FNV1cXDRo0wK1btwAAt27dQsOGDRW227hxY4XXV69eRUhICIyNjcUvLy8vyGQyPHz4sBiPkEjDcPpDIiL6EP/IRUREueHNU0REJK836BoqtrPeQESUbzrqDqBUc+0I2NYDlrpkvfbfD9g35R+zSC0GDhyI0aNHAwBWr15dLPt49+4dhg4dKu7nQ5UqVSqWfRJpJE5/SERE2cn/yLV7CJD+/r92U5usgjj/yEVE9PmS3zy1bYCKhbx5iojos+HaEbi+E7i5B6jZG6jTN+uGKF7/iYjyhSPFi1vG//9BS88EcGzBBEVq065dO6SlpSE9PR1eXl4Ky5ycnKCnp4czZ86Ibenp6bhw4QJcXV0BAC4uLjh//rzCen/++afC67p16+LmzZuoUqWK0peenl4xHRmRBuL0h0REpIprR8DBPevnOv0B/wPA2L9ZECciov9untLO9rs1RwgSEX1e0pOzvjs2Axybs95ARFQALIoXt7R3Wd/1jNQbB332tLW1cevWLdy8eRPa2or/WTIyMsLw4cPx7bff4siRI7h58ya++uorJCcnY9CgQQCAYcOG4d69e/j2229x584dhIeHIyQkRGE7kyZNwtmzZzFq1ChcuXIF9+7dw969ezFq1KhPdZhEmoHTHxIRUU7SErO+O7XkH7mIiEiRa0fA7P9nYWvxLW+eIiL6HKXK6w3G6o2DiEgDsShe3NKSsr7rM0mR+pmamsLU1FTlsvnz56Nbt27o378/6tati/v37+Po0aOwsLAAkDX9+c6dO7Fnzx7UqlUL69atw9y5cxW2UbNmTfz++++4e/cumjdvjjp16mD69OmwsbEp9mMj0igKz47NXhjn9IdERJ+1lISs71LV/2cjIqLP3Ps3Wd+rd+PNU0REnyP5TbSsNxARFRifKV7c5EVxjhQnNQgJCYFMJkNCQoLK5Xv27BF/lkqlWLFiBVasWJHj9jp06IAOHTootA0cOFDhdf369XHs2LEctxEdHZ134ESfA/n0h0cmAQnP/mvns2OJiD5vqfFZ3/XN1BsHERGVPLJM4P3brJ8Ny6o3FiIiUg9xpLiJeuMgItJAah0pPm/ePNSvXx8mJiawsrJC586dcefOHXH5mzdv8PXXX6Nq1aowMDBApUqVMHr0aMTHxytsRyKRKH1FRER86sNRLY3TmRARUQ5cOwLD//zvdd+dnP6QiOhzl/L/v+twpDgREWX3Pg6AkPWzgYU6IyEiInWR1xs4UpyIqMDUOlL8999/x8iRI1G/fn1kZGRgypQpaNu2LW7evAkjIyM8e/YMz549w6JFi+Dq6opHjx5h2LBhePbsGXbs2KGwreDgYLRr1058bW5u/omPJgepfKY4ERHlIvX/Z3LQ1gOqtAIkOT1nnIiISj1BAFLl0yGyKE5ERNkkv876LjUDtHXVGwsREakH6w1ERIWm1qL4kSNHFF6HhITAysoKf/31F1q0aIHq1atj586d4nInJyfMmTMH/fr1Q0ZGBnR0/gvf3Nwc1tbWnyz2fOP06URElBv59IcGFiyIExF97tLeAYIs62eOFCciouzkRXFOnU5E9HnKzAAy3mf9zOnTiYgKrEQ9U1w+LXqZMmVy7WNqaqpQEAeAkSNHYvDgwahcuTKGDRuGgQMHQpJDcSE1NRWpqania/nzltPT05Genv6xh6FAKyUB2gBkOobILKJty2MsTKzp6ekQBAEymQwymaxI4ikIQRDE7+rYf2FpatwAY1cHTY0bKPrYZTIZBEFAeno6tLW1FZYV9fX2Y+SWFz78Xhwk715BB4AgNUdGCcgT6qSpcQOaGzvj/rQYd/73VRJ8yt8ZRO/eQBeAoKWDDOgCRbAffu4+LU2NG9Dc2Bn3p/Wp4y5p5yen3AB8mlgliTHQASAzKFMkf2PS1M8hwNjVRVNj19S4Acae23ZLik/6e0NKAuTzhKRr6RfJ7wuAZn/OAM2OX5NjBxi/ujF+5W3lRSLIKyBqJpPJ0LFjR8TFxeH06dMq+8TGxsLNzQ39+vXDnDlzxPagoCC0bNkShoaGOHbsGGbMmIEffvgBo0ePVrmdwMBAzJw5U6k9PDwchoaGRXNA/8/l2XZ88XI/Hli2xfWK/Yp024Who6MDa2tr2NnZQU9PT93hEFEpl5aWhidPnuDFixfIyMhQWJacnIw+ffqINzup06fMC9lViLuABg9X4rXRFzj9xdRi3RcRUUn3uecGk/f/ouXtKUjVNsaRmmuKZR9ERJqkJOUFQL2/NwBApdgTqPPkZ7wwrYVzTuOLfX9ERCXR55wbpGlv4HVjLGQSbeyvHVyk2yYi0mT5zQ0lpig+fPhwHD58GKdPn0bFihWVlickJKBNmzYoU6YM9u3bB13dnJ+dNH36dAQHB+PJkycql6u6e8vOzg6xsbFFnki1jn4H7Ysbkdl0HGQeU4pkm+np6YiMjESbNm1yPQ+qpKSk4MmTJ3BwcIBUKi2SeApCEAQkJibCxMQkx5H8JZGmxg0wdnXQ1LiBoo89JSUF0dHRsLOzU7rmJCQkoFy5ciXil5ic8sLz589x7ty5Ql1v80tyeTN0Do2DzNkLmX5hRbLNj8kT6qSpcQOaGzvj/rQYd940ITcUx+8McpJ/z0NnU3sI5g7IGHmxSLbJz92npalxA5obO+P+tD513CUpLwA554bw8HB06tSp2M+J1tnl0I4KgqxmL2T6rvro7Wnq5xBg7OqiqbFratwAY1dFU3JDsfzeEHsXuuubQDCwQMa4e0W2WU3+nAGaHb8mxw4wfnVj/P/Jb24oEdOnjxo1CgcOHMDJkydVFsQTExPRrl07mJiYYPfu3XmenIYNGyIoKAipqanQ19dXWq6vr6+yXVdXt+g/OP//jA9tqQm0i3jbhYk3MzMTEokEWlpa0NLSKtJ48kM+HbM8Bk2hqXEDjF0dNDVuoOhj19LSgkQiUXm9KkmJOre8IP9ebPGmZU2rpWVYFlolIE+UBJoaN6C5sTPuT4tx576PkuKT/s4gl54EAJBITYt8H/zcfVqaGjegubEz7k/rU8Vd0s5NTrkB+ETnJOUtAEDLqFyR/u6gqZ9DgLGri6bGrqlxA4w9+/ZKkk/6e0NmCgBAomdSLOdBkz9ngGbHr8mxA4xf3Rh//nODWqs1giBg1KhR2L17N44fPw5HR0elPgkJCWjbti309PSwb9++fI1uvnLlCiwsLHL8ReWTSnuX9V3PWL1xEBFRyfQ+6w9bMLBQbxxERKR+qf//bFqpmXrjICKikin5TdZ3w7LqjYOIiNQjLTHruz5rDUREhaHWkeIjR45EeHg49u7dCxMTE7x48QIAYGZmBgMDA7EgnpycjC1btiAhIQEJCVl/KLK0tIS2tjb279+Ply9folGjRpBKpYiMjMTcuXMxYcIEdR7af9KyRnswURERkUrv47K+syhOREQp8Vnf9dU/DSQREZVA71kUJyL6rKXKB+AZqTcOIiINpdaR4mvXrkV8fDw8PDxQoUIF8Wvr1q0AgEuXLuHcuXP4+++/UaVKFYU+8ueF6+rqYvXq1WjcuDFq166N9evXY8mSJZgxY4Y6D+0/8qI4E1WxO3HiBCQSCeLi4tQdikoeHh4YO3asWmNwcHDAsmXL1BoDEWUjjhQ3V2sYRERUAnCkOBER5Sb5ddZ3FsWJiD5PnJWWiOijqH36dFVfAQEBALKKiDn1cXBwAAC0a9cOly9fRmJiIt69e4crV65g6NChJec5vrx7q0jIC945fXl6eqo7xDzt2rULQUFBxbqPwMBA1K5du1j3UVzu3LkDT09PlC9fHlKpFJUrV8a0adOQnp6e63pr165FzZo1YWpqClNTUzRu3BiHDx9W6DN06FA4OTnBwMAAlpaW6NSpE27fvq3Q5/Hjx/Dx8YGhoSGsrKzw7bffIiMjQ1weEhICiUQCFxcXpRi2b98OiUQiXpeICoTTpxMRkVyKvCjOkeJERKQCi+JERJ83eVGcs9ISERWKWqdP/yyU4ru3MmUCzj98g5jEFFiZSNHAsQy0tSTFsq8mTZrg+fPnSu379u3DsGHDMGLEiGLZb1EqU6aMukMo0XR1dTFgwADUrVsX5ubmuHr1Kr766iu8f/8eixYtynG9ihUrYv78+XB2doYgCNi0aRM6deqEy5cvo1q1agAANzc39O3bF5UqVcKbN28QGBiItm3b4uHDh9DW1kZmZiZ8fHxgbW2Ns2fP4vnz5xgwYAB0dXUxd+5ccV9GRkaIiYnBH3/8gcaNG4vtP/30EypVqlR8J4dKN3H6dHN1RkFERCWBfKQ4p08nIiJVWBQnIvq8iQPwTNQbBxGRhiohw6lLsVI6ffqR68/RbMFx9N7wJ8ZEXEHvDX+i2YLjOHJduXBdFPT09GBtba3w9fbtW0yYMAFTpkxBjx49xL5//fUX6tWrB0NDQzRp0gR37twRlwUEBKBz584K2x47diw8PDzE16mpqRg9ejSsrKwglUrRrFkzXLhwQVwuH7V+9OhR1KlTBwYGBmjZsiViYmJw+PBhuLi4wNTUFH369EFycrK4Xvbp0x0cHDB37lx8+eWXMDExQaVKlfDjjz8qxPbvv/+id+/eKFOmDIyMjFCvXj2cO3fuI8/mf5YsWYIaNWrAyMgIdnZ2GDFiBN69eycuDwkJgbm5OQ4cOICqVavC0NAQ3bt3R3JyMjZt2gQHBwdYWFhg9OjRyMzMFNcLDQ1FvXr1YGJiAmtra/Tp0wcxMTG5xlK5cmUMHDgQtWrVgr29PTp27Ig+ffrgjz/+yHU9X19ftG/fHs7Ozvjiiy8wZ84cGBsb488//xT7DBkyBC1atICDgwPq1q2L2bNn48mTJ4iOjgYAHDt2DDdv3sSWLVtQu3ZteHt7IygoCKtXr0ZaWpq4HR0dHfTp0wc///yz2Pbvv//ixIkT6NOnT77OOZGSlLis7xwpTkRE8meKc6Q4ERFll5n+X55gUZyI6PPEkeJERB+FRfHiJhbFS0+iOnL9OYZvuYTn8SkK7S/iUzB8y6ViK4x/KC4uDp06dYKHh4fSlOTff/89Fi9ejIsXL0JHRwdffvllgbY9ceJE7Ny5E5s2bcKlS5dQpUoVeHt74+3btwr9AgMDsWrVKpw9exZPnjyBn58fli1bhvDwcBw8eBDHjh3DypUrc93X4sWLUa9ePVy+fBkjRozA8OHDxSL+u3fv4O7ujqdPn2Lfvn24evUqJk6cCJlMVqDjyY2WlhZWrFiBGzduYNOmTTh+/DgmTpyo0Cc5ORkrVqxAREQEjhw5ghMnTqBLly44dOgQDh06hNDQUKxfvx47duwQ10lPT0dQUBCuXr2KPXv2IDo6WnwsQn7dv38fR48eRdOmTfO9TmZmJiIiIpCUlKQwkvtDSUlJCA4OhqOjI+zs7AAAf/zxB2rUqIHy5cuL/by8vJCQkIAbN24orP/ll19i27Zt4g0PISEhaNeuncK6RAUinz5daq7WMIiIqARI4UhxIiLKgfz3Bkg4yxQR0ecqtfTOSktE9Clw+vTiJJMB6SW/KC4IAt6nZ+bdEVlTps/YdwOCqu0AkAAI3HcTTauUUzmVukwmw/u0TOikZUBLSwsGutqQSAo25bpMJkOfPn2go6ODsLAwpfXnzJkDd3d3AMB3330HHx8fpKSkQCqV5rntpKQkrF27FiEhIfD29gYAbNiwAZGRkQgNDcXUqVPFvrNnzxYLtoMGDcLkyZPx4MEDVK5cGQDQvXt3REVFYdKkSTnur3379uLU75MmTcLSpUsRFRWFqlWrIjw8HK9evcKFCxfEqderVKmS39OUL9lHrs+ePRvDhg3DmjVrxPb09HSsXbsWTk5O4nGFhobi5cuXMDY2hqurKzw9PREVFYWePXsCgMKNCJUrV8aKFStQv359vHv3DsbGuf9baNKkCS5duoTU1FR89dVXmDJlSp7H8ffff6Nx48ZISUmBsbExdu/eDVdXV4U+a9aswcSJE5GUlISqVasiMjISenp6AIAXL14oFbXlr1+8eKHQXqdOHVSuXBk7duxA//79ERISgiVLluCff/7JM04iJRlp/93ly5HiRESUymeKExFRDuRTpxtYAFra6o2FiIjUIy0x6ztHihMRFQqL4sVJXhAHSvT06e/TM+E6/WiRbEsA8CIhBTUCj+Wr/81ZXjDUK9jHcMqUKfjjjz9w/vx5mJgoPz+lZs2a4s8VKlQAAMTExOTrmc8PHjxAenq6wuhkXV1d1K9fH3fv3s1xP+XLl4ehoaFYEJe3nT9/Ptf9fbgNiUQCa2trcZrxK1euoE6dOiqfRf748WOFou+UKVPyVTzO7tdff8W8efNw+/ZtJCQkICMjAykpKUhOToahoSEAwNDQUCyIy4/LwcFBobhdvnx5henR//rrLwQGBuLq1at4+/atOLpdHne1atXw6NEjAEDz5s1x+PBhcd2tW7ciMTERV69exbfffgtbW1tMmzYNp06dEm9UAID169ejb9++AICqVaviypUriI+Px44dO+Dv74/ff/9d4Rz17dsXbdq0wfPnz7Fo0SL4+fnhzJkz+bpZIrsvv/wSwcHBqFSpEpKSktC+fXusWrWqwNshEqdOhwSQmqkzEiIiKgnkI8WZE4iIKDs+T5yIiDhSnIjoo7AoXpzkU6dLtABdA/XGUkpERERg0aJFOHjwIJydnVX20dXVFX+WjyKXF2W1tLQgCIrj3NPT0wsVS/b9fPha3pbXVOe5rWNgkPNnxsbGBleuXBFfqyqc5yU6OhodOnTA8OHDMWfOHJQpUwanT5/GoEGDkJaWJhbFVcWYW9xJSUnw8vKCl5cXwsLCYGlpicePH8PLy0t8PvehQ4fE8579OOVTmru6uiI9PR3Dhg3DlClTUK9ePYVj/nBkt56enjiK3s3NDRcuXMDy5cuxfv16sY+ZmRnMzMzg7OyMRo0awcLCArt370bv3r1hbW2tdAPDy5cvAQDW1tZK565v376YOHEiAgMD0b9/f+jo8FJKhfQ+Luu71IyjPYiI6L+R4vosihMRUTYsihMRURqL4kREH4OVnOL04fPECzhF+KdkoKuNm7O88tX3/MM3CAi+kGe/kIH10cBRuVArk8mQmJAIE1MTcfr0/Lpy5QoGDRqE+fPnw8srf/FmZ2lpievXryttV17kdXJygp6eHs6cOQN7e3sAWUXzixcvYujQoYXaZ2HVrFkTGzduxJs3b5SK3jo6Oh89lfpff/0FmUyGxYsXQ0tLCwCwbdu2j9omANy+fRuvX7/G/PnzxQL3xYsXFfrIz21eZDIZ0tPTIZPJYGBgkO9jlslkSE1NzXG5IAgQBEHs07hxY8yZMwcxMTGwsrICAERGRsLU1FRpGnYg6yaEjh07Ytu2bVi3bl2+YiJSSf5cQD4TkIiIgA9GinP6dCIiykYsihf8pngiIiol5PUGTp9ORFQoLIoXJ/HOrZI7dTqQNco3v1OYN3e2RAUzKV7Ep6h8rrgEgLWZFM2dLXN8pniGnjYM9XTEQmx+xMbGonPnzvDw8EC/fv2UnvOsrZ2/4nrLli2xcOFCbN68GY0bN8aWLVtw/fp11KlTBwBgZGSE4cOH49tvv0WZMmVQqVIl/PDDD0hOTkb//v3zHW9R6N27N+bOnYvOnTtj3rx5qFChAi5fvgwbGxs0btw4x/Xev3+vMKJaJpNBIpGgVq1aCv2qVKmC9PR0rFy5Er6+vjhz5kyRFHgrVaoEPT09rFy5EsOGDcP169cRFBSU53phYWHQ1dVFjRo1oK+vj4sXL+L7779Hly5dlEamf2jy5Mnw9vZGpUqVkJiYiPDwcJw4cQJHj2Y9EuCff/7B1q1b0bZtW1haWuLff//F/PnzYWBggPbt2wMA2rZtC1dXV/Tv3x8//PADXrx4galTp2LkyJHQ19dXud+QkBCsWbMGZcvyLn36CGJRnM8TJyL67MlkH4wUZ1GciIiyYVGciIg4fToR0UdhUbw4iSPFS3ZRvCC0tSSY4euK4VsuQQIoFMblJfAZvq4qC+If4+DBg3j06BEePXokPif8Q/b29ggJCclzO15eXpg2bRomTpyIlJQUfPnllxgwYAD+/vtvsc/8+fMhk8nQv39/JCYmol69ejh8+DDMzc2L8Ijypqenh2PHjmH8+PFo3749MjIy4OrqitWrV+e63t27d8Uiv5y7uzuOHz+u0FarVi0sWbIECxYswOTJk9GiRQvMmzcPAwYM+Ki4LS0tERISgilTpmDFihWoW7cuFi1ahI4dO+a6no6ODhYsWIC7d+9CEATY29tj5MiR+PLLL3NdLyYmBgMGDMDz589hZmaGmjVr4ujRo2jTpg0AQCqV4tSpU1i2bBnevn2L8uXLo0WLFjh79qw4KlxbWxsHDhzA8OHD0bhxYxgZGcHf3x+zZs3Kcb8GBga5TnFPlC8sihMRkVxaIsT/XXOkOBERZZf8Jus7p08nIvp8pSVmfdc3UW8cREQaikXx4pSqGSPFC6pd9QpY268uZu6/iefxKWK7tZkUM3xd0a66ctH6Y/n7+8Pf3z/PftmfF167dm2ltpkzZ2LmzJk5bkMqlWLFihVYsWKF2CaTyZCQkDVyx8PDQ2mbAQEBCAgIUGgLDAxEYGCg+PrEiRMKy6Ojo5X2/eEIbyCr2L9jx44cY80u+z6zx559n9988w2++eYbhbYPR8Tn57gAKN2Q0Lt3b/Tu3VuhLfs5y65nz57o2bNnjrHn5Keffsp1uY2NDQ4dOpRrHyDrXOfWT9W5+NDYsWMxduzYPPdDpCAlLus7i+JERCSfOl1LF9CRqjcWIiIqefhMcSIi4khxIqKPUqii+OPHj/Ho0SMkJyfD0tIS1apVy3GK4c9aWulNUu2qV0AbV2ucf/gGMYkpsDKRooFjmSIfIU5ElB8am5fkI8Wl5moNg4ioNNK43CCfOl1qBkj4f2oioqKmcXkhOxbFiYiKnMblBnm9gc8UJyIqlHwXxaOjo7F27VpERETg33//VRj1qaenh+bNm2PIkCHo1q1bgZ4VXaqJ06eXziSlrSVBYyf+MkZE6lEq8hKnTyciKlIanRvkI8U5dToRUZHR6LyQHadPJyIqEhqbGzIzgIz/n7W1lNYbiIiKW76u6qNHj0atWrXw8OFDzJ49Gzdv3kR8fDzS0tLw4sULHDp0CM2aNcP06dNRs2ZNXLhwobjj1gyl8JniREQlQanJSyyKExEVGY3PDfKR4vosihMRFQWNzwvZcaQ4EdFH0+jcIH+eOMCiOBFRIeVrpLiRkRH++ecflC2r/B9vKysrtGzZEi1btsSMGTNw5MgRPHnyBPXr1y/yYDVOWul8pjgRkbqVmrz0Pi7rO4viREQfTeNzQ0p81neOFCciKhIanxey40hxIqKPptG5QT4AT1sP0NFTbyxERBoqX0XxefPm5XuD7dq1K3QwpU4pfqY4EZE6lZq8JI4UN1drGEREpYHG5wZ5UZwjxYmIioTG54UPZaT+N0LQsIx6YyEi0mAanRtSWWsgIvpYBX4oxsOHD3Hv3j2l9nv37iE6OrooYio9OH06EVGx0+i8xOnTiYiKhUbmBvn06VJztYZBRFQaaWRe+JB8lLhEG9A3U28sRESlhMblBvkAPH0WxYmICqvARfGAgACcPXtWqf3cuXMICAgoiphKD3lRnImKiKjYaHReYlGciKhYaGRuSJEXxTlSnIioqGlkXviQ+DzxMoBWgf+UR0REKmhcbkj9/xlD9EzUGwcRkQYr8P+kL1++jKZNmyq1N2rUCFeuXCmKmEoPTp9ORFTsNDYvyWRASlzWzyyKExEVKY3MDfKR4pw+nYioyGlkXviQWBTn88SJiIqKxuUGjhQnIvpoBS6KSyQSJCYmKrXHx8cjMzOzSIIqNTh9OhFRsdPYvJSWCAiyrJ85VS4RUZHSyNzAkeJERMVGI/PCh1gUJyIqchqXG8RnirPWQERUWAUuirdo0QLz5s1TSAyZmZmYN28emjVrVqTBaTwmqs+Kh4cHxo4dK752cHDAsmXL1BbPxwgICEDnzp3VHQZRvmhsXpJPna5jAOhK1RsLEVEpo5G5ISU+6ztHihMRFTmNzAsf+nD6dCIiKhIalxs4Ky0R0UfTKegKCxYsQIsWLVC1alU0b94cAHDq1CkkJCTg+PHjRR6gRivtI8VlmcCjs8C7l4BxecC+CaClre6oCsTDwwO1a9fW2OJ1YZ04cQKenp54+/YtzM3N1R1OgXXs2BFXrlxBTEwMLCws0Lp1ayxYsAA2NjYq+7958wYzZszAsWPH8PjxY1haWqJz584ICgqCmZlZjvtJSUnBsGHD8Ndff+HWrVvo0KED9uzZo9AnJCQEAwcOFF8bGRmhatWqGDt2LPr27ZvrcTg4OGDs2LEKN1MUVnR0NBwdHXH58mXUrl37o7eXl8DAQOzZs6dETCelsXmJzxMnIio2GpkbUjlSnIiouGhkXvhQ8pus7wYsihMRFRWNyw3i9Ol8pjgRUWEVeKS4q6srrl27Bj8/P8TExCAxMREDBgzA7du3Ub169eKIUXOJd2+VwkR1cx+wrDqwqQOwc1DW92XVs9qJipmnpye2bduGO3fuYOfOnXjw4AG6d++eY/9nz57h2bNnWLRoEa5fv46QkBAcOXIEgwYNynU/mZmZMDAwwOjRo9G6desc+5mamuL58+d4/vw5Ll++jLZt22LgwIG4c+dOoY+xuKSlpak7hCKnsXnpfVzWdxbFiYiKnEbmBnH69Jxv2CMiosLRyLzwIU6fTkRU5DQuN6RypDgR0ccqcFEcAGxsbDB37lwcPHgQO3bswPTp01GmDO9WVVJaR4rf3AdsGwAkPFNsT3ie1V5MhXEPDw98/fXXGDt2LCwsLFC+fHls2LABSUlJGDhwIExMTFClShUcPnxYXOf69evw9vaGsbExypcvj/79+yM2NhZA1hThv//+O5YvXw6JRAKJRILo6GhkZmZi0KBBcHR0hIGBAapWrYoVK1Z8dPxLlixBjRo1YGRkBDs7O4wYMQLv3r0Tl4eEhMDc3BwHDhxA1apVYWhoiO7duyM5ORmbNm2Cg4MDLCwsMHr0aIVpfUJDQ1GvXj2YmJjA2toaffr0QUxMzEfH+6EjR46gWbNmMDc3R9myZdGhQwc8ePBAXB4dHQ2JRIJt27ahefPmMDAwQP369XH37l1cuHAB9erVg7GxMby9vfHq1StxvQsXLqBNmzYoV64czMzM4O7ujkuXLuUZzzfffINGjRrB3t4eTZo0wXfffYc///wT6enpKvtXr14dO3fuhK+vL5ycnNCyZUvMmTMH+/fvR0ZGRo77MTIywtq1a/HVV1/B2to6x34SiQTW1tawtraGs7MzgoKCoKWlhWvXruV5LNm3s3HjRnTp0gWGhoZwdnbGvn3//Xt6+/Yt+vbtC0tLSxgYGMDZ2RnBwcEAAEdHRwBAnTp1IJFI4OHhAeC/qfDnzJkDGxsbVK1aVdxX9lHv5ubmCAkJEV//+++/6N27N8qUKQMjIyPUq1cP586dQ0hICGbOnImrV6+K/3Y+XE8dNDIviSPFzdUaBhFRaaVxuUE+UpzTpxMRFQuNywsfYlGciKhYaFRuEEeKsyhORFRYhSqKnzp1Cv369UOTJk3w9OlTAFmFudOnTxdpcBpPU4rigpAVa36+UhKAwxMBCKo2lPXtyKSsfjltIz35v58FVdvJ2aZNm1CuXDmcP38eX3/9NYYPH44ePXqgSZMmuHTpEtq2bYv+/fsjOTkZcXFxaNmyJerUqYOLFy/iyJEjePnyJfz8/AAAy5cvR+PGjfHVV1+Jo3zt7Owgk8lQsWJFbN++HTdv3sT06dPx/fffY/fu3R91mrW0tLBixQrcuHEDmzZtwvHjxzFx4kSFPsnJyVixYgUiIiJw5MgRnDhxAl26dMGhQ4dw6NAhhIaGYv369dixY4e4Tnp6OoKCgnD16lXs2bMH0dHRCAgI+KhYs0tKSsK4ceNw8eJF/Pbbb9DS0kKXLl0gk8kU+s2YMQNTp07FpUuXoKOjgz59+mDixIlYvnw5Tp06hfv372P69Oli/8TERPj7++P06dP4888/4ezsjPbt2yMxMTHfsb158wZhYWFo0qQJdHV1871efHw8TE1NoaNT4KdI5CozMxObNm0CANStW7fA68+cORN+fn64du0a2rdvj759+/4fe/cd31S9/gH8k6Tp3ruVljIKpVCwyFYRZIMoilcEfwrKRcUNF+cFBQeiV1HR68A9QK5e50VFAcGBbASBslcZHbSlMx1pcn5/pOfQTdImOeebft6vl6/QND15Oj+ePOf7fFFQYBuVN2/ePGRkZOCHH37Avn378MYbbyAyMhIAsGXLFgDAmjVrkJWVhS+//FI55tq1a3HgwAGsXr0aK1eutKuO0tJSXHHFFTh9+jS+/fZb7Nq1Cw899BCsVismTZqEf/zjH+jevbvyuzNp0iSHP1dnEjKXOD6diMilhMsGrhQnInIp4XKhNjbFiYhcQqhs4EpxIqJWc7gb9MUXX+Dmm2/GTTfdhB07dqCyshKArcG0cOFCfP/9904vUkhWK2CWm+IaDyqzCVjY+F7MjpNsK8gXJTT6Xj2A0Np3PHbGoYsGevXqhblz5wIAHn30USxatAiRkZGYMWMGAODxxx/HG2+8gb/++gtr1qxBeno6Fi5cqHz8e++9h4SEBBw8eBBdunSBt7c3/P3966wCNhgMWLBggfJ2hw4d8Mcff+Drr7/G1KlT7a61vtr7RiclJeHpp5/GnXfeiddff12532w244033kCnTp0AANdffz0+/vhj5OTkIDAwEKmpqRg6dCjWrVunNCFvu+025eM7duyIJUuWoG/fvigtLUVgoHN+9iZOnFjn7ffeew9RUVHIyMioM05ozpw5GDVqFADg/vvvx+TJk7F27VpceumlAIDp06fXWVF85ZVX1jnu0qVLERoail9++QVXXXVVszU9/PDDeO2112AymTBgwAC7m70AkJeXh6eeegq333673R/TnKKiIuVrXV5eDqPRiJdffln5Pjpi2rRpmDx5MgBg4cKFWLJkCbZs2YLRo0cjMzMT6enp6NOnDwDbz5EsKioKABAREdFgVXtAQADeeecdeHt7213H8uXLcfbsWWzdulW5QrZz587K+wMDA+Hl5dXsCnp3ETaXuFKciMhlhMsGqwWoqrkokCvFiYicTrhcqK+8Zk9xNsWJiJxGuGxQzhc8cKtWIiI3cXil+NNPP40333wTb7/9dp1VmZdeeqldY4/bDLkhDmh/pbhAevbsqfzbYDAgIiICaWlpyn0xMTEAgNzcXOzatQvr1q1DYGCg8l9KSgoA1Bn93Zh///vfuOSSSxAVFYXAwEC8/fbbOHXqFADbFYS1j7ls2TK7al+zZg2GDRuGiy66CEFBQbj55puRn58Pk8mkPMbf379OIzUmJgZJSUl1mtsxMTF1xqNv374d48ePR2JiIoKCgnDFFVcAADIzMwEAaWlpaNeuHYKDgzFmzBi7aq3v0KFDmDx5Mjp27Ijg4GClGSs/h6z290f+XtT//tSuPScnBzNmzEBycjJCQkIQHByM0tJS5bgzZ85Uaq/f4H/wwQfx559/4qeffoLBYMAtt9wCyY7JA8XFxRg3bhxSU1Mxf/585f7u3bsr31NHv05BQUHYuXMndu7ciT///BPPPPMMZs+ejf/9738AbM3t2j8z9b9utdX+GgYEBCA4OFj5ms2cORMrVqzAxRdfjIceegh//PGHXfWlpaU51BAHgF27diE9PV27I6NqETaXKgptt1wpTkTkdMJlgzw6HQB82RQnInI24XKhPhOb4kREziZcNigrxdlrICJqKYdXih84cACDBw9ucH9ISAgKCwudUZNnkEen6/SA0U/dWi7E6G9bsW2PE38Ay66/8ONu+i/QflCDu61WK4pLShAcFAS9Xm97bkdKrTceW6fT1blPp9Mpz1NaWorx48fjueeea3CcuLi4Jp9jxYoVmDNnDl588UUMHDgQQUFBeP7557Fx40YAQJ8+fbBz507l8XLztznHjx/HVVddhZkzZ+KZZ55BeHg4fv/9d0yfPh1VVVXw9/e36/OT75PHlpeVlWHUqFEYNWoUli1bhqioKGRmZmLUqFGoqqoCAKxcuRLnzp1DYGAgAgJa9j9N48ePR/v27fH2228jPj4eVqsVPXr0UJ5D1tj3ov59tUeuT506Ffn5+XjllVfQvn17+Pj4YODAgcpxFyxYgDvuuAOBgYG2n5daIiMjERkZiS5duqBbt25ISEjApk2bMHDgwCY/j5KSEowePRpBQUH46quv6tT2/fffK3uS+/k59jur1+vrrKLu0aMHfvjhB/zrX//CNddcgzvvvFMZ2w/Y9itqSnPf7zFjxuDEiRP4/vvvsXr1agwbNgx33303XnjhhWbra+z7rtPpGlxEUHtPdke/BmoSNpfkleK+oaqWQUTkiYTLBnl0usEH8PJRtxYiIg8kXC7Up4xP1/5Fy0REohAuG6o4Pp2IqLUcborHxsbi8OHDdcb2AsDvv/+Ojh07Oqsu8VXVGp1e0xzULJ3O/ivMOl0JBMcDxVlofF9xne39na4E9IaG77ZaAaPF9nz6Fm1pb7fevXvjiy++QFJSUpP7Rnt7e8NisdS5b8OGDRg0aBDuuusu5b6jR48q//bz86vTALXH9u3bYbVa8eKLLyrN3c8++8yhYzRm//79yM/Px6JFi5CQYBtZv23btjqPad++PcLCwhAcHNygsWyP/Px8HDhwAG+//TYuv/xyAHDavjobNmzA66+/jrFjxwIATp48iby8POX90dHR8PX1vWDtctNYHnPUmOLiYowaNQo+Pj749ttv4evrW+f97du3b82n0oBer0d5eTkAIDw83GkrrqOiojB16lRMnToVl19+OR588EG88MILykrw+j/PzR0nKytLefvQoUN1phakpaXh3XffRUFBQaO1N/a7oxZhc6m80HbLleJERE4nXDZUcj9xIiJXEi4Xaqsy2ba9A7hSnIjIiYTLBrnf4MOmOBFRSzncIZsxYwbuv/9+bN68GTqdDmfOnMGyZcswZ84czJw50xU1iqnKQ8eZ6A3AaHnldf1mf83boxc13hB3s7vvvhsFBQWYPHkytm7diiNHjuDHH3/ErbfeqjTzkpKSsHnzZhw/fhx5eXmwWq1ITk7Gtm3b8OOPP+LgwYOYN28etm7d2qpaOnfuDLPZjFdffRVHjx7Fxx9/jDfffLPVn2NiYiK8vb2V43777bd46qmn7P743bt3K2O/d+7ciV27djV4TFhYGCIiIrB06VIcPnwYP//8M2bPnt3q2gEgOTkZH3/8Mfbt24fNmzfjpptuuuAK5c2bN+O1117Dzp07ceLECfz888+YPHkyOnXqpKwSP336NFJSUrBlyxYAtob4yJEjUVZWhnfffRfFxcXIzs5Gdnb2BRu7GRkZ2LlzJwoKClBUVKR8rWqTJEk53rFjx7B06VL8/PPPuPrqq1v+xWnE448/jm+++QaHDx/G3r17sXLlSnTr1g2A7QICPz8/rFq1Cjk5OSgqKmr2WFdeeSVee+01/Pnnn9i2bRvuvPPOOqvUJ0+ejNjYWEyYMAEbNmzA0aNH8cUXXygTE5KSknDs2DHs3LkTeXl5zV6Q4GrC5pKypzib4kREziZcNsgrxTk6nYjIJYTLhdrk/cT1Ru4jS0TkRMJlQ2XNnuLezAIiopZyuCn+yCOPYMqUKRg2bBhKS0sxePBg/P3vf8cdd9yBe++916FjPfvss+jbty+CgoIQHR2NCRMm4MCBA3UeU1FRgbvvvhsREREIDAzExIkTkZOTU+cxmZmZGDduHPz9/REdHY0HH3wQ1dXVjn5qzuXJe3ykXg3c8BEQXG8EeXC87f5U5zYCWyo+Ph4bNmyAxWLByJEjkZaWhgceeAChoaHKquM5c+bAYDAgNTVVGT1+xx134LrrrsOkSZPQv39/5Ofnt/p/hHr16oXFixfjueeeQ48ePbBs2TI8++yzrf4co6Ki8MEHH+Dzzz9HamoqFi1adMFR2rUNHjwY6enpyn+XXHJJg8fo9XqsWLEC27dvR48ePTBr1iz861//anXtAPDuu+/i3Llz6N27N26++Wbcd999iI6ObvZj/P398eWXX2LYsGHo2rUrpk+fjp49e+KXX36Bj49t3KjZbMaBAweUlc87duzA5s2bsXv3bnTu3BlxcXHKfydPnmz2+caOHYv09HT873//w/r165WvVW3FxcXK8bp164aXXnoJjz76KB577LFWfHUa8vb2xqOPPoqePXti8ODBMBgMWLFiBQDAy8sLS5YswVtvvYX4+Hhcc801zR7rxRdfREJCAi6//HJMmTIFc+bMUcb4y8/1008/ITo6GmPHjkVaWhoWLVoEg8F2wcvEiRMxevRoDB06FFFRUfj000+d+rk6wpm55FZcKU5E5DLCZYO8UtyHTXEiIlcQLhdqU0anR2h/EiERkUCEywZ5ER5XihMRtZjD49N1Oh3++c9/4sEHH8Thw4dRWlqK1NRUBAY6/sf4l19+wd13342+ffuiuroajz32GEaOHImMjAxlD9xZs2bhu+++w+eff46QkBDcc889uO6667BhwwYAtlHB48aNQ2xsLP744w9kZWXhlltugdFoxMKFCx2uyWmU8eke2BQHbI3vlHG2PcZLc4DAGNse4i5cIb5+/foG9x0/frzBfbX3SU5OTsaXX37Z5DG7dOmirHyt7f3338f777+vvG21WvHII484VF/92mbNmoVZs2bVue/mm29W/j1t2jRMmzatzvvnz5+P+fPn17nvgw8+qPP25MmTMXny5Dr31d8rur4hQ4Y0+5j6zzF8+HBkZGQ0+RxJSUkNjtfYc9T/HNPT0xuswr/++ub3rE9LS8PPP//c7GPq13Ohz7c5jf2M1dbY981qtaK4uFhpINt77MZqrL2H0dy5czF37twmj/f3v/8df//73+vcV/97KYuPj8ePP/7Y4Lnk2gHbSPn//ve/jX68j49Pk+9zN2fmklspK8VDVS2DiMgTCZcNXClORORSwuVCbbWb4kRE5DTCZUMl9xQnImoth5viMm9vb6SmpqK4uBhr1qxB165dlTG+9lq1alWdtz/44ANER0dj+/btGDx4MIqKivDuu+9i+fLluPLKKwHYmpXdunXDpk2bMGDAAPz000/IyMjAmjVrEBMTg4svvhhPPfUUHn74YcyfP1/ZZ9ftlPHpHjzORG8AOlyudhVERACck0tuxfHpREQuJ0w2VNRse8KV4kRELiVMLtRmqhmf7h+ubh1ERB5KiGywmAFLzdaFXClORNRiDjfFb7jhBgwePBj33HMPysvL0bdvXxw7dgySJGHFihWYOHFii4uR98AND7f9j/727dthNpsxfPhw5TEpKSlITEzExo0bMWDAAGzcuBFpaWmIiYlRHjNq1CjMnDkTe/fubTDmGAAqKyvr7H8rr4o0m80wm80trr82XXkxvABYjX6wOOmYMrnGltRqNpshSRKsViusVqtT67KHvBJWrkEUotYNsHY1iFo34PzarVYrJEmC2WxusHLeWX9vnZFLzeWCM2tVVFfAWF1uO7ZXEKChnFCTqHUD4tbOut2Lddv/XK3l6mxw9tdCbzoHAwCrd5Cmzh3UxLrdT9TaWbd7ubtuLeUC0HQ2OLPW+vQlubaM8AtnRtTC2tUhau2i1g2w9uaO21quzgannjeUn4Ox5p9mnQ9fR6pH5PpFrh1g/Wpj/Q2PdSE6ycGZwrGxsfjxxx/Rq1cvLF++HE888QR27dqFDz/8EEuXLsWff/7ZooKtViuuvvpqFBYW4vfffwcALF++HLfeemudUAGAfv36YejQoXjuuedw++2348SJE3XGAJtMJgQEBOD777/HmDFjGjzX/PnzsWDBggb3L1++vM6euq3RMfcnpJ3+BKdC+2N7h7udckxn8PLyQmxsLBISEtRbRU9EbUZVVRVOnjyJ7OxsVFdX13mfyWTClClTUFRUhODglq+Mc0YuuSMXavMxF2L0nvsgQYdvL34f0Omd/hxERCJqq9mQenoFknO/x+Go0djbbopTj01EJDIt5QLg/vMGAOia9SVSsr/Gscgr8VfCNJc8BxGRSNpiNvhV5WHk3tmw6IxYefG7TjkmEZEnsTcbHF4pXlRUpKzkXrVqFSZOnAh/f3+MGzcODz74YIsLvvvuu7Fnzx6lIe5Kjz76KGbPnq28XVxcjISEBIwcObJVQVqb/vf9wGkgvn1nxIwd65RjysxmM1avXo0RI0bAaDRe+ANqqaiowMmTJxEYGAhfX1+n1mUPSZJQUlKCoKAg6HQ6tz9/S4laN8Da1SBq3YDza6+oqICfnx8GDx7c4G9O7VUVreGMXGoqF4YOHYrNmze36O9ts87uB/YA8AvD2HFXOe+4NVqTE2oStW5A3NpZt3ux7gsTIRucec4g03+/BsgFOnTrhfaXa+fcQU2s2/1ErZ11u5e769ZSLgBNZwMAl31N9KvWA9lAYtd0tBvCjJCxdnWIWruodQOsvTGiZINTzxvO7gf2Anq/YIx1cq8BEPvnDBC7fpFrB1i/2lj/efZmg8NN8YSEBGzcuBHh4eFYtWoVVqxYAQA4d+5ci5us99xzD1auXIlff/0V7dq1U+6PjY1FVVUVCgsLERoaqtyfk5OD2NhY5TFbtmypc7ycnBzlfY3x8fGBj49Pg/uNRqPzfnAstrG4et9g6F30w9iSei0WC3Q6HfR6PfR6969MlMcxyzWIQtS6AdauBlHrBpxfu16vh06na/TvlbP+3jojl5rLBfnWqf9jYS4BAOj8wlz6PyxOr9tNRK0bELd21u1erLv553AGV2eD078OVaUAAIN/GAwaOnfQAtbtfqLWzrrdy111aykXgKazQa7VJV+TinMAAENQFDOiEaxdHaLWLmrdAGuvfzxncHU2OLfXUAEA0HkH8nWkZohcv8i1A6xfbazf/mxwuOPxwAMP4KabbkK7du0QHx+PIUOGAAB+/fVXpKWlOXQsSZJwzz334KuvvsLPP/+MDh061Hn/JZdcAqPRiLVr1yr3HThwAJmZmRg4cCAAYODAgdi9ezdyc3OVx6xevRrBwcFITU119NNznqoy261PoHo1EBG1Ac7MJbcpt72wBb8wdesgIvJQwmVDZc0VzT7OXYFOREQ2wuVCbaZ8261/hLp1EBF5GKGyocq2uAI+QerWQUQkOIdXit91113o168fTp48iREjRigrCTt27Iinn37aoWPdfffdWL58Ob755hsEBQUhOzsbABASEgI/Pz+EhIRg+vTpmD17NsLDwxEcHIx7770XAwcOxIABAwAAI0eORGpqKm6++WY8//zzyM7Oxty5c3H33Xc3efWuW8hNce8A9WogImoDnJlLblNeaLv1C1WzCiIijyVcNlQU2W592RQnInIF4XKhNlOB7dY/XN06iIg8jFDZUGmbLAXvtrMAz2K1YEfuDpw1nUWUfxR6R/eGQW9QuywiEpzdTfHLL78c11xzDa655hr06dMHffr0qfP+cePGOfzkb7zxBgAoV2HJ3n//fUybNg0A8NJLL0Gv12PixImorKzEqFGj8PrrryuPNRgMWLlyJWbOnImBAwciICAAU6dOxZNPPulwPU5VWXP1VhsKKiIid3JFLrlNG1opzpMYInInYbOhou2sFGcuEJE7CZsLtSlNca4UJyJyBiGzoWa7pbYylXbNiTVYtGURckw5yn0x/jF4pN8jGN5+uIqVEZHo7G6Kz5gxA9988w0WLFiAdu3a4eqrr8bVV1+NQYMGQafTtejJJUm64GN8fX3x73//G//+97+bfEz79u3x/ffft6gGl+FKcSIil3JFLrlNG2mK8ySGiNxN2GyQx6f7hqhbh4sxF4jI3YTNBZkktanx6bxwiojcQchsUFaKe36vYc2JNZi9fjYk1O0d5ZpyMXv9bCwespjnDkTUYnbvKX7LLbfgiy++QF5eHl588UUUFhbib3/7G2JjY3Hbbbfh66+/Rnl5uStrFQub4pAkCWXmMhRVFqHMXGbXRRAiGzJkCB544AHl7aSkJLz88suq1dMa06ZNw4QJE9Qug6hZQudSG2iKyycxtRsfwPmTmDUn1qhUGRF5MmGzQV4p7sHj05kLRKQGYXNBVlUGWCpt//bwpviaE2sw6otRuO3H2/Dwbw/jth9vw6gvRjEfiMjphMwGeU9xb8/eU9xitWDRlkUNGuIAlPue2/IcLFaLu0sjIg9hd1Nc5uPjg7Fjx+Ktt97CmTNn8O233yIuLg7z5s1DREQErrrqKmzYsMEVtYpFaYp77kgTi9WCrdlb8f3R77E1e2udMCquLMbBcwdxvOg4TpWcwvGi4zh47iBKzCUqVtxQ/UZ2W7F+/XrodDoUFhaqXUqLXH311UhMTISvry/i4uJw880348yZM81+zNKlSzFkyBAEBwc79Llv3boVw4YNQ2hoKMLCwjBq1Cjs2rVLeb/8tZT/8/PzQ1paGj744IMLHtvZP386nQ5ff/21047XnA8++AChoaFuea4LETKXKgptt76halbhMjyJISK1CZUNlmrAXHPu4OOZK8WZC0SkNqFyoTZ5lbiXL2D0V7cWF+KFU0SkBqGyQe41ePj49B25OxpkQW0SJGSbsrEjd4cbqyIiT+JwU7y+/v3745lnnsHu3buxe/duDBs2DFlZWc6oTWzyPh8e2hRv7gre4spinCw5iWprdZ2PqbZW41TJKZRbNXalHQln6NCh+Oyzz3DgwAF88cUXOHLkCK6//vpmP8ZkMmH06NF47LHH7H6e0tJSjB49GomJidi8eTN+//13BAUFYdSoUTCbzXUee+DAAWRlZSEjIwO33347/vGPf2Dt2rUt+vxcqaqqSu0SXE6IXPLwleI8iSEirdF0Nsij0wGPXSnOXCAirdF0LtRWe3S6Vkf6thIvnCIirdB0NlR6dq9BdtZ01qmPIyKqz+Gm+MmTJ3Hq1Cnl7S1btuCBBx7A0qVL0alTJ8yaNeuCzak2ocpz9/m40BW8Xx3+qtmPL7IWteh5hwwZgnvvvRcPPPAAwsLCEBMTg7fffhtlZWW49dZbERQUhM6dO+OHH35QPmbPnj0YM2YMAgMDERMTg5tvvhl5eXkAbCPCf/nlF7zyyivKKt/jx4/DYrFg+vTp6NChA/z8/NC1a1csWbKkRTXXtnjxYqSlpSEgIAAJCQm46667UFpaqrxfXnm7cuVKdO3aFf7+/rj++uthMpnw4YcfIikpCWFhYbjvvvtgsZw/Gfz444/Rp08fBAUFITY2FlOmTEFubm6r661t1apVuOyyyxAaGqpcKXnkyBHl/cePH4dOp8Nnn32Gyy+/HH5+fujbty8OHjyIrVu3ok+fPggMDMSYMWNw9uz5/2nZunUrRowYgcjISISEhOCKK67Ajh0XfjF01qxZGDBgANq3b49BgwbhkUcewaZNmxo0qmt74IEH8Mgjj2DAgAF2f9779+9HQUEBnnzySXTt2hXdu3fHE088gZycHJw4caLOY6OjoxEbG4sOHTrg3nvvRfv27fHnn3/a/VyAbeT+woULcdtttyEoKAiJiYlYunSp8v6qqircc889iIuLg6+vL9q3b49nn31W+VgAuPbaa6HT6ZS358+fj4svvhjvvPMOOnToAF9fX+Xx9cf7X3zxxViwYIHydmFhIe644w7ExMTA19cXPXr0wMqVK7F+/XrceuutKCoqUn535s+f79Dn6kxC5pKHN8V5EkNEahMqGypq/t/Yyw8wGNWtxUWYC0SkNqFyoTZTge3WP1zdOlyIF04RkVqEyga51+DhK8Wj/KOc+jgiovocbopPmTIF69atAwBkZ2dj+PDh2LJlC/75z3/iySefdHqBwhJoT3FJkmAym+z6r6SyBM9uebbJK3glSHhvz3swmU2oqK5o9L+y6jLkV+TDZDY5vM/4hx9+iMjISGzZsgX33nsvZs6cib/97W8YNGgQduzYgZEjR+Lmm2+GyWRCYWEhrrzySqSnp2Pbtm1YtWoVcnJycMMNNwAAXnnlFQwcOBAzZsxAVlYWsrKykJCQAKvVinbt2uHzzz9HRkYGHn/8cfzzn//EV1813+y/EL1ejyVLlmDv3r348MMP8fPPP+Ohhx6q8xiTyYQlS5ZgxYoVWLVqFdavX49rr70W33//Pb7//nt8/PHHeOutt/Df//5X+Riz2YynnnoKu3btwtdff43jx49j2rRpraq1vrKyMsyePRvbtm3D2rVrodfrce2118JqtdZ53BNPPIG5c+dix44d8PLywpQpU/DQQw/hlVdewW+//YbDhw/j8ccfVx5fUlKCqVOn4vfff8emTZuQnJyMsWPHoqTE/jH7BQUFWLZsGQYNGgSj0bkvJHft2hURERF49913UVVVhfLycrz77rvo1q2b0nSuT5IkrFq1CqdOnUK/fv0cfs4XX3wRffr0wZ9//om77roLM2fOxIEDBwAAS5Yswbfffquskl+2bJlSx9atWwEA77//PrKyspS3AeDw4cP44osv8OWXX2Lnzp121WG1WjFmzBhs2LABn3zyCTIyMrBo0SIYDAYMGjQIL7/8MoKDg5XfnTlz5jj8uTqLkLnk4U1xnsQQkdqEyoZKz99PnLlARGoTKhdqq71S3EPxwikiUotQ2VAp7ynu2U3x3tG9EeMfAx0an46igw6x/rHoHd3bzZURkafwcvQD9uzZozR6PvvsM6SlpWHDhg346aefcOedd9ZpeLVZVgtgNtn+LUBQlVeXo//y/k47XkFFAW776Ta7Hrt5ymb4O7AvVq9evTB37lwAwKOPPopFixYhMjISM2bMAAA8/vjjeOONN/DXX39hzZo1SE9Px8KFC5WPf++995CQkICDBw+iS5cu8Pb2hr+/P2JjY5XHGAyGOqtlO3TogD/++ANff/01pk6danet9dXeOzopKQlPP/007rzzTrz++uvK/WazGW+88QY6deoEALj++uvx8ccfIycnB4GBgUhNTcXQoUOxbt06TJo0CQBw223nv9YdO3bEkiVL0LdvX5SWliIw0Dk/fxMnTqzz9nvvvYeoqChkZGSgR48eyv1z5szBqFGjAAD3338/Jk+ejLVr1+LSSy8FAEyfPr3OXttXXnllneMuXboUoaGh+OWXX3DVVVc1W9PDDz+M1157DSaTCQMGDMDKlStb8yk2KigoCOvXr8eECRPw1FNPAQCSk5Px448/wsur7p/Pdu3aAQAqKythtVrx6KOPYvDgwQ4/59ixY3HXXXcBsH2OL730EtatW4euXbsiMzMTycnJuOyyy6DT6dC+fXvl46KibC8ih4aG1vl5BmwrzD/66CPlMfZYs2YNtmzZgn379qFLly4AbD9fspCQEOh0ugbPpQYhc6m80HbrF6pmFS4jn8TkmnIbvYhKBx1i/GN4EkNELiNUNlTITXHP3E8cYC4QkfqEyoXa2kBTnBdOEZFahMoGZaV4kLp1uJhBb8Aj/R7B7PWzG7xPbpQ/3O9hGPQGd5dGRB7C4ZXiZrMZPj4+AGxNk6uvvhoAkJKSop09NtQmN8QBjx9p4m49e/ZU/m0wGBAREYG0tDTlvpiYGABAbm4udu3ahXXr1iEwMFD5LyUlBQDqjP5uzL///W9ccskliIqKQmBgIN5++21lnM5vv/1W55jLli2zq/Y1a9Zg2LBhuOiiixAUFISbb74Z+fn5MJnO/7z4+/srDXH580lKSqrT3I6JiakzHn379u0YP348EhMTERQUhCuuuAIAkJmZCQBIS0tDu3btEBwcjDFjxthVa32HDh3C5MmT0bFjRwQHByurk+XnkNX+/sjfi/rfn9q15+TkYMaMGUhOTkZISAiCg4NRWlqqHHfmzJlK7fUb/A8++CD+/PNP/PTTTzAYDLjlllscnjxQmzxmPzAwEN27dwcAlJeXY/r06bj00kuxadMmbNiwAT169MC4ceNQXl5e5+N/++037Ny5Ezt37sTSpUvx0ksv4Y033gAALFu2rM7PzG+//dZkHbW/hnLTWf6aTZs2DTt37kTXrl1x33334aeffrLrc2vfvr1DDXEA2LVrF9q1a6c0xLVMuFyyWs6PyvXQleLySUxjeBJDRO4gVDbIK8V9PHelOHOBiNQmVC7U1gaa4lwVSERqESobKj13q9b6hrcfjsVDFsOorzsRNMY/BouHLMbw9sNVqoyIPIHDK8W7d++ON998E+PGjcPq1auV1ZNnzpxBRITn/k+6Q+TR6To94OWrbi128PPyw+Ypm+167Pac7bhr7V0XfNzDfR9Gt/Bujb7PAAM6h3WGXq+Hn5efQ7XWH4+t0+nq3KfT2U6irFYrSktLMX78eDz33HMNjhMXF9fkc6xYsQJz5szBiy++iIEDByIoKAjPP/88Nm7cCADo06dPnRHUcvO3OcePH8dVV12FmTNn4plnnkF4eDh+//13TJ8+HVVVVfD397fr85Pvk8eWl5WVYdSoURg1ahSWLVuGqKgoZGZmYtSoUaiqqgIArFy5EufOnUNgYCACAlr2P07jx49H+/bt8fbbbyM+Ph5WqxU9evRQnkPW2Pei/n21R65PnToV+fn5eOWVV9C+fXv4+Phg4MCBynEXLFiAO+64A4GBgdDr617DExkZicjISHTp0gXdunVDQkICNm3ahIEDB7boc3znnXeURrdc8/Lly3H8+HFs3LhRef7ly5cjLCwM33zzDW688Ubl4zt06IDQ0FAAQLdu3fD777/j2Wefxd13342rr74a/fufn8Zw0UUXNVlHc9/v3r1749ixY/jhhx+wZs0a3HDDDRg+fHidcfqNaez7rtfrG1xEUHtPdj8/x3431SRcLlUUAfIqOd9QNStxKfkkZs4vc2CRLMr9Mf4xeLjfwzyJISKXEiobKjx/fDpgy4WH+z6MRVsX1bmfuUBE7iBULtQmN8X9PHdPca4KJCK1CJUN8kpxAabSOsMVCVdAX7Oe8x99/oHuEd3RO7o3s4CIWs3hpvhzzz2Ha6+9Fv/6178wdepU9OrVCwDw7bfftmj/XI9UWSukdI1f6aolOp3O7hHmg+IHXXD0YZR/FHpF9YJe1/gggnB9OPyN/g2anM7Wu3dvfPHFF0hKSmow6lrm7e0Ni8VS574NGzZg0KBByghrADh69Kjybz8/P3Tu3NmhWrZv3w6r1YoXX3xR+bw/++wzh47RmP379yM/Px+LFi1CQkICAGDbtm11HtO+fXuEhYUhODi4RV/z/Px8HDhwAG+//TYuv/xyAMDvv//e6toB29f69ddfx9ixYwEAJ0+eRF5envL+6Oho+Pr6XrB2uWlcWVnZ4loaa1SbTCbo9XqlwQ9Aebv+fur1GQwGpckeFBSEoCDnjDcKDg7GpEmTMGnSJFx//fUYPXo0CgoKEB4eDqPR2ODnuSlRUVF1rnotLi7GsWPHlLfT0tJw6tQpZauB+hr73VGLcLkk7yfuHQh4eatbi4tdetGlsEq235XH+j2GzmGdeRJDRG4hVDa0gZXisqCacY/Jocn4e9rfEeUfxVwgIrcQKhdqawMrxYHzF9T+8/d/wlR9fpoeL5wiIlcSKhvayPh02ZHCI6i0ViLQGIhbUm9pss9AROQoh5viQ4YMQV5eHoqLixEWdn7s6+23366sdm3zqjx3nEntK3h10NVpjMtX8D7a71HEB8Yjuyy7zsca9UbEBMRAKm/5iGtH3H333Xj77bcxefJkPPTQQwgPD8fhw4exYsUKvPPOOzAYDEhKSsLmzZtx/PhxBAYGIjw8HMnJyfjoo4/w448/okOHDvj444+xdetWJCYmtriWzp07w2w249VXX8X48eOxYcMGvPnmm63+HBMTE+Ht7Y1XX30Vd955J/bs2aNc1WiP3bt312nW6nQ65X8AZWFhYYiIiMDSpUsRFxeHzMxMPPJI4+MvHZWcnIyPP/4Yffr0QXFxMR588MELrlDevHkztm7dissuuwxhYWE4cuQI5s2bh06dOimrxE+fPo1hw4bho48+Uv4nNjs7G9nZ2Th8+HCdzz0xMRHh4Y1fdT9ixAg8+OCDuPvuu3HvvffCarVi0aJF8PLywtChQ+s8Njc3FxUVFaisrMSmTZvw2WefNdiLvbUWL16MuLg4pKenQ6/X4/PPP0dsbKyyQj0pKUnZw93Hx6fO3+j6rrzySnzwwQcYP348QkND8fjjj8NgOP+C9BVXXIHBgwdj4sSJWLx4MTp37oz9+/dDp9Nh9OjRSEpKQmlpKdauXYtevXrB399ftQwQLpcqCm23HrxKXHbw3EFIkBDhG4EbU26sc4EJEZErCZUN8pYaHr5SHAD25O0BAAyIH4CxHceqXA0RtSVC5UJtpgLbrb/nrhSXDW8/HB/t/Qh/nv0Tf+vyN4zpMIYXThGRSwmVDZVta6W4fN7QPbI7G+JE5FQt+otiMBgaNFuSkpIQHR3tlKKEJ49P99CQkq/gjfav+/1ubF8PXy9ftAtqh6SQJCSHJSPI6L6r2eLj47FhwwZYLBaMHDkSaWlpeOCBBxAaGqqsOp4zZw4MBgNSU1OV0eN33HEHrrvuOkyaNAn9+/dHfn4+Zs6c2apaevXqhcWLF+O5555Djx49sGzZMjz77LOt/hyjoqLwwQcf4PPPP0dqaioWLVqEF154we6PHzx4MNLT05X/LrnkkgaP0ev1WLFiBbZv344ePXpg1qxZ+Ne//tXq2gHg3Xffxblz59C7d2/cfPPNuO+++y74d8Tf3x9ffvklhg0bhq5du2L69Ono2bMnfvnlF2UfILPZjAMHDtTZr/3NN99Eeno6ZsyYUedz//bbb5t8rpSUFPzvf//DX3/9hYEDB+Lyyy/HmTNnsGrVqgYj+Lt27Yq4uDh07twZjz76KKZOnYolS5a09EvTKHmUf58+fdC3b18cP34c33//vfLz/OKLL2L16tVISEhAenp6s8d69NFHccUVV+Cqq67CuHHjMGHChDr72QPAF198gb59+2Ly5MlITU3FQw89pKwOHzRoEO68805MmjQJUVFReP755536uTpKqFySV4p76H7itWXkZwAAukV0Y0OciNxOmGxQmuIh6tbhBvKLWz0ieqhcCRG1RcLkQm1tZKU4AFisFuw/tx8AcFO3m9A3ti8b4kTkcsJkg7JS3DP7DfXxvIGIXMXhleI5OTmYM2cO1q5di9zc3AZ70mplnK6qlKa4560Ulw1vPxxDE4ZiR+4OnDWdbTD6sMxs+xoEewcjxOf8C3z1f14csX79+gb3HT9+vMF9tZ8jOTkZX375ZZPH7NKli7JXeG3vv/8+3n//feVtq9V6wdXR9eurX9usWbMwa9asOvfdfPPNyr+nTZuGadOm1Xn//PnzMX/+/Dr3ffDBB3Xenjx5MiZPnlznvgt9nYcMGdLsY+o/x/Dhw5GRkdHkcyQlJTU4XmPPUf9zTE9Px9atW+s85vrrr2+29rS0NPz888/NPqaxehr7WtpjxIgRGDFiRJPvb+zztFqtKC4uvuAe7hf6mQFQZ//6GTNmKE39xowfPx7jx4+vc19Tn3dwcDBWrFhR576pU6cqtQNAeHg43nvvvSaf74033sAbb7zR5PvdRbhcKi+03fqFqlmFW8hN8dSIVJUrIaK2RqhsUMane3ZT3GwxY3+BrdnRI5IvbhGRewmVC7WVyyvFPb8pfrToKMqry+Hv5Y+k4CS1yyGiNkCYbKiuAixVtn976CK8+uSmeFpkmsqVEJGncbgpPm3aNGRmZmLevHmIi4vjyq/GVLWNcSYGvQF9Y/s2uF+SJKUpHujhXwMiUp9wudSGVorvy98HgE1xInI/obKhoqYp7uHj0w8WHoTZakawdzASghLULoeI2hihckEmSW1qpbjcAEmNSOUKcSJyC2GyQe41AB7fbwAAk9mEw4W27S95MS0ROZvDTfHff/8dv/32Gy6++GIXlOMhPHhPcXuUV5fDKllh0Bvga/BVuxwi8nDC5VIbWSleaanEkcIjAIDuEd1VroaI2hqhskFZKe7ZTfG9eXsB2F7Y0uwLjkTksYTKBVllMWCttv27DewpzlWBRORuwmRDZYnt1ssXMDjczhHO/oL9sEgWRPlFISYgRu1yiMjDOLyneEJCQqtGYLcJbWB8enNKzbaLAgKMAXzBi4hcTrhcaiMrxQ8WHES1VI1w33DE+PMkhojcS6hsaCMrxeVmBy+UIiI1CJULMnmVuDEAMPqpW4sb7M7bDQDoHsmcICL3ECYb2shUWpmynzhXiRORCzjcFH/55ZfxyCOPNLr3LdWQg8qnbQRVffLo9ABj27wogIjcS7hcaiNNcXk/8W7h3XiBFBG5nVDZUFFku/XwleJys4MvbhGRGoTKBZmp7ewnXmmpxKFzhwBwpTgRuY8w2SAvwGsjvQY2xYnIlRyetzFp0iSYTCZ06tQJ/v7+MBqNdd5fUFDgtOKEpawUbxtBVZvFakF5dTkAINDY9j5/InI/4XKpotB26+FN8X0F3E+ciNQjVDbI49N9Q9Stw4VMZhOOFh0FwGYHEalDqFyQKfuJe/7o9AMFB5QpU3EBcWqXQ0RthDDZII9P9w5Stw432ZPPpjgRuY7DTfGXX37ZBWV4mDY8Pt1UbYIkSTDqjTDqjRf+ACKiVhIul+SV4r6hqpbhavJKcTbFiUgNQmVDGxifvq9gH6ySFdH+0Yjyj1K7HCJqg4TKBZnSFPf8leK1p4lwyhQRuYsw2dCGptIWVhTiZMlJANx2iYhcw+Gm+NSpU11Rh2eplPf5aHtNcWV0ujf3Eyci9xAul9rA+PQqSxUOFdrGH7IpTkRqECYbLGagZsqSJ49PV0YgRnC1BxGpQ5hcqK0NNcX35u0FwJwgIvcSJhsq286e4nvzbXnQPrg9Qnw8d5IWEanH4T3FAeDIkSOYO3cuJk+ejNzcXADADz/8gL179zq1OGFVtZ2gqq/UbPvcOTqdiNxJqFxqA03xQ+cOodpajRCfEI4/JCLVCJEN8ipxoG00xTkCkYhUJEQu1NaGmuLySvHukVwVSETuJUQ2tKGV4rUnhxARuYLDTfFffvkFaWlp2Lx5M7788kuUltr+KO/atQtPPPGE0wsUUhvdU9xsNaOyuhIAEGBse6vkiUgdQuWSJAHlhbZ/e3BTPKOgZnR6eCqnhhCRKoTJhsoi260xADA4PMRLGHJTnM0OIlKLMLlQWxvZU7ykqgTHi48DYBOEiNxLmGxQ9hT3/NfbOWGKiFzN4ab4I488gqeffhqrV6+Gt7e3cv+VV16JTZs2ObU4YbXRPcVNZhMAwNfLF156z31Rj4i0RahcMpcDFtvFQ/ALVbUUV+J+4kSkNmGyoaKmKe7B+4kXVhTiVOkpANwXkIjUI0wu1GYqsN16eFNcHpV7UeBFCPf17M+ViLRFmGxQptIGqVuHi0mSxAlTRORyDjfFd+/ejWuvvbbB/dHR0cjLy3NKUcJrI01xyWJB2eYtKFr5Hco2b0FJzYt6XCXufGvXrkW3bt1gsVgAAPPnz8fFF1+sak0ffPABQkNDlbddUdOqVatw+eWXw2q1OvW45FmEyiV5dLrey6OnibApTkRqEyYb5PHpvp67Xx73BSQiLRAmF2prI+PT2QAhIrUIkw1yr8HDx6dnl2UjvyIfXjovpISnqF0OEXkoh5vioaGhyMrKanD/n3/+iYsuusgpRQmvSh5p4rlBVfzTTzg8bDgyp07FmTlzkDl1KkrGTwF+2eSypvi0adMwYcIElxy7OcePH4fBYEBYWBgMBgOCgoLQvXt33H333Th06JBbanjooYcwd+5cGAwGtzxfS8yZMwdr16516jFHjx4No9GIZcuWOfW45FmEyqXa+4l76Fhxs8WMQ+dsfxu7RXRTuRoiaquEyYbKmqZ4G9hPnKvEiUhNwuRCbW2kKb43z3bxFEflEpG7CZMNlfJKcc/tNQDAnnzbeUNyWDJ8vXxVroaIPJXDTfEbb7wRDz/8MLKzs6HT6WC1WrFhwwbMmTMHt9xyiytqFI+HrxQv/uknnL7/AVRnZ9e5X8rNh+6fL8CyboNKlbWOxWJpdkXy119/jdOnT2PXrl1YuHAh9u3bh169ejm9EVzf77//jiNHjmDixIkufZ7WCgwMRESE80/WJ0+ejNdee83pxyXPIVQuVRTabj14P/HDhYdhtpoR7B2MdoHt1C6HiNooYbJBWSnu+U1xrgAkIjUJkwu1tZGm+O683QCYE0TkfsJkg7wAz8NXist50D2SF9MSkes43BRfuHAhUlJSkJCQgNLSUqSmpmLw4MEYNGgQ5s6d64oaxSPYSBNJkmA1mez6z1JSgpynnwEkqcFxdDX/nV24CJaSkqaPU16u/Ftq5Dj2GjJkCO677z489NBDCA8PR2xsLObPn1/nMYWFhbjjjjsQExMDX19f9OjRAytXrgRwfvz3t99+i9TUVPj4+CAzM7PJ55Ofo2PHjrjmmmuwZs0a9O/fH9OnT1fGmgPAN998g969e8PX1xcdO3bEggULUF1dbVdNjVmxYgVGjBgBX9+GV8i99dZbSEhIgL+/P2644QYUFRUp79u6dStGjBiB6OhoJCYmYujQodixY4fyfkmSMH/+fCQmJsLHxwfx8fG47777lPdXVlZizpw5uOiiixAQEID+/ftj/fr1TdZZf3y6vLL/hRdeQFxcHCIiInD33XfDbDY79ByjR4/Gtm3bcOTIkSafm9o2oXJJXinuG6pqGa4kj07vFtENOg9dDU9E2idMNnj4SnFJkpQVH2mRaSpXQ0RtmTC5ILNaz587eHBT/KzpLHJMOdDr9Nx6iYjcTphsqGwbe4rLk0N43kBEruTl6Ad4e3vj7bffxuOPP47du3ejtLQU6enpSE5ORnl5Ofz8/FxRpzisFsBssv1bkJEmUnk5DvS+xGnHq87JwcG+/Zp9TE7Nbdcd26Hz92/xc3344YeYPXs2Nm/ejI0bN2LatGm49NJLMWLECFitVowZMwYlJSX45JNP0KlTJ2RkZNQZQW4ymfDcc8/hnXfeQUREBKKjo+1+br1ej/vvvx/XXnsttm/fjn79+uG3337DLbfcgiVLluDyyy/HkSNHcPvttwMAnnjiCbtqqu+3337DlClTGtx/+PBhfPbZZ/jf//6H4uJiTJ8+HXfddZcyarykpARTp07FK6+8gpKSEixduhRjx47FoUOHEBQUhC+++AIvvfQSVqxYge7duyM7Oxu7du1Sjn/PPfcgIyMDK1asQHx8PL766iuMHj0au3fvRnJysl1fo3Xr1iEuLg7r1q3D4cOHMWnSJFx88cWYMWOG3c+RkJCAmJgY/Pbbb+jUqZN93xxqU4TKpdrj0z0U9xMnIi0QJhs8fKV4jikHeeV5MOgM6BreVe1yiKgNEyYXZBWFgFQzyc4vXNVSXEmeJtIxpCP8jS1/bYiIqCWEyYaqmqa4IAvwWsIqWbE3v2Y7DU4OISIXcrgpft9992HJkiVISEhAQkKCcn9ZWRmuuuoqrFu3zqkFCkduiAMeOz5dS3r27IknnngCAJCcnIzXXnsNa9euxYgRI7BmzRps2bIF+/btQ5cuXQAAHTt2rPPxZrMZr7/+Onr16tWi509JSQFg23e8X79+WLBgAR555BFMnTpVeb6nnnoKDz30EJ544gm7aqrvxIkTiI+Pb3B/RUUFPvroI2WPm1dffRXjxo3Diy++iNjYWFx55ZUAAKvViuLiYrz11lsIDw/HL7/8gquuugqZmZmIjY3F8OHDYTQakZiYiH79bBczZGZm4v3330dmZqby3HPmzMGqVavw/vvvY+HChXZ9fcLCwvDaa6/BYDAgJSUF48aNw9q1azFjxgyHniM+Ph4nTpyw6zmp7REql9pSUzycTXEiUo8w2VBRM+XHQ1eKy6s9Ood2hp+XRl5UJKI2SZhckJkKbLc+wYCXt7q1uJA8TYQNECJSgzDZoKwU99xew/Gi4ygzl8HPyw8dQ5p/rZyIqDUcbop/9913CAsLw4IFC5T7ysrKMHr0aKcWJiw5pHR6wKvhuGst0vn5oeuO7XY91rRtG07efscFH5ew9C349+nT4H6r1YrikhIEBwVBr9dD18or7nr27Fnn7bi4OOTm5gIAdu7ciXbt2inN58Z4e3s3OIYj5PHv8ojgXbt2YcOGDXjmmWeUx1gsFlRUVMBkMtlVU33l5eWNjk5PTExUGuIAMHDgQFitVhw4cACxsbHIycnB3LlzsX79euTk5MBqtcJkMikj4v/2t7/h5ZdfRseOHTF69GiMHTsW48ePh5eXF3bv3g2LxdKgzsrKSof2De/evXudVfBxcXHYvdu2P4wjz+Hn5weTyQSixgiVS+WFtlsPbYqbrWYcPHcQAFeKE5G6hMmGypqmuG+IunW4CPeJJSKtECYXZMp+4p67Shw4v1K8RwRzgojcT5hsqPL88enyeUO38G7w0jvcsiIispvDf2F++uknXH755QgLC8MDDzyAkpISjBo1Cl5eXvjhhx8cOtavv/6Kf/3rX9i+fTuysrLw1VdfYcKECcr7m9qL9Pnnn8eDDz4IAEhKSmqwgvTZZ5/FI4884tgn5izyfuLegYAge6nqdDq7R5gHXHopvGJjUZ2T0+i+4tDp4BUTg4BLL4WusZHgViv01dXQ+/tDr3d4S/sGjEZjvafXwWq1jRizZ8SNn59fq/a83bdvHwCgQ4cOAIDS0lIsWLAA1113XYPH+vr6tmjsTmRkJM6dO+fwx02dOhX5+fl46aWXEBERgYiICFx66aWoqqoCYBtLfuDAAaxZswarV6/GXXfdhX/961/45ZdfUFpaCoPBgO3btzcY7R4YaP+onua+P448R0FBAaKioux+XmpbnJlLLqesFA9VtQxXOVp4FFXWKgQZg5AQlHDhDyAichFhskEZn+6ZTXF5BWD3yO4qV0JEbZ0wuSBTmuKeu5+4JEnnR+VGsSlORO4nTDa0gfHpvJiWiNzF4aZ4p06dsGrVKgwdOhR6vR6ffvopfHx88N133yEgwLERHmVlZejVqxduu+22RpuIWVlZdd7+4YcfMH36dEycOLHO/U8++aSyRzEABAWpeNWUcuWWZ4aUzmBAzGOP4vT9D9ia/rUb4zW95ZjHHm28Ie5mPXv2xKlTp3Dw4EGHVmbby2q1YsmSJejQoQPS09MBAL1798aBAwfQuXNnp9WUnp6OjIyMBvdnZmbizJkzyujxTZs2Qa/Xo2tX236NGzZswOuvv46xY8eiuLgYRUVFyMvLq3MMPz8/jB8/HuPHj8fdd9+NlJQU7N69G+np6bBYLMjNzcXll19u99fEEfY+R0VFBY4cOaJ8jYnqc2YuuZyHj0+XR6d3i+jWqguOiIhaS5hsqKxpinvg+HSrZEVGni0X0iLTVK6GiNo6YXJB1gaa4qdKTqGosghGvRFdQp3/mg0R0YUIkw2Vnt1vAM5vu8TzBiJytRbNoujZsydWrlyJESNGoH///li5cmWLVsCOGTMGY8aMafL9sbGxdd7+5ptvMHTo0AZ7MAcFBTV4rGqUleIaCk4nCx45EnjlZeQsfBbV2dnK/YaYGMQ+9pjt/RpwxRVXYPDgwZg4cSIWL16Mzp07Y//+/dDpdC0ag1NQUIDs7GxUVFRgz549ePnll7FlyxZ89913ykrnxx9/HFdddRUSExNx/fXXQ6/XY9euXdizZw+efvrpFtU0atQofPjhhw3u9/X1xdSpU/HCCy+guLgY9913H2644QbldyE5ORkff/wxevfujaysLDz55JN1fk8/+OADWCwW9O/fH/7+/vjkk0/g5+eH9u3bIyIiAjfddBNuueUWvPjii0hPT8fZs2exdu1a9OzZE+PGjXP461dfly5d7HqObdu2wcfHBwMHDmz1c5LnclYuuZyHN8XllR7dwrupXAkRkSDZoKwU97ymeGZxJkrMJfAx+KBTaCe1yyEiEiMXZG2gKS6vCkwJT4HRYLzAo4mIXEPz2VBdCVjNtn976ErxKksV9p/bD4ATpojI9exqiqenpze64svHxwdnzpzBpZdeqty3Y8cO51VXS05ODr777rtGm4OLFi3CU089hcTEREyZMgWzZs2Cl1fTn1plZSUqKyuVt4uLbS9Gmc1mmM3mVtWpKy+CFwCr0R+WVh6rKXKNLanVbDZDkiRYrVZljHVLBA4fjoChQ1GwZQNyMw/AEBWBTldcA53B0Oxx5T245RocIUlSg49r7O3a933++ed48MEHMXnyZJSVlaFz585YuHBhnc//QnXI75dH+/v7+6N9+/YYMmQI3nzzTXTu3Fl5zIgRI/Dtt9/i6aefxnPPPQej0YiUlBTcdtttdtXUmMmTJ+Ohhx7Cvn37lFXgkiShc+fOmDBhAsaOHYuCggKMGzcOr732mnKct99+G3feeSf69OmDiy66CAsXLsRDDz2kfH2Cg4Px/PPPY/bs2bBYLEhLS8M333yDsLAwWK1WvPvuu3jmmWfwj3/8A6dPn0ZkZCT69++PsWPHNvr1k7+3td9u7PtT+zEXeg5JkvDf//4XU6ZMga+vb6t+Zt2pNT/nanN27fL30Ww2NxiT35q/t87OpeZyobW11uZVfg46ANXGIEguygjA+XXbS76yt2to1xZnVO1bkYhaO+t2L9Zt/3O1hDuzwWm5UFFkywUvf5flglo/d7tydgEAuoZ1BSyA2eLY8/P3xf1ErZ11u5e769ZSLgBNZ0Nra61PX3oWBgAW31BYPfC8AQD+yv0LAJAanspzB4GIWruodQOsvbnjtoQ7s8Ep5w2mc5AvGzLrfAAPO2cAbFMHq63VCPUJRYxPDDNBMKxfXay/4bEuRCdJjW0MXdeCBQvsfuInnnjC7sfWKUSna7CneG3PP/88Fi1ahDNnzsDX11e5f/HixejduzfCw8Pxxx9/4NFHH8Wtt96KxYsXN/lc8+fPb/RzWr58Ofzt3Fu7KfHnNqHv8deRF5iCDcmPtepYruDl5YXY2FgkJCTA29u71ccrshah1FoKf50/wgyeufJRC+bNm4eSkhK8/PLLapfiVvn5+ejbty/WrVuH9u3bq10OtUBVVRVOnjyJ7OxsVFdX13mfyWTClClTUFRUhOBgx1bIOTuXXJkLtQ3fOxsBVXn4tcvjOBfQ+DYLorJIFjxV9BSqUY0Hgh5ApCFS7ZKISEBtLRtG7b4HvtXFWNf1aRT7JzrlmFrxnek7bKzaiIHeAzHOv/VThoiobdJSLgDuO2+4+MTbaF/wGzLi/oZDseOddlwtWVqyFJmWTEz0n4h0b26XRkT2a0vZ4F95FiMy/oFqvTe+6/VOq46lVZsqN2Fl+UokeyVjauBUtcshIkHZmw12NcXd4UJN8ZSUFIwYMQKvvvpqs8d57733cMcdd6C0tBQ+Pj6NPqaxq7cSEhKQl5fncJDWp9v5Cby+ewDWziNgmfRpq47VFLPZjNWrV2PEiBEwGh0bMVVRUYGTJ08iKSmpzsUFLXWs+BgqqisQHxiPEO+QCz5ekiSUlJQgKChIqP1m1a67sLAQb7zxBh5++GHo9XqHPlbt2ltj69at2Lt3L6ZOnSpU7SJ/zZ1de0VFBY4fP46EhIQGf3OKi4sRGRnZopMYZ2sqF7KysrB58+YW/b1tjNcLHaCrLIH5zk1AhOua4q3JiZY6VHgIk76fhACvAPzyt1+g1zn2twpQp25nEbV21u1erPvCRMgGZ5wzyLwWXQSdpRLme/4EQhKccsz61Pq5m/bTNPyV9xeeGvgUxnVwvCnO3xf3E7V21u1e7q5bS7kANJ0Ny5cvxzXXXOO0r4nhs5ugP/Qjqse8CKm36xoEav0cVlurMfjzwaiwVOCLcV+gQ0gHh48h6u8QwNrVIGrdAGtvjCjZ4JTzhpy9ML5zBaSAKFQ/sK+VlTZNzZ+zJzY+gf8d+x9m9JiBmT1ntugY/D1RD+tXF+s/z95saNGe4gCwfft27Ntn+0PcvXt3pKe77qrO3377DQcOHMB//vOfCz62f//+qK6uxvHjx5VR0/X5+Pg02jA3Go2t/8GxVAAA9L7B0Lv4h7Al9VosFuh0Ouj1eoebq/VVW6tRUW37fAONgXYdTx7HLNcgCrXrDg8Pxz//+c8WfazatbdG37590bVrV+FqF/lr7uza9Xo9dDpdo3+vnB3Urcml5nJBvm19PlQDlSW24wVFA274HxWn1G2ng0UHAQApESnw8W78ojR7ubNuZxO1dtbtXqy7+edwJldlg1PqrK4ELLYXz4wB4S7PBXf+3JmtZhw4dwAAcHHMxa16Xv6+uJ+otbNu93JX3VrKBaDpbACc/DUpPwcA8PLA8wYAOFpwFBWWCgQaA9E5onOLLqiVifo7BLB2NYhaN8Da6x/PmVyVDU75vK2219513oFuy113/5ztLbBtxdfa8waAvydqYv3qYv32Z4PDTfHc3FzceOONWL9+PUJDQwHYVrEOHToUK1asQFRUlKOHvKB3330Xl1xyCXr16nXBx+7cuRN6vR7R0dFOr8MuVaW2W+8AdZ7fjcrMZQAAHy8fGA3i/sIRkdjUyKUWqSg6/2/fC0/WEE1GfgYAIDUiVeVKiIgEyYaK8/vQwidIvTpc4EjhEVRaKhFkDEJisGeNhSciMQmRC7WZ8m23/hHq1uEie/L2AAC6R3RvVUOciKg1hMgGudfgE6huHS5SWlWKY0XHANgygYjI1Rz+P897770XJSUl2Lt3LwoKClBQUIA9e/aguLgY9913n0PHKi0txc6dO7Fz504AwLFjx7Bz505kZmYqjykuLsbnn3+Ov//97w0+fuPGjXj55Zexa9cuHD16FMuWLcOsWbPwf//3fwgLU2l/6ypboxjenhlUtclN8QCj518AQETa5cxccqma1R7wCQYMLR7Uoln78m1XVbMpTkRaIEQ2VNY0xb2DAL1B3VqcbHfebgBAamQqmx1EpAlC5EJtnt4Uz69pikeyAUJE6hEiGyrlBXiedRGtLCM/AxIkxAfEI8LPMzOPiLTF4VflV61ahTVr1qBbt27Kfampqfj3v/+NkSNHOnSsbdu2YejQocrbs2fPBgBMnToVH3zwAQBgxYoVkCQJkydPbvDxPj4+WLFiBebPn4/Kykp06NABs2bNUo6jikoxVoo7Yyt5uSkeaPT8CwCIqGWc8bfmQpyZSy4lN8X9QlUtwxUsVosyJpdNcSLSAiGyQZ4g4qv+PojOtjfPNgKxR0QPlSshIrIRIhdklurzGeGpTfGaleI9IpkTRKQeIbLBw1eKyxfT8iIpInIXh5viVqu10dnsRqNR2YvWXkOGDLlgw+T222/H7bff3uj7evfujU2bNjn0nC6nrBTXZlNc/t6ZTCb4+fm1+DhVlipUWaoAAP5e/k6pjYg8j8lkAuD8/Z5qc2YuuZTSFFdpkokLHSs6hvLqcvh7+SMpOEntcoiIxMgGpSnueVtqyM2OtMg0lSshIrIRIhdkFYUAal4r88Bzh4rqChw6dwgAc4KI1CVENigL8DyzKb4333YxLfOAiNzF4ab4lVdeifvvvx+ffvop4uPjAQCnT5/GrFmzMGzYMKcXKJwqbQeVwWBAaGgocnNzAQD+/v7Q6XQOH6eosghWsxV+Xn4wV5lhhtmuj7NaraiqqkJFRQX0enFGKYpaN8Da1SBq3YDzapckCSaTCbm5uQgNDYXB4LqxsMLkUkWh7dYDX9jKKLDtJ54SnsIxuUSkCUJkgzw+3cezVoqXV5fjcOFhAFzxQUTaIUQuyOTR6b6hHrnt0v6C/bBIFkT4RiDGP0btcoioDRMiG6pKbLcevlKck0OIyF0c/r/r1157DVdffTWSkpKQkJAAADh58iR69OiBTz75xOkFCkeAPcVjY2MBQGmMt8S5inMory5HkHcQzN72NcQBW6OsvLwcfn5+LWrGq0XUugHWrgZR6wacX3toaKjyN8dVhMkleaW4b6iqZbgC9xMnIq0RIhsqapriHjY+XW52RPpFstlBRJohRC7IPH0/8VrTREQ7XyYizyJENnjwSvG88jxkl2VDBx1fTyIit3G4KZ6QkIAdO3ZgzZo12L9/PwCgW7duGD58uNOLE1KV9vcU1+l0iIuLQ3R0NMxm+xvaMqtkxbzv56G4qhjPXv4sOkR2sPtjzWYzfv31VwwePNil45SdTdS6AdauBlHrBpxbu9FodOkKcZkwueTB49Mz8m0rxXkSQ0RaIUQ2eOhKcWWf2IgebHYQkWYIkQsyT2+K59tygtNEiEhtQmSDxqfStoZ83tAptBMCjNrtpRCRZ3G4Kf7RRx9h0qRJGDFiBEaMGKHcX1VVhRUrVuCWW25xaoHC0fie4rUZDIYWNaz2F+zHgdID8PPyQ8+4njAa7G+cGQwGVFdXw9fXV6hmoah1A6xdDaLWDYhZuzC55KFNcYvVgn0FXClORNoiRDZ46Epx+cUtNjuISEuEyAWZpzfFa60UJyJSkxDZIPcaPHB8unLeEMHzBiJyH4c3/rz11ltRVFTU4P6SkhLceuutTilKaB589ZZs05lNAIA+MX0caogTEbmCMLlUXmi79bCm+IniEyivLoeflx+SgpPULoeICIAg2eChK8X35u8FwGYHEWmLELkg8+CmeFFlEU4UnwDAJggRqU+IbKis2VPcA3sNvEiKiNTgcFNckqRGx+CdOnUKISEhTilKaB589ZZsU5atKT4gboDKlRARCZRLykrxUFXLcLaMAtvo9K5hXWHQu35cPhGRPYTIhoqaF+B8NVKPE7DZQURaJUQuyEwFtlv/cHXrcAH5wql2ge0Q6huqbjFE1OYJkQ3yAjyfIHXrcDJJkpTtNHpE9lC5GiJqS+wen56eng6dTgedTodhw4bBy+v8h1osFhw7dgyjR492SZFCEWh8ektUWaqwPWc7AGBAPJviRKQe4XLJQ8encz9xItISobJBaYp7zkpxNjuISGuEygWZB68U35vHaSJEpD6hsqHSM6fSnio5haLKIhj1RnQJ66J2OUTUhtjdFJ8wYQIAYOfOnRg1ahQCA8//Ifb29kZSUhImTpzo9AKFYrUAZpPt3x4WVLJdZ3ehwlKBcN9wJIcmq10OEbVhwuUSm+JERC4nVDYo49M1sgrFCeRmB1d7EJFWCJULMqUp7nkrxXfn7QYAdI/kNBEiUo9Q2aCsFPesXoOcBynhKdyelYjcyu6m+BNPPAEASEpKwqRJk+Dr6+uyooQlrxIHPHal+MYzGwHYRqc3Nl6GiMhdhMulikLbrQc1xa2SFfsL9gMAukV0U7kaIiLBsqGipinuQSvF5X0B2RQnIq0QKhdkXClORORSQmWDh64U5+h0IlKLXU3x2vtrTJ061aUFCU1uiuv0gJeGw7QVNmdtBsD9xIlIXcLlkiSdXynuQeNkM4szUWYug6/BFx1DOqpdDhG1ccJlg7JS3IOa4nxxi4g0RLhckHloUzynLAe55bnQ6/RICU9RuxwiaqOEy4aqEtutpzXFeTEtEalEb8+DunfvjhUrVqCqqqrZxx06dAgzZ87EokWLnFKccJT9xIMAD1xFXVxVrLzQxaY4EalJuFyqKgWs1bZ/e9BKcXl0epfwLvDS2z18hojIJYTLBg9bKZ5rykWuydbs6BbO6SFEpD7hckFmKrDdelhTXH49qXNoZ/gb/VWuhojaKqGyQZLO9xs8aHx6tbUa+/L3AWBTnIjcz65XsF999VU8/PDDuOuuuzBixAj06dMH8fHx8PX1xblz55CRkYHff/8de/fuxT333IOZM2e6um5tkvf48NDR6Vuzt8IqWZEUnIS4wDi1yyGiNky4XJJXiRt8AKOfurU4kbKfeDj3Eyci9QmVDZIEVBTZ/u3rGXuKy6s9OoZ0ZLODiDRBqFyQWcznJ4l4WFNcHp3OBggRqUmobKiuPL/AwoNWih8pPIIKSwUCjYFICk5SuxwiamPsaooPGzYM27Ztw++//47//Oc/WLZsGU6cOIHy8nJERkYiPT0dt9xyC2666SaEhXnOCjiHeXhTfNOZTQCA/nH9Va6EiNo64XKpvNB26xfmUZNEMgpqmuIRbIoTkfqEyobqCsBqtv3bQ8ancwQiEWmNULkgk1eJ6/Qec9GUbHfebgBA94juKldCRG2ZUNkg9xoAj2qK184Dvc6uQcZERE7j0KzTyy67DJdddpmrahGfMj7dM5vim7Nt+4kPjBuociVERDbC5JK8UtwvVNUynMkqWZVxV2yKE5GWCJEN8uh06DzmBa69+bYVgGmRaSpXQkRUlxC5IJP3E/cLA/QGdWtxIqtkZU4QkaYIkQ2VNfuJGwMAvec0j3kxLRGpyXP+mmqBfPWWT5C6dbhAdlk2jhUdg16nR5/YPmqXQ0QkFqUprpEVKE5wquQUSs2l8NZ7o2NoR7XLISISizwa1yfYI17gkiRJeXGreyRXABIRtZjcFPew0emZxZkoqSqBj8EHncM6q10OEZEYlF6DZ1xEK2NTnIjUJP4rMFriwSvFN2fZVol3j+iOEB/PGuFFRORyHtgUl/cT7xreFUa9UeVqiIgEI68U9/WM0eknS06iuKoYRr0RXUK7qF0OEZG4PLQpviff1gBJCU/huQMRkb0q5a1aPacpXl5djsOFhwGwKU5E6mBT3Jk8uCm+Kcu2n/iAuAEqV0JEJKCKQtutBzbFu4V3U7kSIiIBVRbZbj1sP/GU8BQYDWx2EBG1mIc2xffm2UanswFCROQAeaW4B/Ua9hfsh0WyINIvEjH+MWqXQ0RtEJvizlTpeUEF2MYhsilORNQK8kpx31BVy3CmjAJbU5z7iRMRtUBFTVPc1zMmMMkrALtHcHQ6EVGrmApst/7h6tbhZLvzdgNgThAROcQDt2qtPTpdp9OpXA0RtUVsijtTleeNNAGAI4VHkFeeB1+DL3pF91K7HCIi8XjY+HRJkrAvfx8ANsWJiFrEw8anyysA06LSVK6EiEhwHrhS3Gw1Y3/BfgBAWiRzgojIbh44Pl2+SIp5QERqcbgpbjAYkJub2+D+/Px8GAwGpxQlLGV8uucEFXB+dHrvmN7wMfioXA0RUV1C5JLSFA9VtQxnOVV6Stk7tnNoZ7XLISJqQPPZUFnTFPeA8enV1mplS40eERyLS0TapPlckHlgU/zwucOotFQiyBiExOBEtcshIlJoPhuUleKe02tQttPgeQMRqcThprgkSY3eX1lZCW9v71YXJDQP3VOco9OJSMuEyKXyQtuth6wUl5sfyWHJ3DuWiDRJ89ngQSvFjxQeQYWlAgHGACSFJKldDhFRozSfCzIPbIorW2xEdodex4GVRKQdms8GD1spXlRZhMySTAC2TCAiUoOXvQ9csmQJAECn0+Gdd95BYOD5P8YWiwW//vorUlJSnF+hSKo8b09xs9WMrdlbAbApTkTaIlQuKU3xUDWrcBq5Kc7R6USkNcJkgwetFN+bb1vtkRqRymYHEWmOMLkgk5vifp6zp7iyKjCSqwKJSBuEyYaqEtuth+wpLu8nnhiUiBCfEJWrIaK2yu6m+EsvvQTAdgXVm2++WWeEiLe3N5KSkvDmm286v0KReOCe4nvy9sBUbUKoTyi6hndVuxwiIoVQueRhe4pzP3Ei0iphssGDVorLL26x2UFEWiRMLshMBbZbD1opLu8fy5wgIq0QJhs8bKU4zxuISAvsboofO3YMADB06FB8+eWXCAvzjBf2ncoDx6dvOmMbnd4/rj9XfhCRpgiVSx7UFJckCRkFXClORNokTDZUFNlufcVfIaG8uMV9AYlIg4TJBZkyPt0zVoqbzCYcKTwCgDlBRNohTDZ42FRaNsWJSAvsborL1q1b54o6PIPcFPfxjKu3AO4nTkTap/lcqq4CzDX54AFN8TNlZ1BUWQQvvReSQ5PVLoeIqFGazwYPGZ9eaanEoXOHAPDFLSLSNs3nAgCYK86fN3jISvH9BfthkSyI8otCTECM2uUQEdWh+WzwoF6DJEnK5JC0yDSVqyGitszhpvhtt93W7Pvfe++9FhcjPA8bn15mLsNfZ/8CwKY4EWmX5nOporDmHzrAA/ZMkvcTTw5NhrfBW+VqiIgap/1s8IyV4vsL9qNaqka4bzjiAuLULoeIqEmazwUAKK8Zna4zCJ8PMq4KJCIt03w2VNbsKe4t/p7iOaYc5Ffkw6AzcItWIlKVw03xc+fO1XnbbDZjz549KCwsxJVXXum0woTkYePTt+dsR7VUjXaB7dAuqJ3a5RARNUrzuSSPTvcNAfTib0PB/cSJSASazwYPWSkuNzu6R3SHTqdTuRoioqZpPheAWqPTIwAP+ZvKpjgRaZnms0FegOcBK8XlPEgOS4afl5/K1RBRW+ZwU/yrr75qcJ/VasXMmTPRqVMnpxQlrErP2udj45mNAIAB8VwlTkTapflc8qD9xIHzK8XZFCciLdN8NlTUNMV9xW6K783bC4AjEIlI+zSfC0DdpriH2JPPpjgRaZfms6HSc6bSyqPTmQdEpDanLFnT6/WYPXs2XnrpJWccTkxWC1Bdbvu3BwQVwP3EiUhcmsql8kLbrQc0xSVJYlOciISlmWyQJI9ZKS6/uNU9srvKlRAROU4zuSDzsKZ4UWURTpacBGCbKEJEJAJNZYMHrhTvEcGmOBGpy2lzXI8cOYLq6mqHPubXX3/F+PHjER8fD51Oh6+//rrO+6dNmwadTlfnv9GjR9d5TEFBAW666SYEBwcjNDQU06dPR2lpaWs/HcfJo9MBj2iK55Xn4XDhYQBAv9h+KldDROS4luSSSygrxUNVLcMZssuyca7yHLx0XkgOS1a7HCIih2kiG8wmwFpTg8B7xpZUleB48XEAXPFBROLSRC7ITDV7ivuHq1uHk8gNkPbB7RHiI27eEVHbo5lsUFaKi72nuFWyYm++bcIUzxuISG0Oj0+fPXt2nbclSUJWVha+++47TJ061aFjlZWVoVevXrjttttw3XXXNfqY0aNH4/3331fe9vHxqfP+m266CVlZWVi9ejXMZjNuvfVW3H777Vi+fLlDtbSa3BTXGQAvn+YfKwB5lXi38G4I8xV/dSMReS5n5pJLeND49IwC2yrxTqGd4GMQP+uIyHNpOhvk0ek6g9DbLsmTQ+ID4hHu6xkNHCLyXJrOBZmHrRSXm+JcJU5EWqXpbJAkoKrE9m+BzxkA4HjRcZSZy+Br8EWnUA2MpSeiNs3hpviff/5Z5229Xo+oqCi8+OKLuO222xw61pgxYzBmzJhmH+Pj44PY2NhG37dv3z6sWrUKW7duRZ8+fQAAr776KsaOHYsXXngB8fHxDtXTKnJT3DsQ0Onc97wusukMR6cTkRicmUsu4UlNcY5OJyJBaDoblNHpQUKfNygjELnag4gEoOlckHloU5w5QURapelsqK4AJKvt34KPT9+Tb8uD1IhUeOkdbkcRETmVw3+F1q1b54o6mrR+/XpER0cjLCwMV155JZ5++mlERNhOEDZu3IjQ0FClIQ4Aw4cPh16vx+bNm3Httdc2eszKykpUVlYqbxcX216YMpvNMJvNLSvUdA5GAJK3P6pbegw7yTW2uNYLkCRJWSneJ7qPU5/H1bW7iqh1A6xdDaLWDbi3dmc9hzNyqblcqH3bEnpTAQwALN4hsLrpZ8JV38e9Z23jrrqGdnXJzwh/d9yPdbsX67b/uVrL1dnQmjp1ZQXwAiD5BLv8vAFw3fdv91nbfuLdwroxE2oRtW5A3NpZt3u5u24t5QLQdDYAra/VUHoWegAW31DhzxskScLuvJqcCGVO1Mfa3U/UugHW3txxW8vV2dCq84YyW68BAMw6b0DgfsOunF0AXHfeAPD3RE2sX12sv+GxLkQnSZLUkifIzc3FgQMHAABdu3ZFdHR0Sw5zvhCdDl999RUmTJig3LdixQr4+/ujQ4cOOHLkCB577DEEBgZi48aNMBgMWLhwIT788EOlDll0dDQWLFiAmTNnNvpc8+fPx4IFCxrcv3z5cvj7+7eo/oiS/bjs8EKU+MTh59TnWnQMrThrOYtXSl6BAQb8M+Sf8NZ5q10SEXkgk8mEKVOmoKioCMHBwa0+XmtyyRW5IOt9/A0knNuIPRdNxpHo5qejaJkkSVhUvAhlUhnuCLwDCV4JapdERB6oLWRDVPFfGHTkBRT5JWJ9ytMtPo7a/lX0LxRJRbgt4DZ0NHZUuxwi8lBaygXAtecNAw8/h+iSvdje/g6cCr+0VcdSW6G1EC8UvwA99JgbMpevKxGRU7WFbPCvzMGIjAdRrffFd72WtugYWvFmyZs4ZTmFG/xvQE/vnmqXQ0Qeyt5scLgpXlxcjLvvvhuffvoprFbbCA+DwYBJkybh3//+N0JCQlpUcGNN8fqOHj2KTp06Yc2aNRg2bFiLm+KNXb2VkJCAvLy8Fgep7tBP8PpsCqxxF8Ny25oWHcNeZrMZq1evxogRI2A0Gi/8AQ76z8H/4Lltz6FPTB8sHebc0HV17a4iat0Aa1eDqHUD7q29uLgYkZGRrT6JcUYuNZULWVlZ2Lx5c6u+HoZPJ0F/dC2qr3oVUq/JLTqGo1zxfcw15WL016Nh0Bnw299+g6+Xr1OOWxt/d9yPdbsX674wEbKhNecMAKDL+BpeX/0d1sRBsNz8bYuPYy9XfP/yy/Mx4qsR0EGHX//2KwKMzt/nkL8v7idq7azbvdxdt5ZyAWg6G5YvX45rrrmmVV8Tr3eGQpezG9WTVkDqPLzFx3GEq76fa0+uxYO/PYiuYV3x6ZhPnXbc2kT9HQJYuxpErRtg7Y0RJRtadd6QvRvGd4dCCoxB9f17W3YMB7jqe2W2mHHZ55fBbDXj26u/RbvAdk47dp3n4e+Jali/ulj/efZmg8Pj02fMmIE///wT3333HQYOHAjANsb8/vvvxx133IEVK1a0vOoL6NixIyIjI3H48GEMGzYMsbGxyM3NrfOY6upqFBQUNLkPOWDbp9zHx6fB/UajseVfeGsFAEDvEwS9m374WlVvM7bmbAUADIof5LJfJFfV7mqi1g2wdjWIWjfgntqddXxn5FJzuSDftrjeykIAgFdgJODmnwdnfh8PFh0EAHQM7YggvyCnHLMp/N1xP9btXqy7+edwBldnQ6vqNJcCAPR+oW47bwCc+/07kG27KLlDSAeE+oc65ZhN4e+L+4laO+t2L3fVraVcAJrOBrnWVtVbXgAA8AqOFvq8AQD2n9sPwLafuDv+30HE3yGAtatB1LoB1l7/eM7g6mxwRq9B5xPk1u+7s79XB4sOwmw1I9QnFEmhSdDpdE47dmP4e6Ie1q8u1m9/NjjcFF+5ciV+/PFHXHbZZcp9o0aNwttvv43Ro0c7ejiHnDp1Cvn5+YiLiwMADBw4EIWFhdi+fTsuueQSAMDPP/8Mq9WK/v37u7SWBqrKbLfezl8l4U7V1mpszbY1xQfEDVC5GiKiC1Mzl+xSXmi79QtTtYzWyijIAACkhqeqXAkR0YVpOhsqa/af9Wn9qEe17MnfA8DW7CAiEoGmcwEAJAkw5dv+7R+hbi1OsCfPlhNpkWkqV0JE1DRNZ0Ol7UJaeAeqW0cr7c7bDQDoHtnd5Q1xIiJ7ONwUj4iIaHR0SEhICMLCHHvBv7S0FIcPH1bePnbsGHbu3Inw8HCEh4djwYIFmDhxImJjY3HkyBE89NBD6Ny5M0aNGgUA6NatG0aPHo0ZM2bgzTffhNlsxj333IMbb7wR8fHxjn5qraMEldhN8Yz8DJSYSxBkDEJqBBsfRKR9zswllyg/Z7v1C1W1jNbKyLc1xbtFdFO5EiKiC9N0NlTUNMV9xW2Kyy9usSlORKLQdC4AgNkEVNtWBYreFLdKVuzNt436ZU4QkZZpOhuqSmy3HtIU7xHBPCAibdA7+gFz587F7NmzkZ2drdyXnZ2NBx98EPPmzXPoWNu2bUN6ejrS09MBALNnz0Z6ejoef/xxGAwG/PXXX7j66qvRpUsXTJ8+HZdccgl+++23OuNIli1bhpSUFAwbNgxjx47FZZddhqVLnbsPtl08ZKX4pqxNAIB+cf1g0BtUroaI6MKcmUtOZ7UCFYW2fwu+Unxf/j4AQPeI7ipXQkR0YZrOBsFXikuShL15Nc0OvrhFRILQdC4AgMk2Oh0Gb+EbIMeLj6PUXApfgy86hXZSuxwioiZpOhvkXoOP2JkgnzdwcggRaYXDK8XfeOMNHD58GImJiUhMTAQAZGZmwsfHB2fPnsVbb72lPHbHjh3NHmvIkCGQJKnJ9//4448XrCc8PBzLly+3s3oXqpJXirt2n1VX25y1GQBHpxOROJyZS05XWQxIVtu/fUPd+9xOdNZ0FmfLz0Kv06NreFe1yyEiuiBNZ4PgK8VPl55GYWUhvPRezAQiEoamcwGoOzpd8PGycgOkW0Q3eOkdftmRiMhtNJ0NHjA+vcxchqNFRwHYxqcTEWmBw/93OmHCBBeU4QE8YKV4eXU5/sz9EwCb4kQkDk3nkrxK3MsPMPqqWkpryKPTO4Z0hJ+Xn8rVEBFdmKazQV4p7ttwVKMI5P3Eu4R1gbfBW+VqiIjso+lcADxqP3Fl/1hOmCIijdN0NsgL8AReKZ6RnwEJEuIC4hDpF6l2OUREAFrQFH/iiSdcUYf4PKAp/mfOnzBbzYgNiEX74PZql0NEZBdN55Kyn7jYo9OV/cTDuZ84EYlB09lQUWS7FXR8OkcgEpGINJ0LwPnx6f7h6tbhBMwJIhKFprOhUvw9xZX9xCO55RIRaUeL5xhVVVUhNzcXVqu1zv3yqJE2p0oOKnGb4vJ+4gPiBkAn+LguImp7NJlLntIUL7A1xVMjUlWuhIjIMZrMBsHHp3MFIBGJTJO5AHjMSnGzxYz9BfsBsAlCROLQZDYoK8XF3ap1T55twhTzgIi0xOGm+MGDBzF9+nT88ccfde6XJAk6nQ4Wi8VpxQlFWSku7tVbtZviRESi0HQueUpTPJ9NcSISi6azoVJeKS7e+HSL1aJkAl/cIiKRaDoXAI9pih8sPIgqaxWCvYOREJSgdjlERM3SdDZ4wJ7iclOck0OISEscborfeuut8PLywsqVKxEXF8cVxTK5KS7oPh/nKs5hX8E+AED/uP4qV0NEZD9N51J5oe3WL1TNKlolrzwPuaZc6KBDSniK2uUQEdlF09kg8ErxY0XHUF5dDj8vP3QM6ah2OUREdtN0LgAe0xSXR6f3iOyhva8xEVE9ms4GeaW4oFNp88rzkFWWBR10XGBBRJricFN8586d2L59O1JS+MJ4HYLvKb45ezMAIDksGZF+kSpXQ0RkP03nkrJSPFTVMlpDXhGYFJIEf6O/ytUQEdlHs9kgSef3BxRwT/E9+bbVHqkRqTDoDSpXQ0RkP83mgsxDmuLcYoOIRKLpbBB8fLp8kVTHkI4IMIrZLyEiz6R39ANSU1ORl5fnilrEJr+4JehIk01nODqdiMSk6VzygPHp+/JtU0R4ZS8RiUSz2VBVBkg1Yxh9xRufruwLGMHR6UQkFs3mgsxDmuIclUtEItF0Ngg+Pl25SCqSF0kRkbbY1RQvLi5W/nvuuefw0EMPYf369cjPz6/zvuLiYlfXq12CrxTnfuJEJBJhckkZny5uU1zZTzycTXEi0jYhsqGiZj9xvRdg9FOvjhZSmuLcT5yIBCBELshMBbZb/3B162gFk9mEo0VHATAniEi7hMkGZaW4mE1xecIUL5IiIq2xa3x6aGhonT01JEnCsGHD6jxGkiTodDpYLBbnVigKgZviJ0tO4nTpaXjpvNAnpo/a5RARXZAwuVRRaLv1DVWvhlbKKKhpinOlOBFpnBDZUFnz4ppPMKClPQvtUGWpwoFzBwBwxQcRiUGIXJDJK8X9xG2KZ+RnwCpZEeMfgyj/KLXLISJqlDDZIPBKcUmSeDEtEWmWXU3xdevWuboOsVktQHW57d/e4u3zIa8S7xnVk/vFEpEQhMklwcenF1QUILssGwCQEq7BPbaIiGoRIhsqaprivuLtJ37w3EFUW6sR6hOKdoHt1C6HiOiChMgFAJAkjxifvjfftn8sGyBEpGXCZIPAe4qfKj2FosoiGPVGdAnronY5RER12NUUv+KKK1xdh9jkVeKAkCvFlf3E4zk6nYjEIEwuCd4Ul/cTTwpOQqCAVycTUdsiRDbUXikumNr7AuoEW+VORG2TELkAAJUlgNVs+7fATXE5J9gUJyItEyIbJOl8U1zA12LkVeIp4SnwNnirXA0RUV12NcVr++uvvxq9X6fTwdfXF4mJifDx8Wl1YUKRQ0pnALzE+tytkhWbszcDAAbGDVS5GiIix2k6lwRvisv7iXeL6KZyJUREjtFsNsh7ivuGuP+5W0kZgRjBZgcRiUezuQCcXyXu5Qd4izu9j6NyiUg0ms0GswmQrLZ/C7gAT86D7hHccomItMfhpvjFF1/c7MoAo9GISZMm4a233oKvr2+rihOGsp94oHB7A+4v2I+iyiIEGAO4NyARCUnTuVReaLv1C3Xv8zqJ3BTniQwRiUaz2SCvFBewKb43j2NxiUhcms0FACgvsN0KvEq8oKIAp0tPAwBSI1JVroaIyD6azQZlKq1O6KZ4WlSaypUQETWkd/QDvvrqKyQnJ2Pp0qXYuXMndu7ciaVLl6Jr165Yvnw53n33Xfz888+YO3euK+rVJmWciXghJe8n3jemL4x6o8rVEBE5TrO5ZC4Hqstt/xZ8pThf2CIi0Wg2G+SV4oKNTy8zl+Fo0VEAbIoTkZg0mwsAYJKb4uHuf24nkS+cSgpOQrC3WBlHRG2XZrOhssR2K+ACvGprtfJaEidMEZEWObxS/JlnnsErr7yCUaNGKfelpaWhXbt2mDdvHrZs2YKAgAD84x//wAsvvODUYjVLvnrLR7w9PuT9xPvH9Ve5EiKiltFsLsmrxHUG4ZofAFBYUYgzZWcA2PaBIiISiWazoUJeKS5WLmTkZ0CChNiAWET6RapdDhGRwzSbC8D58ekCrxTfk8/R6UQkHs1mg7wAT8Bew5HCI6iwVCDAGICkkCS1yyEiasDhleK7d+9G+/btG9zfvn177N69G4Bt9EhWVlbrqxOFMj5drJXilZZK7MjdAQAYEDdA5WqIiFpGs7mk7CceKtyVvQCQUWC7sjcxKBFB3kEqV0NE5BjNZoM8Pl2wi6W4nzgRiU6zuQB4RlOc+4kTkYA0mw2V8lRa8ZritfcT1+scbj0REbmcw3+ZUlJSsGjRIlRVVSn3mc1mLFq0CCkptpVkp0+fRkxMjPOq1LraI00EsjN3JyotlYj0i0Sn0E5ql0NE1CKazaWKQtutb6h7n9dJODqdiESm3WwQc6W48uJWZHeVKyEiahnN5gIgfFNckiQ2xYlISJrNBoFXinNyCBFpncPj0//973/j6quvRrt27dCzZ08AtquqLBYLVq5cCQA4evQo7rrrLudWqmWCrhSX9xMfEDcAOgFXMRIRARrOJWWlOPcTJyJyN81mg6Arxffm2/aK5YtbRCQqzeYCIHxTPKssCwUVBfDSeXHbJSISimazQdAFeMD5i2nTItNUroSIqHEON8UHDRqEY8eOYdmyZTh48CAA4G9/+xumTJmCoCDbeNWbb77ZuVVqndIUFyuo5P3EOTqdiESm2VzykKZ4t4huKldCROQ4zWaDslI8xP3P3UIFFQU4XXoaAC+UIiJxaTYXgFpN8XB1nr+V5AZIclgyfAw+KldDRGQ/zWZDlZjj0yuqK3Do3CEAvJiWiLTL4aY4AAQFBeHOO+90di3iEnCleFFlkbLio39cf5WrISJqHU3mksBN8aLKIqUB0i2cTXEiEpMms6GiyHYr0Ph0udmRFJyEYG9x6iYiqk+TuQAApgLbraArxTk6nYhEpslskHsNgo1P31+wHxbJgki/SMT4t6GtdYlIKA43xT/66KNm33/LLbe0uBhhCXj11tbsrZAgoUNIB8QGxKpdDhFRi2k2l8oLbbd+oeo8fyvsK9gHAGgX2A4hPuKsZiQikmk2G5Tx6eL8bd2bx9HpRCQ+zeYCIPz4dHn/WI7KJSLRaDYbKsXrNQDA7rzdAIAeET24VSsRaZbDTfH777+/zttmsxkmkwne3t7w9/dv401xcVaK195PnIhIZJrNJYFXinM/cSISnWazQRmfLs6Ka7nZwaY4EYlMs7kACN0Ut1gtyrlD98juKldDROQYzWZDVc2e4oKtFOfkECISgd7RDzh37lyd/0pLS3HgwAFcdtll+PTTT11Ro/YJOD6dTXEi8hSazSUPaIpzP3EiEpUms8FqrbVSXIymuCRJyotb3SPY7CAicWkyFwBbNijj08XbU/x48XGUmcvg5+WHjiEd1S6HiMghms0GZaV4kHo1tACb4kQkAoeb4o1JTk7GokWLGlxd1WbIK8UFuXrrTOkZnCg+Ab1Oj76xfdUuh4jI6TSRSxWFtlsBm+L78m3j07lSnIg8ierZUFUKQLL9W5CV4tll2SioKICXzgsp4Slql0NE5FSq5wIAVBYBksX2bz/xmuJyA6RbeDd46R0eRklEpDmayAbBeg0AUFRZhMySTABsihORtjmlKQ4AXl5eOHPmjLMOJxZlpbgYQbU5azMAW0AFCXbFGRGRvVTPJXmluG+oejW0QElViXIikxrOpjgReRZVs0FeJW7wBrx81anBQfK+gMlhyfAVpGYiIkeofs4grxL3DgSM4v2dVfaPZQOEiDyI6tkg4J7ie/P2AgASghIQ4hOicjVERE1z+DLOb7/9ts7bkiQhKysLr732Gi699FKnFSYUwcanc3Q6EXkSzeaSoOPT5VXiFwVehFDBGvpERDJNZkNFke3WJxjQ6dSpwUHyfuLcJ5aIRKfJXACEHp0OnG+CpEWmqVwJEZHjNJsN8kpxQXoNwPnzBl4kRURa53BTfMKECXXe1ul0iIqKwpVXXokXX3zRWXWJpVKcoJIkiU1xIvIoms0lQZviyn7i4dxPnIjEpclsqKhZKS7I6HTgfLOjRwRf3CIisWkyFwDAlG+79Y9Qr4YWqrJUYf+5/QB48RQRiUmz2aCMTxdnwqs8OYQXSRGR1jncFLdara6oQ2xV4ow0OVR4CAUVBfDz8kOvqF5ql0NE1GqazCWr5XzzQ7SmeIGtKc79xIlIZJrMBnl8uo8YTXGrZMXe/JqmOFd8EJHgNJkLgNBN8YPnDqLaWo1Qn1C0C2yndjlERA7TbDYINj5dkiTsyeNKcSISQ6v2FJckCZIkOasWcQm0p/imM7ZV4r1jesPb4K1yNUREzqWZXKooAlBTh1+ompU4TB6fzqY4EXkK7WSDWCvFjxcdR5m5DL4GX3QK7aR2OURETqOZXACEborLDZDukd2hE2RbECKipmgqG5SV4trvNQBAjikHeeV5MOgMSAlPUbscIqJmtagp/tFHHyEtLQ1+fn7w8/NDz5498fHHHzt8nF9//RXjx49HfHw8dDodvv76a+V9ZrMZDz/8MNLS0hAQEID4+HjccsstOHPmTJ1jJCUlQafT1flv0aJFLfm0Wk6gPcXl0ekD4waqXAkRkfM4K5ecRh6d7h0IGIzq1eGg0qpSHC8+DgDoFsHx6UQkNs1lQ2XNnuK+IerV4AB5X8BuEd3gpXd4wBgRkeZoLhcAoZvi8qhcbrFBRCLTZDYItlJcvkiqc2hn+Hn5qVwNEVHzHH51Y/HixZg3bx7uueceXHrppQCA33//HXfeeSfy8vIwa9Ysu49VVlaGXr164bbbbsN1111X530mkwk7duzAvHnz0KtXL5w7dw73338/rr76amzbtq3OY5988knMmDFDeTsoyI37bVgtQHW57d8aDyqzxYxtObavHfcTJyJP4cxccpryQtutYKPT9xXYVonHBsQi3Ddc5WqIiFpOk9kgrxT3EaQpLq8AjOA+sUQkPk3mAlCrKS7e/3vvzbNtscH9Y4lIVJrMBkkSbk9xjk4nIpE43BR/9dVX8cYbb+CWW25R7rv66qvRvXt3zJ8/36GwGDNmDMaMGdPo+0JCQrB69eo697322mvo168fMjMzkZiYqNwfFBSE2NhYBz8TJ5FDCtD8SvG/8v5CeXU5wn3DkRyWrHY5RERO4cxccpqKmpXioo5OD+fodCISmzazQV4pLsb4dLnZwRe3iMgTaDIXAMBUYLsVbKV4mbkMR4uOArCNTyciEpEms6GqDMp2fBpfgCdjU5yIROJwUzwrKwuDBg1qcP+gQYOQlZXllKKaUlRUBJ1Oh9DQ0Dr3L1q0CE899RQSExMxZcoUzJo1C15eTX9qlZWVqKysVN4uLrat2jCbzTCbzY4VVVYEIwBJZ0C1pAcc/fgWkGt0tNYNpzYAAPrG9IWl2gILLE6v7UJaWrvaRK0bYO1qELVuwL21O+s5nJFLzeVC7Vu7WC3QH9sAAwCrJMFSWQHoDfZ/vBO09Pson8ikhKWo8vPL3x33Y93uxbrtf67WcnU2tKROfXkhDAAsxgBY3fgz0JLvn9lixv6C/QCAlBBmgiNErRsQt3bW7V7urltLuQA0nQ1Ay2o1lOVBD6DaJxSSQH9r/8r5CxIkxPrHIsQrhDnhINbufqLWDbD25o7bWq7Ohpb1Gs7V9Br0qIaXW3oNQMu/V1bJir35totpu4V2U+3nlL8n6mH96mL9DY91ITpJkiRHDtyjRw9MmTIFjz32WJ37n376afznP//B7t27HTnc+UJ0Onz11VeYMGFCo++vqKjApZdeipSUFCxbtky5f/HixejduzfCw8Pxxx9/4NFHH8Wtt96KxYsXN/lc8+fPx4IFCxrcv3z5cvj7+ztUd2BFFobtexhVBn/80PNNhz7W3ZaWLEWmJRMT/Cagj08ftcshojbOZDJhypQpKCoqQnBwy1fNOSOXnJULcYVbkXZqGfzMBcp95cZw7G53E7JC+9p9HLW8UvwKzlrP4paAW9DF2EXtcoioDfLEbAAASFYMOvwcokr34WjkMOxudzOg0zt2DDc6XX0ab5S+AV+dL/4Z/E/odDq1SyKiNkpLuQA4PxtG7J0Nf3MBdsffhKPRIzSdDbX9VvEbfqz4Ed2N3TE5YLLa5RBRG+PJ2RBQkY3h+x6CWe+H73u95dDHquGs5SxeKXkFRhgxN2QuDDr3LgohIpLZmw0ON8W/+OILTJo0CcOHD1f22tiwYQPWrl2Lzz77DNdee22LCm6uKW42mzFx4kScOnUK69evb/YTeu+993DHHXegtLQUPj4+jT6msau3EhISkJeX53iQnt4O4wejIPmGwnL9B5ASBrp8RaDZbMbq1asxYsQIGI1Guz6m1FyKof8dCotkwcprViI+IN6lNTalJbVrgah1A6xdDaLWDbi39uLiYkRGRrb6JMYZudRULmRlZWHz5s12fT10+1fC8MWtACTUbh3Ib1kmvg8p5SqHP7+WaMn3scxchsGfD4YECauvXY0IP/ePcOTvjvuxbvdi3RcmQjY4es6g278Shp8eg67kjHKfFBQPy8iFbsmFlnz//nvov1i4dSEGxA7A61e+7uIKG8ffF/cTtXbW7V7urltLuQA0nQ3Lly/HNddcY/fXRO1skLX0+/nQbw9hzck1uO/i+zAtdZrrCmyGqL9DAGtXg6h1A6y9MaJkQ8t6DTtg/GCkW3sNQMu/VyuPrcTjGx/HxVEX470R77mwwubx90Q9rF9drP88e7PB4fHpEydOxJYtW7B48WJ8/fXXAIBu3bphy5YtSE9Pb3HBTTGbzbjhhhtw4sQJ/PzzzxcMkv79+6O6uhrHjx9H165dG32Mj49Pow1zo9Ho2Bc+41tgpW1vEV1FIbw+mQAExwOjnwNSr7b/OC3kSL27snfBIlmQGJSI9qHtXVzZhTn8tdYIUesGWLsaRK0bcE/tzjq+M3KpuVyQb5ut12oBVj8GZd+nWnSQAOjgtfqfQPer3TpK3ZHv49GCo5AgIdo/GrHBsS6urHn83XE/1u1erLv553AGV2eD3XVmfAvUXDBVm64kC15f3Arc8JFbzhsAx+red24fACAtKk31n1X+vrifqLWzbvdyV91aygWg6WwAHPiaaCgbZI5+PzMKMgAAvaJ7qf7zK+rvEMDa1SBq3QBrr388Z3B1NojWawAcr1lL5w0Af0/UxPrVxfrtzwaHmuJmsxl33HEH5s2bh08++aRFhTn6fDfccAMOHTqEdevWISLiwqvWdu7cCb1ej+joaNcWl/Et8NktaNAAKc6y3a/CSUxzNmVtAgAMiBugciVERM7j7lxq0ok/gOIzzTxAAopP2x7X4XK3leWIjHzbC1upEakqV0JE1DqayQarBVj1MBq7YAo1F0xh1SNAyji3XjBljz35ewAA3SO7q1wJEVHraSYXAKGzQZZfno8zZWegg47nDkQkLE1lAyBcr0G2J8923tAjoofKlRAR2cehzYqMRiO++OILpz15aWkpdu7ciZ07dwIAjh07hp07dyIzMxNmsxnXX389tm3bhmXLlsFisSA7OxvZ2dmoqqoCAGzcuBEvv/wydu3ahaNHj2LZsmWYNWsW/u///g9hYWFOq7OBC57EwHYSY7W4rgYHbTpT0xSPZ1OciDyHs3OpxUpznPs4FbApTkSeQjPZ4MgFUxpiMptwpPAIAL64RUSeQTO5AAibDbXtzd8LAOgQ0gGB3oEqV0NE1DKaygYBew0AYLaYsb9gPwAgLTJN5WqIiOzjUFMcACZMmKCME2mtbdu2IT09XRlHMnv2bKSnp+Pxxx/H6dOn8e233+LUqVO4+OKLERcXp/z3xx+2kwMfHx+sWLECV1xxBbp3745nnnkGs2bNwtKlS51SX5MEO4nJNeXiSNER6KBDv9h+apdDRORUzsylFguMce7jVKA0a7r5sAAAXVRJREFUxcPZFCci8WkiGwS9YGpfwT5YJSui/aIRE6Dd3CIicoQmcgEQNhtq2523GwDQI5IXThGR2DSTDYL1GmQHzx2E2WpGiE8I2gW1U7scIiK7OLyneHJyMp588kls2LABl1xyCQICAuq8/7777rP7WEOGDIEkNXYFlE1z7wOA3r17Y9OmTXY/n9MIdhKzOWszANvqvxCfEJWrISJyLmfmUou1H2Tb56k4C41f2auzvb/9INfX0gImswnHio8B4EpxIvIMmsgGQS+YkkcgcnQ6EXkSTeQCIGw21KaMymVTnIgEp5lsEKzXIKs9Ol2n06lcDRGRfRxuir/77rsIDQ3F9u3bsX379jrv0+l07gsLNQl2EiPvJ94/rr/KlRAROZ8mcklvAEY/V7P/kw51G+M1JwajF2l2X8CD5w7CKlkR5ReFKP8otcshImo1TWSDoBdM7c2zjcVls4OIPIkmcgEQNhtkkiSdzwlusUFEgtNMNgjWa5DJk0N4MS0RicThpvixY8dcUYdYBDqJkSTp/H7icdxPnIg8j2ZyKfVq4IaPbPtA1R57FRxva4inXq1ebRcg7wvIVeJE5Ck0kQ2CXjC1J//8ig8iIk+hiVwAhM0G2enS0zhXeQ5eei90De+qdjlERK2imWwQqNdQm/xaEvcTJyKROLynOOH8SQwA5aRFoa2TmGNFx5BbngtvvTfSo9PVLoeIyLOlXg08sAeYuhKY+K7t9oHdmm6IA7X2E2dTnIjIueQLpoLj6t4fHG+7X2P5UFhRiJMlJwFwxQcRkcsIlg21yRdOdQ3rCm+Dt8rVEBF5CIF6DbIycxmOFB4BwAlTRCQWh1eKWywWfPDBB1i7di1yc3NhtVrrvP/nn392WnGaJsiKwI1ZGwEA6THp8PXyVbkaIiLn01wu6Q1Ah8vd+5yttK9gHwCgW3g3lSshInIOTWVD6tVAyjjgxB+2fQADY2yrPDT0opZMXu2RGJSIEJ8QlashInIeTeUCIFQ21LbnLPcTJyLPoalsEKTXIMvIz4AECbEBsYj0i1S7HCIiuzncFL///vvxwQcfYNy4cejRowd0uvpXL7UhApzEyPuJc3Q6EXkq5lLrVFRX4GjhUQBcKU5EnkNz2SDIBVN78mzNDq4SJyJPo7lcAITJhtqULTbYFCciD6C5bBCg1yCTzxs4Op2IRONwU3zFihX47LPPMHbsWFfUIx4Nn8RUW6uxNXsrAGBg3ECVqyEicg3mUuscOHcAFsmCCN8IRPtHq10OEZFTMBtahvuJE5GnYi60nsVqUbZdYk4QkSfQZDZouNdQ2+683QCA7hG8mJaIxOLwnuLe3t7o3LmzK2ohJ9uTtwdl5jIEewcjJTxF7XKIiFyCudQ6tfcTV/2qaCIiJ2E2OE6SJGXFB1cAEpGnYS603tGioyivLoe/lz86hHRQuxwiolZjNrTc3jzbtktcKU5EonG4Kf6Pf/wDr7zyCiRJckU95ETy6PT+cf1h0OCYFSIiZ2AutY7cFO8Wwf3EiURgsUrYeCQf3+w8jY1H8mGx8m9fY5gNjssx5SCvPA8GnYEX1BKRx2EutJ584VRqRCpfYyIij8BsaJn88nycKTsDHXTcho+IhGPX+PTrrruuzts///wzfvjhB3Tv3h1Go7HO+7788kvnVUetwv3EicRisUrYcqwAuSUViA7yRb8O4TDouXK3Mcwl59mXvw8A9xMnEsGqPVlY8L8MZBVVKPfFhfjiifGpGN0jTsXKtIHZ0Dryao9OoZ3gb/RXuRoiag7PG+zDXHAuThMhEhMzoy5mQ+vtzbedN3QI6YBA70CVqyEicoxdTfGQkJA6b1977bUuKYacx2Q2YdfZXQDYFCcSARsdjmEuOUelpRJHCo8A4D5QRFq3ak8WZn6yA/XXMGQXVWDmJzvwxv/1bvN5wWxoHWU/cTY7iDSN5w32Yy44F3OCSDzMjIaYDa0n7yfOPCDyDG3t4im7muLvv/8+MjMz0a5dO+j1Dk9cJxXsyN2Bams14gPikRCUoHY5RNQMNjocx1xyjoMFB1EtVSPcNxwx/jFql0NETbBYJSz4X0aDnAAACYAOwIL/ZWBEaqyqJy6SxQLTtu2oPnsWXlFR8O9zCXQG941XZTa0jrwCkBdJEWkXzxscw1xwnkpLJQ4WHATAJgiRKJgZjWM2tB4nhxB5jrZ48ZTdf/k7dOiAvLw8V9ZCTrTpTM3o9PgB0Ok896oOItFdqNEB2Bod3DO2IeZS6yn7iYd3Y1YQadiWYwV1TlDqkwBkFVVgy7EC9xVVT/FPP+HwsOHInDoVZ+bMQebUqTg8bDiKf/rJrXUwG1rGKlmV8elpkWkqV0NEjeF5Q8swF5zjQMEB5WLa+IB4tcshogtgZjSP2dBykiQpTXGeNxCJTb54qv7rTfLFU6v2ZLmtFsliQdnmLSha+R3KNm+BZLG47LnsbopLUtsMSVFxP3EiMYjQ6NAq5lLr7SvgfuJEIsgtaTonWvI4Zyv+6Secvv8BVGdn17m/OicHp+9/wK2NcWZDy2QWZ6LEXAIfgw86h3VWuxwiaoSo5w3ufIGr0ednLjhF7WkivJiWSPu0nBlq5wLAbGiNU6WnUFhZCC+9F7qEdVG7HCJqIS1dPBW4Zw+OjxrttkUWdo1Pl/F/fMWQX56PA+cOAAD6xfZTuRoiao7WGx1NUXtEroy51DrySnE2xYm0LTrI16mPcybJYkHOwmeBxl5YkiRAp0POwmcRNGyY23KC2eA4eZ/YruFdYdQbVa6GiBoj4nlD8U8/IWfhs3UumvKKjUXMY48ieORIt9XBXGg9rgokEotWM0MruQAwG1pKni6VEpYCb4O3ytUQUUs5cvHUwE4RLqujdM0axH38CepfHiUvssArLzs9Hxxqis+bNw/+/v7NPmbx4sWtKohax2K14NP9nwIAEgITEOoTqm5BRNQsLTc6mqKlkxjmUstYrBZsztqsXEDVNbyryhURUXP6dQhHXIgvsosqGr2KVwcgNsQX/TqEu7s02wVS9VaI1yFJqM7OhmnbdgT0d8/FmswG+1msFuzI3YGVR1YCAFLDeZEUkVaJdt4gTxGpf9GUK1/gagpzofXki6e6R3ZXuRIisocWM0NLuQAwG1rCYrVg9YnVAIAo/yhYrBYY9O5fIENEraeFi6ckiwVnFz3XxDtdt8jCoab47t274e3d9BVAvMJKXWtOrMGiLYuQY8oBAJwsPYlRX4zCI/0ewfD2w1Wujogao+VGR2O0dhLDXHJc/awAgFtX3cqsINIwg16HJ8anYuYnOxq8T/4r98T4VBj07v+bV332rFMf5wzMBvs0lgffH/se/eP6Mw+INEik8watTRFhLrScxWrB76d/x7GiYwCAbuHdVK6IiOyhtczQWi4AzAZH1T93WHdyHfsORALTwsVTpm3bYcnJQZN/bV20yMKhpvhXX32F6Ohopz05Oc+aE2swe/1sSPX+VyfXlIvZ62dj8ZDFDCgiDard6NABdX6D1W501KfFkxjmkmOYFUTiGt0jDm/8X2/c++mfMFvO/w7HhvjiifGpGN0jTpW6vKKinPo4Z2A2XFhTeVBcVcw8INIokc4btDZFhLnQMo1dPDX5u8lsgBAJQGsX1Zbv2KGpXACYDY7ga0lEnkcLF0+ptchCb+8DeXWUdlmsFizasqhBMAFQ7ntuy3OwWOtP5iciLZAbHTEhda+8ig3xxRv/11u1Rkd9jry45Q7MJccwK4jEN7pHHGKCfQAADwxPxqczBuD3h69UNSf8+1wCr9hYoKm/yTodvGJj4d/nErfUw2y4sObyQMY8INIm5bwhWNvnDVqaIsJcaBm5AVK7IQ6cb4CsObFGpcqIyF5yZnh71X35X43MsGgoFwBmgyP4WhKRZ5IvngLQYKW2uy6eUmuRhd1NcamxlYGkCTtydzQ4UalNgoRsUzZ25Da8OpCItGF0jzj8cP/lytsf3tZX9UZHfVp6cQtgLjmKWUEkPkmSkFdaBQC4Lr0dBnaKUH1FoM5gQMxjjzbxTlttMY896rYJIsyGC2MeEIltdI84fHXXIAC2F6yW/b2/5s4btDRFhLngODZAiDzH6B5xaB/uBwCYeUUn1S6qNWgoFwBmgyN47kDkueSLpyIC624l4a6Lp/z7XAJDTEzTl+u7aJGF3U3x999/HyEhIU59cnKOsyb7GlD2Po6I1FFaUQ0A8DXqcUWXaNUbHfVp6cUtgLnkKGYFkfhKK6tRYbYCACKDmt7/zt2CR47ERa+8DNRrfHvFxOCiV15G8MiRbquF2XBhzAMi8eWX2S6QigzywaWdIzV33qClKSLMBcexAULkWbKLKwEAEy+5SLWLav1699ZMLgDMBkfw3IHIs43uEYcXru8FAIgL9nXrxVM6gwFRjzzcxDtdt8jC7qb41KlT4ePj49QnJ+eI8revAWXv44hIHedMthe3Qv200+ioTUsvbgHMJUcxK4jEd7bE9oJWoI8X/L29VK6mrqAhQwCrrWEf8/g8JH74ITqvXePWhjjAbLAH84BIfDnFFQCgbKmhNVqaIsJccBwbIESeo7SyGiU1CzDiQvxUq0NLuQAwGxzBcwciz5dbanutqUtskNsvngocPhxZN/8f9IGBde535SILu5vipF29o3sjxj8GugbT/2100CHWPxa9o3u7uTIickShyQwACPU3qlxJ47R2EkOOYVYQiU9uikcFae8FnKrTpwFJgs7fH2GTJyOgfz/mgUYxD4jEl1uTB9FBvhd4pHqCR47ERS++0OB+NaaIkGPYACHyHFmF5QCAYF8vBPioe1GtPF1KHxRU537mgrbx3IHI82UX2S64jQ1W59yitEcPBF13LQAg4PLLXb7Igk1xD2DQG/BIv0cAoEFAyW8/3O9hGPR8YZJIy+SV4mH+2lwpDtQakauvGx88idE+ZgWR+OT9xCMDtZcT5sxMAIB3YiJ0TU0UIU1gHhCJT+srxWW+PXva/uHlhfgX/qXaFBFyDBsgRJ7jTE2jQ81V4rUFjxyJkGtrGh9XXMFcEADPHYg8X3bNuUVsiHoX3FrPnQMA+Pfr6/JFFmyKe4jh7Ydj8ZDFiPaPrnN/jH8MFg9ZjOHth6tUGRHZq6hc2yvFZf59+yojcuOeeZonMQJhVhCJ7WyJ7URFkyvFT5xvipP2MQ+IxJarTA7R7kpxADCfOQMA8L7oIoRcdRWniAiCDRAizyGvFI8L1U5emM+cBgAEXjGYuSAInjsQeTZlpbiKTfHqvHwAgFek6ycROTw3pWPHjti6dSsiIiLq3F9YWIjevXvj6NGjTiuOHDO8/XAMTRiKHbk7cNZ0FlH+Uegd3ZsnKkSCOFcmN8W1twKwtsqDhwAAxsREhE6cqHI1zCVHMSuIxHW2Zp+nqEANNsXlleLttdEUZzZcGPOASFy5xfL4dO3lQW3VWVkAAK/4OJUrYS44Sm6ALNqyCDmmHOX+GP8YPNzvYTZAiAShtZXiAGA+eQoA4N2uncqVMBscwXMHIs+lhaa4JV9uikdc4JGt53BT/Pjx47BYLA3ur6ysxOnTp51SFLWcQW9A39i+apdBRC0gj0/X+krxygMHAAA+XZJVrsSGueQ4ZgWRmDS9p3jmCQC2C6a0gNlgH+YBkZhyS+Tx6dpZ+dcYc01T3BirflOcueA4NkCIxCevFI9XsdFRmyRJMJ88CQAwtktQuRpmg6N47kDkmeTx6XGaaIpHuvy57G6Kf/vtt8q/f/zxR4SEhChvWywWrF27FklJSU4tjoioLZHHp4dpvClecdDWFPft0lXVOphLRNTWaLopfsLWFPdObK9qHcwGImoLRFkpbj5T0xSPU68pzlxoHTZAiMSmNDpCtbFS3HLuHKwmE6DTwXhRvGp1MBuIiGwqzBYUlNkW6sWqdcGt1QpLzZ7ihggNrRSfMGECAECn02Hq1Kl13mc0GpGUlIQXX3zRqcUREbUlykpxP42PTz9wEADg01XdpjhziYjamrxSW05Eamx8umQ2w3y6Zt9YlcenMxuIyNNZrJKynYYwK8VVHJ/OXCCituyMxlaKy6vEvWJioPdR75yG2UBEZCNfbOtr1CPET52FeoayMsBqBXQ6eIWHu/z57G6KW61WAECHDh2wdetWRLphGTsRUVtSaJL3FNfuSnHJYkHlIdue4r5du6haC3OJiNoara4UN2dlAdXV0Pn4wCs6WtVamA1E5OkKyqpgsUrQ6YDIQG1fTFudXbOnuIorxZkLRNRWSZKErCJtrRSv0sh+4swGIiKbrCLbxVOxwb7Q6XSq1GAoKbHdhoVB5+Xwjt8O0zv6AceOHXNaUPz6668YP3484uPjodPp8PXXX9d5vyRJePzxxxEXFwc/Pz8MHz4ch2qaMbKCggLcdNNNCA4ORmhoKKZPn47S0lKn1EdE5E6FNSvFwwK0++JWVWYmpIoK6Pz8YExQf/8nwLm5RESkVVarhLxSbTbFq05kAgC8ExOg0zt8euESzAYi8lTyfuIRAd7wMmjjb25jJElSpogY49QbkStjLhBRW1NcXg1TlW2/bNVG4tZjPmVrivP1JCIibZC32YhVcaKIV00/18sNo9MBB1aKy5588slm3//444/bfayysjL06tULt912G6677roG73/++eexZMkSfPjhh+jQoQPmzZuHUaNGISMjA76+tm/STTfdhKysLKxevRpmsxm33norbr/9dixfvtyxT4yISGWFNXuKh6o0qsQeyuj05GToDAaVq7FxZi4REWlVYbkZ1VYJABARoLGmeKZtP3GjyvuJ18ZsICJPdX4/cW00OJpiLSmx7RsLwBgXq3I1zAUianvO1Kz+C/M3ws9bG6/fVJ2yjU83Jqi7UlzGbCCiti67ZqKImhdPySvFvaLcc5GSw03xr776qs7bZrMZx44dg5eXFzp16uRQWIwZMwZjxoxp9H2SJOHll1/G3Llzcc011wAAPvroI8TExODrr7/GjTfeiH379mHVqlXYunUr+vTpAwB49dVXMXbsWLzwwguIj1f/amQiIntYrBKK5Ka4v3ZXilcePABA/dHptTkzl4iItEpeJR7mb4S3l7ZWBpoz5ZXi6u4nXhuzgYg8lbxSPDpYWxdI1SfvJ24IC4PeT/2xvcwFImpr5JG4cSHq/w2WmeXx6RpZKc5sIKK2Tt5mI1bFrJBXihsiNNoU//PPPxvcV1xcjGnTpuHaa691SlGAbXxJdnY2hg8frtwXEhKC/v37Y+PGjbjxxhuxceNGhIaGKg1xABg+fDj0ej02b97cZD2VlZWorKysUz9gCz6z2ey0z8FV5BpFqLU+UWsXtW6AtauhJXWfM1VBsi0AhL+Xep/zhWov37cfAODVqXOra3TW5+iMXGouF2rfioJ1u5+otbNu92pN3VnnygDYxuW6+/O+UN0Vx48DAAztLmoz2SDSz15b/H1Rk6h1A+LW3tbqPnPOtvo6KtD9eQDYX3fFSdtqQK/Y2FbVqaVcAJrOBkC8n0FA3N8fgLWrRdTaRa0baHntJ/Nt5w+xwT6aeY2pqiYbdLFxbSIbRDpvEPl3BBC7fpFrB1i/2lpbf1ah7dwiOtCo2rmFocTWFNeHh7klG3SSJLdhWmf37t0YP348jte8KOYonU6Hr776ChMmTAAA/PHHH7j00ktx5swZxMXFKY+74YYboNPp8J///AcLFy7Ehx9+iAMHDtQ5VnR0NBYsWICZM2c2+lzz58/HggULGty/fPly+Pv7t6h+IqLWyC0HntnpBR+DhOf7WdQup0lJzz0P74ICnLzjdpR37NiqY5lMJkyZMgVFRUUIDg52UoXnOZJLzAUi0rptZ3X4+LABycFW3NPdqnY5dbR/4UX4nD2LU3+fDlNycquOxWwgImreZ0f12JCjx8iLrBiXqK08qC1k40bEfP0NSrun4swtt7T4OFrKBYDZQETi+C5Tj59O63FZjBV/66iBvKiuRvLcedBJEo7M/ScsQUEtPhSzgYjIORbvNuBEqQ63dbGgV4RTWsUOi12xAsF/7sTZsWNw7oorWnwce7PB4ZXiTSkqKkJRUZGzDudSjz76KGbPnq28XVxcjISEBIwcOdIlQepsZrMZq1evxogRI2A0anfv4caIWruodQOsXQ0tqfvPk4XAzi2IDPLD2LGDXVtgM5qr3VpaiqMPPwIAGHzzzTCEhLTquWqvqnAFR3KpqVwYOnQoNm/e3CZ+BrVA1LoBcWtn3e7VmrqzNhwHDh9ESlI8xo7t6ZoCm9Bc3ZLFgiNz5wEALr3+ehgvuqhVzyVCNohyziBri78vahK1bkDc2tta3SuX7wRycjEoPRVj+7t/2wp76847cBCFAC66+GJcPHZsi59PS7kANJ0NAIT7GQTE/f0BWLtaRK1d1LqBlte+/ovdwOksDOjZFWMHd3BhhU2rXbuUlYVMSYLOzxcjaxa9tZQo2SDSeYPIvyOA2PWLXDvA+tXW2vqf3fsLgEqMGzoIPdu17rX+ljCbzch4+x0AQPdLL0WwG84bHG6KL1mypM7bkiQhKysLH3/8cZP7g7dEbGwsACAnJ6fOSvGcnBxcfPHFymNyc3PrfFx1dTUKCgqUj2+Mj48PfHwa7r9lNBqF+sEXrd7aRK1d1LoB1q4GR+ouq7JdiRUW4K2Jz7Wx2k3HjgOwjUD0jWz9Hh/O+jydkUvN5YJ8q4Xvi6NYt/uJWjvrdq+W1F1gqgYAxAT7qfY5N1a3OTcXMJuhMxrhl5AAncHQ6udwBldnQ1v5udMC1u1+otbeVurOLa0CAMSFBaj6+V6obmtODgDAJ/6iVtWppVwAms4GQNyfQYC1q4W1u5+odQOO155dbMuLduHq5gVgq70yKxsA4N2uHby9vVt9PGdwdTaI+PMmYs21iVy/yLUDrF9tLam/2mLF2Zpzi3YRgap9/l6lJQAAn5hYt5w3ONwUf+mll+q8rdfrERUVhalTp+LRRx919HBN6tChA2JjY7F27VqlCV5cXIzNmzcrY9EHDhyIwsJCbN++HZdccgkA4Oeff4bVakX//v2dVgsRkaudM9kCKNSvdScGrlR50LZVhU/XLipXUpe7comISE15JbY96qKCGn8hXi1VmZkAAKMTGuLOxGwgIk91trgCABCtsTyoz5yVBQAwxsdd4JHuwVwgorYmq6gcABAb4qtyJTbmU7b9xI3tElSu5DxmAxG1ZXmlVbBYJRj0OkQGqnduIe8p7hUZ4Zbnc7gpfuzYMac9eWlpKQ4fPlzn2Dt37kR4eDgSExPxwAMP4Omnn0ZycjI6dOiAefPmIT4+Xtl3vFu3bhg9ejRmzJiBN998E2azGffccw9uvPFGxMfHO61OIiJXKzSZAQCh/tq9Iq3igK0p7tulq8qV1OXMXCIi0qqzpbamuJonKo2pOmFrinsnun+Eb3OYDUTkiaxWCbk1F0nFBGujydEUpSkep42mOHOBiNoSSZKQVWS7iCo+xE/lamyqTtY0xRPaqVzJecwGImrLsmsuto0J8oFB3/ItLVpDqq6GwWQCAHg5YTKtPVq1p/jJmjCT909y1LZt/9/encfHWdb7/39PMjNZmiZpkzRLd+gCpQeUVmpZRLpRRE6VPoAfiwcPHL4im4AogtaWehDEoyKKeNwoyiZ48AiHRWhZFCxL2SoUu1la2mxNm7VpMtv1+2PmniRN0iaQzH3dM6/n49EHzWSS+eRukjf3/bk/17VOp5xySvJtZ++NCy+8UKtWrdLXv/517du3T//v//0/NTU16cQTT9RTTz2l3Nyuk7/77rtPV1xxhebPn6+srCwtXbq019InAGC7psSk+Kh8iyfFN26SJOVMt6sp3t1HzSUAsNVuyyfFgxPtaop3RzYASBeN7SFFYvFtl2y7Sao7E4koktjqzl9p38ACuQAg3e3dF1JnJCZJKi+yIy/CH+yUJAUtmhTvjmwAkGlqEyuKlLu4oki0sVE+Y6SsLGWPGpWS18wa7AdEIhEtW7ZMRUVFmjRpkiZNmqSioiJ961vfUjgcHtTn+vSnPy1jTK8/q1atkiT5fD6tXLlStbW16ujo0OrVqzVtWs9le0ePHq37779fra2tam5u1m9+8xsVFBQM9ssCAFc1Wj4pboxRpzMpbtny6UOZSwBgK2ub4tu3S5IClk2Kkw0A0pEzJV4yIqigf9CXc1Imsnu3FI1KgYD8ZamZ+DgUcgFAJnGmxEsLcpTjt2OLo9BO+ybFyQYAmaw2kRWVbjbF9+yRJGUXF6dsS75BT4pfeeWVeuSRR3Tbbbdp7ty5kqS1a9dqxYoV2rNnj+66664hLxIA0l3TfqcpbuekeHhXtWL79skXCCg4aZLb5fRALgFId5FoTHsTK4rY1hQP74g3xYMTJrpcSU9kA4B0VJdY4tC2LDhQcun08nL5suxo3pMLADJJcun0Ynu22gjv3CVJClo0iU02AMhkNc7y6S5uyxRpaJAkZado6XTpQzTF77//fj344IM67bTTko8dffTRGj9+vM4991zCAgA+BGf59OI8OyfFOzfFp8SDU6bIF7CrRnIJQLrbuy8kY6TsLJ9V22yYWEyhHfGJD9uWTycbAKQjZ1J8jO37iVfbtZ+4RC4AyCw1iSVx3Zz+6y7a3KJYS4skKTB2rMvVdCEbAGSyOpsmxUtKUvaag75lNycnR5P6mBKcPHmygkF7LtIBgJc0JZZPHzXCroazI7l0+jS7lk6XyCUA6c9pgoweEVR2ls/larpE6utlOjslv1+BKrv2jCUbAKSjemeaw/pJ8WpJUqDKnqY4uQAgk1Q3OY2OPJcriQvvjO8n7i8rU1aeHTVJZAOAzOasKuLmpLjTFPeXWtwUv+KKK/Sd73xHnZ2dycc6Ozt1880364orrhjS4gAgUzQmJsWL8uz8n+6OjZskSTnTp7tcSW/kEoB0t7stsZ94gV1NkND2HZKkwNgq+fyDXoBqWJENANJR16S4XXlwoEhi+XS/RZPi5AKATGLbpHgk0RQPWLR0ukQ2AMhstS3u30AVaUj9pPigr169+eabWrNmjcaNG6djjjlGkvT2228rFApp/vz5OvPMM5PPfeSRR4auUgBIY83OpHi+3ZPiOdPtmxQnlwCku92JJohte8iGLN1PXCIbAKSn+pZ4Hrg5zTEQXcun27OKCLkAIJPUOJPixXZMZTuT4oFx9iydLpENADKXMUa1Ni2fbvOe4sXFxVq6dGmPx8ZbdpcXAHhJOBpTa2dEkqzaK9YR279foe3xxkeuhZPi5BKAdGdrUzy8Iz4pHpxg137iEtkAID3VtcYvXI2xLA8OFK5xmuIVLlfShVwAkEmqE5PiVZZMijtN8eA4u37vkg0AMlVTe1idkZgkd1ehiu5pkCT5R1s8KX733XcPRx0AkLGc/cR9Pqkwz75J8c4tW6VYTNklJfKn8K6tgSKXAKS7hjY7m+LO8unBifY1xckGAOnImRQvG2lHk6M/XU1xe5ZPJxcAZIpYzKiuxdJJccsazmQDgEzlLJ1eMiKoHH+2a3UkJ8VTuHz6oPcUnzdvnpqamno93tLSonnz5g1FTQCQUZr3x/cTL8wNKDvL53I1vXVuii+dnmvh0ukSuQQg/TmT4qW27SmemBQPWDgpTjYASDfGmGQelFu8p3i0bZ9iLS2S7NpTnFwAkCka2joVjhpl+aRyS26qTU6Kjx/nciU9kQ0AMpWzdLrb2zJF9uyVJGWXWtwUf/755xUKhXo93tHRob/+9a9DUhQAZJLGxKR4saX7iXc4+4lPs2/pdIlcApD+bFw+3RiTbIoHJ9q3pzjZACDdNLWHFYrGlzi0KQ8OFKmNT4lnFRYqu6DA5Wq6kAsAMkVNs7PVRq782YO+9D/0olFFnBVELJsUJxsAZKoaC/YTN+GwYo2NklI7KT7g5dPXr1+f/PuGDRtUW1ubfDsajeqpp57S2LFjh7Y6AMgATcmmuH37iUtS58ZNkqQcy/YTJ5cAZIrdzvLpFk2KRxsaZNrbpawsBS36XUs2AEhX9YkbpEblB1xd4vBQbFs6nVwAkGlqEvuJVxbbsdWGv7lZikblCwblLytzuxxJZAMAOMunV7jYFI/sjTfETVaWsouLU/a6A26Kf+xjH5PP55PP5+tz+ZC8vDz95Cc/GdLiACATNLbH70ottnA/cWOMOjfauXw6uQQgU9g4KZ5cOr2qSr6gPTd1kQ0A0pWzP+wY2/cTr7arKU4uAMg01U3xvKgqsmM/8cDe+NK4gXHj5MuyYHJdZAMA1CZuoKpwcfn0SMNuSVJ0xAj5slN30++Am+Lbtm2TMUaHHXaYXn31VZV1u7MrGAxqzJgxyk5h4QCQLpoTk+KjLFw+PVK/W9GmJik7W8HDD3e7nB7IJQCZoCMcVWtHRJJdk+Kh7Yml0y3bT5xsAJCunEnxMRbvJy5J4ZpqSVKgyo6mOLkAINM4k+JuTv91F0zsFxuwaD9xsgFApqttiZ9buJkV0T17JEmRkandcmnATfGJib0CY7HYsBUDAJkoOSlu4fLpnZviU+LByZOUlWPXBThyCUAmaEgsnR7MzlJh3oD/133YhXZslyQFJtrVFCcbAKQrr0yKO/vG+i2ZFCcXAGSaagv2ie3O3xhvigfH2bOfONkAINPVWnADVWR3gyQpWjAypa876Ctrv/3tbw/6/n/7t3/70MUAQCZq2u/sKW7fpHhy6fRpdu0n3h25BCCddV863efzuVxNl9D2eFM8OGGiy5X0jWwAkG52e2VSPLl8epXLlfRELgDIFDVN8UZHVbEly6dbOCnuIBsAZKoaC26gijiT4gWWToo7vvKVr/R4OxwOq729XcFgUPn5+YQFAAxSU2JSfJSFk+IdGzdJknKm29sUJ5cApDOnCVJq0X7ikhR2lk+3bFLcQTYASDf1rfELV+WW5cGBwjVOU7zC5Up6IhcAZAobGh3dBfc6k+L2NcXJBgCZaF9nJLlNX0WRezdQRfckJsVHpnZSPGuwH9DY2NjjT1tbmzZu3KgTTzxRDzzwwHDUCABprXGf/ZPiOdOnuVxJ/8glAOlsd2L5dJv2EzfGKLTDzj3FHWQDgHRT1+JMitvR5OiLiUYVrquTJAUsWT7dQS4AyASRaEz1iZtqrZkU3+tMituzfLqDbACQiWoT2zIV5PhVkOPeNn3O8umpnhQfdFO8L1OnTtWtt97a6+4qAMChdS2fbtekuAmF1PnPf0qSci2eFO8LuQQgXTS0xlcTKRtpT0ZEGxsVa2uTfD4rL271h2wA4GXOpPgYiyfFIw17pHBYysqSf8wYt8s5JHIBQLrZ3dapaMzIn+VTqQU31UZbW5Xd3i5JCoy1b1K8L2QDgHRXl1hRxM39xKWu5dOtnxTvj9/vV3V19VB9OgDIGM7y6cV5dk2Kd27bJkUiyioslL/CruUPB4JcApAOdrfFT1ZsmhR39hP3V1QoK8eeugaCbADgRcaY5KR4ucWT4pHa+NLp/vJy+fzuTZ0MBrkAIJ1UNyW22ijMVXaWz+VqpMiuXZKk7NGjlF0wwuVqBo5sAJDOnG02Klw+r4g0JCbFR1q+p/ijjz7a421jjGpqavTTn/5UJ5xwwpAVBgCZoqk9Pilu257iztLpudOmyedz/2SqP+QSgHTm7CleZtFkYNjypdMlsgFAemnZH1EoEpNkVx4cqGs/cbuWTpfIBQCZoaZ5vyR79hMP79wpSfJbuJ+4RDYAyEzO8uluT4pHE03xaIqXTx90U/xzn/tcj7d9Pp/Kyso0b948/eAHPxiqugAgI3SEo9ofjkqSikfYNSnekdxP3O6l08klAOnMxqZ4aLv9TXGyAUA6cZZOL8oLKDeQ7XI1/QtX29sUJxcAZIKaxKR4pSX7iTtNcVuXTicbAGSiWgsmxU0opGhzsyQpkuLl0wfdFI/FYsNRBwBkpObEfuLZWT6NzLFricHOjZskSTnTp7lcycGRSwDSWUObs6e4RU1xZ1J8or1NcbIBQDpxlk63eT9xqdukeJV9TXFyAUAmqE5MildZNikesHRSnGwAkIlqLNhTPLJ3b/wv2dmK5aX2Rq4Pvad4Q0ODGhLj7QCAD6ex237iti1Rnlw+3fJJcQe5BCDdGGOSk+KlNu0pnmiKByyeFHeQDQDSgTMpPqbQnizoS7gmvv+q38JJcQe5ACCdJSfFLWmKR5ym+Hg7m+IOsgFAJqlrcT8rIg17JEnZJSVS1oduU38og3q1pqYmXX755SotLVV5ebnKy8tVWlqqK664Qk1NTcNUIgCkr8Z98Unxony7lk6P7t2ryO7dks+nnClT3C6nX+QSgHS2L9S1xYZNTfHw9u2SpODESe4W0g+yAUC6cSbFy0fa0eToT8TS5dPJBQCZoqbFtuXTd0myc09xsgFApnImxctdXD490rBbkuQvGZ3y1x7wWr179+7V3LlztWvXLp1//vk68sgjJUkbNmzQqlWrtGbNGv3tb3/TqFGjhq1YAEg3zfvjk+Kj8oMuV9JT5+bNkqTAhPHKGjHC5Wr6Ri4BSHfOlPiIYLZGWLLFRrSpKbnvU9DCiQ+yAUA6cibFy6yfFHeWT69yuZIu5AKATFLT5Cyf7n5T3ESjClfHVxCxbfl0sgFApgpFYtqzL36tyc1J8egeZ1K8NOWvPeCraytXrlQwGNTWrVtVXl7e632LFi3SypUr9aMf/WjIiwSAdNXYHp8UL86za1I8tCm+n3juNHuXTieXAKQ7pylu1X7iH3wgSfKPGaOs/HyXq+mNbACQjupb7Z8Uj+3fr2hjoyQpUFHhcjVdyAUAmSIUiWl3W6LRUex+XkTq66VwWCY7W/4xY9wupweyAUCmqm/tkDFSMDtLo0e4N6TXY/n0FBvw8un/+7//q//6r//qFRSSVFFRodtuu01//OMfh7Q4AEh3TU5T3LZJ8U3xSfEci/cTJ5cApLuGNgub4tvj+4kHLd1PnGwAkI7qW+zfUzxcUytJysrPV1ZhocvVdCEXAGSKupZujQ4LrjE5N9OGi4vly852uZqeyAYAmarWWTq9KEc+n8+1OiINDZIsb4rX1NToqKOO6vf9M2fOVG1t7ZAUBQCZoqndWT7dzknxnOnTXK6kf+QSgHTnTIrbtJ94aPv7kqTARDub4mQDgHTkTIqPsXhSPFwTXyLXX1Xp6gW2A5ELADKFs0dsRVGusrLc/z0c/mBn/L+jU79f7KGQDQAyVW3iZtvKQne32YjuiTfF/aUWN8VLS0v1/vvv9/v+bdu2abSFIQcANuuaFLeoKR6NKrR1qyQp1+JJcXIJQLqzcfn08A5nUnyiy5X0jWwAkG6MMapLXLwqt3hSPJJoHgQq7dlPXCIXAGSOmub4fuJu7hHbXWhnYlK8xL7fsWQDgEzVNSnublZEdntgUvzUU0/VN7/5TYVCoV7v6+zs1LJly7R48eIhLQ4A0l1jYlLcpuXTA3v2yHR2ypefr8C4cW6X0y9yCUC6SzbFrZoUTzTFLZ0UJxsApJvWzog6wjFJlk+KV9dIkgKVlS5X0hO5ACBTVDfFGx1Vxe5O/zlsnhQnGwBkKqcp7vYNVJE9iT3FS0tT/tr+gT5x5cqVmj17tqZOnarLL79cRxxxhIwxeu+99/Szn/1MnZ2d+t3vfjfkBU6aNEnbt2/v9fhll12mO++8U5/+9Kf1wgsv9Hjfl770Jf385z8f8loAYKjZOCmek9gPMHfqVPmyBnzvVMq5lUsAkCq7bdxTfIfde4qTDQDSTX1LPAtG5vqVF7RrT9buwjWJpniVXU1xcgFApqi1bFI87OwpbmFTnGwAkKlqkitQudwUT+wp7i8pkRJ/T5UBN8XHjRuntWvX6rLLLtMNN9wgY4wkyefzaeHChfrpT3+q8ePHD3mBr732mqLRaPLtd955RwsXLtRZZ52VfOySSy7RypUrk2/n5+cPeR0AMBya9jt7itszKZ5TG7+glWPx0umSe7kEAKnSYFlTPNraqujevZKkgKVNcbIBQLqpT1y4GmNJFvTH2VPctklxcgFApqh2pv8smRQP7bR3UpxsAJCp6iyYFI+FQoq1tEhKTIpv3JjS1x9wU1ySJk+erCeffFKNjY3avHmzJGnKlCnDusdGWVlZj7dvvfVWHX744Tr55JOTj+Xn56uiomLYagCA4dKYmBQvyrNvUjxn+jSXKzk0N3IJAFLFWT691JLl051pj+ySEmUXFLhcTf/IBgDppD6RBTYvnS5JkcTy6X7LmuISuQAgMzh7ildZMCke27dP0cTSuOHRqd8vdiDIBgCZqCbRFK9wMSucfFAgoKzCwpS//qCa4o5Ro0bpuOOOG+paDikUCunee+/VtddeK5/Pl3z8vvvu07333quKigqdccYZWrZs2UGnxTs7O9XZ2Zl8uyVxV0I4HFY4HB6+L2CIODV6odYDebV2r9YtUbsbBlq3MUZNiT3FRwZ9Vnyd4XA4OSnuP/zwYa1pKD/3R82lg+VC9/96BXWnnldrp+7UGkzdsZhJToqPyst29Wt1Xrtj2zZJUmD8+GGrxyvZ4KXvvUz4ebGJV+uWvFt7utdd3bRPklRWELTia+yrbmNMcvl0X1nZkNZpUy5I/WeD5L3vQcm7Pz8StbvFq7V7tW5pMHkRb4qXjvC7/nV2vv++JCmrqEixvNwhr8cr2eCl8wYv/4xI3q7fy7VL1O+2wV5nqkusQlWa715WdNTGB/KyR41SJBKRNDTHf6Cfw2ec9UE84KGHHtJ5552nHTt2qKqqSpL0i1/8QhMnTlRVVZXWr1+v66+/Xscdd5weeeSRfj/PihUrdNNNN/V6/P7772fpdQAp0xmVvv5q/N6k246LKMeCLQKz9ndoyooVkqQtK5Yrljd8y261t7frvPPOU3NzswpduCusO3IBgG32haUb18Uz4gdzIvJnuVyQpNHPPqfSP/9Zzcceq7pzzh6W1yAbAKCnP76fpedrsjSvMqYlk2Jul9On7LY2Hf6d/5Tx+bT5P78j+T/U/EWfbMoFiWwAYKdwTLrulfjv3u/OjmiEy4sRjnj3XY397e/UMXasdlx15ZB/frIBAAavJSQte90vn4x+MCeqbJeuM4147z2NXXXPkGfEQLPBU03xU089VcFgUI899li/z3n22Wc1f/58bdmyRYcffnifz+nr7q3x48eroaHBiiA9lHA4rGeeeUYLFy5UIGDPkssD4dXavVq3RO1uGGjd1U37dfIP/qpAtk/vLl/QYwUMt7S++qrqLv4PZVdUaPIzTw/ra7W0tKi0tNSKk5j+cqGmpkavvPJK2n4P2sardUverZ26U2swdW+ua9Nnfvo3FecF9NqNp6Sowr45dX/8pb9p36OPavTll2v0pV8altfyQjZ45ZzBkQk/Lzbxat2Sd2tP97qvfmi9Hv97rW48bbr+/fiJKaywb33V3fHuu9r5/52r7LIyTX52zZC+nk25IPWfDffff7+WLFniqe9Bybs/PxK1u8WrtXu1bmlgtW/f064Ft7+o3ECW1i+b7/r1pabf/U4Nt31f+QsX6K0FC4b8uHslG7x03uDlnxHJ2/V7uXaJ+t02mPrf2dWiz//8ZY0ZmaOXvn7yQZ87nJr/53+0e8VNyj/pJJX9+PYhO/4DzYahu313mG3fvl2rV68+6AS4JM2ZM0eSDtoUz8nJUU5O770ZA4GAp77xvVZvd16t3at1S9TuhkPV3RpqlySNyg8qGAymqqyDiv3zn5Li+4kP9zG36d/0YLng/NemegeKulPPq7VTd2oNpO6mjqgkqWxkjjVfY3TnTklS7mGTh60mW75WKX3OGRzUnVperVvybu3pWvfutvh2SxXF+VZ9fd3r3l9fH3+sqnLIa7Tpa5b6zwbJu9+DErW7hdpTz6t1SwevvX5ffMnYqqI8K64vRavjW2oEx0+QNPTH3bZ/w3Q6b/Bizd15uX4v1y5Rv9sGUn9De3yp8sqiXHe/1qYmSVJgTNmQXnsf6MdbsBDjwNx9990aM2aMTj/99IM+76233pIkVVZWpqAqAPjwmtrjJy3F+fYEbuemzZKknGnTXK4EADLb7sR+4qUFfV94d0P4gw8kScEJ7k8qAkCm2N0az4MxI+3JgwNFEvsCBiqrXK4EADJTbXN8j9jK4lyXK4lzzhsC48a5XAkAwFHbvF+SVF7oblZEGvZIkvwlpa68vicmxWOxmO6++25deOGF8nfbm2rr1q26//779ZnPfEYlJSVav369rrnmGn3qU5/S0Ucf7WLFAHBoTfvjUx/F+e7fxesIbdokSQrSFAcAVzlNkDJLmiC+UEjR3bslScEJ412uBgAyR11LvNHh9sWrgwknJgIDDCcAgCtqnKZ4UZ7LlcSFEitMBcaNkxr3ulwNAEDqnhVuN8UbJEn+0hJXXt8TTfHVq1drx44duuiii3o8HgwGtXr1at1+++3at2+fxo8fr6VLl+pb3/qWS5UCwMA1OpPieXZMiptYTJ2bmRQHABvY1hQP7InfyZtdXKzsoiKXqwGAzNDWGVF7KL6dhs2T4uEamuIA4Kbqpvj0X5XLjQ4pfm0pTFMcAKxTm7jZtsLlG6iiyaY4k+L9WrRokYwxvR4fP368XnjhBRcqAoCPrrk9Pik+ypJJ8fCuXTLt7Yr5/QpMmOB2OQCQ0WxrigcTTfHARPIBAFKlPnHhqiDHrxE59l6+STbFq2iKA4AbnOk/txsdkhTZ3SDT2SllZ8tfUS793e2KAABS11YbFUXuXmdyJsWzXVo+3TN7igNAuklOio+wY1K8c+NGSVKovFw+v70X3QAgE9i2p3ggcdLCfuIAkDp1LfbvJy5J4ZpqSZKfSXEAcIUzKW7DnuLhnYn9xCsr5QvYcb0LANBtUrzQ3RuoIomhC38ZTXEAyChNyeXT7ZgU70g0xTsrKlyuBABg3aR4Q/ykJchKIgCQMvWt8QtXtmRBX2KhkKK74zdOBaqqXK4GADKTMyleZcGkeOiDRFN8/DiXKwEAOIwx3SbF3buBKtbZqVhrqyTJX+LOnuI0xQHAJU3J5dPtuHO2c+Om+H8raYoDgNsaEpPiZbZMiifu5A2yfDoApEx9YlK8vND9yb/+RGprJUm+3FxlFxe7WwwAZKD2UETN++NDF1ZMin8Q3088OG68y5UAABwtHRG1h6KSpAoXzy2c/cR9gYCyCgtdqYGmOAC4pDHRFC+2pimemBRn2UMAcFUkGtOeffGMsGU6MNkUZ1IcAFLGmRS3efn0cHViP/GKCvl8PperAYDM40yJF+T4VZjr/vWl5PLp45gUBwBb1CWWTi/ODygvmO1aHc7S6dmlpa6dO9AUBwCXNCXu5C3Od3/59Fh7u0I7dkiSQiyfDgCu2rsvJGOkLJ80eoQFGdHRoUBzsyQpMJE9xQEgVepb7Z8UD9ckmuJV3FgLAG6oaYo3OipdXA63u5AzKc7y6QBgDecGKjenxCUpkpgUd2vpdImmOAC4JrmnuAWT4p1btkjGKLu0VNGCArfLAYCMtjuxdProETnKznJ/6i68M35hK2vkSJbGBYAUciY6xhRaPCleUy1J8rPaFAC4orp5vySpstj9/cSlrnOHwHiWTwcAW9RZsJ+41K0pXlrqWg00xQHABbGY6banuPtTgB2JpdOD06a5XAkAYHdiMtCWpdPDOxJLII4fz9K4AJBC9ZblQV8iNfE9xQOVVS5XAgCZKTkpbsGqIrGODkXq6yWxfDoA2MSWSfFocvl0JsUBIKO0dkYUM/G/F+VZMCm+cZMkKWfaVJcrAQBY1xT/IL69RoD9xAEgpepbPLR8OpPiAOCKmuSkuPtZEd61S5KUVVDAClMAYJHaFksmxXczKQ4AGak5sXR6XiBbuYFsl6uROpkUBwBrOMunlxVY0hTfkWiKswQiAKRMeyiits6IJGmMJTdJ9YU9xQHAXdWJ6b+qIveXTw99wApTAGCj2sQNVG5PikcSk+L+EpriAJBRGpNLp7s/JW6MUccmZ1KcpjgAuK2hNZ4R1kyKO8unT2RSHABSIRoz+vM78WXJc/xZyrPgJtq+GGOYFAcAl9U0WTQp/kF8P/EgS6cDgFVqbNtTvIymOABkFKcpXmTBfuKRujrFmpslv1/ByZPdLgcAMp4zKV5a4H5GSCyfDgCp9NQ7NTrxe8/qmofeliR1RmI66bbn9NQ7NS5X1lusuVmmvV2S5K+ocLkaAMhMtYlGR6UFk+LhnV2T4gAAe9S12JEVUacpXsKe4gCQUZr3x5dPt2FS3Fk6PWfyJPmCdjRgACCT7W6Nn6zYMCkeC4UUqYlPKwbG0xQHgOH01Ds1+vK9byQnORy1zR368r1vWNcYd6bEs0tKlJXr/oQiAGSSaMzo2ffq1JrYaqO80P1zh5AzKT6eSXEAsEVHOKrGxFauri+fnmiKZ7N8OgBkjmjM6I3tjYm/xxSNGVfr6djoLJ0+3dU6AADxjNixJz51t7u10/WMCO/cJcViigWDyi4Z7WotAJDOojGjmx7boL5+6zuP3fTYBtdzoTuWTgcAdzirilx0z7rkY4t+9BfXb54KO3uKj2NSHABsEI2ZZDYEs7M0Ise9bZliHR2K7dsnieXTASBjOCcu96zdLkl6ZVujTvzes66euCQnxafTFAcANzkZUZ2YEPzPx99zPSNC29+P/7e0RD6fz7U6ACDdvbptb68J8e6M4nsBvrptb+qKOoRwtdMUZ+l0AEgVW1cVMcYotDM+KR4YN9aVGgAAXZxrTFf/Pr4tUyjq7rZMkYY9kiRfMKisggJXapBoigNAyth64tK5Kd4Uz50+zZXXBwDYmxHhHfH9xMMuLm0FAJmgvrX/hviHeV4qhGuqJUl+JsUBICVsXVXERKNqfWa1zP79kiR/BTdLAYCbbLzGFG3YLUnyl5a6OnRBUxwAUsDGExcTjartpZfUuWWrJCk4ZUrKXhsA0MXGjJDiObHv1dfif8/KkolGU/r6AJBJxowc2P5+A31eKkSSy6dXuVwJAGQGG1cVaXn6aW2Zv0C7rroq+dg/T/uMWp5+OmU1AAC62HiNyUSjalu7Nv5Gbq6r15doigNACth24uKctHxw8X9IJh6A288/X22rV6fk9QEAXWzLCKkrJ9rWrJEkFb79tt4/dTEXtwBgmBw3ebQqi3LV38yET1JlUa6Omzw6lWX1y0Sj6ti0WZIUa2vjxikASAHbVhVpefpp7frK1YrU1vZ4PFJXp11fuZprTADgAtuuMTnXlxp+fIckKfzPf2rL/AWuXV+iKQ4AKWDTiUv/Jy31qr32qyp4551hrwEA0MWmjJD6z4lofb12feVqGuMAMAyys3xafsYMSerVGHfeXn7GDGVnubfUoKPgnXf0/qmLFdqyRZLUcOedrl7YAoBMYdOqIiYaVd13b0kOWvR8Z/yxhu/dJsViw14LAKCLTdeYbLx5iqY4AKSALScuAzlpKXv0MSY9ACCFbMkIaWA5UffdW8gJABgGi2dW6q4LjlVFUc/f9xVFubrrgmO1eKb7e3e3rV6tyt/dq2hdXY/HnQtbNMYBYPjYtKpI+7rXezU5ejBGkdpa5W3bNuy1AAC62HKNydabp/wpfTUAyFDOiUttc0ef+3n4FL/YNdwnLgM5aQk0N2v/G28oePzxw1oLACDOloyQBn5xq33d6xox57hhrwcAMs3imZVaOKNCr27bq/rWDo0ZGf/9b8OEuIlGtfvW7/XzTiP5fKr77i0aOX++fNnZqS0OADKAs6rIl+99Qz6px7lDqlcViezePaDn+Vtbh7kSAEB3tlxjsvXmKSbFASAFbFkOcaAnLdEBPg8A8NHZkhHSwHNioM8DAAxedpZPcw8v0ZKPjdXcw0usaIhL8Qtb0bq6ficUu984BQAYHrasKuIvKxvQ8yIjRw5zJQCA7my5xmTrzVNMigNAijgnLjc9tkE1zV17dlQU5Wr5GTNScuIy0JOW7AE+DwAwNGzICGngOTHQ5wEA0gc3TgGAHWxYVSR/9iz5KyoUqavre2lcn0/+8nLtnzw5ZTUBAOJsuMZk681TNMUBIIXcPnEZyElLuLBQeccem5J6AABd3M4IaeAXt/Jnz0pZTQAAO3DjFADYw1lVxC2+7GyV33iDdn3lasnn63nu4Iufv5Re/3UpFHKnQADIcG5fY7L15imWTweAFHNzOUTnpCX+xgGvm3h797+ewR6AAOASt5fMHUhOlN94AzkBABkof/YsZZeX97k3oaT4ha2KCm6cAoAMUbhokcb++Hb5y8t7PO4vL9fYH9+uggULXKoMACDZ34covf7rUlZq29Q0xQEgwxzspKXihz9Q28yZLlUGALDBoS5uFS5a5FJlAAA3+bKzVfaN6xNvcOMUACB+7jBlzWpNuOceVf3Xf2nCPfdoyprVnDMAAKy8eYrl0wEgAxUuWqSR8+erfd3riuzeLX9ZmfJnz1IkFpOeeMLt8gAALuueE521NVq3das+fdllCubmul0aAMBFBQsWqOYLF2j8088oWleXfNxfXq7yG2+gCQIAGciXna0Rc45zuwwAgIX660P4srMVDodTXg9NcQDIUH2etMRi7hQDALCOkxPBcFj7n3iCyT8AgCSpbeZMTfrqVxV+e32vC1sAAAAA0J1NN0/RFAcAAAAAAMCA2XRhCwAAAAAGgj3FAQAAAAAAAAAAAABpy+qm+IoVK+Tz+Xr8OeKII5Lv7+jo0OWXX66SkhIVFBRo6dKlquu2pxUAAAAAAAAAAAAAILNZ3RSXpKOOOko1NTXJPy+++GLyfddcc40ee+wxPfzww3rhhRdUXV2tM88808VqAQAAAAAAAAAAAAA2sX5Pcb/fr4qKil6PNzc369e//rXuv/9+zZs3T5J0991368gjj9TLL7+sT37yk6kuFQAAAAAAAAAAAABgGeub4ps3b1ZVVZVyc3M1d+5c3XLLLZowYYJef/11hcNhLViwIPncI444QhMmTNDatWsP2hTv7OxUZ2dn8u2WlhZJUjgcVjgcHr4vZog4NXqh1gN5tXav1i1Ruxu8WreU2tptOj4Hy4Xu//UK6k49r9ZO3alF3QN/LRt4/ZzBwfddanm1bsm7tVN3aqW6btuOT3/ZINlX60B49ftQona3eLV2r9YtUfvBPq8t0uG8wcvfZ5K36/dy7RL1u436e3+uQ/EZY8xHfrVh8uSTT6qtrU3Tp09XTU2NbrrpJu3atUvvvPOOHnvsMf37v/97j8CRpOOOO06nnHKKvve97/X7eVesWKGbbrqp1+P333+/8vPzh/zrAAD01t7ervPOO0/Nzc0qLCx0tRZyAQDsQDYAALqzKRcksgEAbEA2AAAONNBssLopfqCmpiZNnDhRP/zhD5WXl/ehm+IH3r3V3NysCRMmaNu2bRo5cuSw1T9UwuGwnnvuOZ1yyikKBAJulzMoXq3dq3VL1O4Gr9Ytpbb21tZWTZ48WU1NTSoqKhrW1zqU/nJh06ZNWrdunef+Lb36PejVuiXv1k7dqUXdh+aFbPDKOYOD77vU8mrdkndrp+7USnXdNuWC1H82/OpXv9Lpp5/uqX9LybvfhxK1u8WrtXu1bona++KVbPDSeYOXv88kb9fv5dol6ncb9XcZaDZYv3x6d8XFxZo2bZq2bNmihQsXKhQKqampScXFxcnn1NXV9bkHeXc5OTnKyclJvu0saTJ58uRhqRsA0L/W1lbXT2L6y4Vp06a5VRIAZDSbs4FzBgBIPRtyQeo/G/7jP/7DrZIAIGPZng2cNwBA6h0qGzzVFG9ra9PWrVv1hS98QbNmzVIgENCaNWu0dOlSSdLGjRu1Y8cOzZ07d1Cft6qqSh988IFGjhwpn883HKUPqZaWFo0fP14ffPCBFUvEDIZXa/dq3RK1u8GrdUuprd0Yo9bWVlVVVQ3r63wYTi4YYzRhwgTP/Vt69XvQq3VL3q2dulOLug/NC9nglXMGB993qeXVuiXv1k7dqZXqum3OBSmeDRs2bNCMGTM8928peff7UKJ2t3i1dq/WLVF7X7yQDV47b/Dy95nk7fq9XLtE/W6j/i4DzQarm+LXXXedzjjjDE2cOFHV1dVavny5srOzde6556qoqEgXX3yxrr32Wo0ePVqFhYW68sorNXfuXH3yk58c1OtkZWVp3Lhxw/RVDJ/CwkJPfqNL3q3dq3VL1O4Gr9Ytpa52G+7o7YuTC87dvV79t6Tu1PNq7dSdWtR9cLZng1fxfZdaXq1b8m7t1J1aqazb1lyQ4tkwduxYSd79t5So3S3UnnperVui9gPZng1ePW/w8veZ5O36vVy7RP1uo/64gWSD1U3xnTt36txzz9WePXtUVlamE088US+//LLKysokST/60Y+UlZWlpUuXqrOzU6eeeqp+9rOfuVw1AAAAAAAAAAAAAMAWVjfFH3zwwYO+Pzc3V3feeafuvPPOFFUEAAAAAAAAAAAAAPCSLLcLwODl5ORo+fLlysnJcbuUQfNq7V6tW6J2N3i1bsnbtQ8Hrx4P6k49r9ZO3alF3XCDV//9qDv1vFo7daeWV+seTl4+JtTuDmpPPa/WLVE7UsPr/1Zert/LtUvU7zbqHzyfMcak7NUAAAAAAAAAAAAAAEghJsUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4pa66667dPTRR6uwsFCFhYWaO3eunnzyyeT7P/3pT8vn8/X4c+mll7pYcf9uvfVW+Xw+XX311cnHOjo6dPnll6ukpEQFBQVaunSp6urq3CuyD33VbetxX7FiRa+6jjjiiOT7bT7eh6rd1mMuSbt27dIFF1ygkpIS5eXl6V/+5V+0bt265PuNMfr2t7+tyspK5eXlacGCBdq8ebOLFXc5VO1f/OIXex33xYsXu1jx8DnU71ubf37SISu8mhGSd3LCqxlBPriDfPCmdMgDybuZ4JU8kMgEN5AJ6YFzBjuQE6nh1ayQvJ0XEpmB4ZVOeSB5NxMk7+WCRDa4zcv5INmVEf5h+az4yMaNG6dbb71VU6dOlTFG99xzj5YsWaI333xTRx11lCTpkksu0cqVK5Mfk5+f71a5/Xrttdf03//93zr66KN7PH7NNdfo8ccf18MPP6yioiJdccUVOvPMM/XSSy+5VGlP/dUt2XvcjzrqKK1evTr5tt/f9eNt+/E+WO2Snce8sbFRJ5xwgk455RQ9+eSTKisr0+bNmzVq1Kjkc2677TbdcccduueeezR58mQtW7ZMp556qjZs2KDc3Fyra5ekxYsX6+67706+nZOTk+pSU+JQv29t/vnxelZ4NSMk7+WEVzOCfLCvdilz8sFLvJ4HknczwWt5IJEJqUQmpA/OGdxHTqSWV7NC8mZeSGQGhl+65IHk3UyQvJsLEtngFi/ng2RhRhh4xqhRo8yvfvUrY4wxJ598svnKV77ibkGH0NraaqZOnWqeeeaZHvU2NTWZQCBgHn744eRz33vvPSPJrF271qVqu/RXtzH2Hvfly5ebY445ps/32X68D1a7MfYe8+uvv96ceOKJ/b4/FouZiooK8/3vfz/5WFNTk8nJyTEPPPBAKkrs16FqN8aYCy+80CxZsiQ1BVnI+X1r+89PX7ySFV7NCGO8lxNezQjyIfXIh/TilTwwxruZ4LU8MIZMSDUyIb1xzpA65ERqeTUrjPFuXhhDZsAdXssDY7ybCcZ4NxeMIRvc5OV8MMa+jGD5dA+IRqN68MEHtW/fPs2dOzf5+H333afS0lLNnDlTN9xwg9rb212ssrfLL79cp59+uhYsWNDj8ddff13hcLjH40cccYQmTJigtWvXprrMXvqr22Hrcd+8ebOqqqp02GGH6fzzz9eOHTsk2X+8pf5rd9h4zB999FHNnj1bZ511lsaMGaOPf/zj+uUvf5l8/7Zt21RbW9vjuBcVFWnOnDmuH/dD1e54/vnnNWbMGE2fPl1f/vKXtWfPHheqTa0Df9964efH4bWs8GpGSN7MCa9mBPmQWuRDevBaHkjezQQv5oFEJqQSmZCeOGdIPXIi9byaFZI380IiM5BaXs0DybuZIHk7FySywS1ezgfJvoxg+XSL/f3vf9fcuXPV0dGhgoIC/fGPf9SMGTMkSeedd54mTpyoqqoqrV+/Xtdff702btyoRx55xOWq4x588EG98cYbeu2113q9r7a2VsFgUMXFxT0eLy8vV21tbYoq7NvB6pbsPe5z5szRqlWrNH36dNXU1Oimm27SSSedpHfeecfq4y0dvPaRI0dae8z/+c9/6q677tK1116rG2+8Ua+99pquuuoqBYNBXXjhhcljW15e3uPjbDjuh6pdii9XcuaZZ2ry5MnaunWrbrzxRp122mlau3atsrOzXa1/OPT3+/att96y+udH8mZWeDUjJG/mhFczgnxIPfLB27yYB5J3M8GLeSCRCalGJqQXzhncQU6knlezQvJuXkhkBlLDy3kgeTcTJG/ngkQ2uMnL+SBZmBEpmUfHh9LZ2Wk2b95s1q1bZ77xjW+Y0tJS8+677/b53DVr1hhJZsuWLSmusrcdO3aYMWPGmLfffjv5WPclKO677z4TDAZ7fdwnPvEJ8/Wvfz1VZfZyqLr7YtNx766xsdEUFhaaX/3qV9Ye7/50r70vthzzQCBg5s6d2+OxK6+80nzyk580xhjz0ksvGUmmurq6x3POOussc/bZZ6eszr4cqva+bN261Ugyq1evHu7yXNHf71sv/Px4LSu8mhHGpE9OeDUjyIfhRz54m9fywBjvZkK65IExZMJwIxPSC+cMqUdO2MGrWWGMd/LCGDIDqeHVPDDGu5lgTPrlgjFkQyp5OR+MsS8jWD7dYsFgUFOmTNGsWbN0yy236JhjjtGPf/zjPp87Z84cSdKWLVtSWWKfXn/9ddXX1+vYY4+V3++X3+/XCy+8oDvuuEN+v1/l5eUKhUJqamrq8XF1dXWqqKhwp2gduu5oNNrrY2w67t0VFxdr2rRp2rJliyoqKqw83v3pXntfbDnmlZWVyTspHUceeWRy6RXn2NbV1fV4jg3H/VC19+Wwww5TaWmp68d9uPT3+9YLPz9eywqvZoSUPjnh1YwgH4Yf+eBtXssDybuZkC55IJEJw41MSC+cM6QeOWEHr2aF5J28kMgMpIZX80DybiZI6ZcLEtmQSl7OB8m+jKAp7iGxWEydnZ19vu+tt96SFP8Gc9v8+fP197//XW+99Vbyz+zZs3X++ecn/x4IBLRmzZrkx2zcuFE7duzosYeJbXX3tUyDTce9u7a2Nm3dulWVlZWaNWuWlce7P91r74stx/yEE07Qxo0bezy2adMmTZw4UZI0efJkVVRU9DjuLS0teuWVV1w/7oeqvS87d+7Unj17XD/uqeL8vvXaz49kf1Z4NSOk9MkJr2YE+TD8yIf0YnseSN7NhHTJA4lMGG5kQnrjnGH4kRN28GpWSN7JC4nMgDu8kgeSdzNBSr9ckMiGVPJyPkgWZsSQz55jSHzjG98wL7zwgtm2bZtZv369+cY3vmF8Pp95+umnzZYtW8zKlSvNunXrzLZt28yf/vQnc9hhh5lPfepTbpfdrwOXA7n00kvNhAkTzLPPPmvWrVtn5s6d22sJBRt0r9vm4/7Vr37VPP/882bbtm3mpZdeMgsWLDClpaWmvr7eGGP38T5Y7TYf81dffdX4/X5z8803m82bN5v77rvP5Ofnm3vvvTf5nFtvvdUUFxebP/3pT2b9+vVmyZIlZvLkyWb//v0uVn7o2ltbW811111n1q5da7Zt22ZWr15tjj32WDN16lTT0dHhau3D4WC/b42x++cnXbLCqxlhjDdywqsZQT6kHvngXemSB8Z4NxO8kAfGkAmpRiakD84Z7EFODD+vZoUx3s0LY8gMDL90ywNjvJsJxngrF4whG9zk5Xwwxr6MoCluqYsuushMnDjRBINBU1ZWZubPn5882dqxY4f51Kc+ZUaPHm1ycnLMlClTzNe+9jXT3NzsctX9OzCg9u/fby677DIzatQok5+fbz7/+c+bmpoa9wrsR/e6bT7u55xzjqmsrDTBYNCMHTvWnHPOOT32vLD5eB+sdpuPuTHGPPbYY2bmzJkmJyfHHHHEEeYXv/hFj/fHYjGzbNkyU15ebnJycsz8+fPNxo0bXaq2p4PV3t7ebhYtWmTKyspMIBAwEydONJdccompra11seLhc7Dft8bY/fOTLlnh1Ywwxhs54dWMIB/cQT54U7rkgTHezQQv5IExZIIbyIT0wDmDPciJ4efVrDDG23lhDJmB4ZVueWCMdzPBGG/lgjFkg9u8nA/G2JURPmOMGfr5cwAAAAAAAAAAAAAA3Mee4gAAAAAAAAAAAACAtEVTHAAAAAAAAAAAAACQtmiKAwAAAAAAAAAAAADSFk1xAAAAAAAAAAAAAEDaoikOAAAAAAAAAAAAAEhbNMUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4gAAAAAAAAAAAACAtEVTHLDEr3/9ay1atMjtMgZk1apVKi4uHtbX+PnPf64zzjhjWF8DAGxGLvRELgAA2XAgsgEAyIYDkQ0AQDYciGyAg6Y40prP5zvonxUrVuj999/v8VhJSYkWLVqkN998s8fnevfdd3X22WerrKxMOTk5mjZtmr797W+rvb1dkvT8888f8vWef/75Puvs6OjQsmXLtHz58uE+JJKkyZMna/Xq1SkJnA/roosu0htvvKG//vWvbpcCII2QC30jFwBkMrKhb2QDgExGNvSNbACQyciGvpEN8BKa4khrNTU1yT+33367CgsLezx23XXXJZ+7evVq1dTU6M9//rPa2tp02mmnqampSZL08ssva86cOQqFQnr88ce1adMm3XzzzVq1apUWLlyoUCik448/vsfnPvvss7V48eIejx1//PF91vmHP/xBhYWFOuGEE4b9mKxfv16NjY06+eSTh/21PopgMKjzzjtPd9xxh9ulAEgj5EJv5AKATEc29EY2AMh0ZENvZAOATEc29EY2wHMMkCHuvvtuU1RU1Ovxbdu2GUnmzTffTD720ksvGUnmqaeeMrFYzMyYMcPMnj3bRKPRHh/71ltvGZ/PZ2699dZen/fCCy80S5YsGVBtp59+urnuuuv6/Pibb77ZjBkzxhQVFZmbbrrJhMNhc91115lRo0aZsWPHmt/85jfJj+ns7DSXX365qaioMDk5OWbChAnmu9/9bo/Pu3LlSnPOOeeY5557zkjq8Wf58uXGGGP27t1rvvCFL5ji4mKTl5dnFi9ebDZt2tTvsayvrzezZs0yn/vc50xHR4eJRqPmu9/9rpk0aZLJzc01Rx99tHn44YeTz3dee/Xq1WbWrFkmLy/PzJ071/zjH//oUesLL7xggsGgaW9vH9BxBIDBIBfiyAUA6EI2xJENANCFbIgjGwCgC9kQRzbAa5gUB/qQl5cnSQqFQnrrrbe0YcMGXXvttcrK6vkjc8wxx2jBggV64IEHPtLrvfjii5o9e3avx5999llVV1frL3/5i374wx9q+fLl+uxnP6tRo0bplVde0aWXXqovfelL2rlzpyTpjjvu0KOPPqqHHnpIGzdu1H333adJkyb1+JyPPvqolixZouOPP77XHW3O3Wxf/OIXtW7dOj366KNau3atjDH6zGc+o3A43KvGDz74QCeddJJmzpypP/zhD8rJydEtt9yi3/72t/r5z3+ud999V9dcc40uuOACvfDCCz0+9pvf/KZ+8IMfaN26dfL7/brooot6vH/27NmKRCJ65ZVXPsrhBYCPjFwgFwDgQGQD2QAAByIbyAYAOBDZQDbAIq625IEUGujdW42Njebzn/+8KSgoMLW1tebBBx/sdXdXd1dddZXJy8vr9fhA795qbGw0ksxf/vKXXh8/ceLEHneMTZ8+3Zx00knJtyORiBkxYoR54IEHjDHGXHnllWbevHkmFov1+Vo7d+40wWDQNDY2GmP6PiabNm0yksxLL72UfKyhocHk5eWZhx56qMfH/eMf/zDjx483V111VfI1Ozo6TH5+vvnb3/7W4/NefPHF5txzzzXG9Lx7y/H4448bSWb//v09Pm7UqFFm1apVfR88APgIyAVyAQAORDaQDQBwILKBbACAA5ENZAO8yZ+KxjvgBccff7yysrK0b98+HXbYYfr973+v8vLy5PuNMcPyuvv375ck5ebm9nrfUUcd1eOOsfLycs2cOTP5dnZ2tkpKSlRfXy8pftfVwoULNX36dC1evFif/exntWjRouTzH330UZ144okqLi7ut5733ntPfr9fc+bMST5WUlKi6dOn67333utR90knnaTzzjtPt99+e/LxLVu2qL29XQsXLuzxeUOhkD7+8Y/3eOzoo49O/r2yslKSVF9frwkTJiQfz8vLU3t7e7/1AsBwIRfiyAUA6EI2xJENANCFbIgjGwCgC9kQRzbANjTFgYTf//73mjFjhkpKSnr8Ip82bZqk+C/wA3/ROo87z/kwSkpK5PP51NjY2Ot9gUCgx9s+n6/Px2KxmCTp2GOP1bZt2/Tkk09q9erVOvvss7VgwQL94Q9/kBQPqn/913/90LV2l5OTowULFuj//u//9LWvfU1jx46VJLW1tUmSHn/88eRj3T+mv6/P5/NJUvJrcezdu1dlZWVDUjMADAa5MDjkAoBMQDYMDtkAIBOQDYNDNgDIBGTD4JANSBX2FAcSxo8fr8MPP7zXnU0f+9jHdMQRR+hHP/pRr1+ib7/9tlavXq1zzz33Q79uMBjUjBkztGHDhg/9OborLCzUOeeco1/+8pf6/e9/r//5n//R3r171dbWpueee05Llizp8drRaLTHxx955JG99tbYs2ePNm7cqBkzZiQfy8rK0u9+9zvNmjVLp5xyiqqrqyVJM2bMUE5Ojnbs2KEpU6b0+DN+/PhBfS1bt25VR0dHn/+DAADDjVyIIxcAoAvZEEc2AEAXsiGObACALmRDHNkA29AUBw7B5/Pp17/+tTZs2KClS5fq1Vdf1Y4dO/Twww/rjDPO0Ny5c3X11Vd/pNc49dRT9eKLL37kWn/4wx/qgQce0D/+8Q9t2rRJDz/8sCoqKlRcXKynnnpK06ZN06RJk5LPnzRpktra2rRmzRo1NDSovb1dU6dO1ZIlS3TJJZfoxRdf1Ntvv60LLrhAY8eO7RFyUnxJlfvuu0/HHHOM5s2bp9raWo0cOVLXXXedrrnmGt1zzz3aunWr3njjDf3kJz/RPffcM6iv569//asOO+wwHX744R/52ADAUCEXyAUAOBDZQDYAwIHIBrIBAA5ENpANcBdNcWAAjj/+eL388svKzs7WaaedpilTpuiGG27QhRdeqGeeeabXUh2DdfHFF+uJJ55Qc3PzR/o8I0eO1G233abZs2frE5/4hN5//3098cQTysrK0p/+9Kdey5kcf/zxuvTSS3XOOeeorKxMt912myTp7rvv1qxZs/TZz35Wc+fOlTFGTzzxRK/lVCTJ7/frgQce0FFHHaV58+apvr5e3/nOd7Rs2TLdcsstOvLII7V48WI9/vjjmjx58qC+ngceeECXXHLJhz8gADBMyAVyAQAORDaQDQBwILKBbACAA5ENZAPc4zPGGLeLACCdddZZOvbYY3XDDTcM+eeORCIqLy/Xk08+qeOOO27IP/9wePfddzVv3jxt2rRJRUVFbpcDAClHLvRELgAA2XAgsgEAyIYDkQ0AQDYciGyAg0lxwBLf//73VVBQMCyfe+/evbrmmmv0iU98Ylg+/3CoqanRb3/7W0IKQMYiF3oiFwCAbDgQ2QAAZMOByAYAIBsORDbAwaQ4AAAAAAAAAAAAACBtMSkOAAAAAAAAAAAAAEhbNMUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4gAAAAAAAAAAAACAtEVTHAAAAAAAAAAAAACQtmiKAwAAAAAAAAAAAADSFk1xAAAAAAAAAAAAAEDaoikOAAAAAAAAAAAAAEhb/z8c3Pk+fvR86QAAAABJRU5ErkJggg==",
+      "text/plain": [
+       "<Figure size 2000x500 with 5 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create a list to store the throughput and tpot data\n",
+    "throughput_tpot_data = []\n",
+    "\n",
+    "# Iterate over the models, batch sizes, and arrival rates to calculate throughput and tpot\n",
+    "for ssm in small_model_names:\n",
+    "    for batch_size in batch_sizes:\n",
+    "        for arrival_rate in arrival_rates:\n",
+    "            model_name = ssm.replace(\"/\", \"-\")\n",
+    "            filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "            if os.path.exists(filepath):\n",
+    "                throughput = get_throughput(filepath)\n",
+    "                tpot = get_tpot(filepath)\n",
+    "                throughput_tpot_data.append({\n",
+    "                    'Model': model_name,\n",
+    "                    'Batch Size': batch_size,\n",
+    "                    'Arrival Rate': arrival_rate,\n",
+    "                    'Throughput': throughput,\n",
+    "                    'TPOT': tpot\n",
+    "                })\n",
+    "\n",
+    "# add incremental decoding entry\n",
+    "for batch_size in batch_sizes:\n",
+    "    for arrival_rate in arrival_rates:\n",
+    "        model_name = ssm.replace(\"/\", \"-\")\n",
+    "        filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "        if os.path.exists(filepath):\n",
+    "            throughput = get_throughput(filepath)\n",
+    "            tpot = get_tpot(filepath)\n",
+    "            throughput_tpot_data.append({\n",
+    "                'Model': \"Incr Dec (baseline)\",\n",
+    "                'Batch Size': batch_size,\n",
+    "                'Arrival Rate': arrival_rate,\n",
+    "                'Throughput': throughput,\n",
+    "                'TPOT': tpot\n",
+    "            })\n",
+    "\n",
+    "# Convert the list to a DataFrame\n",
+    "throughput_tpot_df = pd.DataFrame(throughput_tpot_data)\n",
+    "\n",
+    "# Plot the data\n",
+    "fig, axes = plt.subplots(nrows=1, ncols=len(arrival_rates), figsize=(20, 5), sharey=True)\n",
+    "\n",
+    "for i, arrival_rate in enumerate(arrival_rates):\n",
+    "    ax = axes[i]\n",
+    "    for model_name in throughput_tpot_df['Model'].unique():\n",
+    "        model_data = throughput_tpot_df[(throughput_tpot_df['Model'] == model_name) & (throughput_tpot_df['Arrival Rate'] == arrival_rate)]\n",
+    "        ax.plot(model_data['TPOT'], model_data['Throughput'], marker='o', label=model_name)\n",
+    "        ax.set_title(f'Arrival Rate: {arrival_rate} {\"requests/sec\" if arrival_rate != \"offline\" else \"\"}')\n",
+    "        ax.set_xlabel('TPOT (ms/token)')\n",
+    "        ax.set_ylabel('Output Throughput (tokens/sec)')\n",
+    "        ax.grid(True)\n",
+    "    if i == 0:\n",
+    "        ax.legend(title='Model')\n",
+    "\n",
+    "plt.suptitle('Throughput vs TPOT for Different Arrival Rates\\nLLM: LLAMA-3.1-70B-Instruct\\nBatch Sizes: 4, 8')\n",
+    "plt.tight_layout(rect=[0, 0, 1, 0.96])\n",
+    "\n",
+    "# Save the plot as a PDF\n",
+    "plt.savefig('/usr/FlexFlow/benchmarking/throughput_vs_tpot.pdf')\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   Model  Batch Size Arrival Rate        TTFT\n",
+      "0  Zhuominc-Llama-3-330M           4      offline  236.037453\n",
+      "1  Zhuominc-Llama-3-330M           4            1  239.494513\n",
+      "2  Zhuominc-Llama-3-330M           4            2  236.035863\n",
+      "3  Zhuominc-Llama-3-330M           4            4  237.153932\n",
+      "4  Zhuominc-Llama-3-330M           4            8  237.309231\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABVwAAALvCAYAAACZeQ7oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADxCklEQVR4nOzdeVyN6f8/8NeptC9atJFKoiSh7EP2KCR7tsq+CxmMqGQdu5mxLxlK9u0jS4OIjGXsNJgohpQ1khZ1//7o1/11nJZTIqbX8/s4j++c676W93Wfuz68Xee6JIIgCCAiIiIiIiIiIiKiz6ZQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUsKEKxEREREREREREVEpYcKViIiIiIiIiIiIqJQw4UpERCJvb29YWFh80TEkEgkCAwO/6BhlYeHChahWrRoUFRVRt27dsg5HRsuWLdGyZUupsqSkJPTo0QP6+vqQSCRYtmwZAODevXto3749dHR0IJFIsG/fvq8e739NfvdfXhYWFvD29i7VeL60z5lvWbOwsECnTp2++DiBgYGQSCRffBwiIiIi+vqYcCWib45EIinVV1RUFOLj4wu83rhxY/EvvkW9vpUEwo8//giJRILevXuXdShfxKefl4KCAvT09NCxY0ecO3euxP2uXLkSISEhpRfo/3fs2DH8+OOPaNasGTZt2oS5c+eW+hgf8/b2lro/mpqaqFatGnr06IHdu3cjJydHrn4mTJiAo0ePYtq0adiyZQs6dOgAAPDy8sKNGzcwZ84cbNmyBU5OTl9yOp9l7ty5cieEP36uZs+enW+dfv36ifeUvjwLCwupZ1lVVRXW1taYPHkyXr58WaI+Y2JiEBgYiNevX5dusHJKTU1FQEAAateuDQ0NDejr66Nu3boYP348njx5UiYxEREREdHXpVTWARARfWrLli1S73///XdERkbKlGdnZ0NRUbHIera2tnj//j0AwNPTE66urlLXK1WqBBMTE1SvXl0sS01NxciRI+Hh4YFu3bqJ5UZGRp83uVIgCAK2bdsGCwsLHDx4EG/fvoWWllap9L1u3Tq5k3VfQ97nlZ2djbt372LlypVo1aoVLl68CHt7+2L3t3LlShgYGJT6asETJ05AQUEBGzZsgLKycqn2XRAVFRWsX78eAPD+/XskJCTg4MGD6NGjB1q2bIn9+/dDW1tbrH/s2LF843Z3d4efn59Y9v79e5w7dw7Tp0/HmDFjvvxEPtPcuXPRo0cPdO3aVe42qqqq2LZtG/z9/aXK3717h/3790NVVbWUo6TC1K1bF5MmTQIApKen46+//sKyZctw6tQpXLhwodj9xcTEICgoCN7e3qhYsWIpR1u4rKwstGjRAn///Te8vLwwduxYpKam4tatWwgLC4OHhwdMTU0BAP7+/pg6depXjY+IiIiIvg4mXInom9O/f3+p93/++SciIyNlyj9VWL34+HgAQP369Qvsp06dOuJ/P3/+HCNHjkSdOnWKHPdri4qKwr///osTJ07AxcUFe/bsgZeXV5Ht0tPToaysDAUF2S83vHv3DhoaGqhQocKXCLnEPv28mjdvjo4dO2LVqlVYuXJlGUYmLTk5GWpqaqWWbBUEAenp6VBTUyuwjpKSksyzOXv2bMyfPx/Tpk3D0KFDsX37dvFafrElJyfLJKSePXsGAKWaqCrs2SsLrq6u2LNnD65duwYHBwexfP/+/cjMzESHDh1w4sSJMoywfKlcubLUszxkyBBoampi0aJFuHfvHqytrcswuuLZt28frly5gtDQUPTt21fqWnp6OjIzM8X3SkpKUFLiH8WJiIiI/ou+jb/5EBF953bt2gWJRIJTp07JXFuzZg0kEglu3rwJAHj69Cl8fHxQpUoVqKiowMTEBO7u7mJSuCihoaGoVasWWrVqhbZt2yI0NFSmTlRUFCQSCcLDw+Hv74/KlStDXV0db968gbe3NzQ1NREXFwdXV1doaWmhX79+AKT3cM3KyoKenh58fHxk+n/z5g1UVVXFlZGZmZmYOXMmHB0doaOjAw0NDTRv3hwnT56Ua07yat68OQAgLi5OqnzTpk1o3bo1DA0NoaKiglq1amHVqlVSdSwsLHDr1i2cOnUq3y0iXr9+DV9fX5iZmUFFRQXVq1fHggULilzxK5FIsGnTJrx7907sN2/bgg8fPiA4OBhWVlZQUVGBhYUFfvrpJ2RkZMjE1qlTJxw9ehROTk5QU1PDmjVrSnSPpk6divbt22Pnzp24e/euWP7xnpohISGQSCQQBAG//fabGHdgYCDMzc0BAJMnT4ZEIpHa0/fx48cYNGgQjIyMoKKiAjs7O2zcuFFq/MKePQA4f/48OnToAB0dHairq8PZ2Rlnz56V6iNvi49//vlHXKWoo6MDHx8fpKWlSd37d+/eYfPmzeIc5Fm93KRJE1haWiIsLEyqPDQ0FB06dICenl6+7VauXAk7OzuoqKjA1NQUo0ePzvdr62vXroWVlRXU1NTQsGFDREdH59tfRkYGAgICUL16daioqMDMzAw//vijzPPxqaysLAQFBcHa2hqqqqrQ19fHDz/8gMjIyELbvXz5En5+frC3t4empia0tbXRsWNHXLt2Tape3me4Y8cOzJkzB1WqVIGqqiratGmDf/75p8TzLQ5jY2MAkEpIXr9+Hd7e3qhWrRpUVVVhbGyMQYMG4cWLF2KdwMBATJ48GQBgaWkpPhcf/37dunUrGjZsCHV1dejq6qJFixb5rgA/c+YMGjZsCFVVVVSrVg2///57kXHn/W5q1qyZzDVVVVWpVeef7uH66VYhH78+3vda3ucmMjISP/zwAypWrAhNTU3UrFkTP/30U5FzICIiIqLPx39WJ6JyJS0tDc+fP5cq09HR+eyVnW5ubtDU1MSOHTvg7OwsdW379u2ws7ND7dq1AQDdu3fHrVu3MHbsWFhYWCA5ORmRkZF4+PBhkQdWZWRkYPfu3eLXbz09PeHj44OnT5+KCYqPBQcHQ1lZGX5+fsjIyBBXOX748AEuLi744YcfsGjRIqirq8u0rVChAjw8PLBnzx6sWbNGaoXkvn37kJGRgT59+gDITcCuX78enp6eGDp0KN6+fYsNGzbAxcUFFy5cKLVDpPKSJrq6ulLlq1atgp2dHbp06QIlJSUcPHgQo0aNQk5ODkaPHg0AWLZsGcaOHQtNTU1Mnz4dwP9tEZGWlgZnZ2c8fvwYw4cPR9WqVRETE4Np06YhMTFRPEwqP1u2bMHatWtx4cIF8Sv+TZs2BZC7Um/z5s3o0aMHJk2ahPPnz2PevHmIjY3F3r17pfq5c+cOPD09MXz4cAwdOhQ1a9Ys8X0aMGAAjh07hsjISNSoUUPmeosWLbBlyxYMGDAA7dq1w8CBAwHkrvKuWLEiJkyYIG7nkLeXaVJSEho3bgyJRIIxY8agUqVKOHz4MAYPHow3b97A19dXaoz8nr0TJ06gY8eOcHR0REBAABQUFMRkeXR0NBo2bCjVR69evWBpaYl58+bh8uXLWL9+PQwNDbFgwQLx3g8ZMgQNGzbEsGHDAABWVlZy3SNPT09s3boV8+fPh0QiwfPnz3Hs2DFs2bIFR44ckakfGBiIoKAgtG3bFiNHjsSdO3ewatUqXLx4EWfPnhV/h2zYsAHDhw9H06ZN4evri/v376NLly7Q09ODmZmZ2F9OTg66dOmCM2fOYNiwYbC1tcWNGzewdOlS3L17t9B9aQMDAzFv3jxx7m/evMGlS5dw+fJltGvXrsB29+/fx759+9CzZ09YWloiKSkJa9asgbOzM27fvi1+1T3P/PnzoaCgAD8/P6SkpODnn39Gv379cP78ebGOvPMtTFZWlvh7OT09HVeuXMGSJUvQokULWFpaivUiIyNx//59+Pj4wNjYGLdu3cLatWtx69Yt/Pnnn5BIJOjWrRvu3r2Lbdu2YenSpTAwMACQu3UMAAQFBSEwMBBNmzbFrFmzoKysjPPnz+PEiRNo3769ONY///yDHj16YPDgwfDy8sLGjRvh7e0NR0dH2NnZFTiXvH+w+P333+Hv71+sQ7GGDx+Otm3bSpUdOXIEoaGhMDQ0BCD/c3Pr1i106tQJderUwaxZs6CiooJ//vlH5h83iIiIiOgLEYiIvnGjR48W5Pl1VVi9Bw8eCADyfZ08eVKm/rNnzwQAQkBAgNxxenp6CoaGhsKHDx/EssTEREFBQUGYNWuWIAiC8OrVKwGAsHDhQrn7/diuXbsEAMK9e/cEQRCEN2/eCKqqqsLSpUul6p08eVIAIFSrVk1IS0uTuubl5SUAEKZOnSrTv5eXl2Bubi6+P3r0qABAOHjwoFQ9V1dXoVq1auL7Dx8+CBkZGVJ1Xr16JRgZGQmDBg2SKpfnvuZ9XkFBQcKzZ8+Ep0+fCtHR0UKDBg0EAMLOnTul6n86R0EQBBcXF6kYBUEQ7OzsBGdnZ5m6wcHBgoaGhnD37l2p8qlTpwqKiorCw4cPC43Xy8tL0NDQkCq7evWqAEAYMmSIVLmfn58AQDhx4oRYZm5uLgAQjhw5Uug4hY33sStXrggAhAkTJohlzs7OMnMHIIwePVqqLO/ef/qMDh48WDAxMRGeP38uVd6nTx9BR0dH/AwKevZycnIEa2trwcXFRcjJyRHL09LSBEtLS6Fdu3ZiWUBAgABA5tnx8PAQ9PX1pco0NDQELy+vAu9FQXO7efOmAECIjo4WBEEQfvvtN0FTU1N49+6dzP1NTk4WlJWVhfbt2wvZ2dli+a+//ioAEDZu3CgIgiBkZmYKhoaGQt26daV+HtauXSsAkLr/W7ZsERQUFMTx86xevVoAIJw9e1YsMzc3l5qjg4OD4ObmJtecP5aeni4Vf949UVFREX9HCcL/fYa2trZS81i+fLkAQLhx40ax51uQvGf/01ezZs1knrX8fs63bdsmABBOnz4tli1cuFAAIDx48ECq7r179wQFBQXBw8ND5j58/EzmxfRxn8nJyYKKioowadKkQueTlpYm1KxZUwAgmJubC97e3sKGDRuEpKQkmbp5z3lB7t27J+jo6Ajt2rUT/3dF3udm6dKlAgDh2bNnhcZLRERERF8GtxQgonJl2LBhiIyMlHp9vIfj5+jduzeSk5MRFRUllu3atQs5OTno3bs3AIj7fEZFReHVq1fFHiM0NBROTk7iAV9aWlpwc3PLd1sBIPe0+YL2AR05cmSR47Vu3RoGBgZSe4G+evUKkZGR4pwAQFFRUVwBm5OTg5cvX+LDhw9wcnLC5cuX5Z7fpwICAlCpUiUYGxujefPmiI2NxeLFi9GjRw+peh/PMSUlBc+fP4ezszPu37+PlJSUIsfZuXMnmjdvDl1dXTx//lx8tW3bFtnZ2Th9+nSxY4+IiAAATJw4Uao8b3XyoUOHpMotLS3h4uJS7HHyk7cq9e3bt6XSnyAI2L17Nzp37gxBEKTukYuLC1JSUmQ+50+fvatXr+LevXvo27cvXrx4IbZ/9+4d2rRpg9OnT8ts3zBixAip982bN8eLFy/E7Qk+h52dHerUqYNt27YBAMLCwuDu7p7vau8//vgDmZmZ8PX1ldqHdujQodDW1hY/y0uXLiE5ORkjRoyQWhHu7e0NHR0dqT537twJW1tb2NjYSN3P1q1bA0Ch23FUrFgRt27dwr1794o1ZxUVFTH+7OxsvHjxQvyqeX4/pz4+PlLzyNvS4/79+8Web2EaNWok/j7+3//+hzlz5uDWrVvo0qWLeOAhIP1znp6ejufPn6Nx48YAINfvmX379iEnJwczZ86U2U/405WotWrVEucL5K6QrVmzpjj3gqipqeH8+fPitgYhISEYPHgwTExMMHbs2CK3i8jz7t07eHh4QFdXF9u2bRMPiJT3ucnbg3n//v3f1EGIREREROUFtxQgonLF2tpa5iubpSVvX8rt27ejTZs2AHK3E6hbt674tW4VFRUsWLAAkyZNgpGRERo3boxOnTph4MCB+W4J8LHXr18jIiICY8aMkdpHsVmzZti9ezfu3r0r8/Xxj7+O+zElJSVUqVKlyDkpKSmhe/fuCAsLQ0ZGBlRUVLBnzx5kZWVJJVwBYPPmzVi8eDH+/vtvZGVlFRmDPIYNG4aePXsiPT0dJ06cwIoVK5CdnS1T7+zZswgICMC5c+ek9vgEchOwRSV/7t27h+vXr4tfO/5UcnJysWNPSEiAgoKCmBzPY2xsjIoVKyIhIUGq/HPu06dSU1MB5CbkS8OzZ8/w+vVrrF27FmvXrs23zqf36NP55CUHCzvgLSUlRWq7iKpVq0pdz7v26tUrqb0wS6pv375YvHgxJkyYgJiYmAL3t8z7rD7d5kFZWRnVqlUTr+f9/08PeapQoQKqVasmVXbv3j3ExsaW6JmbNWsW3N3dUaNGDdSuXRsdOnTAgAEDpA7+y09OTg6WL1+OlStX4sGDB1I/S/r6+jL1C7v/QPHmWxgDAwOp38tubm6oWbMmevTogfXr12Ps2LEAcvegDQoKQnh4uMz9kecfVuLi4qCgoIBatWoVWffTuQO585fnH8p0dHTw888/4+eff0ZCQgKOHz+ORYsW4ddff4WOjg5mz55dZB9Dhw5FXFwcYmJipD4beZ+b3r17Y/369RgyZAimTp2KNm3aoFu3bujRo8c3c3gdERER0X8ZE65ERKVERUUFXbt2xd69e7Fy5UokJSXh7NmzmDt3rlQ9X19fdO7cGfv27cPRo0cxY8YMzJs3DydOnEC9evUK7H/nzp3IyMjA4sWLsXjxYpnroaGhCAoKkioraHXrxyvditKnTx+sWbMGhw8fRteuXbFjxw7Y2NhIrQzeunUrvL290bVrV0yePBmGhoZQVFTEvHnzZA64Ko6PE+SdOnWCoqIipk6dilatWsHJyQlAbhKlTZs2sLGxwZIlS2BmZgZlZWVERERg6dKlcq3uysnJQbt27fDjjz/mez2/fVDlJe8ejgV9ViWRd0Dbp8neksq7h/379y8wYfppsu/T+eT1sXDhwgL39M1bmZsnb1XfpwRBKDJmeXh6emLatGkYOnQo9PX1pfbw/NJycnJgb2+PJUuW5Hu9sP1PW7Rogbi4OOzfvx/Hjh3D+vXrsXTpUqxevRpDhgwpsN3cuXMxY8YMDBo0CMHBwdDT04OCggJ8fX3z/Tn50ve/MHn/aHX69Gkx4dqrVy/ExMRg8uTJqFu3LjQ1NZGTk4MOHTqU+irO0pq7ubk5Bg0aBA8PD1SrVg2hoaFFJlyXL1+Obdu2YevWrTI/K/I+N2pqajh9+jROnjyJQ4cO4ciRI9i+fTtat26NY8eOFTg/IiIiIiodTLgSEZWi3r17Y/PmzTh+/DhiY2MhCILMSlAg92CfSZMmYdKkSbh37x7q1q2LxYsXY+vWrQX2HRoaitq1ayMgIEDm2po1axAWFiaTcC0NLVq0gImJCbZv344ffvgBJ06cEA+dyrNr1y5Uq1YNe/bskUow5hfr55g+fTrWrVsHf39/8WCjgwcPIiMjAwcOHJBalZbfV7ILSn5aWVkhNTW1VFc/m5ubIycnB/fu3YOtra1YnpSUhNevX4uH63wJW7ZsgUQiKfQApeKoVKkStLS0kJ2dXeJ7lHeYlba2dqne5+IcSvSpqlWrolmzZoiKisLIkSOhpJT/H4vyPqs7d+5IrdzMzMzEgwcPxPnk1bt37574FW8g91CoBw8eSP0jhZWVFa5du4Y2bdqUaA56enrw8fGBj48PUlNT0aJFCwQGBhaacN21axdatWqFDRs2SJW/fv1aPFyqOIoz3+L68OEDgP9brf3q1SscP34cQUFBmDlzplgvv20VCvs5z8nJwe3bt0vtID956erqwsrKSvzHkIJER0fDz88Pvr6+6Nevn8z14jw3CgoKaNOmDdq0aYMlS5Zg7ty5mD59Ok6ePPnFvulBRERERLn4nSIiolLUtm1b6OnpYfv27di+fTsaNmwo9dXqtLQ0pKenS7WxsrKClpZWoXv7PXr0CKdPn0avXr3Qo0cPmZePjw/++ecfqdPDS4uCggJ69OiBgwcPYsuWLfjw4YNMEjlvtdTHq7/Onz+Pc+fOlWosFStWxPDhw3H06FFcvXq1wLFTUlKwadMmmfYaGhp4/fq1THmvXr1w7tw5HD16VOba69evxeRPcbi6ugIAli1bJlWetzLNzc2t2H3KY/78+Th27Bh69+4t81XvklJUVET37t2xe/fufBNGz549K7IPR0dHWFlZYdGiRWISrbh95Kegz1Res2fPRkBAgLiKMj9t27aFsrIyVqxYIfWcbdiwASkpKeJn6eTkhEqVKmH16tXIzMwU64WEhMjE2KtXLzx+/Bjr1q2TGe/9+/d49+5dgfG8ePFC6r2mpiaqV69e5P6gioqKMis0d+7cicePHxfariDFmW9xHTx4EADEpG1+P+eA7M8XkPtMAJCJoWvXrlBQUMCsWbNkVsSW1qrda9eu4fnz5zLlCQkJuH37tsy2FB9LTExEr1698MMPP2DhwoX51pH3uXn58qXM9bwks7z7yBIRERFRyXGFKxFRKapQoQK6deuG8PBwvHv3DosWLZK6fvfuXbRp0wa9evVCrVq1oKSkhL179yIpKQl9+vQpsN+wsDAIgoAuXbrke93V1RVKSkoIDQ1Fo0aNSnVOQO7K3V9++QUBAQGwt7eXWrEJ5H7df8+ePfDw8ICbmxsePHiA1atXo1atWvkm1z7H+PHjsWzZMsyfPx/h4eFo3749lJWV0blzZwwfPhypqalYt24dDA0NkZiYKNXW0dERq1atwuzZs1G9enUYGhqidevWmDx5Mg4cOIBOnTrB29sbjo6OePfuHW7cuIFdu3YhPj6+2CsAHRwc4OXlhbVr1+L169dwdnbGhQsXsHnzZnTt2hWtWrX6rPvw4cMHcUV0eno6EhIScODAAVy/fh2tWrUqcK/Vkpo/fz5OnjyJRo0aYejQoahVqxZevnyJy5cv448//sg3wfMxBQUFrF+/Hh07doSdnR18fHxQuXJlPH78GCdPnoS2traYZCsOR0dH/PHHH1iyZAlMTU1haWlZrJ8BZ2dnODs7F1qnUqVKmDZtGoKCgtChQwd06dIFd+7cwcqVK9GgQQP0798fQO7P/+zZszF8+HC0bt0avXv3xoMHD7Bp0yaZPU0HDBiAHTt2YMSIETh58iSaNWuG7Oxs/P3339ixYweOHj0qbpvxqVq1aqFly5ZwdHSEnp4eLl26hF27dmHMmDGFzqNTp06YNWsWfHx80LRpU9y4cQOhoaHF2m/1Y8WZb2EeP34sPsuZmZm4du0a1qxZAwMDAzERrq2tjRYtWuDnn39GVlYWKleujGPHjuHBgwcy/Tk6OgLIXRHfp08fVKhQAZ07d0b16tUxffp0BAcHo3nz5ujWrRtUVFRw8eJFmJqaYt68eSW6Dx+LjIxEQEAAunTpgsaNG0NTUxP379/Hxo0bkZGRgcDAwALbjhs3Ds+ePcOPP/6I8PBwqWt16tRBnTp15H5uZs2ahdOnT8PNzQ3m5uZITk7GypUrUaVKFfzwww+fPU8iIiIiKoJARPSNGz16tCDPr6vC6j148EAAICxcuFCuMZ89eyYAEAICAooTqiAIghAZGSkAECQSifDo0SOpa8+fPxdGjx4t2NjYCBoaGoKOjo7QqFEjYceOHYX2aW9vL1StWrXQOi1bthQMDQ2FrKws4eTJkwIAYefOnTL1vLy8BA0NjXz78PLyEszNzWXKc3JyBDMzMwGAMHv27Hyvz507VzA3NxdUVFSEevXqCf/73//y7U+e+1rU5+Xt7S0oKioK//zzjyAIgnDgwAGhTp06gqqqqmBhYSEsWLBA2LhxowBAePDggdju6dOngpubm6ClpSUAEJydncVrb9++FaZNmyZUr15dUFZWFgwMDISmTZsKixYtEjIzMwuNt6B7mpWVJQQFBQmWlpZChQoVBDMzM2HatGlCenq6VD1zc3PBzc2t0DE+HQ+A+FJXVxcsLCyE7t27C7t27RKys7Nl2jg7O0vNVxByP4vRo0dLlRV275OSkoTRo0cLZmZmQoUKFQRjY2OhTZs2wtq1a8U6hT17giAIV65cEbp16ybo6+sLKioqgrm5udCrVy/h+PHjYp2AgAABgPDs2TOptps2bZL5TP/++2+hRYsWgpqamgBA8PLyKui2yf17oKDP89dffxVsbGyEChUqCEZGRsLIkSOFV69eydRbuXKlYGlpKaioqAhOTk7C6dOn873/mZmZwoIFCwQ7OztBRUVF0NXVFRwdHYWgoCAhJSVFrGdubi41r9mzZwsNGzYUKlasKKipqQk2NjbCnDlzinxO09PThUmTJgkmJiaCmpqa0KxZM+HcuXMysRX0Gebdv02bNpVovvkxNzeXepYVFBQEQ0NDwdPTU/z5zvPvv/8KHh4eQsWKFQUdHR2hZ8+ewpMnT/L9nRIcHCxUrlxZUFBQkHlmNm7cKNSrV0+8587OzkJkZKRUTPn9PMozp/v37wszZ84UGjduLBgaGgpKSkpCpUqVBDc3N+HEiRNSdfOe84/7//hefPz6eH7yPDfHjx8X3N3dBVNTU0FZWVkwNTUVPD09hbt37xYaPxERERGVDokgfIWTD4iIiIiIiIiIiIjKAe7hSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiL6KkJAQSCQSXLp0qcA68fHxkEgkWLRoUaF9WVhYQCKRoG3btvleX7duHSQSSZHjFSYwMBASiQTPnz8vsE5UVBQkEgl27dold7+9evWCRCLBlClTCu1TIpFg69at+dZp1qwZJBIJateune/17OxsmJqaQiKR4PDhw3LHBgATJkxA/fr1oaenB3V1ddja2iIwMBCpqalytV+1ahV69uyJqlWrQiKRwNvbu1jj5322+b2sra1l6m/YsAG2trZQVVWFtbU1fvnlF5k63t7eUv0oKSnBzMwMffr0we3bt+WKS57n93Pcvn0bgYGBiI+P/yL9fy8xEBERERH9FyiVdQBEREQloaqqipMnT+Lp06cwNjaWuhYaGgpVVVWkp6eXUXT5e/PmDQ4ePAgLCwts27YN8+fPh0QiybeuqqoqwsLC0L9/f6ny+Ph4xMTEQFVVtcBxTpw4gcTERFhYWCA0NBQdO3aUO8aLFy+iefPm8PHxgaqqKq5cuYL58+fjjz/+wOnTp6GgUPi/1S5YsABv375Fw4YNkZiYKPe4eZYtWyaT3E1ISIC/vz/at28vVb5mzRqMGDEC3bt3x8SJExEdHY1x48YhLS1NJqGtoqKC9evXAwA+fPiAuLg4rF69GkeOHMHt27dhampa7FhL0+3btxEUFISWLVvCwsKi3MZARERERPRfwIQrERF9l5o1a4aLFy9i+/btGD9+vFj+77//Ijo6Gh4eHti9e3cZRihr9+7dyM7OxsaNG9G6dWucPn0azs7O+dZ1dXXFgQMH8Pz5cxgYGIjlYWFhMDIygrW1NV69epVv261bt6J+/frw8vLCTz/9hHfv3kFDQ0OuGM+cOSNTZmVlBT8/P1y4cAGNGzcutP2pU6fE1a2amppyjfmxrl27ypTNnj0bANCvXz+x7P3795g+fTrc3NzEFcZDhw5FTk4OgoODMWzYMOjq6or1lZSUZJLXjRs3RqdOnXDo0CEMHTq02LGWFUEQkJ6eDjU1tbIOhYiIiIiI8sEtBYiI6LukqqqKbt26ISwsTKp827Zt0NXVhYuLi0ybrKws/P333yVaeVkaQkND0a5dO7Rq1Qq2trYIDQ0tsK67uztUVFSwc+dOqfKwsDD06tULioqK+bZ7//499u7diz59+qBXr154//499u/f/1lx5612fP36dZF1zc3NC1y1W1JhYWGwtLRE06ZNxbKTJ0/ixYsXGDVqlFTd0aNH4927dzh06FCR/eatjFZSKtm/P3t7e0NTUxOPHz9G165doampiUqVKsHPzw/Z2dlSdcPDw+Ho6AgtLS1oa2vD3t4ey5cvB5C7XUHPnj0BAK1atRK3PoiKigKQe/87deqEo0ePwsnJCWpqalizZo24BUdISIhMbBKJBIGBgVJljx8/xuDBg2FqagoVFRVYWlpi5MiRyMzMLDIGIiIiIiKSHxOuRET03erbty8uXLiAuLg4sSwsLAw9evRAhQoVZOo/fvwYtra2mDZt2tcMEwDw5MkTnDx5Ep6engAAT09P7Nq1C5mZmfnWV1dXh7u7O7Zt2yaWXbt2Dbdu3ULfvn0LHOfAgQNITU1Fnz59YGxsjJYtWxaa2M3Phw8f8Pz5czx58gTHjh2Dv78/tLS00LBhw2L1UxquXLmC2NhYmTlfuXIFAODk5CRV7ujoCAUFBfH6x54/f47nz58jKSkJ586dw4QJE6Cvr49OnTqVOL7s7Gy4uLhAX18fixYtgrOzMxYvXoy1a9eKdSIjI+Hp6QldXV0sWLAA8+fPR8uWLXH27FkAQIsWLTBu3DgAwE8//YQtW7Zgy5YtsLW1Ffu4c+cOPD090a5dOyxfvhx169YtVpxPnjxBw4YNER4ejt69e2PFihUYMGAATp06hbS0NLliICIiIiIi+XBLASIi+m61bt0axsbG2LZtG/z9/REbG4urV69i+fLluH//flmHJ2Xbtm1QUVGBu7s7AKBPnz6YOXMmIiIi8v0aPZCbUO7cuTMePXoEMzMzhIaGolq1aoV+rX/r1q1o2rQpzMzMxHFGjRqFZ8+eoVKlSnLFeunSJTRp0kR8X7NmTRw4cAB6enpyzrb05CWLP95OAAASExOhqKgIQ0NDqXJlZWXo6+vjyZMnUuXv3r2TmX/lypVx7Ngxue9LftLT09G7d2/MmDEDADBixAjUr18fGzZswMiRIwEAhw4dgra2No4ePZrvyuRq1aqhefPmWLFiBdq1a4eWLVvK1Pnnn39w5MgRqZXbxTncatq0aXj69CnOnz8vlaSeNWsWBEFAxYoVi4yBiIiIiIjkwxWuRET03VJUVESvXr3EVaChoaEwMzND8+bN861vYWEBQRDy/Qr2lxYaGgo3NzdoaWkBAKytreHo6Fjo6tP27dtDT08P4eHhEAQB4eHh4grZ/Lx48QJHjx6VqtO9e3dIJBLs2LFD7lhr1aqFyMhI7Nu3Dz/++CM0NDRkDrL6GnJychAeHo569erJrLR8//49lJWV822nqqqK9+/fy5RFRkYiMjISR48exZo1a6CpqQlXV1fcvXv3s+IcMWKE1PvmzZtLJfwrVqyId+/eITIyssRjWFpa5rtNhjxycnKwb98+dO7cWWZFMIBS3wKCiIiIiKi84wpXIiL6rvXt2xcrVqzAtWvXEBYWhj59+nxzCaTY2FhcuXIFAwcOxD///COWt2zZEr/99hvevHkDbW1tmXYVKlRAz549ERYWhoYNG+LRo0eFbiewfft2ZGVloV69elLjNGrUCKGhoRg9ejQA4OXLl1JbGaipqUFHR0d8r62tjbZt2wLI3Us2LCwM7u7uuHz5MhwcHEp+I/6/9+/fIyUlRaosbz/Vj506dQqPHz/GhAkTZK6pqakVuB1DfgdKKSoqinPK4+rqCmtra0ybNk080OzZs2dSdfT09ApM7AK5idxPV8jq6upKHWg2atQo7NixAx07dkTlypXRvn179OrVCx06dCiw309ZWlrKXfdTz549w5s3b1C7du0S90FERERERPLjClciIvquNWrUCFZWVvD19cWDBw8KTUiWla1btwIAJkyYAGtra/G1ePFipKenY/fu3QW27du3L65evYrAwEA4ODigVq1aBdbNWy3brFkzqXHOnDmDc+fOiasuu3XrBhMTE/E1fvz4QuPv1q0bgNyDn0rD9u3bpcY3MTEpcD4KCgr5ruo1MTFBdnY2kpOTpcozMzPx4sULmJqaFhlHlSpVULNmTZw+fRoA8OjRI5m4YmJiCu2joMPLPmZoaIirV6/iwIED6NKlC06ePImOHTvCy8uryLZ5Pk0gAwWvTP30wC4iIiIiIvq6uMKViIi+e56enpg9ezZsbW2LfZjQlyYIAsLCwtCqVSuMGjVK5npwcDBCQ0Ph4+OTb/sffvgBVatWRVRUFBYsWFDgOA8ePEBMTAzGjBkDZ2dnqWs5OTkYMGAAwsLC4O/vj8WLF0utwCwqOZmRkYGcnByZVakl5eLiUuTX6zMyMrB79260bNky3/jyPudLly7B1dVVLL906RJycnLkfg4+fPggbpdgbGwsE1dprOgFcveW7dy5Mzp37oycnByMGjUKa9aswYwZM1C9evUSrcrW1dUFALx+/VqqPCEhQep9pUqVoK2tjZs3bxba37e2MpyIiIiI6HvFhCsREX33hgwZAkVFRTRq1KjQellZWYiLi4OOjk6BqypL29mzZxEfH49Zs2ahR48eMtfv3r2LGTNm4MmTJ/kmFiUSCVasWIErV65gwIABBY6Tt7r1xx9/FA/M+tj69esRGhoKf39/ODo65tvH69evoaGhgQoVKsi0BSC1/2daWhoePnwIAwMDGBgYFBhXfgpb1ZonIiICr1+/ljksK0/r1q2hp6eHVatWSSVcV61aBXV1dbi5uRUZx927d3Hnzh3xfqiqqspsO1AaXrx4AX19ffG9goIC6tSpAyA3sQwAGhoaAGSTp4XR1taGgYEBTp8+DV9fX7F85cqVUvUUFBTQtWtXbN26FZcuXZLZx1UQBEgkkhLFQEREREREsphwJSKir2rjxo04cuSITPnHX2s/fvw40tPTZep07do1330ozc3NERgYWOTYjx8/hq2tLby8vOQ+OGvJkiVQV1eXKlNQUMBPP/0kvt+9ezf+/vtvmbZeXl4IDQ2FoqJigQnALl26YPr06QgPD8fEiRPzrePu7g53d/dC4wwNDUXdunXzTbbmjTN27FhcvnwZ9evXz7dOVFQUxo0bhx49esDa2hqZmZmIjo7Gnj174OTkhP79+4t1L1y4gFatWiEgIEDq3h88eBDXrl0DkJvgvn79OmbPni3GkJdoLEpoaChUVFTQvXv3fK+rqakhODgYo0ePRs+ePeHi4oLo6Ghs3boVc+bMgZ6enlT9Dx8+iFs75OTkID4+HqtXr0ZOTg4CAgLkiqmkhgwZgpcvX6J169aoUqUKEhIS8Msvv6Bu3briYWB169aFoqIiFixYgJSUFKioqKB169YwNDQssu/58+djyJAhcHJywunTp/M9BGzu3Lk4duwYnJ2dMWzYMNja2iIxMRE7d+7EmTNnULFixRLHQERERERE0phwJSKir2rVqlX5lnt7e4v/feTIkXyTshYWFl/94J958+bJlCkqKkolXAva29TZ2Rk7d+5E06ZNZRKAeWrXrg1LS0ts3bq1wIRrUS5fvoy///4bM2bMKLBO586dMXbsWGzdurXAhKu9vT1atWqF/fv3IzExEYIgwMrKCjNnzsTkyZMLPTwqz+7du7F582bx/ZUrV3DlyhUAuXumypNwffPmDQ4dOgQ3Nzepw7w+NWrUKFSoUAGLFy/GgQMHYGZmhqVLl+a7J21GRobUCmFtbW00aNAAW7ZsQZs2bYqM6XP0798fa9euxcqVK/H69WsYGxujd+/eCAwMhIJC7nb6xsbGWL16NebNm4fBgwcjOzsbJ0+eLDLZOXPmTDx79gy7du0SD+Y6fPiwTLvKlSvj/PnzmDFjBkJDQ/HmzRtUrlwZHTt2FP9BoaQxEBERERGRNIkgCEJZB0FERERERERERET0X6BQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKiVJZB/AtyMnJwZMnT6ClpQWJRFLW4RAREREREdF/nCAIePv2LUxNTaGgwLVQRET/JUy4Anjy5AnMzMzKOgwiIiIiIiIqZx49eoQqVaqUdRhERFSKmHAFoKWlBSD3f+i0tbXLOJryISsrC8eOHUP79u1RoUKFsg6H6Ivgc07lAZ9zKg/4nFN5wOf863vz5g3MzMzEv48SEdF/BxOugLiNgLa2NhOuX0lWVhbU1dWhra3NP9DRfxafcyoP+JxTecDnnMoDPudlh9vaERH993CjGCIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolHAPVyIiIiIiIiIqVdnZ2cjKyirrMIiISo2ysjIUFORbu8qEKxERERERERGVCkEQ8PTpU7x+/bqsQyEiKlUKCgqwtLSEsrJykXWZcCUiIiIiIiKiUpGXbDU0NIS6ujokEklZh0RE9NlycnLw5MkTJCYmomrVqkX+bmPClYiIiIiIiIg+W3Z2tphs1dfXL+twiIhKVaVKlfDkyRN8+PABFSpUKLQuD80iIiIiIiIios+Wt2erurp6GUdCRFT68rYSyM7OLrIuE65EREREREREVGq4jQAR/RcV53cbE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrEREREREREVE5FBUVBYlEgtevX8vdxsLCAsuWLftiMRWkJLF+Td7e3ujatWup99uyZUv4+vqWer/0ZTHhSkRERERERET0jfH29oZEIsGIESNkro0ePRoSiQTe3t5fP7BvmIWFBSQSCSQSCRQVFWFqaorBgwfj1atXxernayU5s7OzMX/+fNjY2EBNTQ16enpo1KgR1q9fL9bZs2cPgoODv3gsVLqYcCUiIiIiIiIi+gaZmZkhPDwc79+/F8vS09MRFhaGqlWrlmFk365Zs2YhMTERDx8+RGhoKE6fPo1x48aVdVj5CgoKwtKlSxEcHIzbt2/j5MmTGDZsmNQqXj09PWhpaZVdkFQiTLgSEREREREREX2D6tevDzMzM+zZs0cs27NnD6pWrYp69epJ1c3IyMC4ceNgaGgIVVVV/PDDD7h48aJUnYiICNSoUQNqampo1aoV4uPjZcY8c+YMmjdvDjU1NZiZmWHcuHF49+6d3DFfvHgR7dq1g4GBAXR0dODs7IzLly9L1ZFIJFi/fj08PDygrq4Oa2trHDhwoNix5kdLSwvGxsaoXLkyWrVqBS8vL6nxX7x4AU9PT1SuXBnq6uqwt7fHtm3bxOve3t44deoUli9fLq6WzRv71q1b6NSpE7S1taGlpYXmzZsjLi5OavxFixbBxMQE+vr6GD16NLKysgqM9cCBAxg1ahR69uwJS0tLODg4YPDgwfDz8xPrfLzaNm9bhU9fH6903r9/P+rXrw9VVVVUq1YNQUFB+PDhg1z3jkoPE65ERERERERERN+oQYMGYdOmTeL7jRs3wsfHR6bejz/+iN27d2Pz5s24fPkyqlevDhcXF7x8+RIA8OjRI3Tr1g2dO3fG1atXMWTIEEydOlWqj7i4OHTo0AHdu3fH9evXsX37dpw5cwZjxoyRO963b9/Cy8sLZ86cwZ9//glra2u4urri7du3UvWCgoLQq1cvXL9+Ha6urujXr1+xYpXH48ePcfDgQTRq1EgsS09Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuAAAWL58OZo0aYKhQ4ciMTERiYmJMDMzw+PHj9GiRQuoqKjgxIkT+OuvvzBo0CCpZObJkycRFxeHkydPYvPmzQgJCUFISEiB8RkbG+PEiRN49uyZXPNp2rSpGFNiYiJOnDgBVVVVtGjRAgAQHR2NgQMHYvz48bh9+zbWrFmDkJAQzJkzp9j3jj6TQEJKSooAQEhJSSnrUMqNzMxMYd++fUJmZmZZh0L0xfA5p/KAzzmVB3zOqTzgc/71/Rf/Hvr+/Xvh9u3bwvv378s6lP8ELy8vwd3dXUhOThZUVFSE+Ph4IT4+XlBVVRWePXsmuLu7C15eXoIgCEJqaqpQoUIFITQ0VGyfmZkpmJqaCj///LMgCIIwbdo0oVatWlJjTJkyRQAgvHr1ShAEQRg8eLAwbNgwqTrR0dGCgoKC+Lmam5sLS5culXse2dnZgpaWlnDw4EGxDIDg7+8vvk9NTRUACIcPH5Y71vyYm5sLysrKgoaGhqCqqioAEBo1alRoG0EQBDc3N2HSpEnie2dnZ2H8+PFSdaZNmyZYWloW+HvSy8tLMDc3Fz58+CCW9ezZU+jdu3eB4966dUuwtbUVFBQUBHt7e2H48OFCRESEVJ38YhEEQXj+/LlQrVo1YdSoUWJZmzZthLlz50rV27Jli2BiYlJgDCS/4vyO4wpXIiIiIiIiIqJvVKVKleDm5oaQkBBs2rQJbm5uMDAwkKoTFxeHrKwsNGvWTCyrUKECGjZsiNjYWABAbGys1EpPAGjSpInU+2vXriEkJASampriy8XFBTk5OXjw4IFc8SYlJWHo0KGwtraGjo4OtLW1kZqaiocPH0rVq1OnjvjfGhoa0NbWRnJystyxFmTy5Mm4evUqrl+/juPHjwMA3NzckJ2dDSD3oKrg4GDY29tDT08PmpqaOHr0qEx8n7p69SqaN2+OChUqFFjHzs4OioqK4nsTExNxTvmpVasWbt68iT///BODBg1CcnIyOnfujCFDhhQaS1ZWFrp37w5zc3MsX75cLL927RpmzZol9fnlrdRNS0srtE8qXUplHQARERERERERERVs0KBB4tf6f/vtty82TmpqKoYPH57vIVPyHtLl5eWFFy9eYPny5TA3N4eKigqaNGmCzMxMqXqfJi4lEglycnJKHvz/Z2BggOrVqwMArK2tsWzZMjRp0gQnT55E27ZtsXDhQixfvhzLli2Dvb09NDQ04OvrKxPfp9TU1IocuyRzUlBQQIMGDdCgQQP4+vpi69atGDBgAKZPnw5LS8t824wcORKPHj3ChQsXoKT0f6m91NRUBAUFoVu3bjJtVFVVi4yfSg8TrkRERERERERE37AOHTogMzMTEokELi4uMtetrKygrKyMs2fPwtzcHEDuKsiLFy+KBy7Z2trKHEz1559/Sr2vX78+bt++LSYsS+Ls2bNYuXIlXF1dAeTux/r8+fNi9SFPrPLKW3H6/v17MT53d3f0798fAJCTk4O7d++iVq1aYhtlZWVxRWyeOnXqYPPmzcjKyip0levnyoujoIPKlixZgh07diAmJgb6+vpS1+rXr487d+581udHpYNbChARERERERERfcMUFRURGxuL27dvS31lPY+GhgZGjhyJyZMn48iRI7h9+zaGDh2KtLQ0DB48GAAwYsQI3Lt3D5MnT8adO3cQFhYmc6DTlClTEBMTgzFjxuDq1au4d+8e9u/fX6xDs6ytrbFlyxbExsbi/Pnz6Nevn1yrQz8mT6wFefv2LZ4+fYrExERcuHABkydPRqVKldC0aVMxvsjISMTExCA2NhbDhw9HUlKSVB8WFhY4f/484uPj8fz5c+Tk5GDMmDF48+YN+vTpg0uXLuHevXvYsmUL7ty5U6y5faxHjx5YunQpzp8/j4SEBERFRWH06NGoUaMGbGxsZOr/8ccf+PHHH7Fw4UIYGBjg6dOnePr0KVJSUgAAM2fOxO+//46goCDcunULsbGxCA8Ph7+/f4ljpJJhwpWIiIiIiIiI6Bunra0NbW3tAq/Pnz8f3bt3x4ABA1C/fn38888/OHr0KHR1dQHkbgmwe/du7Nu3Dw4ODli9ejXmzp0r1UedOnVw6tQp3L17F82bN0e9evUwc+ZMmJqayh3nhg0b8OrVK9SvXx8DBgzAuHHjYGhoWKy5yhNrQWbOnAkTExOYmpqiU6dO0NDQwLFjx8TVoP7+/qhfvz5cXFzQsmVLGBsbo2vXrlJ9+Pn5QVFREbVq1UKlSpXw8OFD6Ovr48SJE0hNTYWzszMcHR2xbt26z1rt6uLigoMHD6Jz586oUaMGvLy8YGNjg2PHjkltFZDnzJkzyM7OxogRI2BiYiK+xo8fL/b3v//9D8eOHUODBg3QuHFjLF26VFz1TF+PRBAEoayDKGtv3ryBjo4OUlJSCv3lRaUnKysLERERcHV1/aJL8YnKEp9zKg/4nFN5wOecygM+51/ff/Hvoenp6Xjw4AEsLS25XyQR/ecU53ccV7gSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUiJ75BnRt+jeseLVt27/ZeIgIiIiIiIiIiIqBFe4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIiIiolLCQ7OIiIiIiIiI6Ita/mr5VxtrvO74ErV78eIFbG1tceHCBVhYWJRuUN+hFi1aYMSIEejbty8AQCKRYO/evejatWuZxWRhYQFfX1/4+vp+sZgaN26MyZMno3v37qXWJwBkZmaiRo0a2LVrF5ycnEq1b/r2cIUrEREREREREZV7c+bMgbu7u5hsjY+Ph0QiwdWrV796LN7e3pBIJJBIJKhQoQKMjIzQrl07bNy4ETk5OV98/AMHDiApKQl9+vT54mN9jsTERHTs2LFU+/T398fUqVM/6z7Pnz8fEolETAwDgLKyMvz8/DBlypRSiJK+dUy4EhEREREREVG5lpaWhg0bNmDw4MFfddzMzMwCr3Xo0AGJiYmIj4/H4cOH0apVK4wfPx6dOnXChw8fvmhcK1asgI+PDxQUvu20kbGxMVRUVEq1z44dO+Lt27c4fPhwidpfvHgRa9asQZ06dWSu9evXD2fOnMGtW7c+N0z6xn3bPzlERERERERERF9YREQEVFRU0Lhx4wLrREVFQSKR4Pjx43BycoK6ujqaNm2KO3fuSNU7ePAgGjRoAFVVVRgYGMDDw0O8ZmFhgeDgYAwcOBDa2toYNmxYgeOpqKjA2NgYlStXRv369fHTTz9h//79OHz4MEJCQsR6r1+/xpAhQ1CpUiVoa2ujdevWuHbtmtwxferZs2c4ceIEOnfuLHMtb0WpmpoaqlWrhl27dkldnzJlCmrUqAF1dXVUq1YNM2bMQFZWlnj92rVraNWqFbS0tKCtrQ1HR0dcunRJvH7mzBk0b94campqMDMzw7hx4/Du3bsCY5VIJNi3bx+A/1uRvGfPHrRq1Qrq6upwcHDAuXPnpNoUNYaioiJcXV0RHh5e4LgFSU1NRb9+/bBu3Tro6urKXNfV1UWzZs1K1Dd9X5hwJSIiIiIiIqJyLTo6Go6OjnLVnT59OhYvXoxLly5BSUkJgwYNEq8dOnQIHh4ecHV1xZUrV3D8+HE0bNhQqv2iRYvg4OCAK1euYMaMGcWKs3Xr1nBwcMCePXvEsp49eyI5ORmHDx/GX3/9hfr166NNmzZ4+fKl3DF97MyZM1BXV4etra3MtRkzZqB79+64du0a+vXrhz59+iA2Nla8rqWlhZCQENy+fRvLly/HunXrsHTpUvF6v379UKVKFVy8eBF//fUXpk6digoVKgAA4uLi0KFDB3Tv3h3Xr1/H9u3bcebMGYwZM6ZY92j69Onw8/PD1atXUaNGDXh6eoorguUdo2HDhoiOji7WuAAwevRouLm5oW3btgXWKWnf9H3hoVlEREREREREVK4lJCTA1NRUrrpz5syBs7MzAGDq1Klwc3NDeno6VFVVMWfOHPTp0wdBQUFifQcHB6n2rVu3xqRJk0ocq42NDa5fvw4gNzl64cIFJCcni1+tX7RoEfbt24ddu3Zh2LBhcsX0sYSEBBgZGeW7nUDPnj0xZMgQAEBwcDAiIyPxyy+/YOXKlQBy9z/NY2FhAT8/P4SHh+PHH38EADx8+BCTJ0+GjY0NAMDa2lqsP2/ePPTr10/c99Ta2horVqyAs7MzVq1aBVVVVbnuj5+fH9zc3AAAQUFBsLOzwz///AMbGxu5xzA1NcWjR4+Qk5Mj97YK4eHhuHz5Mi5evFhoPVNTUyQkJMjVJ32/uMKViIiIiIiIiMq19+/fy53Q+3hvThMTEwBAcnIyAODq1ato06ZNoe0/94R6QRAgkUgA5H5FPzU1Ffr6+tDU1BRfDx48QFxcnNwxfaywe9GkSROZ9x+vcN2+fTuaNWsGY2NjaGpqwt/fHw8fPhSvT5w4EUOGDEHbtm0xf/58Mca8uYSEhEjNw8XFBTk5OXjw4IHc8Rf2+cg7hpqaGnJycpCRkSHXmI8ePcL48eMRGhpa5HOkpqaGtLQ0uedD3yeucCUiIiIiIiKics3AwACvXr2Sq27eV+ABiInPvBPt1dTUimyvoaFRggj/T2xsLCwtLQHk7hlqYmKCqKgomXoVK1aUO6aPFedefOzcuXPo168fgoKC4OLiAh0dHYSHh2Px4sVincDAQPTt2xeHDh3C4cOHERAQgPDwcHh4eCA1NRXDhw/HuHHjZPquWrWq3HEU9vnIO8bLly+hoaEh973766+/kJycjPr164tl2dnZOH36NH799VdkZGRAUVFR7LtSpUpyz4e+T0y4EhEREREREVG5Vq9ePWzduvWz+6lTpw6OHz8OHx+fUohK1okTJ3Djxg1MmDABAFC/fn08ffoUSkpKsLCwKJWY6tWrh6dPn+LVq1cyBz/9+eefGDhwoNT7evXqAQBiYmJgbm6O6dOni9fz++p8jRo1UKNGDUyYMAGenp7YtGkTPDw8UL9+fdy+fRvVq1eXK86SkHeMmzdvivOSR5s2bXDjxg2pMh8fH9jY2GDKlClisrUkfdP3iVsKEBEREREREVG55uLiglu3bpVoZefHAgICsG3bNgQEBCA2NhY3btzAggULStRXRkYGnj59isePH+Py5cuYO3cu3N3d0alTJzHp2bZtWzRp0gRdu3bFsWPHEB8fj5iYGEyfPh2XLl0qUUz16tWDgYEBzp49K3Nt586d2LhxI+7evYuAgABcuHBBPHDK2toaDx8+RHh4OOLi4rBixQrs3btXbPv+/XuMGTMGUVFRSEhIwNmzZ3Hx4kXxcK4pU6YgJiYGY8aMwdWrV3Hv3j3s37+/2IdmFUbeMaKjo9G+fXu5+9XS0kLt2rWlXhoaGtDX10ft2rU/q2/6PnGFKxERERERERF9UeN1x5d1CIWyt7dH/fr1sWPHDgwfPrzE/bRs2RI7d+5EcHAw5s+fD21tbbRo0aJEfR05cgQmJiZQUlKCrq4uHBwcsGLFCnh5eYkHOUkkEkRERGD69Onw8fHBs2fPYGxsjBYtWsDIyKhEMSkqKsLHxwehoaHo1KmT1LWgoCCEh4dj1KhRMDExwbZt21CrVi0AQJcuXTBhwgSMGTMGGRkZcHNzw4wZMxAYGCj2++LFCwwcOBBJSUkwMDBAt27dxMO86tSpg1OnTmH69Olo3rw5BEGAlZUVevfuXaL7lx95xnj8+DFiYmKkVjzHx8fD0tISJ0+eRMuWLUs8/rlz55CSkoIePXp8zjToOyARBEEo6yDK2ps3b6Cjo4OUlBRoa2uXdTjlQlZWFiIiIuDq6iq1v0qB7h0r3gDW/NciKnvFfs6JvkN8zqk84HNO5QGf86/vv/j30PT0dDx48ACWlpZyH0D1LTl06BAmT56Mmzdvyn0y/X/V06dPYWdnh8uXL8Pc3Lysw/mqpkyZglevXmHt2rVi2cmTJ9GtWzfcv39fZpuF4ujduzccHBzw008/lUao9JUV53ccV7gSERERERERUbnn5uaGe/fu4fHjxzAzMyvrcMqUsbExNmzYgIcPH5a7hKuhoSEmTpwoVRYREYGffvrps5KtmZmZsLe3F/ffpf82JlyJiIiIiIiIiAD4+vqWdQjfjK5du5Z1CGVi0qRJMmULFy787H6VlZXh7+//2f3Q96F8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolCiVdQBERERERERE9N82/8rzrzbW1HoGJWr34sUL2Nra4sKFC7CwsCjdoL5DLVq0wIgRI9C3b18AgEQiwd69e9G1a9cyi8nCwgK+vr7w9fX9YjE1btwYkydPRvfu3Uutz6/RN31buMKViIiIiIiIiMq9OXPmwN3dXUy2xsfHQyKR4OrVq189Fm9vb0gkEkgkElSoUAFGRkZo164dNm7ciJycnC8+/oEDB5CUlIQ+ffp88bE+R2JiIjp27Fiqffr7+2Pq1KnFvs/Z2dmYMWMGLC0toaamBisrKwQHB0MQhM/um74/TLgSERERERERUbmWlpaGDRs2YPDgwV913MzMzAKvdejQAYmJiYiPj8fhw4fRqlUrjB8/Hp06dcKHDx++aFwrVqyAj48PFBS+7bSRsbExVFRUSrXPjh074u3btzh8+HCx2i1YsACrVq3Cr7/+itjYWCxYsAA///wzfvnll8/um74/3/ZPDhERERERERHRFxYREQEVFRU0bty4wDpRUVGQSCQ4fvw4nJycoK6ujqZNm+LOnTtS9Q4ePIgGDRpAVVUVBgYG8PDwEK9ZWFggODgYAwcOhLa2NoYNG1bgeCoqKjA2NkblypVRv359/PTTT9i/fz8OHz6MkJAQsd7r168xZMgQVKpUCdra2mjdujWuXbsmd0yfevbsGU6cOIHOnTvLXMtbUaqmpoZq1aph165dUtenTJmCGjVqQF1dHdWqVcOMGTOQlZUlXr927RpatWoFLS0taGtrw9HREZcuXRKvnzlzBs2bN4eamhrMzMwwbtw4vHv3rsBYJRIJ9u3bB+D/ViTv2bMHrVq1grq6OhwcHHDu3DmpNkWNoaioCFdXV4SHhxc4bn5iYmLg7u4ONzc3WFhYoEePHmjfvj0uXLjw2X3T94cJVyIiIvp67h0r3ouIiIjoK4iOjoajo6NcdadPn47Fixfj0qVLUFJSwqBBg8Rrhw4dgoeHB1xdXXHlyhUcP34cDRs2lGq/aNEiODg44MqVK5gxY0ax4mzdujUcHBywZ88esaxnz55ITk7G4cOH8ddff6F+/fpo06YNXr58KXdMHztz5gzU1dVha2src23GjBno3r07rl27hn79+qFPnz6IjY0Vr2tpaSEkJAS3b9/G8uXLsW7dOixdulS83q9fP1SpUgUXL17EX3/9halTp6JChQoAgLi4OHTo0AHdu3fH9evXsX37dpw5cwZjxowp1j2aPn06/Pz8cPXqVdSoUQOenp7iimB5x2jYsCGio6OLNW7Tpk1x/Phx3L17F0BucvnMmTMyWx6UpG/6/vDQLCIiIiIiIiIq1xISEmBqaipX3Tlz5sDZ2RkAMHXqVLi5uSE9PR2qqqqYM2cO+vTpg6CgILG+g4ODVPvWrVtj0qRJJY7VxsYG169fB5CbHL1w4QKSk5PFr9YvWrQI+/btw65duzBs2DC5YvpYQkICjIyM8t1OoGfPnhgyZAgAIDg4GJGRkfjll1+wcuVKALl7lOaxsLCAn58fwsPD8eOPPwIAHj58iMmTJ8PGxgYAYG1tLdafN28e+vXrJx6IZW1tjRUrVsDZ2RmrVq2CqqqqXPfHz88Pbm5uAICgoCDY2dnhn3/+gY2NjdxjmJqa4tGjR8jJyZF7W4WpU6fizZs3sLGxgaKiIrKzszFnzhz069dPql5J+qbvDz9ZIiIiIiIiIirX3r9/L3dCr06dOuJ/m5iYAACSk5MBAFevXkWbNm0Kbe/k5FTCKHMJggCJRAIgdxVlamoq9PX1oampKb4ePHiAuLg4uWP6WGH3okmTJjLvP17hun37djRr1gzGxsbQ1NSEv78/Hj58KF6fOHEihgwZgrZt22L+/PlijHlzCQkJkZqHi4sLcnJy8ODBA7njL+zzkXcMNTU15OTkICMjQ+5xd+zYgdDQUISFheHy5cvYvHkzFi1ahM2bN0vVK0nf9P3hClciIiIiIiIiKtcMDAzw6tUruermfQUegJj4zDt1Xk1Nrcj2GhoaJYjw/8TGxsLS0hIAkJqaChMTE0RFRcnUq1ixotwxfaw49+Jj586dQ79+/RAUFAQXFxfo6OggPDwcixcvFusEBgaib9++OHToEA4fPoyAgACEh4fDw8MDqampGD58OMaNGyfTd9WqVeWOo7DPR94xXr58CQ0NjWLdu8mTJ2Pq1Kno06cPAMDe3h4JCQmYN28evLy8Pqtv+v4w4UpEREREVJqKu/+wdfsvEwcREcmtXr162Lp162f3U6dOHRw/fhw+Pj6lEJWsEydO4MaNG5gwYQIAoH79+nj69CmUlJRgYWFRKjHVq1cPT58+xatXr6Crqyt17c8//8TAgQOl3terVw9A7qFR5ubmmD59ung9ISFBpv8aNWqgRo0amDBhAjw9PbFp0yZ4eHigfv36uH37NqpXry5XnCUh7xg3b94U5yWvtLQ0mS0CFBUVxWTv5/RN3x9uKUBERERERERE5ZqLiwtu3bpVopWdHwsICMC2bdsQEBCA2NhY3LhxAwsWLChRXxkZGXj69CkeP36My5cvY+7cuXB3d0enTp3EpGfbtm3RpEkTdO3aFceOHUN8fDxiYmIwffp0XLp0qUQx1atXDwYGBjh79qzMtZ07d2Ljxo24e/cuAgICcOHCBfHAKWtrazx8+BDh4eGIi4vDihUrsHfvXrHt+/fvMWbMGERFRSEhIQFnz57FxYsXxcO5pkyZgpiYGIwZMwZXr17FvXv3sH///mIfmlUYeceIjo5G+/bF+wfRzp07Y86cOTh06BDi4+Oxd+9eLFmyBB4eHp/dN31/uMKViIiIiIiIiL6oqfUMyjqEQtnb26N+/frYsWMHhg8fXuJ+WrZsiZ07dyI4OBjz58+HtrY2WrRoUaK+jhw5AhMTEygpKUFXVxcODg5YsWIFvLy8xJWUEokEERERmD59Onx8fPDs2TMYGxujRYsWMDIyKlFMioqK8PHxQWhoKDp16iR1LSgoCOHh4Rg1ahRMTEywbds21KpVCwDQpUsXTJgwAWPGjEFGRgbc3NwwY8YMBAYGiv2+ePECAwcORFJSEgwMDNCtWzfxMK86derg1KlTmD59Opo3bw5BEGBlZYXevXuX6P7lR54xHj9+jJiYGKkVz/Hx8bC0tMTJkyfRsmXLfPv+5ZdfMGPGDIwaNQrJyckwNTXF8OHDMXPmzEL7pv8miSAIQlkHUdbevHkDHR0dpKSkQFtbu6zDKReysrIQEREBV1dXqf1VCsSv5tF3qNjPOdF3iL/PqTzgc07lAf/c8vX9F/8emp6ejgcPHsDS0lLuA6i+JYcOHcLkyZNx8+bNcn96/NOnT2FnZ4fLly/D3Ny8rMP5qqZMmYJXr15h7dq1YtnJkyfRrVs33L9/X2abhc/tm74fxfkdxxWuRERERERERFTuubm54d69e3j8+DHMzMzKOpwyZWxsjA0bNuDhw4flLuFqaGiIiRMnSpVFRETgp59++qxka0F9038TE65ERERERERERAB8fX3LOoRvRteuXcs6hDIxadIkmbKFCxd+sb7pv6l8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSolTWARAR0f9371jx6lu3/zJxEBEREREREVGJcYUrEREREREREZV7L168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJQcWKFctsfCq+1atXo3PnzmUdxneNK1yJiIiIiIiI6ItKCQr6amPpBASUqN2cOXPg7u4OCwsLALmJ1FOnThVYPyoqCs7OziUa62tp2rQpEhMToaOj80XHkUgk2Lt3L7p27fpFx/kSAgMDER4ejkePHkFZWRmOjo6YM2cOGjVqVGCbFy9eoF+/frh+/bqYqHd3d8fcuXOhra0NADhz5gymTJmCv//+G2lpaTA3N8fw4cMxYcIEqb5+++03LFy4EE+fPoWDgwN++eUXNGzYULxuYWGBhIQEbNu2DX369JFqa2dnh9u3b2PTpk3w9vYu9txfvHgBBwcHPH78GK9evRIT44MGDUJwcDCio6PRvHnzYvdLXOFKREREREREROVcWloaNmzYgMGDB4tle/bsQWJiotQrISEBtWvXhpOTU6EJuW+FsrIyjI2NIZFIyjqUb1aNGjXw66+/4saNGzhz5gwsLCzQvn17PHv2rMA2CgoKcHd3x4EDB3D37l2EhITgjz/+wIgRI8Q6GhoaGDNmDE6fPo3Y2Fj4+/vD398fa9euFets374dEydOREBAAC5fvgwHBwe4uLggOTlZajwzMzNs2rRJquzPP//E06dPoaGhUeK5Dx48GHXq1JEpV1ZWRt++fbFixYoS913eMeFKREREREREROVaREQEVFRU0LhxY7FMT08PxsbGUq/g4GA8f/4ce/fuhaqqqlg3JycHP/74o9gmMDBQvBYfHw+JRIKrV6+KZa9fv4ZEIkFUVJRYdurUKTRs2BAqKiowMTHB1KlT8eHDB/F6y5YtMXbsWPj6+kJXVxdGRkZYt24d3r17Bx8fH2hpaaF69eo4fPiw2ObTLQXyvt5/9OhR2NraQlNTEx06dEBiYqLU/di4cSPs7OzEWMaMGfOZd/j/xMXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23bthUZT9++fdG2bVtUq1YNdnZ2WLJkCd68eYPr168X2EZXVxcjR46Ek5MTzM3N0aZNG4waNQrR0dFinXr16sHT0xN2dnawsLBA//794eLiIlVnyZIlGDp0KHx8fFCrVi2sXr0a6urq2Lhxo9R4/fr1w6lTp/Do0SOxbOPGjejXrx+UlEr25fVVq1bh9evX8PPzy/d6586dceDAAbx//75E/Zd3TLgSERERERERUbkWHR0NR0fHQuusXLkSv//+O3bv3o0qVapIXdu8eTM0NDRw/vx5/Pzzz5g1axYiIyPlHv/x48dwdXVFgwYNcO3aNaxatQobNmzA7NmzZcYxMDDAhQsXMHbsWIwcORI9e/ZE06ZNcfnyZbRv3x4DBgxAWlpagWOlpaVh0aJF2LJlC06fPo2HDx9KJd1WrVqF0aNHY9iwYbhx4wYOHDiA6tWryz2XoqSmpsLV1RXHjx/HlStX0KFDB3Tu3BkPHz6Uqrd06VI0a9YMV65cgZubGwYMGICBAweif//+uHz5MqysrDBw4EAIggAASE9Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuCB3bJmZmVi7di10dHTg4OAgd7snT55gz549hW4xceXKFcTExIh1MjMz8ddff6Ft27ZiHQUFBbRt2xbnzp2TamtkZAQXFxds3rwZQO5nuH37dgwaNEjuGD92+/ZtzJo1C7///jsUFPJPDTo5OeHDhw84f/58icYo75hwJSIiIiIiIqJyLSEhAaampgVeP336NHx9ffHbb7+hadOmMtfr1KmDgIAAWFtbY+DAgXBycsLx48flHn/lypUwMzPDr7/+ChsbG3Tt2hVBQUFYvHgxcnJyxHoODg7w9/eHtbU1pk2bBlVVVRgYGGDo0KGwtrbGzJkz8eLFi0JXZ2ZlZWH16tVwcnJC/fr1MWbMGKlYZ8+ejUmTJmH8+PGoUaMGGjRoUKqHgjk4OGD48OGoXbs2rK2tERwcDCsrKxw4cECqnqurK4YPHy7O682bN2jQoAF69uyJGjVqYMqUKYiNjUVSUhIAoHLlyvDz80PdunVRrVo1jB07Fh06dMCOHTuKjOl///sfNDU1oaqqiqVLlyIyMhIGBgZFtvP09IS6ujoqV64MbW1trF+/XqZOlSpVoKKiAicnJ4wePRpDhgwBADx//hzZ2dkwMjKSqm9kZISnT5/K9DNo0CCEhIRAEATs2rULVlZWqFu3bpExfiojIwOenp5YuHAhqlatWmA9dXV16OjoICEhodhjEBOuRERERERERFTOvX//XmqLgI89fPgQPXr0wLBhw8Rk2ac+3QfTxMREZh/OwsTGxqJJkyZSe602a9YMqamp+Pfff/MdR1FREfr6+rC3txfL8pJ3hY2trq4OKyurfGNNTk7GkydP0KZNm3zbjhgxApqamuKrJFJTU+Hn5wdbW1tUrFgRmpqaiI2NlVnh+vFc8+ZV2Fyzs7MRHBwMe3t76OnpQVNTE0ePHhX7DQ0NlYr946/2t2rVClevXkVMTAw6dOiAXr16if127NhRbGNnZycV49KlS3H58mXs378fcXFxmDhxosx8o6OjcenSJaxevRrLli2Ta5uD/Li5uSE1NRWnT5/Gxo0bS7y6ddq0abC1tUX//v2LrKumplboamkqWMk2eiAiIiL6SHxYPBShWGS9arILQoiIiIjKnIGBAV69eiVT/v79e3h4eMDOzg7Lli0rsH2FChWk3kskEnFlat5XtvO++g7krjItifzG+bgsL2H78apYefrIi01NTa3Q8WfNmlXgnp/y8vPzQ2RkJBYtWoTq1atDTU0NPXr0QGZmZoFx5s2rsLkuXLgQy5cvx7Jly2Bvbw8NDQ34+vqK/Xbp0kXqoLPKlSuL/62hoYHq1aujevXqaNy4MaytrbFhwwZMmzYN69evF/cx/fTe5e3ta2NjAz09PTRv3hwzZsyAiYmJWMfS0hJAbrI4KSkJgYGB8PT0hIGBARQVFcUVunmSkpJgbGwsc9+UlJQwYMAABAQE4Pz589i7d2+h97kgJ06cwI0bN7Br1y4A//dcGhgYYPr06QgKChLrvnz5EpUqVSrROOUdE65EREREREREVK7Vq1cPW7dulSkfMmQIXr58iaNHj5b4cKK8hFViYiLq1asHAFIHaAGAra0tdu/eDUEQxETi2bNnoaWlJbNf7JekpaUFCwsLHD9+HK1atZK5bmhoCENDw88a4+zZs/D29oaHhweA3BWv8fHxn9VnXr/u7u7iys2cnBzcvXsXtWrVApA7Ny0tLbn6ysnJQUZGBgDpxGxRbQCI7YrqV1lZGY6Ojjh+/Di6du0qXj9+/HiBh5QNGjQIixYtQu/evaGrqytXXJ/avXu31EFYFy9exKBBgxAdHS218jkuLg7p6eniM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRMLPNToSwkMDMSIESNgaGiIjh074u3btzh79izGjh1baLsHDx7IJJKtra1l6llbW2PPnj3o3LkzJBIJZsyYUeiKXHlZW1tj165diImJga6uLpYsWYKkpCQx4Zqfd+/eYc6cOejSpQtMTEzw/Plz/Pbbb3j8+DF69uxZYLuIiAgkJSWhQYMG0NTUxK1btzB58mQ0a9YMFhYWAIDffvsNVatWhY2NDYDcfYAXLVqEcePGif1MnDgRXl5ecHJyQsOGDbFs2TK8e/cOPj4++Y5ra2uL58+fQ11dvQR3KNfHSVUgdy/ZvL4rVqwolkdHR6NatWoy9Uk+TLgSERERERERUblmb2+P+vXrY8eOHRg+fDiA3IOssrKy0KFDh3zbbNq0Cd7e3nL1v3HjRgwePBiOjo6oWbMmfv75Z7Rv3168XrlyZURERGDy5MlwcHCAnp4eBg8eLJOY/Rq8vLyQnp6OpUuXws/PDwYGBujRo0eR7Qrav/RTS5YswaBBg9C0aVMYGBhgypQpePPmzWfH7e/vj/v378PFxQXq6uoYNmwYunbtipSUlALbKCoq4u+//8bmzZvx/Plz6Ovro0GDBoiOjpbZr/VjampqWLduHSZMmICMjAyYmZmhW7dumDp1qlgnJycH06ZNw4MHD6CkpAQrKyssWLBAfL4AoHfv3nj27BlmzpyJp0+fom7dujhy5IjMQVof09fXL/Q+eHt7Iz4+HlFRUYXWK8q2bdswdOjQz+qjPJMIH28iUk69efMGOjo6SElJgba2dlmHUy5kZWUhIiICrq6uMnug5OveseINYN2+6DpEXxifcyoP8p7zWq9rybmH6z/FG4DPOX0D+PucygM+51/ff/Hvoenp6Xjw4AEsLS0LPIDqW3bo0CFMnjwZN2/e/OqrSolKi7OzM1q1aoXAwMAS93Hr1i20bt0ad+/ehY6OTukF950rzu84rnAlIiIiIiIionLPzc0N9+7dw+PHj2FmZlbW4RAVW0pKCuLi4nDo0KHP6icxMRG///47k62fgQlXIiIiIiIiIiIAvr6+ZR0CUYnp6Ojg33///ex+2rZtWwrRlG/fzBr5+fPnQyKRSP1yS09Px+jRo6Gvrw9NTU10794dSUlJUu0ePnwINzc3qKurw9DQEJMnT8aHDx++cvRERERERERERERE30jC9eLFi1izZg3q1KkjVT5hwgQcPHgQO3fuxKlTp/DkyRN069ZNvJ6dnQ03NzdkZmYiJiYGmzdvRkhICGbOnPm1p0BERERERERERERU9lsKpKamol+/fli3bh1mz54tlqekpGDDhg0ICwtD69atAeSeAGhra4s///wTjRs3xrFjx3D79m388ccfMDIyQt26dREcHIwpU6YgMDAQysrKZTUtIiJRfFi8nIcJfYVgiIiIiArBP7cQERF9vjJPuI4ePRpubm5o27atVML1r7/+QlZWltS+ETY2NqhatSrOnTuHxo0b49y5c7C3t4eRkZFYx8XFBSNHjsStW7dQr169fMfMyMhARkaG+P7NmzcAck/mzMrKKu0pUj7y7rPc9ztbKO4AxYyIqPTlPd/ZyJavPp9z+g7xOafygH9uofKAv8+/Pv7dk4jov6tME67h4eG4fPkyLl68KHPt6dOnUFZWRsWKFaXKjYyM8PTpU7HOx8nWvOt51woyb948BAUFyZQfO3YM6urqxZ0GfYbIyMgv0/GdiC/TL1EJ3Kl4R656t+Wr9lHHfM7p28HnnMoD/rmFygP+Pv960tLSyjoEIiL6Qsos4fro0SOMHz8ekZGRUFVV/apjT5s2DRMnThTfv3nzBmZmZmjfvj20tbW/aizlVVZWFiIjI9GuXTtUqFCh6AZxJ4o3gFXrkgVGVIrynvOar2vK9dU8i0b3izcAn3P6BvA5p/KAf26h8oC/z7++vG9aEhHRf0+ZJVz/+usvJCcno379+mJZdnY2Tp8+jV9//RVHjx5FZmYmXr9+LbXKNSkpCcbGxgAAY2NjXLhwQarfpKQk8VpBVFRUoKKiIlNeoUIF+f4QTaXm8c7Hcu4RJSlex/wc6Rui+P//rygVFPmc0/eLzzmVB3L/WZHPOX3H+Pv86+HfPYmI/rsUymrgNm3a4MaNG7h69ar4cnJyQr9+/cT/rlChAo4fPy62uXPnDh4+fIgmTZoAAJo0aYIbN24gOTlZrBMZGQltbW3UqlXrq8+JiIiIiIiIiL5PL168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJkdkukr5tR44cQd26dZGTk1PWoXy3yizhqqWlhdq1a0u9NDQ0oK+vj9q1a0NHRweDBw/GxIkTcfLkSfz111/w8fFBkyZN0LhxYwBA+/btUatWLQwYMADXrl3D0aNH4e/vj9GjR+e7gpWIiIiIiIiIysDfkq/3KqE5c+bA3d0dFhYWAHITqRKJpMDXqVOnSunmfDlNmzZFYmIidHR0vug4EokE+/bt+6JjfCmBgYGwsbGBhoYGdHV10bZtW5w/f77QNi9evECHDh1gamoKFRUVmJmZYcyYMVJbhZw5cwbNmjWDvr4+1NTUYGNjg6VLl8r09dtvv8HCwgKqqqpo1KiRzDe5LSwsIJFIEB4eLtPWzs4OEokEISEhxZrzxYsX0aZNG1SsWBG6urpwcXHBtWvXxOsdOnRAhQoVEBoaWqx+6f+UWcJVHkuXLkWnTp3QvXt3tGjRAsbGxtizZ494XVFREf/73/+gqKiIJk2aoH///hg4cCBmzZpVhlETERERERER0fckLS0NGzZswODBg8WyPXv2IDExUeqVkJCA2rVrw8nJCY0aNSrDiOWjrKwMY2NjSCQlT0T/19WoUQO//vorbty4gTNnzsDCwgLt27fHs2fPCmyjoKAAd3d3HDhwAHfv3kVISAj++OMPjBgxQqyjoaGBMWPG4PTp04iNjYW/vz/8/f2xdu1asc727dsxceJEBAQE4PLly3BwcICLi4vUN7kBwMzMDJs2bZIq+/PPP/H06VNoaGgUa76pqano0KEDqlativPnz+PMmTPQ0tKCi4sLsrKyxHre3t5YsWJFsfqm//NNJVyjoqKwbNky8b2qqip+++03vHz5Eu/evcOePXtk9mY1NzdHREQE0tLS8OzZMyxatAhKSmW2NS0RERERERERfWciIiKgoqIifqMWAPT09GBsbCz1Cg4OxvPnz7F3716pA8BzcnLw448/im0CAwPFa/Hx8ZBIJLh69apY9vr1a0gkEkRFRYllp06dQsOGDaGiogITExNMnToVHz58EK+3bNkSY8eOha+vL3R1dWFkZIR169bh3bt38PHxgZaWFqpXr47Dhw+LbT7dUiDv6/1Hjx6Fra0tNDU10aFDByQmJkrdj40bN8LOzk6MZcyYMZ95h/9PXFwc3N3dYWRkBE1NTTRo0AB//PGHVB0LCwvMnj0bAwcOhKamJszNzXHgwAE8e/YM7u7u0NTURJ06dXDp0iWxzYsXL+Dp6YnKlStDXV0d9vb22LZtW5Hx9O3bF23btkW1atVgZ2eHJUuW4M2bN7h+/XqBbXR1dTFy5Eg4OTnB3Nwcbdq0wahRoxAdHS3WqVevHjw9PWFnZwcLCwv0798fLi4uUnWWLFmCoUOHwsfHB7Vq1cLq1auhrq6OjRs3So3Xr18/nDp1Co8ePRLLNm7ciH79+hU7B/b333/j5cuXmDVrFmrWrAk7OzsEBAQgKSkJCQkJYr3OnTvj0qVLiIuLK1b/lOubSrgSEREREX2r4sPicX/z/SJfRET0/YmOjoajo2OhdVauXInff/8du3fvRpUqVaSubd68GRoaGjh//jx+/vlnzJo1C5GRkXKP//jxY7i6uqJBgwa4du0aVq1ahQ0bNmD27Nky4xgYGODChQsYO3YsRo4ciZ49e6Jp06a4fPky2rdvjwEDBiAtLa3AsdLS0rBo0SJs2bIFp0+fxsOHD+Hn5ydeX7VqFUaPHo1hw4bhxo0bOHDgAKpXry73XIqSmpoKV1dXHD9+HFeuXEGHDh3QuXNnPHz4UKre0qVL0axZM1y5cgVubm4YMGAABg4ciP79++Py5cuwsrLCwIEDIQgCACA9PR2Ojo44dOgQbt68iWHDhmHAgAEyX9EvTGZmJtauXQsdHR04ODjI3e7JkyfYs2cPnJ2dC6xz5coVxMTEiHUyMzPx119/oW3btmIdBQUFtG3bFufOnZNqa2RkBBcXF2zevBlA7me4fft2DBo0SO4Y89SsWRP6+vrYsGEDMjMz8f79e2zYsAG2trbidhoAULVqVRgZGUkliEl+TLgSERERERERUbmWkJAAU1PTAq+fPn0avr6++O2339C0aVOZ63Xq1EFAQACsra0xcOBAODk5SR0CXpSVK1fCzMwMv/76K2xsbNC1a1cEBQVh8eLFUgcXOTg4wN/fH9bW1pg2bRpUVVVhYGCAoUOHwtraGjNnzsSLFy8KXZ2ZlZWF1atXw8nJCfXr18eYMWOkYp09ezYmTZqE8ePHo0aNGmjQoEGpHgrm4OCA4cOHo3bt2rC2tkZwcDCsrKxw4MABqXqurq4YPny4OK83b96gQYMG6NmzJ2rUqIEpU6YgNjYWSUlJAIDKlSvDz88PdevWRbVq1TB27Fh06NABO3bsKDKm//3vf9DU1ISqqiqWLl2KyMhIGBgYFNnO09MT6urqqFy5MrS1tbF+/XqZOlWqVIGKigqcnJwwevRoDBkyBADw/PlzZGdnw8jISKq+kZERnj59KtPPoEGDEBISAkEQsGvXLlhZWaFu3bpFxvgpLS0tREVFYevWrVBTU4OmpiaOHDmCw4cPy6yWNTU1lVr1SvJjwpWIiIiIiIiIyrX3799LbRHwsYcPH6JHjx4YNmyYmCz7VJ06daTem5iYyOzDWZjY2Fg0adJEaq/VZs2aITU1Ff/++2++4ygqKkJfXx/29vZiWV7yrrCx1dXVYWVllW+sycnJePLkCdq0aZNv2xEjRkBTU1N8lURqair8/Pxga2uLihUrQlNTE7GxsTIrXD+ea968CptrdnY2goODYW9vDz09PWhqauLo0aNiv6GhoVKxf7xys1WrVrh69SpiYmLQoUMH9OrVS+y3Y8eOYhs7OzupGJcuXYrLly9j//79iIuLw8SJE2XmGx0djUuXLmH16tVYtmyZXNsc5MfNzQ2pqak4ffo0Nm7cWKLVrUDusz548GA0a9YMf/75J86ePYvatWvDzc0N79+/l6qrpqZW6GppKhg3OyUiIiIiIiKics3AwACvXr2SKX///j08PDxgZ2cndebMpypUqCD1XiKRiCtTFRRy17rlffUdgNThRMWR3zgfl+UlbD9eFStPH3mxqampFTr+rFmzpLYfKAk/Pz9ERkZi0aJFqF69OtTU1NCjRw9kZmYWGGfevAqb68KFC7F8+XIsW7YM9vb20NDQgK+vr9hvly5dpA46q1y5svjfGhoaqF69OqpXr47GjRvD2toaGzZswLRp07B+/XoxEfnpvcvb29fGxgZ6enpo3rw5ZsyYARMTE7GOpaUlgNxkcVJSEgIDA+Hp6QkDAwMoKiqKK3TzJCUlyZxfBABKSkoYMGAAAgICcP78eezdu7fQ+1yQsLAwxMfH49y5c+KzGRYWBl1dXezfvx99+vQR6758+RKVKlUq0TjlHROuRERERERERFSu1atXD1u3bpUpHzJkCF6+fImjR4+W+IDuvIRVYmIi6tWrBwBSB2gBgK2tLXbv3g1BEMRE4tmzZ6GlpSWzX+yXpKWlBQsLCxw/fhytWrWSuW5oaAhDQ8PPGuPs2bPw9vaGh4cHgNwVr/Hx8Z/VZ16/7u7u6N+/P4DcROzdu3dRq1YtALlz09LSkquvnJwcZGRkAJBOzBbVBoDYrqh+lZWV4ejoiOPHj6Nr167i9ePHjxd4SNmgQYOwaNEi9O7dG7q6unLF9am0tDQoKChIrabOe/9xoj49PR1xcXHiM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRPFVYhfS2BgIEaMGAFDQ0N07NgRb9++xdmzZzF27NhC2z148EAmkWxtbS1Tz9raGnv27EHnzp0hkUgwY8aMQlfkysva2hq7du1CTEwMdHV1sWTJEiQlJYkJ1/y8e/cOc+bMQZcuXWBiYoLnz5/jt99+w+PHj9GzZ88C20VERCApKQkNGjSApqYmbt26hcmTJ6NZs2biwVO//fYbqlatChsbGwC5+wAvWrQI48aNE/uZOHEivLy84OTkhIYNG2LZsmV49+4dfHx88h3X1tYWz58/h7q6egnuUK527dph8uTJGD16NMaOHYucnBzMnz8fSkpKUkn2P//8EyoqKmjSpEmJxyrPmHAlIiIiIiIionLN3t4e9evXx44dOzB8+HAAuQdZZWVloUOHDvm22bRpE7y9veXqf+PGjRg8eDAcHR1Rs2ZN/Pzzz2jfvr14vXLlyoiIiMDkyZPh4OAAPT09DB48WCYx+zV4eXkhPT0dS5cuhZ+fHwwMDNCjR48i2xW0f+mnlixZgkGDBqFp06YwMDDAlClT8ObNm8+O29/fH/fv34eLiwvU1dUxbNgwdO3aFSkpKQW2UVRUxN9//43Nmzfj+fPn0NfXR4MGDRAdHS2zX+vH1NTUsG7dOkyYMAEZGRkwMzNDt27dMHXqVLFOTk4Opk2bhgcPHkBJSQlWVlZYsGCB+HwBQO/evfHs2TPMnDkTT58+Rd26dXHkyBGZg7Q+pq+vX+h98Pb2Rnx8PKKiovK9bmNjg4MHDyIoKAhNmjSBgoIC6tWrhyNHjkhthbBt2zb069fvs5K75ZlE+HgTkXLqzZs30NHRQUpKCrS1tcs6nHIhKysLERERqPW6FhShWGT9ak3/Kd4A1u2LrkP0hfE5p/KAzzmVB3zOqTzgc/71/Rf/Hpqeno4HDx7A0tKywAOovmWHDh3C5MmTcfPmza++qpSotDg7O6NVq1YIDAwscR/Pnz9HzZo1cenSJXEPWire7ziucCUiIiIiIiKics/NzQ337t3D48ePYWZmVtbhEBVbSkoK4uLicOjQoc/qJz4+HitXrmSy9TMw4UpEREREREREBMDX17esQyAqMR0dHfz777+f3Y+TkxOcnJxKIaLyi2vkiYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERFRuffixQsYGhoiPj6+rEORW1RUFCQSCV6/fg0ACAkJQcWKFcs0ppKKj4+HRCLB1atXyzoUKqcaN26M3bt3l0pfSqXSCxERERERERFRAe5vvv/VxqrmVa1E7ebMmQN3d3dYWFiUbkCfiIqKQqtWrfDq1avvNjlaUi1btkTdunWxbNmysg6l2Pbs2YO5c+fin3/+QVZWFqytrTFp0iQMGDCgwDaJiYmYNGkSLl26hH/++Qfjxo2Ta+7Z2dkIDAzE1q1b8fTpU5iamsLb2xv+/v6QSCQAcu/lqVOnxDaGhoZo0aIFFi1aBHNz8wL7Lu3nLzAwEPv27ftqiXILCwv4+vrC19e3xH2Eh4fD09MT7u7u2Ldvn1ju7++PCRMmwMPDAwoKn7dGlStciYiIiIiIiKhcS0tLw4YNGzB48OCyDoW+UXp6epg+fTrOnTuH69evw8fHBz4+Pjh69GiBbTIyMlCpUiX4+/vDwcFB7rEWLFiAVatW4ddff0VsbCwWLFiAn3/+Gb/88otUvaFDhyIxMRFPnjzB/v378ejRI/Tv37/Ec/ySsrKyyjoEALkrqf38/NC8eXOZax07dsTbt29x+PDhzx6HCVciIiIiIiIiKtciIiKgoqKCxo0bi2V5X9c/evQo6tWrBzU1NbRu3RrJyck4fPgwbG1toa2tjb59+yItLU1sl5OTg3nz5sHS0hJqampwcHDArl27AOQme1q1agUA0NXVhUQigbe3NwDgyJEj+OGHH1CxYkXo6+ujU6dOiIuL+6x5xcXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23btn1WrJ/Kzs7G4MGDxftYs2ZNLF++XKqOt7c3unbtirlz58LIyAgVK1bErFmz8OHDB0yePBl6enqoUqUKNm3aJNVuypQpqFGjBtTV1VGtWjXMmDGjyIRgy5Yt4eHhAVtbW1hZWWH8+PGoU6cOzpw5U2AbCwsLLF++HAMHDoSOjo7cc4+JiYG7uzvc3NxgYWGBHj16oH379rhw4YJUPXV1dRgbG8PExASNGzfGmDFjcPnyZbnHAf5vG4qjR4/C1tYWmpqa6NChAxITE8U6UVFRaNiwITQ0NFCxYkU0a9YMCQkJCAkJQVBQEK5duwaJRAKJRIKQkBAAgEQiwapVq9ClSxdoaGhgzpw5+W55sW/fPnHVbp6DBw+iQYMGUFVVhYGBATw8PADkfgYJCQmYMGGCOF5xZGdno1+/fggKCkK1arIr4RUVFeHq6orw8PBi9ZsfJlyJiIiIiIiIqFyLjo6Go6NjvtcCAwPx66+/IiYmBo8ePUKvXr2wbNkyhIWF4dChQzh27JjUysN58+bh999/x+rVq3Hr1i1MmDAB/fv3x6lTp2BmZibuEXnnzh0kJiaKScR3795h4sSJuHTpEo4fPw4FBQV4eHggJyenxPNKTU2Fq6srjh8/jitXrqBDhw7o3LkzHj58KFVv6dKlaNasGa5cuQI3NzcMGDAAAwcORP/+/XH58mVYWVlh4MCBEAQBAJCeng5HR0ccOnQIN2/exLBhwzBgwACZhODnyMnJQZUqVbBz507cvn0bM2fOxE8//YQdO3ZI1Ttx4gSePHmC06dPY8mSJQgICECnTp2gq6uL8+fPY8SIERg+fDj+/fdfsY2WlhZCQkJw+/ZtLF++HOvWrcPSpUvljk0QBBw/fhx37txBixYtSm3OeZo2bYrjx4/j7t27AIBr167hzJkz6NixY4FtXr58iR07dqBRo0bFHi8tLQ2LFi3Cli1bcPr0aTx8+BB+fn4AgA8fPqBr165wdnbG9evXce7cOQwbNgwSiQS9e/fGpEmTYGdnh8TERCQmJqJ3795iv4GBgfDw8MCNGzcwaNAguWI5dOgQPDw84OrqiitXruD48eNo2LAhgNxtHapUqYJZs2aJ4xXHrFmzYGhoWOhK9oYNGyI6OrpY/eaHe7gSERERERERUbmWkJAAU1PTfK/Nnj0bzZo1AwAMHjwY06ZNQ1xcnLhCrkePHjh58iSmTJmCjIwMzJ07F3/88QeaNGkCAKhWrRrOnDmDNWvWwNnZGXp6egBy99z8eLVf9+7dpcbduHEjKlWqhNu3b6N27dolmpeDg4PUV9mDg4Oxd+9eHDhwAGPGjBHLXV1dMXz4cADAzJkzsWrVKjRo0AA9e/YEkLsitEmTJkhKSoKxsTEqV64sJuQAYOzYsTh69Ch27NghJsc+V4UKFRAUFCS+t7S0xLlz57Bjxw706tVLLNfT08OKFSugoKCAmjVr4ueff0ZaWhp++uknAMC0adMwf/58nDlzBn369AGQu1dnHgsLC/j5+SE8PBw//vhjoTGlpKSgcuXKyMjIgKKiIlauXIl27dqVynw/NnXqVLx58wY2NjZQVFREdnY25syZg379+knVW7lyJdavXw9BEJCWloYaNWoUusVBQbKysrB69WpYWVkBAMaMGYNZs2YBAN68eYOUlBR06tRJvG5rayu21dTUhJKSEoyNjWX67du3L3x8fIoVy5w5c9CnTx+pzz7vGdbT04OioiK0tLTyHa8wZ86cwYYNG4rca9bU1BSPHj1CTk7OZ+3jyhWuRERERERERFSuvX//Hqqqqvleq1OnjvjfRkZG4tfQPy5LTk4GAPzzzz9IS0tDu3btoKmpKb5+//33IrcHuHfvHjw9PVGtWjVoa2uLh3flrUbt2LGj2J+dnZ1c80pNTYWfnx9sbW1RsWJFaGpqIjY2VmaF66dzBAB7e3uZsrx5ZmdnIzg4GPb29tDT04OmpiaOHj0q9hsaGio1/5KuGPztt9/g6OiISpUqQVNTE2vXrpWJ3c7OTioxZmRkJBW7oqIi9PX1xdgBYPv27WjWrBmMjY2hqakJf39/sd+HDx9KxT537lyxnZaWFq5evYqLFy9izpw5mDhxIqKioko0NyB3ZfXHY4WGhgIAduzYgdDQUISFheHy5cvYvHkzFi1ahM2bN0u179evH65evSqugK1evTrat2+Pt2/fivcmr+/CVseqq6uLyVQAMDExEe+Xnp4evL294eLigs6dO2P58uVyryx1cnIq1v0AgKtXr6JNmzbFbleYt2/fYsCAAVi3bh0MDAwKraumpoacnBxkZGR81phc4UpERERERERE5ZqBgQFevXqV77UKFSqI/y2RSKTe55Xlfe0/NTUVQO7XoitXrixVT0VFpdAYOnfuDHNzc6xbtw6mpqbIyclB7dq1kZmZCQBYv3493r9/LxNTYfz8/BAZGYlFixahevXqUFNTQ48ePcQ+C5pjQWV581y4cCGWL1+OZcuWwd7eHhoaGvD19RX77dKli9RX2z+9F/IIDw+Hn58fFi9ejCZNmkBLSwsLFy7E+fPnC4w9L9bCPqNz586J+3i6uLhAR0cH4eHhWLx4MYDcFY4fr4LMW5EMAAoKCqhevToAoG7duoiNjcW8efPQsmXLYs8PyE1IfjxWXmJ78uTJmDp1qrgi197eHgkJCZg3bx68vLzE+jo6OmI81atXx4YNG2BiYoLt27djyJAhiIiIEPemVVNTKzCO/O5X3vYRALBp0yaMGzcOR44cwfbt2+Hv74/IyEipPY/zo6GhIfVeQUFBql9A9jCtwuIsqbi4OMTHx6Nz585iWd7zoKSkhDt37ogJ55cvX0JDQ+Oz42DClYiIiIiIiIjKtXr16mHr1q2f3U+tWrWgoqKChw8fwtnZOd86ysrKAHJXieZ58eIF7ty5g3Xr1omnp396GFNJkpZnz56Ft7e3eOhQamoq4uPji91Pfv26u7ujf//+AHKTV3fv3kWtWrUA5K4E1dLS+uwxmjZtilGjRolln3uIGJB7IJW5uTmmT58uliUkJIj/raSkJCYxi/K5KyHV1NTyHSstLU3m6+yKiopF7uerqKgIAGJi3tzcvMSxfapevXqoV68epk2bhiZNmiAsLAyNGzeGsrKy1LNcmEqVKuHt27d49+6dmIz99Cv+derUwfHjxwvciqA44+WxsbHBjRs3pMr8/f3x9u1bLF++HGZmZmL5zZs3Ua9evWL1nx8mXImIiIiIiIioXHNxccG0adPw6tUr6OrqlrgfLS0t+Pn5YcKECcjJycEPP/yAlJQUnD17Ftra2vDy8oK5uTkkEgn+97//wdXVFWpqatDV1YW+vj7Wrl0LExMTPHz4EFOnTv3seVlbW2PPnj3o3LkzJBIJZsyY8VmHcH3c765duxATEwNdXV0sWbIESUlJYsK1MM+ePZNJspmYmOQ7xu+//46jR4/C0tISW7ZswcWLF2FpafnZsT98+BDh4eFo0KABDh06hL179xbZbt68eXBycoKVlRUyMjIQERGBLVu2YNWqVWKdadOm4fHjx/j999/Fsry5pqaminNXVlYu9F517twZc+bMQdWqVWFnZ4crV65gyZIlMgdPpaWl4enTpwCApKQkBAcHQ1VVFe3bty/OLSnUgwcPsHbtWnTp0gWmpqa4c+cO7t27h4EDBwLI3QP3wYMHuHr1KqpUqQItLa0CV3M3atQI6urq+OmnnzBu3DicP38eISEhUnUCAgLQpk0bWFlZoU+fPvjw4QMiIiIwZcoUcbzTp0+jT58+UFFRKXKLAABQVVWV2Qc5b//kT8ujo6NL5f4x4UpEREREREREX1Q1r2pFVypD9vb2qF+/Pnbs2CEeHlVSwcHBqFSpEubNm4f79++jYsWKqF+/vniIU+XKlREUFISpU6fCx8cHAwcOREhICMLDwzFu3DjUrl0bNWvWxIoVK0r8VfU8eUm6pk2bwsDAAFOmTMGbN28+q08gd3Xg/fv34eLiAnV1dQwbNgxdu3ZFSkpKkW3DwsIQFhYmVRYcHCyuls0zfPhwXLlyBb1794ZEIoGnpydGjRqFw4cPf1bsXbp0wYQJEzBmzBhkZGTAzc0NM2bMQGBgYKHt3r17h1GjRuHff/+FmpoabGxssHXrVvTu3Vusk5iYKLPH7MerJf/66y+EhYXB3Ny80JXGv/zyC2bMmIFRo0YhOTkZpqamGD58OGbOnClVb926dVi3bh0AQFdXF3Xq1EFERARq1qwp590omrq6Ov7++29s3rwZL168gImJCUaPHi3+nHTv3h179uxBq1at8Pr1a2zatAne3t759qWnp4etW7di8uTJWLduHdq0aYPAwEAMGzZMrNOyZUvs3LkTwcHBmD9/PrS1tdGiRQvx+qxZszB8+HAx8Z23RYFEIil0bHk8fvwYMTExpbLaXSJ8unlCOfTmzRvo6OggJSUF2traZR1OuZCVlYWIiAjUel0LilAssn61pv8UbwDr0vvXHKKS4nNO5QGfcyoP+JxTecDn/Ov7L/49ND09HQ8ePIClpWWBB1B9yw4dOoTJkyfj5s2bn3U6ORF9XQ8ePECNGjVw+/ZtWFtbl7ifKVOm4NWrV1i7dm2+14vzO44rXImIiIiIiIio3HNzc8O9e/fw+PFjqT0diejbFhERgWHDhn1WshUADA0NMXHixFKJiQlXIiIiIiIiIiIAvr6+ZR0CERXT6NGjS6WfSZMmlUo/AMA18kRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIqNx78eIFDA0NER8fX9ahyC0qKgoSiQSvX78GAISEhKBixYplGlNJxcfHQyKR4OrVq2UdCpVTffr0weLFi0ulLyZciYiIiIiIiOjLunfs671KaM6cOXB3d4eFhUXpzTsfnyZJy5OWLVvC19e3rMMokT179sDJyQkVK1aEhoYG6tatiy1bthTaJjExEX379kWNGjWgoKAg99yzs7MxY8YMWFpaQk1NDVZWVggODoYgCGKdli1bQiKRiC8jIyP07NkTCQkJhfZd2s9fYGAg6tatWyp9ycPCwgLLli0rUdtly5ahZs2aUFNTg5mZGSZMmID09HTxur+/P+bMmYOUlJTPjpMJVyIiIiIiIiIq19LS0rBhwwYMHjy4rEOhb5Senh6mT5+Oc+fO4fr16/Dx8YGPjw+OHj1aYJuMjAxUqlQJ/v7+cHBwkHusBQsWYNWqVfj1118RGxuLBQsW4Oeff8b/a+/O46qq9v+Pvw+ICDKJCooDOI+opGZIpeaskWM5T5laag6EKeWAs5VjmXY1EydCG+yaY0bOes05Ta+ZE5Y43BwQUUQ4vz/8eb4eQQTZiMDr2cPHw7P32mt/9mYp8XbttT/77DOrdn369FF0dLTOnz+vf//73zp37py6du36xNeYmRISErL0/OHh4RoxYoTGjBmjY8eOacGCBVq+fLk++OADS5uqVauqTJkyWrp0aYbPR+AKAAAAAABytbVr18re3l4vvPCCZdv9mYAbNmyQn5+fHBwc9Morr+jSpUtat26dKlWqJBcXF3Xu3FlxcXGW45KSkjR58mTL7MTq1avr22+/lXTvsfkGDRpIkgoUKCCTyaSePXtKktavX68XX3xRbm5uKliwoF599VWdPHkyQ9d18uRJtWrVSp6ennJyclLt2rX1888/W7Xx8fHRhAkT1L17dzk5Ocnb21urVq3S5cuX1apVKzk5OalatWrau3ev5Zh//vlHnTp1UrFixeTo6ChfX199/fXXGar1YYmJierdu7flPlaoUEGzZs2yatOzZ0+1bt1akyZNkqenp9zc3DRu3DjdvXtXw4YNk7u7u4oXL66FCxdaHTd8+HCVL19ejo6OKl26tEaNGvXYQLB+/fpq06aNKlWqpDJlymjw4MGqVq2atm/f/shjfHx8NGvWLHXv3l2urq5pvvadO3eqVatWatmypXx8fNS+fXs1adJEv/76q1U7R0dHFSlSREWLFtULL7yggQMHav/+/Wk+j/R/y1Bs2LBBlSpVkpOTk5o1a6bo6GhLm82bN+v5559X/vz55ebmpoCAAJ09e1ZhYWEaO3asDh06ZJlpGxYWJkkymUyaO3euXnvtNeXPn18TJ05MccmLH374QSaTyWrbjz/+qNq1aytfvnwqVKiQ2rRpI+ne1+Ds2bMaOnSo5XxptXPnTgUEBKhz587y8fFRkyZN1KlTp2T3NDAwUBEREem4gykjcAUAAAAAALnatm3bVLNmzRT3hYaGavbs2dq5c6fOnTunN954QzNnzlR4eLjWrFmjn376yWrm4eTJk7V48WJ98cUX+v333zV06FB17dpVW7ZsUYkSJfTdd99Jko4fP67o6GhLiHjz5k0FBQVp7969ioyMlI2Njdq0aaOkpKQnvq7Y2Fi1aNFCkZGROnDggJo1a6bAwEBFRUVZtZsxY4YCAgJ04MABtWzZUt26dVP37t3VtWtX7d+/X2XKlFH37t0tj7Tfvn1bNWvW1Jo1a3TkyBH17dtX3bp1SxZeZURSUpKKFy+ub775RkePHtXo0aP1wQcfaMWKFVbtfvnlF50/f15bt27V9OnTNWbMGL366qsqUKCAdu/erbffflv9+vXTX3/9ZTnG2dlZYWFhOnr0qGbNmqX58+drxowZaa7NbDYrMjJSx48f18svv2zYNd9Xt25dRUZG6o8//pAkHTp0SNu3b1fz5s0fecyVK1e0YsUK1alTJ93ni4uL09SpU7VkyRJt3bpVUVFRCg4OliTdvXtXrVu3Vr169fTbb79p165d6tu3r0wmkzp06KD33ntPVapUUXR0tKKjo9WhQwdLv6GhoWrTpo0OHz6sN998M021rFmzRm3atFGLFi104MABRUZG6vnnn5d0b1mH4sWLa9y4cZbzpVXdunW1b98+yxg9deqU1q5dqxYtWli1e/755/Xrr78qPj4+zX2nJE+GjgYAAAAAAMjmzp49Ky8vrxT3TZgwQQEBAZKk3r17KyQkRCdPnlTp0qUlSe3bt9emTZs0fPhwxcfHa9KkSfr555/l7+8vSSpdurS2b9+uf/3rX6pXr57c3d0lSR4eHlaz/dq1a2d13q+++kqFCxfW0aNHVbVq1Se6rurVq1s9yj5+/HitXLlSq1at0sCBAy3bW7RooX79+kmSRo8erblz56p27dp6/fXXJd2bEerv76+LFy+qSJEiKlasmCWQk6R3331XGzZs0IoVKyzhWEbZ2dlp7Nixls+lSpXSrl27tGLFCr3xxhuW7e7u7vr0009lY2OjChUq6OOPP1ZcXJzlUfGQkBBNmTJF27dvV8eOHSXdW6vzPh8fHwUHBysiIkLvv/9+qjVdv35dxYoVU3x8vGxtbTVnzhw1btzYkOt90IgRIxQTE6OKFSvK1tZWiYmJmjhxorp06WLVbs6cOfryyy9lNpsVFxen8uXLp7rEwaMkJCToiy++UJkyZSRJAwcO1Lhx4yRJMTExun79ul599VXL/kqVKlmOdXJyUp48eVSkSJFk/Xbu3Fm9evVKVy0TJ05Ux44drb7298ewu7u7bG1t5ezsnOL5UtO5c2f973//04svviiz2ay7d+/q7bfftlpSQJK8vLx0584dXbhwQd7e3uk6x4OY4QoAAAAAAHK1W7duKV++fCnuq1atmuX3np6elsfQH9x26dIlSdKff/6puLg4NW7cWE5OTpZfixcvfuzyACdOnFCnTp1UunRpubi4WF7edX82avPmzS39ValSJU3XFRsbq+DgYFWqVElubm5ycnLSsWPHks1wffgaJcnX1zfZtvvXmZiYqPHjx8vX11fu7u5ycnLShg0bLP0uW7bM6vq3bduWpnof9vnnn6tmzZoqXLiwnJycNG/evGS1V6lSRTY2/xdveXp6WtVua2urggULWmqXpOXLlysgIEBFihSRk5OTRo4caek3KirKqvZJkyZZjnN2dtbBgwe1Z88eTZw4UUFBQdq8efMTXZt0b2b1g+datmyZJGnFihVatmyZwsPDtX//fi1atEhTp07VokWLrI7v0qWLDh48aJkBW7ZsWTVp0kQ3btyw3Jv7fac2O9bR0dESpkpS0aJFLffL3d1dPXv2VNOmTRUYGKhZs2aleWZprVq10nU/JOngwYNq2LBhuo97nM2bN2vSpEmaM2eO9u/fr++//15r1qzR+PHjrdo5ODhIktUyIU+CGa4AAAAAACBXK1SokK5evZriPjs7O8vvTSaT1ef72+4/9h8bGyvp3mPRxYoVs2pnb2+fag2BgYHy9vbW/Pnz5eXlpaSkJFWtWlV37tyRJH355Ze6detWsppSExwcrI0bN2rq1KkqW7asHBwc1L59e0ufj7rGR227f52ffPKJZs2apZkzZ8rX11f58+fXkCFDLP2+9tprVo+2P3wv0iIiIkLBwcGaNm2a/P395ezsrE8++US7d+9+ZO33a03ta7Rr1y516dJFY8eOVdOmTeXq6qqIiAhNmzZN0r0ZjgcPHrQce39GsiTZ2NiobNmykqQaNWro2LFjmjx5surXr5/u65PuBZIPnut+sD1s2DCNGDHCMiPX19dXZ8+e1eTJk9WjRw9Le1dXV0s9ZcuW1YIFC1S0aFEtX75cb731ltauXWtZm/Z+kJiSlO7X/eUjJGnhwoUaNGiQ1q9fr+XLl2vkyJHauHGj1ZrHKcmfP7/VZxsbG6t+peQv00qtzowYNWqUunXrprfeekvSvXt68+ZN9e3bVx9++KEltL9y5YokqXDhwhk6H4ErAAAAAADI1fz8/Ax5M3nlypVlb2+vqKgo1atXL8U2efPmlXRvluh9//zzj44fP6758+frpZdekqRkL2N6ktByx44d6tmzp+WlQ7GxsTpz5ky6+0mp31atWqlr166S7gWxf/zxhypXrizp3kxQZ2fnDJ+jbt266t+/v2VbRl8iJt17eZK3t7c+/PBDy7azZ89afp8nTx5LiPk4SUlJGVrr08HBIcVzxcXFWc3ale7N1H3cer62traSZAnmM/JI/MP8/Pzk5+enkJAQ+fv7Kzw8XC+88ILy5s1rNZZTU7hwYd24cUM3b960hLEPBs7SvdnWkZGRj1yKID3ne9Cj7qkkqxD4yJEjKl68uAoVKpTuczyIwBUAAAAAAORqTZs2VUhIiK5evaoCBQo8cT/Ozs4KDg7W0KFDlZSUpBdffFHXr1/Xjh075OLioh49esjb21smk0mrV69WixYt5ODgoAIFCqhgwYKaN2+eihYtqqioKI0YMSLD11WuXDl9//33CgwMlMlk0qhRozL0Eq4H+/3222+1c+dOFShQQNOnT9fFixctgWtqLl++nCxkK1q0aIrnWLx4sTZs2KBSpUppyZIl2rNnj0qVKpXh2qOiohQREaHatWtrzZo1Wrly5WOPmzx5smrVqqUyZcooPj5ea9eu1ZIlSzR37lxLm5CQEP39999avHixZdv9a42NjbVce968eVO9V4GBgZo4caJKliypKlWq6MCBA5o+fXqyF0/FxcXpwoULkqSLFy9q/Pjxypcvn5o0aZKeW5Kq06dPa968eXrttdfk5eWl48eP68SJE+revbuke2vgnj59WgcPHlTx4sXl7Oz8yNncderUkaOjoz744AMNGjRIu3fvVlhYmFWbMWPGqGHDhipTpow6duyou3fvau3atRo+fLjlfFu3blXHjh1lb2+f5mA0MDBQ06dPl5+fn+rUqaM///xTo0aNUmBgoCV4le4t82DE/SNwBQAAAAAAmauccQFQZvD19dVzzz2nFStWWF4e9aTGjx+vwoULa/LkyTp16pTc3Nz03HPPWV7OU6xYMY0dO1YjRoxQr1691L17d4WFhSkiIkKDBg1S1apVVaFCBX366adP/Kj6ffdDurp166pQoUIaPny4YmJiMtSndO+lU6dOnVLTpk3l6Oiovn37qnXr1rp+/fpjjw0PD1d4eLjVtvHjx1tmy97Xr18/HThwQB06dJDJZFKnTp3Uv39/rVu3LkO1v/baaxo6dKgGDhyo+Ph4tWzZUqNGjVJoaGiqx928eVP9+/fXX3/9JQcHB1WsWFFLly5Vhw4dLG2io6OTrTHr5+dn+f2+ffsUHh4ub2/vVGcaf/bZZxo1apT69++vS5cuycvLS/369dPo0aOt2s2fP1/z58+XJBUoUEDVqlXT2rVrVaFChTTejcdzdHTUf//7Xy1atEj//POPihYtqgEDBlj+nLRr107ff/+9GjRooGvXrmnhwoXq2bNnin25u7tr6dKlGjZsmObPn6+GDRsqNDRUffv2tbSpX7++vvnmG40fP15TpkyRi4uLXn75Zcv+cePGqV+/fpbg+/7sVJPJlOq5R44cKZPJpJEjR+rvv/9W4cKFLcH2fbdv39YPP/yg9evXZ/CuSSbzw4sn5EIxMTFydXXV9evX5eLiktXl5AoJCQlau3atKl+rLFvZPrZ96bp/pu8Ez/g3c+QOjHPkBoxz5AaMc+QGjPOnLyf+HHr79m2dPn1apUqVeuQLqJ5la9as0bBhw3TkyJFkjx4DeHadPn1a5cuX19GjR1WuXLkn7mfu3LlauXKlfvrppxT3p+fvOGa4AgAAAACAXK9ly5Y6ceKE/v77b5UoUSKrywGQRmvXrlXfvn0zFLZK914e9tlnnxlSE4ErAAAAAACApCFDhmR1CQDSacCAAYb089ZbbxnSjyQxRx4AAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAADI9f755x95eHjozJkzWV1Kmm3evFkmk0nXrl2TJIWFhcnNzS1La3pSZ86ckclk0sGDB7O6FORCd+7ckY+Pj/bu3WtIf3kM6QUAAAAAAOARmv+276mda121mk903MSJE9WqVSv5+PgYW9BDNm/erAYNGujq1avZNhx9UvXr11eNGjU0c+bMrC4l3b7//ntNmjRJf/75pxISElSuXDm999576tatW6rHzJ07VwcPHlR8fLyqVKmi0NBQNW3a9LHn+uKLL7Rv3z5duXJFBw4cUI0aNaza+Pj46OzZs5IkGxsbeXp6qnnz5po6daoKFCjwyL7DwsI0ZMgQS0ifUT179tS1a9f0ww8/GNLf45hMJq1cuVKtW7d+4j6mTJmikJAQDR482DIW8+bNq+DgYA0fPlyRkZEZrpMZrgAAAAAAIFeLi4vTggUL1Lt376wuBc8od3d3ffjhh9q1a5d+++039erVS7169dKGDRseeczWrVvVuHFjrV27Vvv27VODBg0UGBioAwcOpHqumzdv6sUXX9RHH32Uartx48YpOjpaUVFRWrZsmbZu3apBgwY90fVltoSEhKwuQZK0Z88e/etf/1K1atWS7evSpYu2b9+u33//PcPnIXAFAAAAAAC52tq1a2Vvb68XXnjBsu3+4/obNmyQn5+fHBwc9Morr+jSpUtat26dKlWqJBcXF3Xu3FlxcXGW45KSkjR58mSVKlVKDg4Oql69ur799ltJ9x6bb9CggSSpQIECMplM6tmzpyRp/fr1evHFF+Xm5qaCBQvq1Vdf1cmTJzN0XSdPnlSrVq3k6ekpJycn1a5dWz///LNVGx8fH02YMEHdu3eXk5OTvL29tWrVKl2+fFmtWrWSk5OTqlWrZvWo9T///KNOnTqpWLFicnR0lK+vr77++usM1fqwxMRE9e7d23IfK1SooFmzZlm16dmzp1q3bq1JkybJ09NTbm5uGjdunO7evathw4bJ3d1dxYsX18KFC62OGz58uMqXLy9HR0eVLl1ao0aNemwgWL9+fbVp00aVKlVSmTJlNHjwYFWrVk3bt29/5DEzZ87U+++/r9q1a6tcuXKaNGmSypUrpx9//DHVc3Xr1k2jR49Wo0aNUm3n7OysIkWKqFixYmrQoIF69Oih/fv3p3rMw0JDQ1WjRg0tWbJEPj4+cnV1VceOHXXjxg1Lm2+//Va+vr5ycHBQwYIF1ahRI928eVOhoaFatGiR/v3vf8tkMslkMmnz5s2W5SGWL1+uevXqKV++fFq2bJnlXA/fo4dnlX/11VeqUqWK7O3tVbRoUQ0cOFCSLO3atGkjk8mU7tnosbGx6tKli+bPn5/iLOACBQooICBAERER6eo3JQSuAAAAAAAgV9u2bZtq1kx5KYLQ0FDNnj1bO3fu1Llz5/TGG29o5syZCg8P15o1a/TTTz/ps88+s7SfPHmyFi9erC+++EK///67hg4dqq5du2rLli0qUaKEvvvuO0nS8ePHFR0dbQkRb968qaCgIO3du1eRkZGysbFRmzZtlJSU9MTXFRsbqxYtWigyMlIHDhxQs2bNFBgYqKioKKt2M2bMUEBAgA4cOKCWLVuqW7du6t69u7p27ar9+/erTJky6t69u8xmsyTp9u3bqlmzptasWaMjR46ob9++6tatm3799dcnrvVhSUlJKl68uL755hsdPXpUo0eP1gcffKAVK1ZYtfvll190/vx5bd26VdOnT9eYMWP06quvqkCBAtq9e7fefvtt9evXT3/99ZflGGdnZ4WFheno0aOaNWuW5s+frxkzZqS5NrPZrMjISB0/flwvv/xyuq7pxo0bcnd3T/MxafX333/rxx9/VJ06ddJ97MmTJ/XDDz9o9erVWr16tbZs2aIpU6ZIkqKjo9WpUye9+eabOnbsmDZv3qy2bdvKbDYrODhYb7zxhpo1a6bo6GhFR0erbt26ln5HjBihwYMH69ixY49dRuG+uXPnasCAAerbt68OHz6sVatWqWzZspLuzU6VpIULFyo6OtryOa0GDBigli1bphpkP//889q2bVu6+k0Ja7gCAAAAAIBc7ezZs/Ly8kpx34QJExQQECBJ6t27t0JCQnTy5EmVLl1aktS+fXtt2rRJw4cPV3x8vCZNmqSff/5Z/v7+kqTSpUtr+/bt+te//qV69epZwjYPDw+rNVzbtWtndd6vvvpKhQsX1tGjR1W1atUnuq7q1aurevXqls/jx4/XypUrtWrVKsusQUlq0aKF+vXrJ0kaPXq05s6dq9q1a+v111+XdG9GqL+/vy5evGiZURkcHGw5/t1339WGDRu0YsUKPf/8809U68Ps7Ow0duxYy+dSpUpp165dWrFihd544w3Ldnd3d3366aeysbFRhQoV9PHHHysuLk4ffPCBJCkkJERTpkzR9u3b1bFjR0nSyJEjLcf7+PgoODhYERERev/991Ot6fr16ypWrJji4+Nla2urOXPmqHHjxmm+pqlTpyo2Ntaq/owYPny4Ro4cqcTERN2+fVt16tTR9OnT091PUlKSwsLC5OzsLOneDNvIyEhNnDhR0dHRunv3rtq2bStvb29Jkq+vr+VYBwcHxcfHq0iRIsn6HTJkiNq2bZuuWiZMmKD33ntPgwcPtmyrXbu2JKlw4cKSJDc3txTPl5qIiAjt37//sSGtl5eXZW3cjGCGKwAAAAAAyNVu3bqlfPnypbjvwbUePT09LY+hP7jt0qVLkqQ///xTcXFxaty4sZycnCy/Fi9e/NjlAU6cOKFOnTqpdOnScnFxsTwufX82avPmzS39ValSJU3XFRsbq+DgYFWqVElubm5ycnLSsWPHks1wffgaJetQ7f62+9eZmJio8ePHy9fXV+7u7nJyctKGDRss/S5btszq+p90xuDnn3+umjVrqnDhwnJyctK8efOS1V6lShXZ2PxfvOXp6WlVu62trQoWLGipXZKWL1+ugIAAFSlSRE5OTho5cqSl36ioKKvaJ02aZDnO2dlZBw8e1J49ezRx4kQFBQVp8+bNabqW8PBwjR07VitWrJCHh4ekjN+nYcOG6eDBg/rtt98sL3pq2bKlEhMTJcmq77fffvuR/fj4+FjCVkkqWrSo5X5Vr15dDRs2lK+vr15//XXNnz9fV69eTVN9tWrVStf1XLp0SefPn1fDhg3TddzjnDt3ToMHD9ayZcse+ef8PgcHB6slQp4UM1wBAAAAAECuVqhQoUeGSHZ2dpbfm0wmq8/3t91/7D82NlaStGbNGhUrVsyqnb29fao1BAYGytvbW/Pnz5eXl5eSkpJUtWpV3blzR5L05Zdf6tatW8lqSk1wcLA2btyoqVOnqmzZsnJwcFD79u0tfT7qGh+17f51fvLJJ5o1a5ZmzpwpX19f5c+fX0OGDLH0+9prr1k92v7wvUiLiIgIBQcHa9q0afL395ezs7M++eQT7d69+5G13681ta/Rrl271KVLF40dO1ZNmzaVq6urIiIiNG3aNEn3ZjgePHjQcuyDj//b2NhYHm+vUaOGjh07psmTJ6t+/fqPvZa33npL33zzjdXj7Bm9T4UKFbLUU65cOc2cOVP+/v7atGmTGjVqZHUdLi4uj+wntftla2urjRs3aufOnZblMz788EPt3r1bpUqVSrW+/PnzW322sbGxLEtx34Nr5zo4OKTa35Pat2+fLl26pOeee86yLTExUVu3btXs2bMtM5Yl6cqVK5aZtBlB4AoAAAAAAHI1Pz8/LV26NMP9VK5cWfb29oqKilK9evVSbJM3b15JssxClO69hOr48eOaP3++XnrpJUlK9jKmJwktd+zYoZ49e6pNmzaS7gXCZ86cSXc/KfXbqlUrde3aVdK9IPaPP/5Q5cqVJd2bCfrgjMknPUfdunXVv39/y7aMvkRMknbu3Clvb299+OGHlm0PPkKeJ08eS4j5OElJSYqPj0+1zddff60333xTERERatmypdU+I+7Tg+6HhveD+bRex+OYTCYFBAQoICBAo0ePlre3t1auXKmgoCDlzZvXaiynpnDhwrpw4YLMZrMlxH8wFHZ2dpaPj48iIyMtL5d7mJ2dXZrPd1/Dhg11+PBhq229evVSxYoVNXz4cMt9k6QjR47Iz88vXf2nhMAVAAAAAADkak2bNlVISIiuXr2a4tvL08rZ2VnBwcEaOnSokpKS9OKLL+r69evasWOHXFxc1KNHD3l7e8tkMmn16tVq0aKFHBwcVKBAARUsWFDz5s1T0aJFFRUVpREjRmT4usqVK6fvv/9egYGBMplMGjVqVIZewvVgv99++6127typAgUKaPr06bp48aIlcE3N5cuXrUI26d4j7CmdY/HixdqwYYNKlSqlJUuWaM+ePY+dVZmW2qOiohQREaHatWtrzZo1Wrly5WOPmzx5smrVqqUyZcooPj5ea9eu1ZIlSzR37lxLm5CQEP39999avHixpHvLCPTo0UOzZs1SnTp1dOHCBUn3ZnK6uro+8lxXrlxRVFSUzp8/L+neC9YkqUiRIlZrl964ccMSYJ47d07vv/++ChcubPXiqozavXu3IiMj1aRJE3l4eGj37t26fPmyKlWqJOnecgQbNmzQ8ePHVbBgwVSvq379+rp8+bI+/vhjtW/fXuvXr9e6deusZt+Ghobq7bffloeHh5o3b64bN25ox44devfddy3ni4yMVEBAgOzt7dP059XZ2TnZOsj58+dXwYIFk23ftm2bxo8fn+b78ygErgAAAAAAIFOtq1Yzq0tIla+vr5577jmtWLHC8vKoJzV+/HgVLlxYkydP1qlTp+Tm5qbnnnvO8hKnYsWKaezYsRoxYoR69eql7t27KywsTBERERo0aJCqVq2qChUq6NNPP33so+qPM336dL355puqW7euChUqpOHDhysmJiZDfUr3Xjp16tQpNW3aVI6Ojurbt69at26t69evP/bY8PBwhYeHW20bP368Zbbsff369dOBAwfUoUMHmUwmderUSf3799e6desyVPtrr72moUOHauDAgYqPj1fLli01atQohYaGpnrczZs31b9/f/31119ycHBQxYoVtXTpUnXo0MHSJjo62mqN2Xnz5unu3bsaMGCABgwYYNneo0cPhYWFPfJcq1atUq9evSyf77/sa8yYMVZ1jh49WqNHj5Z0b/Zo7dq19dNPP6lgwYJpuRVp4uLioq1bt2rmzJmKiYmRt7e3pk2bpubNm0uS+vTpo82bN6tWrVqKjY3Vpk2bLOsPP6xSpUqaM2eOJk2apPHjx6tdu3YKDg7WvHnzLG169Oih27dva8aMGQoODlahQoXUvn17y/5p06YpKChI8+fPV7FixXTmzBmdOXNGpUqV0qZNmzL0Z2bXrl26fv261fmelMn88OIJuVBMTIxcXV11/fr1VNe0gHESEhK0du1aVb5WWbayfWz70nX/TN8JyjV5wsoA4zDOkRswzpEbMM6RGzDOn76c+HPo7du3dfr0aZUqVeqxL6Z5Fq1Zs0bDhg3TkSNHrF7CBODZtmnTJrVt21anTp3K0Az1Dh06qHr16pZ/HHlYev6OY4YrAAAAAADI9Vq2bKkTJ07o77//VokSJbK6HABptHbtWn3wwQcZClvv3LkjX19fDR061JCaCFwBAAAAAAAkDRkyJKtLAJBOn3zySYb7yJs3r0aOHGlANfcwRx4AAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAADI9f755x95eHjozJkzWV1Kmm3evFkmk0nXrl2TJIWFhcnNzS1La3pSZ86ckclk0sGDB7O6FORSL7zwgr777jtD+spjSC8AAAAAAACPkDD2vad2Lrsx057ouIkTJ6pVq1by8fExtqCHbN68WQ0aNNDVq1ezbTj6pOrXr68aNWpo5syZWV1Kun3//feaNGmS/vzzTyUkJKhcuXJ677331K1bt1SPmTt3rg4ePKj4+HhVqVJFoaGhatq06WPP9cUXX2jfvn26cuWKDhw4oBo1ali18fHx0dmzZyVJNjY28vT0VPPmzTV16lQVKFDgkX2HhYVpyJAhlpA+o3r27Klr167phx9+MKS/xzGZTFq5cqVat26druMSExMVGhqqpUuX6sKFC/Ly8lLPnj01cuRImUwmSdLIkSM1dOhQtWnTRjY2GZujygxXAAAAAACQq8XFxWnBggXq3bt3VpeCZ5S7u7s+/PBD7dq1S7/99pt69eqlXr16acOGDY88ZuvWrWrcuLHWrl2rffv2qUGDBgoMDNSBAwdSPdfNmzf14osv6qOPPkq13bhx4xQdHa2oqCgtW7ZMW7du1aBBg57o+jJbQkJClp7/o48+0ty5czV79mwdO3ZMH330kT7++GN99tlnljbNmzfXjRs3tG7dugyfj8AVAAAAAADkamvXrpW9vb1eeOEFy7b7j+tv2LBBfn5+cnBw0CuvvKJLly5p3bp1qlSpklxcXNS5c2fFxcVZjktKStLkyZNVqlQpOTg4qHr16vr2228l3XtsvkGDBpKkAgUKyGQyqWfPnpKk9evX68UXX5Sbm5sKFiyoV199VSdPnszQdZ08eVKtWrWSp6ennJycVLt2bf38889WbXx8fDRhwgR1795dTk5O8vb21qpVq3T58mW1atVKTk5Oqlatmvbu3Ws55p9//lGnTp1UrFgxOTo6ytfXV19//XWGan1YYmKievfubbmPFSpU0KxZs6za9OzZU61bt9akSZPk6ekpNzc3jRs3Tnfv3tWwYcPk7u6u4sWLa+HChVbHDR8+XOXLl5ejo6NKly6tUaNGPTYQrF+/vtq0aaNKlSqpTJkyGjx4sKpVq6bt27c/8piZM2fq/fffV+3atVWuXDlNmjRJ5cqV048//pjqubp166bRo0erUaNGqbZzdnZWkSJFVKxYMTVo0EA9evTQ/v37Uz3mYaGhoapRo4aWLFkiHx8fubq6qmPHjrpx44alzbfffitfX185ODioYMGCatSokW7evKnQ0FAtWrRI//73v2UymWQymbR582bL8hDLly9XvXr1lC9fPi1btsxyrofv0cOzyr/66itVqVJF9vb2Klq0qAYOHChJlnZt2rSRyWRK12z0nTt3qlWrVmrZsqV8fHzUvn17NWnSRL/++qulja2trVq0aKGIiIh03cOUELgCAAAAAIBcbdu2bapZs2aK+0JDQzV79mzt3LlT586d0xtvvKGZM2cqPDxca9as0U8//WQ1S27y5MlavHixvvjiC/3+++8aOnSounbtqi1btqhEiRKWNSKPHz+u6OhoS4h48+ZNBQUFae/evYqMjJSNjY3atGmjpKSkJ76u2NhYtWjRQpGRkTpw4ICaNWumwMBARUVFWbWbMWOGAgICdODAAbVs2VLdunVT9+7d1bVrV+3fv19lypRR9+7dZTabJUm3b99WzZo1tWbNGh05ckR9+/ZVt27drMKrjEpKSlLx4sX1zTff6OjRoxo9erQ++OADrVixwqrdL7/8ovPnz2vr1q2aPn26xowZo1dffVUFChTQ7t279fbbb6tfv37666+/LMc4OzsrLCxMR48e1axZszR//nzNmDEjzbWZzWZFRkbq+PHjevnll9N1TTdu3JC7u3uaj0mrv//+Wz/++KPq1KmT7mNPnjypH374QatXr9bq1au1ZcsWTZkyRZIUHR2tTp066c0339SxY8e0efNmtW3bVmazWcHBwXrjjTfUrFkzRUdHKzo6WnXr1rX0O2LECA0ePFjHjh177DIK982dO1cDBgxQ3759dfjwYa1atUply5aVJO3Zs0eStHDhQkVHR1s+p0XdunUVGRmpP/74Q5J06NAhbd++Xc2bN7dq9/zzz2vbtm1p7vdRWMMVAADkGOlZH+5J13cDAAA5z9mzZ+Xl5ZXivgkTJiggIECS1Lt3b4WEhOjkyZMqXbq0JKl9+/batGmThg8frvj4eE2aNEk///yz/P39JUmlS5fW9u3b9a9//Uv16tWzhG0eHh5Wa7i2a9fO6rxfffWVChcurKNHj6pq1apPdF3Vq1dX9erVLZ/Hjx+vlStXatWqVZZZg5LUokUL9evXT5I0evRozZ07V7Vr19brr78u6d6MUH9/f128eNEyozI4ONhy/LvvvqsNGzZoxYoVev7555+o1ofZ2dlp7Nixls+lSpXSrl27tGLFCr3xxhuW7e7u7vr0009lY2OjChUq6OOPP1ZcXJw++OADSVJISIimTJmi7du3q2PHjpLurdV5n4+Pj4KDgxUREaH3338/1ZquX7+uYsWKKT4+Xra2tpozZ44aN26c5muaOnWqYmNjrerPiOHDh2vkyJFKTEzU7du3VadOHU2fPj3d/SQlJSksLEzOzs6S7s2wjYyM1MSJExUdHa27d++qbdu28vb2liT5+vpajnVwcFB8fLyKFCmSrN8hQ4aobdu26aplwoQJeu+99zR48GDLttq1a0uSChcuLElyc3NL8XypGTFihGJiYlSxYkXZ2toqMTFREydOVJcuXazaeXl56dy5c0pKSsrQOq7McAUAAAAAALnarVu3lC9fvhT3VatWzfJ7T09Py2PoD267dOmSJOnPP/9UXFycGjduLCcnJ8uvxYsXP3Z5gBMnTqhTp04qXbq0XFxcLI9L35+N2rx5c0t/VapUSdN1xcbGKjg4WJUqVZKbm5ucnJx07NixZDNcH75GyTpUu7/t/nUmJiZq/Pjx8vX1lbu7u5ycnLRhwwZLv8uWLbO6/iedMfj555+rZs2aKly4sJycnDRv3rxktVepUsUqGPP09LSq3dbWVgULFrTULknLly9XQECAihQpIicnJ40cOdLSb1RUlFXtkyZNshzn7OysgwcPas+ePZo4caKCgoK0efPmNF1LeHi4xo4dqxUrVsjDw0NSxu/TsGHDdPDgQf3222+KjIyUJLVs2VKJiYmSZNX322+//ch+fHx8LGGrJBUtWtRyv6pXr66GDRvK19dXr7/+uubPn6+rV6+mqb5atWql63ouXbqk8+fPq2HDhuk6Li1WrFihZcuWKTw8XPv379eiRYs0depULVq0yKqdg4ODkpKSFB8fn6HzMcMVAAAAAADkaoUKFXpkiGRnZ2f5vclksvp8f9v9x/5jY2MlSWvWrFGxYsWs2tnb26daQ2BgoLy9vTV//nx5eXkpKSlJVatW1Z07dyRJX375pW7dupWsptQEBwdr48aNmjp1qsqWLSsHBwe1b9/e0uejrvFR2+5f5yeffKJZs2Zp5syZ8vX1Vf78+TVkyBBLv6+99prVo+0P34u0iIiIUHBwsKZNmyZ/f385Ozvrk08+0e7dux9Z+/1aU/sa7dq1S126dNHYsWPVtGlTubq6KiIiQtOm3Xv6ycvLSwcPHrQc++Dj/zY2NpbH22vUqKFjx45p8uTJql+//mOv5a233tI333xjtS5rRu9ToUKFLPWUK1dOM2fOlL+/vzZt2qRGjRpZXYeLi8sj+0ntftna2mrjxo3auXOnZfmMDz/8ULt371apUqVSrS9//vxWn21sbCzLUtz34Nq5Dg4OqfaXEcOGDdOIESMss5x9fX119uxZTZ48WT169LC0u3LlivLnz5/hWghcAQAAgGyEpTMAwHh+fn5aunRphvupXLmy7O3tFRUVpXr16qXYJm/evJJkmYUo3XsJ1fHjxzV//ny99NJLkpTsZUxPElru2LFDPXv2VJs2bSTdC4TPnDmT7n5S6rdVq1bq2rWrpHtB7B9//KHKlStLujcT9MEZk096jrp166p///6WbRl9iZh07+VJ3t7e+vDDDy3bzp49a/l9njx5LCHm46RlJuTXX3+tN998UxEREWrZsqXVPiPu04NsbW0lyRLMp/U6HsdkMikgIEABAQEaPXq0vL29tXLlSgUFBSlv3rxWYzk1hQsX1oULF2Q2my0h/oOhsLOzs3x8fBQZGWl5udzD7Ozs0ny+B8XFxSVbIsDW1jbZGslHjhyRn59fuvt/GIErAAAAAADI1Zo2baqQkBBdvXpVBQoUeOJ+nJ2dFRwcrKFDhyopKUkvvviirl+/rh07dsjFxUU9evSQt7e3TCaTVq9erRYtWsjBwUEFChRQwYIFNW/ePBUtWlRRUVEaMWJEhq+rXLly+v777xUYGCiTyaRRo0Zl6CVcD/b77bffaufOnSpQoICmT5+uixcvWgLX1Fy+fNkqZJPuPcKe0jkWL16sDRs2qFSpUlqyZIn27Nnz2FmVaak9KipKERERql27ttasWaOVK1c+9rjJkyerVq1aKlOmjOLj47V27VotWbJEc+fOtbQJCQnR33//rcWLF0u6t4xAjx49NGvWLNWpU0cXLlyQdG8mp6ur6yPPdeXKFUVFRen8+fOS7r1gTZKKFClitXbpjRs3LAHmuXPn9P7776tw4cJWL67KqN27dysyMlJNmjSRh4eHdu/ercuXL6tSpUqS7i1HsGHDBh0/flwFCxZM9brq16+vy5cv6+OPP1b79u21fv16rVu3zmr2bWhoqN5++215eHioefPmunHjhnbs2KF3333Xcr7IyEgFBATI3t4+zX9eAwMDNXHiRJUsWVJVqlTRgQMHNH36dL355ptW7bZt26YmTZqk9zYlQ+AKAAAAAAAy1bM+497X11fPPfecVqxYYXl51JMaP368ChcurMmTJ+vUqVNyc3PTc889Z3mJU7FixTR27FiNGDFCvXr1Uvfu3RUWFqaIiAgNGjRIVatWVYUKFfTpp58+9lH1x7kfKNWtW1eFChXS8OHDFRMTk6E+pXsvnTp16pSaNm0qR0dH9e3bV61bt9b169cfe2x4eLjCw8Otto0fP94yW/a+fv366cCBA+rQoYNMJpM6deqk/v37a926dRmq/bXXXtPQoUM1cOBAxcfHq2XLlho1apRCQ0NTPe7mzZvq37+//vrrLzk4OKhixYpaunSpOnToYGkTHR1ttcbsvHnzdPfuXQ0YMEADBgywbO/Ro4fCwsIeea5Vq1apV69els/3H4MfM2aMVZ2jR4/W6NGjJd2bPVq7dm399NNPKliwYFpuRZq4uLho69atmjlzpmJiYuTt7a1p06apefPmkqQ+ffpo8+bNqlWrlmJjY7Vp0ybL+sMPq1SpkubMmaNJkyZp/PjxateunYKDgzVv3jxLmx49euj27duaMWOGgoODVahQIbVv396yf9q0aQoKCtL8+fNVrFgxnTlzRmfOnFGpUqW0adOmR/6Z+eyzzzRq1Cj1799fly5dkpeXl/r162e5f5L0999/a+fOnYbMdjeZH148IReKiYmRq6urrl+/nuqaFjBOQkKC1q5dq8rXKstWto9tX7run+k7QbmM/2sEkFGMc+QGz9o451FrZAbGOXKDZ22c5wY58efQ27dv6/Tp0ypVqtQjX0D1LFuzZo2GDRumI0eOZOjt5ACerk2bNqlt27Y6depUhmaoDx8+XFevXrUKgB+Unr/jmOEKAAAAAAByvZYtW+rEiRP6+++/VaJEiawuB0AarV27Vh988EGGwlZJ8vDwUFBQkCE1EbgCAAAAAABIGjJkSFaXACCdPvnkE0P6ee+9tD9F9DjMkQcAAAAAAAAAgzDDFQAAAADwTGGtYgBAdsYMVwAAAAAAYBjezQ0gJ0rP320ErgAAAAAAIMPs7OwkSXFxcVlcCQAY786dO5IkW1vbx7ZlSQEAyKaa/7YvXe3XVauZSZUAmSe943xVJtUBAAAez9bWVm5ubrp06ZIkydHRUSaTKYurAoCMS0pK0uXLl+Xo6Kg8eR4fpxK4AkAuwVpoAAAAyGxFihSRJEvoCgA5hY2NjUqWLJmmf0gicAUAAAAAAIYwmUwqWrSoPDw8lJCQkNXlAIBh8ubNKxubtK3OSuAKiJl/AAAAAGAkW1vbNK1zCAA5ES/NAgAAAAAAAACDMMMVAAAAyEK8HA4AACBnYYYrAAAAAAAAABiEwBUAAAAAAAAADMKSAgAAAACATMXSGQCA3IQZrgAAAAAAAABgEAJXAAAAAAAAADAISwogR+KRJQAAAAAAAGQFZrgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIFkauM6dO1fVqlWTi4uLXFxc5O/vr3Xr1ln23759WwMGDFDBggXl5OSkdu3a6eLFi1Z9REVFqWXLlnJ0dJSHh4eGDRumu3fvPu1LAQAAAAAAAICsDVyLFy+uKVOmaN++fdq7d69eeeUVtWrVSr///rskaejQofrxxx/1zTffaMuWLTp//rzatm1rOT4xMVEtW7bUnTt3tHPnTi1atEhhYWEaPXp0Vl0SAAAAAAAAgFwsT1aePDAw0OrzxIkTNXfuXP3nP/9R8eLFtWDBAoWHh+uVV16RJC1cuFCVKlXSf/7zH73wwgv66aefdPToUf3888/y9PRUjRo1NH78eA0fPlyhoaHKmzdvVlwWAAAAAAAAgFwqSwPXByUmJuqbb77RzZs35e/vr3379ikhIUGNGjWytKlYsaJKliypXbt26YUXXtCuXbvk6+srT09PS5umTZvqnXfe0e+//y4/P78UzxUfH6/4+HjL55iYGElSQkKCEhISMukK8aD79zlRiWlrn2hOV/92SUnpq8fGNh2NGSNIG8Y5cgPGOXIDxjlyA8b508fPngCQc5nMZnP6vlMa7PDhw/L399ft27fl5OSk8PBwtWjRQuHh4erVq5dVMCpJzz//vBo0aKCPPvpIffv21dmzZ7VhwwbL/ri4OOXPn19r165V8+bNUzxnaGioxo4dm2x7eHi4HB0djb1AAAAAAAAeEhcXp86dO+v69etycXHJ6nIAAAbK8hmuFSpU0MGDB3X9+nV9++236tGjh7Zs2ZKp5wwJCVFQUJDlc0xMjEqUKKEmTZrwje4pSUhI0MaNG1XhWgXZ6vH/eu1T51S6+m932z1d7SN+/CbNbe1GTExX38i9GOfIDRjnyA0Y58gNGOdP3/0nLQEAOU+WB6558+ZV2bJlJUk1a9bUnj17NGvWLHXo0EF37tzRtWvX5ObmZml/8eJFFSlSRJJUpEgR/frrr1b9Xbx40bLvUezt7WVvb59su52dnezs7DJ6SUgH2///3+PY2ZrS1W+CTfreB2eXlLZHpyQxRpBujHPkBoxz5AaMc+QGjPOnJ7vWDQB4vPR913sKkpKSFB8fr5o1a8rOzk6RkZGWfcePH1dUVJT8/f0lSf7+/jp8+LAuXbpkabNx40a5uLiocuXKT712AAAAAAAAALlbls5wDQkJUfPmzVWyZEnduHFD4eHh2rx5szZs2CBXV1f17t1bQUFBcnd3l4uLi9599135+/vrhRdekCQ1adJElStXVrdu3fTxxx/rwoULGjlypAYMGJDiDFYAAAAAAAAAyExZGrheunRJ3bt3V3R0tFxdXVWtWjVt2LBBjRs3liTNmDFDNjY2ateuneLj49W0aVPNmTPHcrytra1Wr16td955R/7+/sqfP7969OihcePGZdUlAQAAAAAAAMjFsjRwXbBgQar78+XLp88//1yff/75I9t4e3tr7dq1RpcGAAAAAAAAAOn2zK3hCgAAAAAAAADZFYErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIPkSe8BSUlJ2rJli7Zt26azZ88qLi5OhQsXlp+fnxo1aqQSJUpkRp0AAAAAAAAA8MxL8wzXW7duacKECSpRooRatGihdevW6dq1a7K1tdWff/6pMWPGqFSpUmrRooX+85//ZGbNAAAAAAAAAPBMSvMM1/Lly8vf31/z589X48aNZWdnl6zN2bNnFR4ero4dO+rDDz9Unz59DC0WAAAAAAAAAJ5laQ5cf/rpJ1WqVCnVNt7e3goJCVFwcLCioqIyXBwAAAAAAAAAZCdpXlLgcWHrg+zs7FSmTJknKggAAAAAAAAAsqs0B64PWr9+vbZv3275/Pnnn6tGjRrq3Lmzrl69alhxAAAAAAAAAJCdPFHgOmzYMMXExEiSDh8+rPfee08tWrTQ6dOnFRQUZGiBAAAAAAAAAJBdpHkN1wedPn1alStXliR99913evXVVzVp0iTt379fLVq0MLRAAAAAAAAAAMgunmiGa968eRUXFydJ+vnnn9WkSRNJkru7u2XmKwAAAAAAAADkNk80w/XFF19UUFCQAgIC9Ouvv2r58uWSpD/++EPFixc3tEAAAAAAAAAAyC6eaIbr7NmzlSdPHn377beaO3euihUrJklat26dmjVrZmiBAAAAAAAAAJBdPNEM15IlS2r16tXJts+YMSPDBQEAAAAAAABAdvVEget9ly5d0qVLl5SUlGS1vVq1ahkqCgAAAAAAAACyoycKXPft26cePXro2LFjMpvNkiSTySSz2SyTyaTExERDiwQAAAAAAACA7OCJAtc333xT5cuX14IFC+Tp6SmTyWR0XQAAAAAAAACQ7TxR4Hrq1Cl99913Klu2rNH1AAAAAAAAAEC2ZfMkBzVs2FCHDh0yuhYAAAAAAAAAyNaeaIbrl19+qR49eujIkSOqWrWq7OzsrPa/9tprhhQHAAAAAAAAANnJEwWuu3bt0o4dO7Ru3bpk+3hpFgAAAAAAAIDc6okC13fffVddu3bVqFGj5OnpaXRNAAAAAICM+m86Xm6c6CDp60wrBQCA3OSJAtd//vlHQ4cOJWwFAABA9kQQBQAAgEzyRC/Natu2rTZt2mR0LQAAAAAAAACQrT3RDNfy5csrJCRE27dvl6+vb7KXZg0aNMiQ4gDgmcOMKOQGjHMAAAAAeGJPFLh++eWXcnJy0pYtW7RlyxarfSaTicA1t+IHdAAAAAAAAORyTxS4nj592ug6AAAAAAAAACDbe6I1XAEAAAAAAAAAyaU5cJ0yZYpu3bqVpra7d+/WmjVrnrgoAAAAAAAAAMiO0rykwNGjR1WyZEm9/vrrCgwMVK1atVS4cGFJ0t27d3X06FFt375dS5cu1fnz57V48eJMKxpPx/WxY9PV3rVDJhUCAAAAAAAAZBNpDlwXL16sQ4cOafbs2ercubNiYmJka2sre3t7xcXFSZL8/Pz01ltvqWfPnsqXL1+mFQ0AAAAAAAAAz6J0vTSrevXqmj9/vv71r3/pt99+09mzZ3Xr1i0VKlRINWrUUKFChTKrTgAAAOCReDIHAAAAz4p0Ba732djYqEaNGqpRo4bB5QAAAKMRRAEAAADA0/NEgSsA5BQEUQAAILvg/1sAAMgebLK6AAAAAAAAAADIKQhcAQAAAAAAAMAgBK4AAAAAAAAAYJB0Ba62tra6dOlSZtUCAAAAAAAAANlaugJXs9mcWXUAAAAAAAAAQLbHkgIAAAAAAAAAYJA86T3gyy+/lJOTU6ptBg0a9MQFAQAAAAAAAEB2le7A9YsvvpCtre0j95tMJgJXAAAAAAAAALlSugPXvXv3ysPDIzNqAQAAAAAAAIBsLV1ruJpMpsyqAwAAAAAAAACyvXQFrmazObPqAAAAAAAAAIBsL12B65gxYx77wiwAAAAAAAAAyK3SFbguWLBAt27dsnyePXu2YmJiDC8KAAAAAAAAALKjdAWuf/31lxITEy2fP/jgA/3vf/8zvCgAAAAAAAAAyI7SFbg+jDVdAQAAAAAAAOD/ZChwBQAAAAAAAAD8nzzpPeDLL7+0vDjr7t27CgsLU6FChazaDBo0yJjqAAAAAAAAACAbSVfgWrJkSc2fP9/yuUiRIlqyZIlVG5PJROAKAAAAAAAAIFdKV+B65syZTCoDAAAAAAAAALK/dK3h+sorr+jatWuZVAoAAAAAAAAAZG/pClw3b96sO3fuZFYtAAAAAAAAAJCtpStwBQAAAAAAAAA8WrrWcJWko0eP6sKFC6m2qVat2hMXBAAAAAAAAADZVboD14YNG8psNifbbjKZZDabZTKZlJiYaEhxAAAAAAAAAJCdpDtw3b17twoXLpwZtQAAAAAAAABAtpbuwLVkyZLy8PDIjFoAAAAAAAAAIFvjpVkAAAAAAAAAYJB0Ba716tXTnTt3MqsWAAAAAAAAAMjW0hW4bt26VXnz5s2sWgAAAAAAAAAgW0tX4Go2mzOrDgAAAAAAAADI9tK9hqvJZMqMOgAAAAAAAAAg28uT3gPKly//2ND1ypUrT1wQAAAAAAAAAGRX6Q5cx44dK1dX18yoBQAAAAAAAACytXQHrh07dpSHh0dm1AIAAAAAAAAA2Vq61nBl/VYAAAAAAAAAeLR0Ba5mszmz6gAAAAAAAACAbC9dSwokJSVlVh0AAAAAAAAAkO2la4YrAAAAAAAAAODRCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMkqWB6+TJk1W7dm05OzvLw8NDrVu31vHjx63a3L59WwMGDFDBggXl5OSkdu3a6eLFi1ZtoqKi1LJlSzk6OsrDw0PDhg3T3bt3n+alAAAAAAAAAEDWBq5btmzRgAED9J///EcbN25UQkKCmjRpops3b1raDB06VD/++KO++eYbbdmyRefPn1fbtm0t+xMTE9WyZUvduXNHO3fu1KJFixQWFqbRo0dnxSUBAAAAAAAAyMXyZOXJ169fb/U5LCxMHh4e2rdvn15++WVdv35dCxYsUHh4uF555RVJ0sKFC1WpUiX95z//0QsvvKCffvpJR48e1c8//yxPT0/VqFFD48eP1/DhwxUaGqq8efMmO298fLzi4+Mtn2NiYiRJCQkJSkhIyMQrzl7u2qQvj09IdEh726R7bROVmMa+zemqxS4pKV3tE2xs09GYMZKTMM4f1ZhxnpMwzh/VmHGekzDOH9WYcZ6TMM4f1Th7jnN+9gSAnMtkNpvT950yE/35558qV66cDh8+rKpVq+qXX35Rw4YNdfXqVbm5uVnaeXt7a8iQIRo6dKhGjx6tVatW6eDBg5b9p0+fVunSpbV//375+fklO09oaKjGjh2bbHt4eLgcHR0z49IAAAAAALCIi4tT586ddf36dbm4uGR1OQAAA2XpDNcHJSUlaciQIQoICFDVqlUlSRcuXFDevHmtwlZJ8vT01IULFyxtPD09k+2/vy8lISEhCgoKsnyOiYlRiRIl1KRJE77RPSBmypR0tXdpOznNbROSHLTx1FeqcK2CbPX4f732qXMqXbW0u+2ervYRP36T5rZ2Iyamq2882xjnKWOc5yyM85QxznMWxnnKGOc5C+M8Zdl1nN9/0hIAkPM8M4HrgAEDdOTIEW3fvj3Tz2Vvby97e/tk2+3s7GRnZ5fp588u8qTzsR8721vpPoft///v8X2b0tVvQjoft7JLStujU5IYIzkM4/wRbRnnOQrj/BFtGec5CuP8EW0Z5zkK4/wRbbPpOM+udQMAHi9LX5p138CBA7V69Wpt2rRJxYsXt2wvUqSI7ty5o2vXrlm1v3jxoooUKWJpc/HixWT77+8DAAAAAAAAgKclSwNXs9msgQMHauXKlfrll19UqlQpq/01a9aUnZ2dIiMjLduOHz+uqKgo+fv7S5L8/f11+PBhXbp0ydJm48aNcnFxUeXKlZ/OhQAAAAAAAACAsnhJgQEDBig8PFz//ve/5ezsbFlz1dXVVQ4ODnJ1dVXv3r0VFBQkd3d3ubi46N1335W/v79eeOEFSVKTJk1UuXJldevWTR9//LEuXLigkSNHasCAASkuGwAAAAAAAAAAmSVLA9e5c+dKkurXr2+1feHCherZs6ckacaMGbKxsVG7du0UHx+vpk2bas6cOZa2tra2Wr16td555x35+/srf/786tGjh8aNG/e0LgMAAAAAAAAAJGVx4Go2mx/bJl++fPr888/1+eefP7KNt7e31q5da2RpAAAAAAAAAJBuz8RLswAAAAAAAAAgJyBwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACD5MnqAgAAAAAgt5py4H9pbvtOJtYBAACMQ+AKIMfhBxfkBoxzAAAAAHg2saQAAAAAAAAAABiEGa4AAAB4JjGTGwAAANkRgWsuww8uAAAAAAAAQOZhSQEAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABuGlWQCeebOuzkrnEV0ypQ4AAAAAAIDHIXDN5giiAAAAAAAAgGcHSwoAAAAAAAAAgEGY4QoAwDOAJxYAIGfg73MAAEDgCgAAgKeCIAoAAAC5AUsKAAAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAbJ0sB169atCgwMlJeXl0wmk3744Qer/WazWaNHj1bRokXl4OCgRo0a6cSJE1Ztrly5oi5dusjFxUVubm7q3bu3YmNjn+JVAAAAAAAAAMA9WRq43rx5U9WrV9fnn3+e4v6PP/5Yn376qb744gvt3r1b+fPnV9OmTXX79m1Lmy5duuj333/Xxo0btXr1am3dulV9+/Z9WpcAAAAAAAAAABZ5svLkzZs3V/PmzVPcZzabNXPmTI0cOVKtWrWSJC1evFienp764Ycf1LFjRx07dkzr16/Xnj17VKtWLUnSZ599phYtWmjq1Kny8vJ6atcCAAAAAAAAAFkauKbm9OnTunDhgho1amTZ5urqqjp16mjXrl3q2LGjdu3aJTc3N0vYKkmNGjWSjY2Ndu/erTZt2qTYd3x8vOLj4y2fY2JiJEkJCQlKSEjIpCvKHKa7pnS1t0m6m+a2d23SNwE6IdEh7W2T7rVNVGIa+zanqxa7pKR0tU+wsU1H4+w1RnICxnnKGOc5C+M8ZYzznIVxnjLGec7COE8Z4zy57PazJwAg7Uxmszl93ykziclk0sqVK9W6dWtJ0s6dOxUQEKDz58+raNGilnZvvPGGTCaTli9frkmTJmnRokU6fvy4VV8eHh4aO3as3nnnnRTPFRoaqrFjxybbHh4eLkdHR+MuCgAAAACAFMTFxalz5866fv26XFxcsrocAICBntkZrpkpJCREQUFBls8xMTEqUaKEmjRpku2+0c29Njdd7W9HvZHmtr3Xzk9X3y5tJ6e5bUKSgzae+koVrlWQrR7/r9c+dU6lq5Z2t93T1T7ix2/S3NZuxMR09Y2MY5ynjHGeszDOU8Y4z1kY5yljnOcsjPOUMc6Tu/+kJQAg53lmA9ciRYpIki5evGg1w/XixYuqUaOGpc2lS5esjrt7966uXLliOT4l9vb2sre3T7bdzs5OdnZ2BlT/9JjzpG+CcpJN2r/kedL52I+d7a10tZck2///3+P7Tt+jWQnpfNzKLiltj05JynZjJCdgnKeMcZ6zMM5TxjjPWRjnKWOc5yyM85QxzpPLrnUDAB4vfd/1nqJSpUqpSJEiioyMtGyLiYnR7t275e/vL0ny9/fXtWvXtG/fPkubX375RUlJSapTp85TrxkAAAAAAABA7palM1xjY2P1559/Wj6fPn1aBw8elLu7u0qWLKkhQ4ZowoQJKleunEqVKqVRo0bJy8vLss5rpUqV1KxZM/Xp00dffPGFEhISNHDgQHXs2FFeXl5ZdFUAAAAAAAAAcqssDVz37t2rBg0aWD7fX1e1R48eCgsL0/vvv6+bN2+qb9++unbtml588UWtX79e+fLlsxyzbNkyDRw4UA0bNpSNjY3atWunTz/99KlfCwAAAAAAAABkaeBav359mc2PXuPIZDJp3LhxGjdu3CPbuLu7Kzw8PDPKAwAAAAAAAIB0eWbXcAUAAAAAAACA7IbAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADBIjglcP//8c/n4+ChfvnyqU6eOfv3116wuCQAAAAAAAEAukyMC1+XLlysoKEhjxozR/v37Vb16dTVt2lSXLl3K6tIAAAAAAAAA5CJ5sroAI0yfPl19+vRRr169JElffPGF1qxZo6+++kojRoxI1j4+Pl7x8fGWz9evX5ckXblyRQkJCU+naIPEX49/fKMH3Im5mua2V+/cSVffd6/nS3PbhKR8iouL0/Vb12Ur28e2/+d6bLpq0W27dDX/587dNLe1++ef9NWCDGOcPwLjPEdhnD8C4zxHYZw/AuM8R2GcPwLjPJkbN25IksxmcxZXAgAwmsmczf92v3PnjhwdHfXtt9+qdevWlu09evTQtWvX9O9//zvZMaGhoRo7duxTrBIAAAAAgOTOnTun4sWLZ3UZAAADZfsZrv/73/+UmJgoT09Pq+2enp7673//m+IxISEhCgoKsnxOSkrSlStXVLBgQZlMpkytF/fExMSoRIkSOnfunFxcXLK6HCBTMM6RGzDOkRswzpEbMM6fPrPZrBs3bsjLyyurSwEAGCzbB65Pwt7eXvb29lbb3NzcsqaYXM7FxYX/oUOOxzhHbsA4R27AOEduwDh/ulxdXbO6BABAJsj2L80qVKiQbG1tdfHiRavtFy9eVJEiRbKoKgAAAAAAAAC5UbYPXPPmzauaNWsqMjLSsi0pKUmRkZHy9/fPwsoAAAAAAAAA5DY5YkmBoKAg9ejRQ7Vq1dLzzz+vmTNn6ubNm+rVq1dWl4ZHsLe315gxY5It7QDkJIxz5AaMc+QGjHPkBoxzAACMYzKbzeasLsIIs2fP1ieffKILFy6oRo0a+vTTT1WnTp2sLgsAAAAAAABALpJjAlcAAAAAAAAAyGrZfg1XAAAAAAAAAHhWELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAPDEePcmAAAAAFgjcAUAAE/M3t5ex44dy+oyAABPYNu2beratav8/f31999/S5KWLFmi7du3Z3FlAABkb3myugDg3LlzGjNmjL766qusLgXIkFu3bmnfvn1yd3dX5cqVrfbdvn1bK1asUPfu3bOoOiBjgoKCUtyemJioKVOmqGDBgpKk6dOnP82ygEwxe/Zs/frrr2rRooU6duyoJUuWaPLkyUpKSlLbtm01btw45cnD/0Yje/vuu+/UrVs3denSRQcOHFB8fLwk6fr165o0aZLWrl2bxRUCAJB9mcw8C4gsdujQIT333HNKTEzM6lKAJ/bHH3+oSZMmioqKkslk0osvvqiIiAgVLVpUknTx4kV5eXkxzpFt2djYqHr16nJzc7PavmXLFtWqVUv58+eXyWTSL7/8kjUFAgaZMGGCPv74YzVp0kQ7duzQkCFD9Mknn2jo0KGysbHRjBkz9M4772js2LFZXSqQIX5+fho6dKi6d+8uZ2dnHTp0SKVLl9aBAwfUvHlzXbhwIatLBAAg2+Kf5pHpVq1aler+U6dOPaVKgMwzfPhwVa1aVXv37tW1a9c0ZMgQBQQEaPPmzSpZsmRWlwdk2KRJkzRv3jxNmzZNr7zyimW7nZ2dwsLCks3qBrKrsLAwhYWFqW3btjp06JBq1qypRYsWqUuXLpKkihUr6v333ydwRbZ3/Phxvfzyy8m2u7q66tq1a0+/IAAAchACV2S61q1by2QypfpiFZPJ9BQrAoy3c+dO/fzzzypUqJAKFSqkH3/8Uf3799dLL72kTZs2KX/+/FldIpAhI0aMUMOGDdW1a1cFBgZq8uTJsrOzy+qyAMOdP39etWrVkiRVr15dNjY2qlGjhmX/c889p/Pnz2dRdYBxihQpoj///FM+Pj5W27dv367SpUtnTVEAAOQQvDQLma5o0aL6/vvvlZSUlOKv/fv3Z3WJQIbdunXLaj0/k8mkuXPnKjAwUPXq1dMff/yRhdUBxqhdu7b27duny5cvq1atWjpy5Aj/YIYcp0iRIjp69Kgk6cSJE0pMTLR8lqTff/9dHh4eWVUeYJg+ffpo8ODB2r17t0wmk86fP69ly5YpODhY77zzTlaXBwBAtsYMV2S6mjVrat++fWrVqlWK+x83+xXIDipWrKi9e/eqUqVKVttnz54tSXrttdeyoizAcE5OTlq0aJEiIiLUqFEj1iVGjtOlSxd1795drVq1UmRkpN5//30FBwfrn3/+kclk0sSJE9W+ffusLhPIsBEjRigpKUkNGzZUXFycXn75Zdnb2ys4OFjvvvtuVpcHAEC2xkuzkOm2bdummzdvqlmzZinuv3nzpvbu3at69eo95coA40yePFnbtm175Bt9+/fvry+++EJJSUlPuTIg8/z111/at2+fGjVqxLIZyDGSkpI0ZcoU7dq1S3Xr1tWIESO0fPlyvf/++4qLi1NgYKBmz57NmEeOcefOHf3555+KjY1V5cqV5eTklNUlAQCQ7RG4AgAAAAAAAIBBWFIAAAAAAHKZmzdvasqUKYqMjNSlS5eSPYVz6tSpLKoMAIDsj8AVAAAAAHKZt956S1u2bFG3bt1UtGhRXoIIAICBWFIAAAAAAHIZNzc3rVmzRgEBAVldCgAAOY5NVhcAAAAAAHi6ChQoIHd396wuAwCAHInAFQAAAABymfHjx2v06NGKi4vL6lIAAMhxWFIAAAAAAHIZPz8/nTx5UmazWT4+PrKzs7Pav3///iyqDACA7I+XZgEAAABALtO6deusLgEAgByLGa4AAAAAAAAAYBDWcAUAAAAAAAAAg7CkAAAAAADkAu7u7vrjjz9UqFAhFShQQCaT6ZFtr1y58hQrAwAgZyFwBQAAAIBcYMaMGXJ2dpYkzZw5M2uLAQAgB2MNVwAAAAAAAAAwCDNcAQAAACAXiImJSXNbFxeXTKwEAICcjRmuAAAAAJAL2NjYpLpuqySZzWaZTCYlJiY+paoAAMh5mOEKAAAAALnApk2bsroEAAByBWa4AgAAAEAu0LZtW4WFhcnFxUWLFy9Whw4dZG9vn9VlAQCQ4xC4AgAAAEAukDdvXp09e1ZFixaVra2toqOj5eHhkdVlAQCQ47CkAAAAAADkAhUrVlRISIgaNGggs9msFStWPPLlWN27d3/K1QEAkHMwwxUAAAAAcoEdO3bovffe08mTJ3XlyhU5Ozun+BItk8mkK1euZEGFAADkDASuAAAAAJDL2NjY6MKFCywpAABAJrDJ6gIAAAAAAJmvbdu2iomJkSQtXLhQzs7OWVwRAAA5EzNcAQAAACAX4KVZAAA8Hbw0CwAAAAByAV6aBQDA08EMVwAAAADIBXbu3KmgoCBemgUAQCYjcAUAAACAXMbGxkbR0dHy9PTM6lIAAMhxCFwBAAAAIJc5e/asXFxc9NVXX+nYsWOSpCpVqqh3796PXGYAAACkDYErAAAAAOQye/fuVdOmTeXg4KDnn39ekrRnzx7dunVLGzZsUM2aNbO4QgAAsi8CVwAAAADIZV566SWVLVtW8+fPV548996lfPfuXb311ls6deqUtm7dmsUVAgCQfRG4AgAAAEAu4+DgoAMHDqhixYpW248ePapatWopLi4uiyoDACD7s8nqAgAAAAAAT5eLi4uioqKSbT937pycnZ2zoCIAAHIOAlcAAAAAyGU6dOig3r17a/ny5Tp37pzOnTuniIgIvfXWW+rUqVNWlwcAQLaWJ6sLAAAAAAA8XVOnTpXJZFL37t119+5dSZKdnZ3eeecdTZkyJYurAwAge2MNVwAAAADIpeLi4nTy5ElJUpkyZeTo6JjFFQEAkP0RuAIAAAAAAACAQVjDFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAEgDk8mkH374wbD+Nm/eLJPJpGvXrhnW59Nw584dlS1bVjt37szqUnKF9evXq0aNGkpKSsrqUgAAAACkEYErACDH2rVrl2xtbdWyZcsM9xUdHa3mzZsbUFXa+fj4yGQyyWQyydHRUb6+vvryyy/T3Y+RYfEXX3yhUqVKqW7duob0l9V8fHw0c+bMdB939uxZOTg4KDY21viiHtCsWTPZ2dlp2bJlmXoeAAAAAMYhcAUA5FgLFizQu+++q61bt+r8+fOptjWbzbp7926y7Xfu3JEkFSlSRPb29plSZ2rGjRun6OhoHTlyRF27dlWfPn20bt26p16HdO8ezZ49W7179061XUJCwlOqKOv8+9//VoMGDeTk5JTp5+rZs6c+/fTTTD8PAAAAAGMQuAIAcqTY2FgtX75c77zzjlq2bKmwsDCr/fcf6V+3bp1q1qwpe3t7bd++XfXr19fAgQM1ZMgQFSpUSE2bNpVkPUu0bt26Gj58uFV/ly9flp2dnbZu3SpJWrJkiWrVqiVnZ2cVKVJEnTt31qVLl9J9HfePL126tIYPHy53d3dt3LjRsn/Pnj1q3LixChUqJFdXV9WrV0/79++37Pfx8ZEktWnTRiaTyfJZuhcaPvfcc8qXL59Kly6tsWPHphg637dv3z6dPHnSasbwmTNnZDKZtHz5ctWrV0/58uWzzMb88ssvValSJeXLl08VK1bUnDlzrPr79ddf5efnp3z58qlWrVpauXKlTCaTDh48KEkKCwuTm5ub1TE//PCDTCaT1bbUrsNsNis0NFQlS5aUvb29vLy8NGjQIElS/fr1dfbsWQ0dOtQyk1i6N3s1MDBQBQoUUP78+VWlShWtXbs22Tlfe+01SffG0vPPP6/8+fPLzc1NAQEBOnv2bJrv87Vr19SvXz95enoqX758qlq1qlavXm3ZHxgYqL179+rkyZOP/NoAAAAAeHbkyeoCAADIDCtWrFDFihVVoUIFde3aVUOGDFFISEiysG7EiBGaOnWqSpcurQIFCkiSFi1apHfeeUc7duxIse8uXbro448/1pQpUyz9LV++XF5eXnrppZck3ZvlOX78eFWoUEGXLl1SUFCQevbsmSy4S6ukpCStXLlSV69eVd68eS3bb9y4oR49euizzz6T2WzWtGnT1KJFC504cULOzs7as2ePPDw8tHDhQjVr1ky2traSpG3btql79+769NNP9dJLL+nkyZPq27evJGnMmDEp1rBt2zaVL19ezs7OyfaNGDFC06ZNswSoy5Yt0+jRozV79mz5+fnpwIED6tOnj/Lnz68ePXooNjZWr776qho3bqylS5fq9OnTGjx4cLrvy+Ou47vvvtOMGTMUERGhKlWq6MKFCzp06JAk6fvvv1f16tXVt29f9enTx9LngAEDdOfOHW3dulX58+fX0aNHrWayXrt2Tdu3b9eSJUt09+5dtW7dWn369NHXX3+tO3fu6Ndff7WMi8fVl5SUpObNm+vGjRtaunSpypQpo6NHj1q+TpJUsmRJeXp6atu2bSpTpky67xEAAACAp8wMAEAOVLduXfPMmTPNZrPZnJCQYC5UqJB506ZNlv2bNm0ySzL/8MMPVsfVq1fP7Ofnl6w/SeaVK1eazWaz+dKlS+Y8efKYt27datnv7+9vHj58+CPr2bNnj1mS+caNG1bnv3r16iOP8fb2NufNm9ecP39+c548ecySzO7u7uYTJ0488pjExESzs7Oz+ccff0yx9vsaNmxonjRpktW2JUuWmIsWLfrIvgcPHmx+5ZVXrLadPn3aLMlyr+8rU6aMOTw83Grb+PHjzf7+/maz2Wz+17/+ZS5YsKD51q1blv1z5841SzIfOHDAbDabzQsXLjS7urpa9bFy5Urzg//78rjrmDZtmrl8+fLmO3fupHhN3t7e5hkzZlht8/X1NYeGhqbY3mw2m5ctW2auVauW2Ww2m//55x+zJPPmzZtTbPu4+jZs2GC2sbExHz9+/JHnM5vNZj8/v1RrAgAAAPDsYEkBAECOc/z4cf3666/q1KmTJClPnjzq0KGDFixYkKxtrVq1km2rWbNmqv0XLlxYTZo0sTw6f/r0ae3atUtdunSxtNm3b58CAwNVsmRJOTs7q169epKkqKiodF3LsGHDdPDgQf3yyy+qU6eOZsyYobJly1r2X7x4UX369FG5cuXk6uoqFxcXxcbGPvY8hw4d0rhx4+Tk5GT51adPH0VHRysuLi7FY27duqV8+fKluO/B+3jz5k2dPHlSvXv3tup/woQJlsfijx07pmrVqln15+/vn+b7ktbreP3113Xr1i2VLl1affr00cqVK1NdNkGSBg0apAkTJiggIEBjxozRb7/9ZrX/weUE3N3d1bNnTzVt2lSBgYGaNWuWoqOj01zfwYMHVbx4cZUvXz7VmhwcHB75dQEAAADwbCFwBQDkOAsWLNDdu3fl5eWlPHnyKE+ePJo7d66+++47Xb9+3apt/vz5kx2f0raHdenSRd9++60SEhIUHh4uX19f+fr6SroXODZt2lQuLi5atmyZ9uzZo5UrV0r6v5dwpVWhQoVUtmxZvfTSS/rmm280aNAgHT161LK/R48eOnjwoGbNmqWdO3fq4MGDKliw4GPPExsbq7Fjx+rgwYOWX4cPH9aJEyceGaoWKlRIV69eTXHfg/csNjZWkjR//nyr/o8cOaL//Oc/ab52Gxsbmc1mq20Pv5DrcddRokQJHT9+XHPmzJGDg4P69++vl19+OdUXe7311ls6deqUunXrpsOHD6tWrVr67LPPJN37+q1fv94SuErSwoULtWvXLtWtW1fLly9X+fLlLdf5uPocHBzSdC+uXLmiwoULp6ktAAAAgKxF4AoAyFHu3r2rxYsXa9q0aVYh16FDh+Tl5aWvv/7akPO0atVKt2/f1vr16xUeHm41u/W///2v/vnnH02ZMkUvvfSSKlas+EQvzHpYiRIl1KFDB4WEhFi27dixQ4MGDVKLFi1UpUoV2dvb63//+5/VcXZ2dkpMTLTa9txzz+n48eMqW7Zssl82Nin/74Gfn5/++9//JgtBH+bp6SkvLy+dOnUqWd+lSpWSJFWqVEm//fabbt++bTnu4TC2cOHCunHjhm7evGnZdv+FWum5DgcHBwUGBurTTz/V5s2btWvXLh0+fFiSlDdv3mT3Rrp3r99++219//33eu+99zR//nxJ916QVaBAAVWvXj3ZvQkJCdHOnTtVtWpVhYeHp6m+atWq6a+//tIff/zxyPt5+/ZtnTx5Un5+fqnedwAAAADPBl6aBQDIUVavXq2rV6+qd+/ecnV1tdrXrl07LViwQG+//XaGz5M/f361bt1ao0aN0rFjxyzLF0j3XnKUN29effbZZ3r77bd15MgRjR8/PsPnlKTBgweratWq2rt3r2rVqqVy5cppyZIlqlWrlmJiYjRs2LBksyZ9fHwUGRmpgIAA2dvbq0CBAho9erReffVVlSxZUu3bt5eNjY0OHTqkI0eOaMKECSmeu0GDBoqNjdXvv/+uqlWrplrn2LFjNWjQILm6uqpZs2aKj4/X3r17dfXqVQUFBalz58768MMP1adPH4WEhOjMmTOaOnWqVR916tSRo6OjPvjgAw0aNEi7d+9WWFiYVZvHXUdYWJgSExMtfS1dulQODg7y9va23JutW7eqY8eOsre3V6FChTRkyBA1b95c5cuX19WrV7Vp0yZVqlRJkrRq1Sqr2a2nT5/WvHnz9Nprr8nLy0vHjx/XiRMn1L179zTVV69ePb388stq166dpk+frrJly+q///2vTCaTmjVrJuleEG1vb/9ESy4AAAAAePqY4QoAyFEWLFigRo0aJQtbpXuB6969e5OtyfmkunTpokOHDumll15SyZIlLdsLFy6ssLAwffPNN6pcubKmTJmSLEx8UpUrV1aTJk00evRoSfeu9+rVq3ruuefUrVs3DRo0SB4eHlbHTJs2TRs3blSJEiUssySbNm2q1atX66efflLt2rX1wgsvaMaMGZYgMiUFCxZUmzZtLGvXpuatt97Sl19+qYULF8rX11f16tVTWFiYZYark5OTfvzxRx0+fFh+fn768MMP9dFHH1n14e7urqVLl2rt2rXy9fXV119/rdDQUKs2j7sONzc3zZ8/XwEBAapWrZp+/vln/fjjjypYsKAkady4cTpz5ozKlCljeWQ/MTFRAwYMUKVKldSsWTOVL19ec+bMkZQ8cHV0dNR///tftWvXTuXLl1ffvn01YMAA9evXL833+bvvvlPt2rXVqVMnVa5cWe+//77VrNuvv/5aXbp0kaOj42PvOwAAAICsZzI/7rlAAACA/++3335T48aNdfLkSTk5ORna95kzZ1SqVCkdOHBANWrUMLRvI+zfv1+vvPKKLl++LDs7u6dyzv/973+qUKGC9u7dawmrAQAAADzbmOEKAADSrFq1avroo490+vTprC7lqbt7964+++yzpxa2SvdC6Dlz5hC2AgAAANkIM1wBAMAz4Vmf4QoAAAAAaUHgCgAAAAAAAAAGYUkBAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgkP8H5wq5TazU+vsAAAAASUVORK5CYII=",
+      "text/plain": [
+       "<Figure size 1200x800 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create a list to store the TTFT data\n",
+    "ttft_data = []\n",
+    "\n",
+    "# Iterate over the models, batch sizes, and arrival rates to calculate TTFT\n",
+    "for ssm in small_model_names:\n",
+    "    for batch_size in batch_sizes:\n",
+    "        for arrival_rate in arrival_rates:\n",
+    "            model_name = ssm.replace(\"/\", \"-\")\n",
+    "            filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "            if os.path.exists(filepath):\n",
+    "                ttft = get_ttft(filepath)\n",
+    "                ttft_data.append({\n",
+    "                    'Model': model_name,\n",
+    "                    'Batch Size': batch_size,\n",
+    "                    'Arrival Rate': arrival_rate,\n",
+    "                    'TTFT': ttft\n",
+    "                })\n",
+    "# add incremental decoding entry\n",
+    "for batch_size in batch_sizes:\n",
+    "    for arrival_rate in arrival_rates:\n",
+    "        model_name = ssm.replace(\"/\", \"-\")\n",
+    "        filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "        if os.path.exists(filepath):\n",
+    "            ttft = get_ttft(filepath)\n",
+    "            ttft_data.append({\n",
+    "                'Model': \"Incr Dec (baseline)\",\n",
+    "                'Batch Size': batch_size,\n",
+    "                'Arrival Rate': arrival_rate,\n",
+    "                'TTFT': ttft\n",
+    "            })\n",
+    "\n",
+    "# Convert the list to a DataFrame\n",
+    "ttft_df = pd.DataFrame(ttft_data)\n",
+    "print(ttft_df.head())\n",
+    "\n",
+    "# Pivot the dataframe to have models and batch sizes as columns\n",
+    "pivot_df = ttft_df.pivot_table(index='Arrival Rate', columns=['Model', 'Batch Size'], values='TTFT')\n",
+    "\n",
+    "# Plot the data\n",
+    "fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "\n",
+    "colors = ['lightgreen', 'skyblue', 'lightcoral', 'gold', 'plum', 'peachpuff', 'mediumturquoise', 'salmon']\n",
+    "pivot_df.plot(kind='bar', ax=ax, color=colors)\n",
+    "\n",
+    "ax.set_title('TTFT vs Arrival Rate for Different Models and Batch Sizes\\nLLM: LLAMA-3.1-70B-Instruct')\n",
+    "ax.set_xlabel('Arrival Rate (requests/sec)')\n",
+    "ax.set_ylabel('TTFT (ms)')\n",
+    "ax.grid(True)\n",
+    "ax.legend(title='Model and Batch Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "\n",
+    "# Save the plot as a PDF\n",
+    "plt.savefig('/usr/FlexFlow/benchmarking/ttft_vs_arrival_rate.pdf', bbox_inches='tight')\n",
+    "\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   Model  Batch Size Arrival Rate  Queueing Time\n",
+      "0  Zhuominc-Llama-3-330M           4      offline     376.053818\n",
+      "1  Zhuominc-Llama-3-330M           4            1     319.585296\n",
+      "2  Zhuominc-Llama-3-330M           4            2     346.747481\n",
+      "3  Zhuominc-Llama-3-330M           4            4     360.138720\n",
+      "4  Zhuominc-Llama-3-330M           4            8     368.694877\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABVwAAALvCAYAAACZeQ7oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeVzN2f8H8NetaC+tWqSSKCTKbixhREnZSyhbDCEmY4vKNvZtxjJjkCGyb2OXrez7mmUalUnJWlIq9fn94dfn67qVWyKm1/PxuI/v3PM5n3Pen0/nXt/enc85EkEQBBARERERERERERHRJ1Mo6wCIiIiIiIiIiIiI/iuYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCWib56FhQV8fX3LOoxCHT9+HBKJBMePHy/rUP4z4uLiIJFIEBYW9tn6CAkJgUQi+Wztl5X79++jffv20NbWhkQiwc6dO8s6JCmFfV7WrVsHGxsbVKhQAZUqVRLL586di2rVqkFRURH16tX7orH+F33K91VYWBgkEgni4uJKPa7P5Vv+fs6/3xcvXvzsfUkkEoSEhHz2foiIiIj+K5hwJSpnbt26hT59+sDU1BTKysowMTFBnz59cPv27bIO7Zvi6+sLiUTy0dfXnAj+El6+fAkVFRVIJBLExMSUdTifxYdjQVlZGTVq1MCUKVPw5s2bErV5+/ZthISEfJbElY+PD27cuIEZM2Zg3bp1aNCgQan3kS8/MZ7/qlChAvT19dGsWTNMnDgRCQkJcrVz584d+Pr6wsrKCitXrsTvv/8OADh06BB++uknNG/eHGvWrMHMmTM/27V8qtOnTyMkJAQvX76Uq37+uNLS0kJmZqbM8fv374v3dd68eaUcLX0oP7n5/svQ0BBOTk7Yv39/idudOXNmmf7RIzo6Gh07doSpqSlUVFRQtWpVuLm5YcOGDWUWExEREdF/gVJZB0BEX8727dvh5eUFXV1dDBw4EJaWloiLi8OqVauwdetWbNq0Ce7u7mUdZrHdvXsXCgpf9u9HQ4YMQbt27cT3Dx48wJQpU+Dn54cWLVqI5VZWVmjcuDEyMzNRsWLFLxrj12DLli2QSCQwMjJCeHg4pk+fXirtmpubIzMzExUqVCiV9j6VsrIy/vjjDwBAamoqdu3ahWnTpiE2Nhbh4eHFbu/27dsIDQ1F69atYWFhUWpxZmZm4syZM5g0aRL8/f1Lrd2P8fLygouLC/Ly8vDixQtcuHABixYtwuLFi7Fq1Sp4enqKdVu2bCnzeTl+/Djy8vKwePFiVK9eXSw/evQoFBQUsGrVqq/+83X69GmEhobC19dXaoZuUZSUlJCRkYE9e/agZ8+eUsfCw8OhoqJS4qQ+lczUqVNhaWkJQRDw+PFjhIWFwcXFBXv27EGnTp2K3d7MmTPRvXt3eHh4lH6wH7Flyxb06tUL9erVw6hRo6Cjo4MHDx7g5MmTWLlyJXr37i3WzczMhJISf20gIiIikhf/nxNROREbG4u+ffuiWrVqOHnyJAwMDMRjo0aNQosWLdCnTx9cv34dlpaWZRhp8SkrK3/xPps2bYqmTZuK7y9evIgpU6agadOm6NOnj0x9FRWVLxneV2P9+vVwcXGBubk5NmzYIHfCNSMjA2pqajLlb9++RV5eHipWrPhV3VMlJSWpn/uwYcPQrFkzbNy4EQsWLEDlypXLMLr/efLkCQDInfCTx+vXr6Gurl5kHQcHB5nPRXx8PNq3bw8fHx/Y2trC3t4eAKCgoCDzs01JSSkw7pSUFKiqqpZqsrWwsVcWlJWV0bx5c2zcuFEm4bphwwa4urpi27ZtZRRd+dSxY0epWeEDBw5E5cqVsXHjxhIlXMtSSEgIatWqhbNnz8p8hvI/c/m+pu9bIiIiom8BlxQgKifmzp2LjIwM/P7771LJVgDQ19fHb7/9hvT0dMydO1cs9/X1LXB2XWFrW65fvx6Ojo5QVVWFrq4uPD098fDhQ6k6ha232rp1a7Ru3VqqLCsrC8HBwahevTqUlZVhZmaGn376CVlZWUW2mf/o56lTpzBmzBgYGBhAXV0dXbp0ERNO+fLy8hASEgITExOoqanByckJt2/fLtV1YQtaI7B169aoU6cOrl+/jlatWkFNTQ3Vq1fH1q1bAQAnTpxA48aNoaqqipo1a+LIkSMy7SYmJmLAgAGoXLkylJWVUbt2baxevfqj8dSpUwdOTk4y5Xl5eTA1NUX37t3FsoiICDg6OkJTUxNaWlqws7PD4sWL5bruhIQEREVFwdPTE56ennjw4AFOnz4tUy//Xly6dAktW7aEmpoaJk6cKD6OPm/ePCxatAhWVlZQVlbG7du3ZdZwnTdvHiQSCeLj42XanzBhAipWrIgXL14AAKKiotCjRw9UrVpVHFejR48u8LHtkpJIJPjuu+8gCAL++ecfsTw+Ph7Dhg1DzZo1oaqqCj09PfTo0UNq6YCwsDD06NEDAODk5CQ+vvz++Nm/fz9atGgBdXV1aGpqwtXVFbdu3SoyppCQEJibmwMAxo4dC4lEIvX5vnLlCjp27AgtLS1oaGigbdu2OHv2rFQb+Z+tEydOYNiwYTA0NESVKlVKdI/Mzc0RFhaG7OxszJkzRyz/8PNiYWGB4OBgAICBgYG4lqREIsGaNWvw+vVr8R69v6avPN9HhY09QP7vH4lEAn9/f+zcuRN16tQRP4sHDhyQuvdjx44FAFhaWorxyrNkRO/evbF//36ppQguXLiA+/fvS81AfN8///yDHj16QFdXF2pqamjSpAn27t0rU+/ff/+Fh4cH1NXVYWhoiNGjR8tcX75z586hQ4cO0NbWhpqaGlq1aoVTp059NP6LFy/C2dkZ+vr6UFVVhaWlJQYMGPDR83bt2gVXV1eYmJhAWVkZVlZWmDZtGnJzc6Xq5f8Mb9++DScnJ6ipqcHU1FRqTJXkeuVVqVIlqKqqysz+nDdvHpo1awY9PT2oqqrC0dFR/H7PJ5FI8Pr1a6xdu7bAZWgSExMxcOBA8R5YWlrihx9+QHZ2tlQ7WVlZH/23riCxsbFo2LBhgX+wMDQ0lIk1fw3XD5cK+fD1PnnGzatXrxAQEAALCwsoKyvD0NAQ33//PS5fvvzRayAiIiL6WnGGK1E5sWfPHlhYWEg97v6+li1bwsLCAnv27MGyZcuK3f6MGTMwefJk9OzZE4MGDcKTJ0/wyy+/oGXLlrhy5UqxZ9Tl5eWhc+fOiI6Ohp+fH2xtbXHjxg0sXLgQ9+7dk2vNuxEjRkBHRwfBwcGIi4vDokWL4O/vj02bNol1JkyYgDlz5sDNzQ3Ozs64du0anJ2dv8hjui9evECnTp3g6emJHj16YPny5fD09ER4eDgCAgIwdOhQ9O7dG3PnzkX37t3x8OFDaGpqAgAeP36MJk2aiMkeAwMD7N+/HwMHDkRaWhoCAgIK7bdXr14ICQlBcnIyjIyMxPLo6Gg8evRIfLz78OHD8PLyQtu2bTF79mwAQExMDE6dOoVRo0Z99Po2btwIdXV1dOrUCaqqqrCyskJ4eDiaNWsmU/fZs2fo2LEjPD090adPH6kZoWvWrMGbN2/g5+cHZWVl6OrqIi8vT+r8nj174qeffsLmzZvFxFa+zZs3o3379tDR0QHw7jHajIwM/PDDD9DT08P58+fxyy+/4N9//8WWLVs+el3yyk+m5fcLvEuUnT59Gp6enqhSpQri4uKwfPlytG7dGrdv34aamhpatmyJkSNHYsmSJZg4cSJsbW0BQPzfdevWwcfHB87Ozpg9ezYyMjKwfPlyfPfdd7hy5UqhSxB07doVlSpVwujRo8VH/DU0NAC8W9u5RYsW0NLSwk8//YQKFSrgt99+Q+vWrcXk//uGDRsGAwMDTJkyBa9fvy7xPWratCmsrKxw+PDhQussWrQIf/75J3bs2IHly5dDQ0MDdevWRfXq1fH777/j/Pnz4nIO+WOrON9HBY294n7/REdHY/v27Rg2bBg0NTWxZMkSdOvWDQkJCdDT00PXrl1x7949bNy4EQsXLoS+vj4AyPzxqyBdu3bF0KFDsX37djFRuWHDBtjY2MDBwUGm/uPHj9GsWTNkZGRg5MiR0NPTw9q1a9G5c2ds3boVXbp0AfDuEfG2bdsiISEBI0eOhImJCdatW4ejR4/KtHn06FF07NgRjo6OCA4OhoKCAtasWYM2bdogKioKjRo1KjD2lJQUtG/fHgYGBhg/fjwqVaqEuLg4bN++/aPXHRYWBg0NDYwZMwYaGho4evQopkyZgrS0NKk/DALvvks7dOiArl27omfPnti6dSvGjRsHOzs7dOzYsdjXW5TU1FQ8ffoUgiAgJSUFv/zyC9LT02VmcC9evBidO3eGt7c3srOzERERgR49euCvv/6Cq6srgHef5UGDBqFRo0bw8/MD8G4ZGgB49OgRGjVqhJcvX8LPzw82NjZITEzE1q1bkZGRIZUkleffuoKYm5sjMjIS//77b7H+cGJgYIB169ZJleXk5GD06NFScck7boYOHYqtW7fC398ftWrVwrNnzxAdHY2YmJgCxzgRERHRN0Egov+8ly9fCgAEd3f3Iut17txZACCkpaUJgiAIPj4+grm5uUy94OBg4f2vj7i4OEFRUVGYMWOGVL0bN24ISkpKUuXm5uaCj4+PTJutWrUSWrVqJb5ft26doKCgIERFRUnVW7FihQBAOHXqVKFtrlmzRgAgtGvXTsjLyxPLR48eLSgqKgovX74UBEEQkpOTBSUlJcHDw0Oqj5CQEAFAgXEW5sKFCwIAYc2aNTLHjh07JgAQjh07JnW9AIQNGzaIZXfu3BEACAoKCsLZs2fF8oMHD8q0PXDgQMHY2Fh4+vSpVF+enp6Ctra2kJGRUWisd+/eFQAIv/zyi1T5sGHDBA0NDfHcUaNGCVpaWsLbt2/luQUy7OzsBG9vb/H9xIkTBX19fSEnJ0eqXv69WLFihVT5gwcPBACClpaWkJKSUuCx9+9J06ZNBUdHR6l658+fFwAIf/75p1hW0L35+eefBYlEIsTHx4tlH47zwvj4+Ajq6urCkydPhCdPngh///23MG/ePEEikQh16tSRGoMF9X3mzBmZGLds2SIzZgRBEF69eiVUqlRJGDx4sFR5cnKyoK2tLVP+ofz7NnfuXKlyDw8PoWLFikJsbKxY9ujRI0FTU1No2bKlWJb/2fruu+/kGheF9fc+d3d3AYCQmpoqCELBn5f8n8WTJ0+kzs2/9+8rzvdRYWOvON8/AISKFSsKf//9t1h27do1mc/Y3LlzBQDCgwcPCr0XhV1b9+7dhbZt2wqCIAi5ubmCkZGREBoaWuD9DQgIEABIxf7q1SvB0tJSsLCwEHJzcwVBEIRFixYJAITNmzeL9V6/fi1Ur15d6v7n5eUJ1tbWgrOzs8xYtrS0FL7//nuxLH985F/jjh07BADChQsX5Lrm9xX0WRkyZIigpqYmvHnzRizL/xm+//nJysoSjIyMhG7duoll8l5vYfKv7cOXsrKyEBYW9tH4s7OzhTp16ght2rSRKldXVy/w35p+/foJCgoKBd67/J+DvP/WFWbVqlXi+HVychImT54sREVFiWPkfQCE4ODgQtsaNmyYoKioKBw9elSMUd5xo62tLQwfPrzIWImIiIi+NVxSgKgcePXqFQCIsyMLk388v768tm/fjry8PPTs2RNPnz4VX0ZGRrC2tsaxY8eKHfOWLVtga2sLGxsbqTbbtGkDAHK16efnJ/V4Y4sWLZCbmys+dh4ZGYm3b99i2LBhUueNGDGi2PGWhIaGhtRmQTVr1kSlSpVga2srNaMw/7/zH00XBAHbtm2Dm5sbBEGQuj/Ozs5ITU0t8lHMGjVqoF69elKzn3Jzc7F161a4ublBVVUVwLtHZV+/fl3k7MPCXL9+HTdu3ICXl5dY5uXlhadPn+LgwYMy9ZWVldG/f/8C2+rWrZtcMwF79eqFS5cuITY2VizbtGkTlJWVpTaDy78+4N36o0+fPkWzZs0gCAKuXLki1/V96PXr1zAwMICBgQGqV6+OwMBANG/eHLt27ZIag+/3nZOTg2fPnqF69eqoVKmSXI/PHj58GC9fvhTvZf5LUVERjRs3LtFnLTc3F4cOHYKHhweqVasmlhsbG6N3796Ijo5GWlqa1DmDBw+GoqJisfsqSP4s2+J+7xSmuN9HBY294n7/tGvXTpyZCAB169aFlpaW1HISn6J37944fvw4kpOTcfToUSQnJxe6nMC+ffvQqFEjfPfdd2KZhoYG/Pz8EBcXh9u3b4v1jI2NpZYQUVNTE2da5rt69aq4fMGzZ8/Ee/H69Wu0bdsWJ0+elJlxni9/JvFff/2FnJycYl3z+5+VV69e4enTp2jRogUyMjJw584dqboaGhpSM0wrVqyIRo0aSd1/ea/3Y5YuXYrDhw/j8OHDWL9+PZycnDBo0CCZWbvvx//ixQukpqaiRYsWcn3O8/LysHPnTri5uUmtF5vvw8f2P/ZvXWEGDBiAAwcOoHXr1oiOjsa0adPQokULWFtbF7j8S2H+/PNPLFu2DHPmzBGXqynOuKlUqRLOnTuHR48eyd0nERER0deOSwoQlQPyJlJfvXoFiUQiPu4qr/v370MQBFhbWxd4vCQ7yd+/fx8xMTGFJto+3NCjIFWrVpV6n/9od/5anvm/jL6/6zkA6OrqSj0G/rlUqVJF5hdnbW1tmJmZyZQB/4v7yZMnePnyJX7//Xf8/vvvBbb9sfvTq1cvTJw4EYmJiTA1NcXx48eRkpKCXr16iXWGDRuGzZs3o2PHjjA1NUX79u3Rs2dPdOjQ4aPXtn79eqirq6NatWr4+++/AbzbdMXCwgLh4eHiI7X5TE1NC934SN5N3Hr06IExY8Zg06ZNmDhxIgRBwJYtW8R1SfMlJCRgypQp2L17t3hP86WmpsrV14dUVFSwZ88eAO/WiZwzZ464odP7MjMz8fPPP2PNmjVITEyEIAjF6vv+/fsAICb+PvT+dcrryZMnyMjIQM2aNWWO2draIi8vDw8fPkTt2rXF8tLcWC89PR3Ax/8gJK/ifh8VNPaK+/3z4XcN8O775sPxVVIuLi7Q1NTEpk2bcPXqVTRs2BDVq1cvcA3Y+Ph4mSUggP8tSREfH486deogPj4e1atXl/kO+nAc5I85Hx+fQuNLTU0t8DuzVatW6NatG0JDQ7Fw4UK0bt0aHh4e6N2790c3O7x16xaCgoJw9OhRmYT/h5+Vgr5LdXR0cP36dfG9vNf7MY0aNZJKgnp5eaF+/frw9/dHp06dxLH0119/Yfr06bh69arUOrEFrX/+oSdPniAtLQ116tSRK6aP/VtXFGdnZzg7OyMjIwOXLl3Cpk2bsGLFCnTq1Al37tyRWcv1Q1evXsXQoUPh5eWFMWPGiOXFGTdz5syBj48PzMzM4OjoCBcXF/Tr10/qD0BERERE3xomXInKAW1tbZiYmEj98lmQ69evo0qVKuIvjIX9YvjhpiV5eXmQSCTYv39/gbPe8mewfazN98/Ny8uDnZ0dFixYUGD9D5OSBSlsBt77Sa6yVFh8H4s7f1ZQnz59Cv1ltm7dukX23atXL0yYMAFbtmxBQEAANm/eDG1tbalkqqGhIa5evYqDBw9i//792L9/P9asWYN+/fph7dq1hbYtCAI2btyI169fo1atWjLHU1JSkJ6eLjUuPkxMvq+oY+8zMTFBixYtsHnzZkycOBFnz55FQkKCuP4s8G6cff/993j+/DnGjRsHGxsbqKurIzExEb6+voXO1PsYRUVFtGvXTnzv7OwMGxsbDBkyBLt37xbLR4wYgTVr1iAgIABNmzaFtrY2JBIJPD095eo7v866deuk1t/N9+HGPZ+LvD8Tedy8eROGhoYlShYXpDjfR0DB11Lc75/P/V2jrKyMrl27Yu3atfjnn3/EzYu+hPwxN3fuXNSrV6/AOh/e03wSiQRbt27F2bNnsWfPHhw8eBADBgzA/Pnzcfbs2ULPe/nyJVq1agUtLS1MnToVVlZWUFFRweXLlzFu3DiZz0pZftcrKCjAyckJixcvxv3791G7dm1ERUWhc+fOaNmyJZYtWwZjY2NUqFABa9aswYYNG0o9htK4fjU1NbRo0QItWrSAvr4+QkNDsX///iITpi9evEC3bt1Qo0YNcR3lfMUZNz179kSLFi2wY8cOHDp0CHPnzsXs2bOxfft2cQ1eIiIiom8NE65E5YSbmxt+++03REdHSz1qmi8qKgpxcXFSM1R0dHSkdsbO9+FjilZWVhAEAZaWlqhRo0aRcRTV5vuzWaysrHDt2jW0bdtWrhlBJZG/Y/vff/8tNWPv2bNnpTYz7XMwMDCApqYmcnNzpZJ8xWFpaYlGjRph06ZN8Pf3x/bt2+Hh4SEz66xixYpwc3ODm5sb8vLyMGzYMPz222+YPHmyzMzgfCdOnMC///6LqVOnirPq8r148QJ+fn7YuXOnzCYzpaFXr14YNmwY7t69i02bNkFNTQ1ubm7i8Rs3buDevXtYu3Yt+vXrJ5aXZNmEohgbG2P06NEIDQ3F2bNn0aRJEwDA1q1b4ePjg/nz54t137x5I/OZKGzM5z+2bmhoWOKf/YcMDAygpqaGu3fvyhy7c+cOFBQU5PoDR0mcOXMGsbGxpToWivN9VFQbpf3986nt9O7dG6tXr4aCgoLUUiQfMjc3L/RnmX88/39v3rwJQRCkYvvw3Pwxp6WlVeIx16RJEzRp0gQzZszAhg0b4O3tjYiICAwaNKjA+sePH8ezZ8+wfft2tGzZUix/8OBBifoH5L/eknj79i2A/83W3rZtG1RUVHDw4EGp79Q1a9bInFvQuDAwMICWlhZu3rz5ybGVRP4M3qSkpELr5OXlwdvbGy9fvsSRI0egpqYmdby448bY2BjDhg3DsGHDkJKSAgcHB8yYMYMJVyIiIvpmcQ1XonIiMDAQampqGDJkCJ49eyZ17Pnz5xg6dCi0tLTg7+8vlltZWSE1NVVqZmxSUhJ27NghdX7Xrl2hqKiI0NBQmRk1giBI9WdlZYWzZ88iOztbLPvrr7/w8OFDqfN69uyJxMRErFy5UuZaMjMzP2ln9Hxt27aFkpISli9fLlX+66+/fnLbn5OioiK6deuGbdu2FfgL+ZMnT+Rqp1evXjh79ixWr16Np0+fSi0nAEBmnCgoKIgzZ99/RPZD+csJjB07Ft27d5d6DR48GNbW1ggPD5crxuLq1q0bFBUVsXHjRmzZsgWdOnWCurq6eDx/Jtj741QQBCxevLjUYxkxYgTU1NQwa9Ysqf4//Iz88ssvMrPG82P+MBHr7OwMLS0tzJw5s8A1MeX92b9PUVER7du3x65du6QeUX/8+DE2bNiA7777rtRmn74vPj4evr6+qFixIsaOHVtq7Rbn+6gwn+P7p7CfqbycnJwwbdo0/PrrrwXObs7n4uKC8+fP48yZM2LZ69ev8fvvv8PCwkKcde7i4oJHjx5h69atYr2MjAyZZUocHR1hZWWFefPmiQnF9xU15l68eCHzM8if7VjUd0hBn9Ps7GwsW7as0HM+Rt7rLa6cnBwcOnQIFStWFP/ApKioCIlEIvW5jouLw86dO2XOV1dXlxkTCgoK8PDwwJ49e3Dx4kWZc0pr5m5kZGSB5fv27QNQ9HILoaGhOHjwIDZu3FjgEiPyjpvc3FyZJSIMDQ1hYmJS5BghIiIi+tpxhitROVG9enX8+eef8PLygp2dHQYOHAhLS0vExcVh1apVePHiBSIiIqR+cfL09MS4cePQpUsXjBw5EhkZGVi+fDlq1KghtfGHlZUVpk+fjgkTJiAuLg4eHh7Q1NTEgwcPsGPHDvj5+SEwMBAAMGjQIGzduhUdOnRAz549ERsbi/Xr10ttOAMAffv2xebNmzF06FAcO3YMzZs3R25uLu7cuYPNmzfj4MGDBW4mUhyVK1fGqFGjMH/+fHTu3BkdOnTAtWvXsH//fujr63+2mbWlYdasWTh27BgaN26MwYMHo1atWnj+/DkuX76MI0eO4Pnz5x9to2fPnggMDERgYCB0dXVlZiENGjQIz58/R5s2bVClShXEx8fjl19+Qb169WRmrubLysrCtm3b8P3330NFRaXAOp07d8bixYuRkpLy0fUBi8vQ0BBOTk5YsGABXr16JZNEtrGxgZWVFQIDA5GYmAgtLS1s27bts8xo1tPTQ//+/bFs2TLExMTA1tYWnTp1wrp166CtrY1atWrhzJkzOHLkCPT09KTOrVevHhQVFTF79mykpqZCWVkZbdq0gaGhIZYvX46+ffvCwcEBnp6eMDAwQEJCAvbu3YvmzZuX6A8G06dPx+HDh/Hdd99h2LBhUFJSwm+//YasrCzMmTPnk+/F5cuXsX79euTl5eHly5e4cOECtm3bBolEgnXr1n10CYziKM73UWE+x/ePo6MjAGDSpEnw9PREhQoV4ObmJvUHgaIoKCggKCjoo/XGjx+PjRs3omPHjhg5ciR0dXWxdu1aPHjwANu2bYOCwru/tQ8ePBi//vor+vXrh0uXLsHY2Bjr1q2TmamooKCAP/74Ax07dkTt2rXRv39/mJqaIjExEceOHYOWlpa4fvGH1q5di2XLlqFLly6wsrLCq1evsHLlSmhpacHFxaXQa2jWrBl0dHTg4+ODkSNHiuPkUxKN8l7vx+zfv1+cLZySkoINGzbg/v37GD9+vPiHCVdXVyxYsAAdOnRA7969kZKSgqVLl6J69eoyS/s4OjriyJEjWLBgAUxMTGBpaYnGjRtj5syZOHToEFq1agU/Pz/Y2toiKSkJW7ZsQXR0tLgh2adwd3eHpaUl3NzcYGVlhdevX+PIkSPYs2cPGjZsKPV0wPtu3LiBadOmoWXLlkhJScH69euljvfp00fucfPq1StUqVIF3bt3h729PTQ0NHDkyBFcuHBB6kkAIiIiom+OQETlyo0bN4TevXsLRkZGgoKCggBAUFFREW7dulVg/UOHDgl16tQRKlasKNSsWVNYv369EBwcLBT09bFt2zbhu+++E9TV1QV1dXXBxsZGGD58uHD37l2pevPnzxdMTU0FZWVloXnz5sLFixeFVq1aCa1atZKql52dLcyePVuoXbu2oKysLOjo6AiOjo5CaGiokJqaKtYzNzcXfHx8xPdr1qwRAAgXLlyQau/YsWMCAOHYsWNi2du3b4XJkycLRkZGgqqqqtCmTRshJiZG0NPTE4YOHSrnXRWECxcuCACENWvWyBwrqN9WrVoJtWvXlqlrbm4uuLq6ypQDEIYPHy5V9vjxY2H48OGCmZmZUKFCBcHIyEho27at8Pvvv8sdd/PmzQUAwqBBg2SObd26VWjfvr1gaGgoVKxYUahataowZMgQISkpqdD2tm3bJgAQVq1aVWid48ePCwCExYsXC4JQ+L148OCBAECYO3duoccKut8rV64UAAiamppCZmamzPHbt28L7dq1EzQ0NAR9fX1h8ODBwrVr12TaK2ycf8jHx0dQV1cv8FhsbKygqKgojs8XL14I/fv3F/T19QUNDQ3B2dlZuHPnjswYzr+OatWqCYqKijLj59ixY4Kzs7Ogra0tqKioCFZWVoKvr69w8eLFImMt6p5evnxZcHZ2FjQ0NAQ1NTXByclJOH36tFSdwj5bH+sv/6WkpCTo6uoKjRs3FiZMmCDEx8fLnFPQ5yX/Z/HkyROpukXde3m+jwobe4Ig//dPQZ9NQZD9XhIEQZg2bZpgamoqfvc+ePCgwL4/dm35Cvt5xsbGCt27dxcqVaokqKioCI0aNRL++usvmfPj4+OFzp07C2pqaoK+vr4watQo4cCBAzL3XxAE4cqVK0LXrl0FPT09QVlZWTA3Nxd69uwpREZGinXyx0f+dV2+fFnw8vISqlatKigrKwuGhoZCp06dPjpOBUEQTp06JTRp0kRQVVUVTExMhJ9++kk4ePCg3N+lPj4+grm5eYmv90P51/b+S0VFRahXr56wfPlyIS8vT6r+qlWrBGtra0FZWVmwsbER1qxZU+B3yp07d4SWLVsKqqqqAgCpMRMfHy/069dPMDAwEJSVlYVq1aoJw4cPF7KysqRikuffuoJs3LhR8PT0FKysrARVVVVBRUVFqFWrljBp0iQhLS1Nqi4AITg4WKr9wl7v+9i4ycrKEsaOHSvY29sLmpqagrq6umBvby8sW7asyNiJiIiIvnYSQfhKdo8hojLx559/wtfXF3369MGff/5Z1uF8FV6+fAkdHR1Mnz4dkyZNKutwiIiIiIiIiOgbwiUFiMq5fv36ISkpCePHj0eVKlUwc+bMsg7pi8rMzJTZpXzRokUAgNatW3/5gIiIiIiIiIjom8YZrkRUroWFhSEsLAwuLi7Q0NBAdHQ0Nm7ciPbt2+PgwYNlHR4RERERERERfWM4w5WIyrW6detCSUkJc+bMQVpamriR1vTp08s6NCIiIiIiIiL6BnGGKxEREREREREREVEpUSjrAIiIiIiIiIiIiIj+K5hwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImI6IsICwuDRCLBxYsXC60TFxcHiUSCefPmFdmWhYUFJBIJ2rVrV+DxlStXQiKRfLS/ooSEhEAikeDp06eF1jl+/DgkEgm2bt0qd7s9e/aERCLBuHHjimxTIpFg/fr1BdZp3rw5JBIJ6tSpU+Dx3NxcmJiYQCKRYP/+/XLHBgCjR4+Gg4MDdHV1oaamBltbW4SEhCA9PV2u85cvX44ePXqgatWqkEgk8PX1LVb/+T/bgl7W1tYy9VetWgVbW1uoqKjA2toav/zyi0wdX19fqXaUlJRgZmYGT09P3L59W6645Bm/n+L27dsICQlBXFzcZ2n/W4mBiIiIiOi/QKmsAyAiIioJFRUVHDt2DMnJyTAyMpI6Fh4eDhUVFbx586aMoitYWloa9uzZAwsLC2zcuBGzZs2CRCIpsK6Kigo2bNiAPn36SJXHxcXh9OnTUFFRKbSfo0ePIikpCRYWFggPD0fHjh3ljvHChQto0aIF+vfvDxUVFVy5cgWzZs3CkSNHcPLkSSgoFP232tmzZ+PVq1do1KgRkpKS5O4336JFi2SSu/Hx8QgKCkL79u2lyn/77TcMHToU3bp1w5gxYxAVFYWRI0ciIyNDJqGtrKyMP/74AwDw9u1bxMbGYsWKFThw4ABu374NExOTYsdamm7fvo3Q0FC0bt0aFhYW5TYGIiIiIqL/AiZciYjom9S8eXNcuHABmzZtwqhRo8Tyf//9F1FRUejSpQu2bdtWhhHK2rZtG3Jzc7F69Wq0adMGJ0+eRKtWrQqs6+Ligt27d+Pp06fQ19cXyzds2IDKlSvD2toaL168KPDc9evXw8HBAT4+Ppg4cSJev34NdXV1uWKMjo6WKbOyskJgYCDOnz+PJk2aFHn+iRMnxNmtGhoacvX5Pg8PD5my6dOnAwC8vb3FsszMTEyaNAmurq7iDOPBgwcjLy8P06ZNg5+fH3R0dMT6SkpKMsnrJk2aoFOnTti7dy8GDx5c7FjLiiAIePPmDVRVVcs6FCIiIiIiKgCXFCAiom+SiooKunbtig0bNkiVb9y4ETo6OnB2dpY5JycnB3fu3CnRzMvSEB4eju+//x5OTk6wtbVFeHh4oXXd3d2hrKyMLVu2SJVv2LABPXv2hKKiYoHnZWZmYseOHfD09ETPnj2RmZmJXbt2fVLc+bMdX758+dG65ubmhc7aLakNGzbA0tISzZo1E8uOHTuGZ8+eYdiwYVJ1hw8fjtevX2Pv3r0fbTd/ZrSSUsn+/uzr6wsNDQ0kJibCw8MDGhoaMDAwQGBgIHJzc6XqRkREwNHREZqamtDS0oKdnR0WL14M4N1yBT169AAAODk5iUsfHD9+HMC7+9+pUyccPHgQDRo0gKqqKn777TdxCY6wsDCZ2CQSCUJCQqTKEhMTMXDgQJiYmEBZWRmWlpb44YcfkJ2d/dEYiIiIiIhIfky4EhHRN6t37944f/48YmNjxbINGzage/fuqFChgkz9xMRE2NraYsKECV8yTADAo0ePcOzYMXh5eQEAvLy8sHXrVmRnZxdYX01NDe7u7ti4caNYdu3aNdy6dQu9e/cutJ/du3cjPT0dnp6eMDIyQuvWrYtM7Bbk7du3ePr0KR49eoRDhw4hKCgImpqaaNSoUbHaKQ1XrlxBTEyMzDVfuXIFANCgQQOpckdHRygoKIjH3/f06VM8ffoUjx8/xpkzZzB69Gjo6emhU6dOJY4vNzcXzs7O0NPTw7x589CqVSvMnz8fv//+u1jn8OHD8PLygo6ODmbPno1Zs2ahdevWOHXqFACgZcuWGDlyJABg4sSJWLduHdatWwdbW1uxjbt378LLywvff/89Fi9ejHr16hUrzkePHqFRo0aIiIhAr169sGTJEvTt2xcnTpxARkaGXDEQEREREZF8uKQAERF9s9q0aQMjIyNs3LgRQUFBiImJwdWrV7F48WL8888/ZR2elI0bN0JZWRnu7u4AAE9PT0yZMgX79u0r8DF64F1C2c3NDQ8fPoSZmRnCw8NRrVq1Ih/rX79+PZo1awYzMzOxn2HDhuHJkycwMDCQK9aLFy+iadOm4vuaNWti9+7d0NXVlfNqS09+svj95QQAICkpCYqKijA0NJQqr1ixIvT09PDo0SOp8tevX8tcv6mpKQ4dOiT3fSnImzdv0KtXL0yePBkAMHToUDg4OGDVqlX44YcfAAB79+6FlpYWDh48WODM5GrVqqFFixZYsmQJvv/+e7Ru3Vqmzt9//40DBw5IzdwuzuZWEyZMQHJyMs6dOyeVpJ46dSoEQUClSpU+GgMREREREcmHM1yJiOibpaioiJ49e4qzQMPDw2FmZoYWLVoUWN/CwgKCIBT4CPbnFh4eDldXV2hqagIArK2t4ejoWOTs0/bt20NXVxcREREQBAERERHiDNmCPHv2DAcPHpSq061bN0gkEmzevFnuWGvVqoXDhw9j586d+Omnn6Curi6zkdWXkJeXh4iICNSvX19mpmVmZiYqVqxY4HkqKirIzMyUKTt8+DAOHz6MgwcP4rfffoOGhgZcXFxw7969T4pz6NChUu9btGghlfCvVKkSXr9+jcOHD5e4D0tLywKXyZBHXl4edu7cCTc3N5kZwQBKfQkIIiIiIqLyjjNciYjom9a7d28sWbIE165dw4YNG+Dp6fnVJZBiYmJw5coV9OvXD3///bdY3rp1ayxduhRpaWnQ0tKSOa9ChQro0aMHNmzYgEaNGuHhw4dFLiewadMm5OTkoH79+lL9NG7cGOHh4Rg+fDgA4Pnz51JLGaiqqkJbW1t8r6WlhXbt2gF4t5bshg0b4O7ujsuXL8Pe3r7kN+L/ZWZmIjU1Vaosfz3V9504cQKJiYkYPXq0zDFVVdVCl2MoaEMpRUVF8Zryubi4wNraGhMmTBA3NHvy5IlUHV1d3UITu8C7RO6HM2R1dHSkNjQbNmwYNm/ejI4dO8LU1BTt27dHz5490aFDh0Lb/ZClpaXcdT/05MkTpKWloU6dOiVug4iIiIiI5McZrkRE9E1r3LgxrKysEBAQgAcPHhSZkCwr69evBwCMHj0a1tbW4mv+/Pl48+YNtm3bVui5vXv3xtWrVxESEgJ7e3vUqlWr0Lr5s2WbN28u1U90dDTOnDkjzrrs2rUrjI2NxdeoUaOKjL9r164A3m38VBo2bdok1b+xsXGh16OgoFDgrF5jY2Pk5uYiJSVFqjw7OxvPnj2DiYnJR+OoUqUKatasiZMnTwIAHj58KBPX6dOni2yjsM3L3mdoaIirV69i9+7d6Ny5M44dO4aOHTvCx8fno+fm+zCBDBQ+M/XDDbuIiIiIiOjL4gxXIiL65nl5eWH69OmwtbUt9mZCn5sgCNiwYQOcnJwwbNgwmePTpk1DeHg4+vfvX+D53333HapWrYrjx49j9uzZhfbz4MEDnD59Gv7+/mjVqpXUsby8PPTt2xcbNmxAUFAQ5s+fLzUD82PJyaysLOTl5cnMSi0pZ2fnjz5en5WVhW3btqF169YFxpf/c7548SJcXFzE8osXLyIvL0/ucfD27VtxuQQjIyOZuEpjRi/wbm1ZNzc3uLm5IS8vD8OGDcNvv/2GyZMno3r16iWala2jowMAePnypVR5fHy81HsDAwNoaWnh5s2bRbb3tc0MJyIiIiL6VjHhSkRE37xBgwZBUVERjRs3LrJeTk4OYmNjoa2tXeisytJ26tQpxMXFYerUqejevbvM8Xv37mHy5Ml49OhRgYlFiUSCJUuW4MqVK+jbt2+h/eTPbv3pp5/EDbPe98cffyA8PBxBQUFwdHQssI2XL19CXV0dFSpUkDkXgNT6nxkZGUhISIC+vj709fULjasgRc1qzbdv3z68fPlSZrOsfG3atIGuri6WL18ulXBdvnw51NTU4Orq+tE47t27h7t374r3Q0VFRWbZgdLw7Nkz6Onpie8VFBRQt25dAO8SywCgrq4OQDZ5WhQtLS3o6+vj5MmTCAgIEMuXLVsmVU9BQQEeHh5Yv349Ll68KLOOqyAIkEgkJYqBiIiIiIhkMeFKRERf1OrVq3HgwAGZ8vcfa4+MjMSbN29k6nh4eBS4DqW5uTlCQkI+2ndiYiJsbW3h4+Mj98ZZCxYsgJqamlSZgoICJk6cKL7ftm0b7ty5I3Ouj48PwsPDoaioWGgCsHPnzpg0aRIiIiIwZsyYAuu4u7vD3d29yDjDw8NRr169ApOt+f2MGDECly9fhoODQ4F1jh8/jpEjR6J79+6wtrZGdnY2oqKisH37djRo0AB9+vQR654/fx5OTk4IDg6Wuvd79uzBtWvXALxLcF+/fh3Tp08XY8hPNH5MeHg4lJWV0a1btwKPq6qqYtq0aRg+fDh69OgBZ2dnREVFYf369ZgxYwZ0dXWl6r99+1Zc2iEvLw9xcXFYsWIF8vLyEBwcLFdMJTVo0CA8f/4cbdq0QZUqVRAfH49ffvkF9erVEzcDq1evHhQVFTF79mykpqZCWVkZbdq0gaGh4UfbnjVrFgYNGoQGDRrg5MmTBW4CNnPmTBw6dAitWrWCn58fbG1tkZSUhC1btiA6OhqVKlUqcQxERERERCSNCVciIvqili9fXmC5r6+v+N8HDhwoMClrYWHxxTf++fnnn2XKFBUVpRKuha1t2qpVK2zZsgXNmjWTSQDmq1OnDiwtLbF+/fpCE64fc/nyZdy5cweTJ08utI6bmxtGjBiB9evXF5pwtbOzg5OTE3bt2oWkpCQIggArKytMmTIFY8eOLXLzqHzbtm3D2rVrxfdXrlzBlStXALxbM1WehGtaWhr27t0LV1dXqc28PjRs2DBUqFAB8+fPx+7du2FmZoaFCxcWuCZtVlaW1AxhLS0tNGzYEOvWrUPbtm0/GtOn6NOnD37//XcsW7YML1++hJGREXr16oWQkBAoKLxbTt/IyAgrVqzAzz//jIEDByI3NxfHjh37aLJzypQpePLkCbZu3SpuzLV//36Z80xNTXHu3DlMnjwZ4eHhSEtLg6mpKTp27Cj+QaGkMRARERERkTSJIAhCWQdBRERERERERERE9F+gUNYBEBEREREREREREf1XMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISolSWQfwNcjLy8OjR4+gqakJiURS1uEQERERERHRf5wgCHj16hVMTEygoMC5UERE/yVMuAJ49OgRzMzMyjoMIiIiIiIiKmcePnyIKlWqlHUYRERUiphwBaCpqQng3T90WlpaZRxN+ZCTk4NDhw6hffv2qFChQlmHQ/RZcJxTecBxTuUBxzmVBxznX15aWhrMzMzE30eJiOi/gwlXQFxGQEtLiwnXLyQnJwdqamrQ0tLi/6Gj/yyOcyoPOM6pPOA4p/KA47zscFk7IqL/Hi4UQ0RERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEq7hSkRERERERESlKjc3Fzk5OWUdBhFRqalYsSIUFOSbu8qEKxERERERERGVCkEQkJycjJcvX5Z1KEREpUpBQQGWlpaoWLHiR+sy4UpEREREREREpSI/2WpoaAg1NTVIJJKyDomI6JPl5eXh0aNHSEpKQtWqVT/63caEKxERERERERF9stzcXDHZqqenV9bhEBGVKgMDAzx69Ahv375FhQoViqzLTbOIiIiIiIiI6JPlr9mqpqZWxpEQEZW+/KUEcnNzP1qXCVciIiIiIiIiKjVcRoCI/ouK893GhCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKRERERERERFQOHT9+HBKJBC9fvpT7HAsLCyxatOizxVSYksT6Jfn6+sLDw6PU223dujUCAgJKvV36vJhwJSIiIiIiIiL6yvj6+kIikWDo0KEyx4YPHw6JRAJfX98vH9hXzMLCAhKJBBKJBIqKijAxMcHAgQPx4sWLYrXzpZKcubm5mDVrFmxsbKCqqgpdXV00btwYf/zxh1hn+/btmDZt2mePhUoXE65ERERERERERF8hMzMzREREIDMzUyx78+YNNmzYgKpVq5ZhZF+vqVOnIikpCQkJCQgPD8fJkycxcuTIsg6rQKGhoVi4cCGmTZuG27dv49ixY/Dz85OaxaurqwtNTc2yC5JKhAlXIiIiIiIiIqKvkIODA8zMzLB9+3axbPv27ahatSrq168vVTcrKwsjR46EoaEhVFRU8N133+HChQtSdfbt24caNWpAVVUVTk5OiIuLk+kzOjoaLVq0gKqqKszMzDBy5Ei8fv1a7pgvXLiA77//Hvr6+tDW1karVq1w+fJlqToSiQR//PEHunTpAjU1NVhbW2P37t3FjrUgmpqaMDIygqmpKZycnODj4yPV/7Nnz+Dl5QVTU1OoqanBzs4OGzduFI/7+vrixIkTWLx4sThbNr/vW7duoVOnTtDS0oKmpiZatGiB2NhYqf7nzZsHY2Nj6OnpYfjw4cjJySk01t27d2PYsGHo0aMHLC0tYW9vj4EDByIwMFCs8/5s2/xlFT58vT/TedeuXXBwcICKigqqVauG0NBQvH37Vq57R6WHCVciIiIiIiIioq/UgAEDsGbNGvH96tWr0b9/f5l6P/30E7Zt24a1a9fi8uXLqF69OpydnfH8+XMAwMOHD9G1a1e4ubnh6tWrGDRoEMaPHy/VRmxsLDp06IBu3brh+vXr2LRpE6Kjo+Hv7y93vK9evYKPjw+io6Nx9uxZWFtbw8XFBa9evZKqFxoaip49e+L69etwcXGBt7d3sWKVR2JiIvbs2YPGjRuLZW/evIGjoyP27t2Lmzdvws/PD3379sX58+cBAIsXL0bTpk0xePBgJCUlISkpCWZmZkhMTETLli2hrKyMo0eP4tKlSxgwYIBUMvPYsWOIjY3FsWPHsHbtWoSFhSEsLKzQ+IyMjHD06FE8efJErutp1qyZGFNSUhKOHj0KFRUVtGzZEgAQFRWFfv36YdSoUbh9+zZ+++03hIWFYcaMGcW+d/SJBBJSU1MFAEJqampZh1JuZGdnCzt37hSys7PLOhSiz4bjnMoDjnMqDzjOqTzgOP/y/ou/h2ZmZgq3b98WMjMzyzqU/wQfHx/B3d1dSElJEZSVlYW4uDghLi5OUFFREZ48eSK4u7sLPj4+giAIQnp6ulChQgUhPDxcPD87O1swMTER5syZIwiCIEyYMEGoVauWVB/jxo0TAAgvXrwQBEEQBg4cKPj5+UnViYqKEhQUFMSfq7m5ubBw4UK5ryM3N1fQ1NQU9uzZI5YBEIKCgsT36enpAgBh//79csdaEHNzc6FixYqCurq6oKKiIgAQGjduXOQ5giAIrq6uwo8//ii+b9WqlTBq1CipOhMmTBAsLS0L/Z708fERzM3Nhbdv34plPXr0EHr16lVov7du3RJsbW0FBQUFwc7OThgyZIiwb98+qToFxSIIgvD06VOhWrVqwrBhw8Sytm3bCjNnzpSqt27dOsHY2LjQGEh+xfmO4wxXIiIiIiIiIqKvlIGBAVxdXREWFoY1a9bA1dUV+vr6UnViY2ORk5OD5s2bi2UVKlRAo0aNEBMTAwCIiYmRmukJAE2bNpV6f+3aNYSFhUFDQ0N8OTs7Iy8vDw8ePJAr3sePH2Pw4MGwtraGtrY2tLS0kJ6ejoSEBKl6devWFf9bXV0dWlpaSElJkTvWwowdOxZXr17F9evXERkZCQBwdXVFbm4ugHcbVU2bNg12dnbQ1dWFhoYGDh48KBPfh65evYoWLVqgQoUKhdapXbs2FBUVxffGxsbiNRWkVq1auHnzJs6ePYsBAwYgJSUFbm5uGDRoUJGx5OTkoFu3bjA3N8fixYvF8mvXrmHq1KlSP7/8mboZGRlFtkmlS6msAyAiIiIiIiIiosINGDBAfKx/6dKln62f9PR0DBkypMBNpuTdpMvHxwfPnj3D4sWLYW5uDmVlZTRt2hTZ2dlS9T5MXEokEuTl5ZU8+P+nr6+P6tWrAwCsra2xaNEiNG3aFMeOHUO7du0wd+5cLF68GIsWLYKdnR3U1dUREBAgE9+HVFVVP9p3Sa5JQUEBDRs2RMOGDREQEID169ejb9++mDRpEiwtLQs854cffsDDhw9x/vx5KCn9L7WXnp6O0NBQdO3aVeYcFRWVj8ZPpYcJVyIiIiIiIiKir1iHDh2QnZ0NiUQCZ2dnmeNWVlaoWLEiTp06BXNzcwDvZkFeuHBB3HDJ1tZWZmOqs2fPSr13cHDA7du3xYRlSZw6dQrLli2Di4sLgHfrsT59+rRYbcgTq7zyZ5xmZmaK8bm7u6NPnz4AgLy8PNy7dw+1atUSz6lYsaI4IzZf3bp1sXbtWuTk5BQ5y/VT5cdR2EZlCxYswObNm3H69Gno6elJHXNwcMDdu3c/6edHpYNLChARERERERERfcUUFRURExOD27dvSz2ynk9dXR0//PADxo4diwMHDuD27dsYPHgwMjIyMHDgQADA0KFDcf/+fYwdOxZ3797Fhg0bZDZ0GjduHE6fPg1/f39cvXoV9+/fx65du4q1aZa1tTXWrVuHmJgYnDt3Dt7e3nLNDn2fPLEW5tWrV0hOTkZSUhLOnz+PsWPHwsDAAM2aNRPjO3z4ME6fPo2YmBgMGTIEjx8/lmrDwsIC586dQ1xcHJ4+fYq8vDz4+/sjLS0Nnp6euHjxIu7fv49169bh7t27xbq293Xv3h0LFy7EuXPnEB8fj+PHj2P48OGoUaMGbGxsZOofOXIEP/30E+bOnQt9fX0kJycjOTkZqampAIApU6bgzz//RGhoKG7duoWYmBhEREQgKCioxDFSyTDhSkRERERERET0ldPS0oKWllahx2fNmoVu3bqhb9++cHBwwN9//42DBw9CR0cHwLslAbZt24adO3fC3t4eK1aswMyZM6XaqFu3Lk6cOIF79+6hRYsWqF+/PqZMmQITExO541y1ahVevHgBBwcH9O3bFyNHjoShoWGxrlWeWAszZcoUGBsbw8TEBJ06dYK6ujoOHTokzgYNCgqCg4MDnJ2d0bp1axgZGcHDw0OqjcDAQCgqKqJWrVowMDBAQkIC9PT0cPToUaSnp6NVq1ZwdHTEypUrP2m2q7OzM/bs2QM3NzfUqFEDPj4+sLGxwaFDh6SWCsgXHR2N3NxcDB06FMbGxuJr1KhRYnt//fUXDh06hIYNG6JJkyZYuHChOOuZvhyJIAhCWQdR1tLS0qCtrY3U1NQiv7yo9OTk5GDfvn1wcXH5rFPxicoSxzmVBxznVB5wnFN5wHH+5f0Xfw998+YNHjx4AEtLS64XSUT/OcX5juMMVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKZLc8IyIiIiIiohJZ/GJxseqP0hn1mSIhIiKissIZrkRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUcNMsIiIiIiIiIvqsiruh3Kco6WZ0z549g62tLc6fPw8LC4vSDeob1LJlSwwdOhS9e/cGAEgkEuzYsQMeHh5lFpOFhQUCAgIQEBDw2WJq0qQJxo4di27dupVamwCQnZ2NGjVqYOvWrWjQoEGptk1fH85wJSIiIiIiIqJyb8aMGXB3dxeTrXFxcZBIJLh69eoXj8XX1xcSiQQSiQQVKlRA5cqV8f3332P16tXIy8v77P3v3r0bjx8/hqen52fv61MkJSWhY8eOpdpmUFAQxo8f/0n3edasWZBIJGJiGAAqVqyIwMBAjBs3rhSipK8dE65EREREREREVK5lZGRg1apVGDhw4BftNzs7u9BjHTp0QFJSEuLi4rB//344OTlh1KhR6NSpE96+fftZ41qyZAn69+8PBYWvO21kZGQEZWXlUm2zY8eOePXqFfbv31+i8y9cuIDffvsNdevWlTnm7e2N6Oho3Lp161PDpK/c1/3JISIiIiIiIiL6zPbt2wdlZWU0adKk0DrHjx+HRCJBZGQkGjRoADU1NTRr1gx3796Vqrdnzx40bNgQKioq0NfXR5cuXcRjFhYWmDZtGvr16wctLS34+fkV2p+ysjKMjIxgamoKBwcHTJw4Ebt27cL+/fsRFhYm1nv58iUGDRoEAwMDaGlpoU2bNrh27ZrcMX3oyZMnOHr0KNzc3GSO5c8oVVVVRbVq1bB161ap4+PGjUONGjWgpqaGatWqYfLkycjJyRGPX7t2DU5OTtDU1ISWlhYcHR1x8eJF8Xh0dDRatGgBVVVVmJmZYeTIkXj9+nWhsUokEuzcuRPA/2Ykb9++HU5OTlBTU4O9vT3OnDkjdc7H+lBUVISLiwsiIiIK7bcw6enp8Pb2xsqVK6GjoyNzXEdHB82bNy9R2/RtYcKViIiIiIiIiMq1qKgoODo6ylV30qRJmD9/Pi5evAglJSUMGDBAPLZ371506dIFLi4uuHLlCiIjI9GoUSOp8+fNmwd7e3tcuXIFkydPLlacbdq0gb29PbZv3y6W9ejRAykpKdi/fz8uXboEBwcHtG3bFs+fP5c7pvdFR0dDTU0Ntra2MscmT56Mbt264dq1a/D29oanpydiYmLE45qamggLC8Pt27exePFirFy5EgsXLhSPe3t7o0qVKrhw4QIuXbqE8ePHo0KFCgCA2NhYdOjQAd26dcP169exadMmREdHw9/fv1j3aNKkSQgMDMTVq1dRo0YNeHl5iTOC5e2jUaNGiIqKKla/ADB8+HC4urqiXbt2hdYpadv0beGmWURERET0RRR3w5SSbnpCRERUXPHx8TAxMZGr7owZM9CqVSsAwPjx4+Hq6oo3b95ARUUFM2bMgKenJ0JDQ8X69vb2Uue3adMGP/74Y4ljtbGxwfXr1wG8S46eP38eKSkp4qP18+bNw86dO7F161b4+fnJFdP74uPjUbly5QKXE+jRowcGDRoEAJg2bRoOHz6MX375BcuWLQPwbv3TfBYWFggMDERERAR++uknAEBCQgLGjh0LGxsbAIC1tbVY/+eff4a3t7e47qm1tTWWLFmCVq1aYfny5VBRUZHr/gQGBsLV1RUAEBoaitq1a+Pvv/+GjY2N3H2YmJjg4cOHyMvLk3tZhYiICFy+fBkXLlwosp6JiQni4+PlapO+XZzhSkRERERERETlWmZmptwJvffX5jQ2NgYApKSkAACuXr2Ktm3bFnn+p+5QLwgCJBIJgHeP6Kenp0NPTw8aGhri68GDB4iNjZU7pvcVdS+aNm0q8/79Ga6bNm1C8+bNYWRkBA0NDQQFBSEhIUE8PmbMGAwaNAjt2rXDrFmzxBjzryUsLEzqOpydnZGXl4cHDx7IHX9RPx95+1BVVUVeXh6ysrLk6vPhw4cYNWoUwsPDPzqOVFVVkZGRIff10LeJM1yJiIiIiIiIqFzT19fHixcv5Kqb/wg8ADHxmb+jvaqq6kfPV1dXL0GE/xMTEwNLS0sA79YMNTY2xvHjx2XqVapUSe6Y3lece/G+M2fOwNvbG6GhoXB2doa2tjYiIiIwf/58sU5ISAh69+6NvXv3Yv/+/QgODkZERAS6dOmC9PR0DBkyBCNHjpRpu2rVqnLHUdTPR94+nj9/DnV1dbnv3aVLl5CSkgIHBwexLDc3FydPnsSvv/6KrKwsKCoqim0bGBjIfT30bWLClYiIiIiIiIjKtfr162P9+vWf3E7dunURGRmJ/v37l0JUso4ePYobN25g9OjRAAAHBwckJydDSUkJFhYWpRJT/fr1kZycjBcvXshs/HT27Fn069dP6n39+vUBAKdPn4a5uTkmTZokHi/o0fkaNWqgRo0aGD16NLy8vLBmzRp06dIFDg4OuH37NqpXry5XnCUhbx83b94Ur0sebdu2xY0bN6TK+vfvDxsbG4wbN05Mtpakbfo2cUkBIiIiIiIiIirXnJ2dcevWrRLN7HxfcHAwNm7ciODgYMTExODGjRuYPXt2idrKyspCcnIyEhMTcfnyZcycORPu7u7o1KmTmPRs164dmjZtCg8PDxw6dAhxcXE4ffo0Jk2ahIsXL5Yopvr160NfXx+nTp2SObZlyxasXr0a9+7dQ3BwMM6fPy9uOGVtbY2EhAREREQgNjYWS5YswY4dO8RzMzMz4e/vj+PHjyM+Ph6nTp3ChQsXxM25xo0bh9OnT8Pf3x9Xr17F/fv3sWvXrmJvmlUUefuIiopC+/bt5W5XU1MTderUkXqpq6tDT08PderU+aS26dvEGa5ERERERERE9Fl97Rsh2tnZwcHBAZs3b8aQIUNK3E7r1q2xZcsWTJs2DbNmzYKWlhZatmxZorYOHDgAY2NjKCkpQUdHB/b29liyZAl8fHzEjZwkEgn27duHSZMmoX///njy5AmMjIzQsmVLVK5cuUQxKSoqon///ggPD0enTp2kjoWGhiIiIgLDhg2DsbExNm7ciFq1agEAOnfujNGjR8Pf3x9ZWVlwdXXF5MmTERISIrb77Nkz9OvXD48fP4a+vj66du0qbuZVt25dnDhxApMmTUKLFi0gCAKsrKzQq1evEt2/gsjTR2JiIk6fPi014zkuLg6WlpY4duwYWrduXeL+z5w5g9TUVHTv3v1TLoO+ARJBEISyDqKspaWlQVtbG6mpqdDS0irrcMqFnJwc7Nu3Dy4uLlLrqxD9l3CcU3nAcU7FsfjF4mLV/1p+Oec4p+LgOCd5/Rd/D33z5g0ePHgAS0tLuTeg+prs3bsXY8eOxc2bN+Xemf6/Kjk5GbVr18bly5dhbm5e1uF8UePGjcOLFy/w+++/i2XHjh1D165d8c8//8gss1AcvXr1gr29PSZOnFgaodIXVpzvOM5wJSIiIiIiIqJyz9XVFffv30diYiLMzMzKOpwyZWRkhFWrViEhIaHcJVwNDQ0xZswYqbJ9+/Zh4sSJn5Rszc7Ohp2dnbj+Lv23MeFKRERERERERAQgICCgrEP4anh4eJR1CGXixx9/lCmbO3fuJ7dbsWJFBAUFfXI79G0o33PkiYiIiIiIiIiIiEoRE65EREREREREREREpeSrWVJg1qxZmDBhAkaNGoVFixYBeLcY7Y8//oiIiAhkZWXB2dkZy5YtE3faA4CEhAT88MMPOHbsGDQ0NODj44Off/4ZSkpfzaURERF91Le6yQoRERERERFJ+ypmuF64cAG//fYb6tatK1U+evRo7NmzB1u2bMGJEyfw6NEjdO3aVTyem5sLV1dXZGdn4/Tp01i7di3CwsIwZcqUL30JRERERERERERERGWfcE1PT4e3tzdWrlwptdtbamoqVq1ahQULFqBNmzZwdHTEmjVrcPr0aZw9exYAcOjQIdy+fRvr169HvXr10LFjR0ybNg1Lly5FdnZ2WV0SERERERERERERlVNl/tz98OHD4erqinbt2mH69Oli+aVLl5CTk4N27dqJZTY2NqhatSrOnDmDJk2a4MyZM7Czs5NaYsDZ2Rk//PADbt26hfr16xfYZ1ZWFrKyssT3aWlpAICcnBzk5OSU9iVSAfLvM+83/ZdxnFNxSN5KilX/axlXHOdUHBznVB5wnJO8eK+JiP67yjThGhERgcuXL+PChQsyx5KTk1GxYkVUqlRJqrxy5cpITk4W67yfbM0/nn+sMD///DNCQ0Nlyg8dOgQ1NbXiXgZ9gsOHD5d1CESfHcc5ycMc5sWqvw/7PlMkJcNxTvLgOKfygOOc5JWRkVHWIRAR0WdSZgnXhw8fYtSoUTh8+DBUVFS+aN8TJkzAmDFjxPdpaWkwMzND+/btoaWl9UVjKa9ycnJw+PBhfP/996hQoUJZh0P0WXCcU3Esf7m8WPV/qPTDZ4qkeDjOqTg4zqk84DgneeU/aVlezLry9Iv1Nb6+fonOe/bsGWxtbXH+/HlYWFiUblDfoJYtW2Lo0KHo3bs3AEAikWDHjh3w8PAos5gsLCwQEBCAgICAzxZTkyZNMHbsWHTr1q3U2vwSbdPXpcwSrpcuXUJKSgocHBzEstzcXJw8eRK//vorDh48iOzsbLx8+VJqluvjx49hZGQEADAyMsL58+el2n38+LF4rDDKyspQVlaWKa9QoQL/z8UXxntO5QHHOclDUBKKVf9rG1Mc5yQPjnMqDzjOSV68z1+fGTNmwN3dXUy2xsXFwdLSEleuXEG9evW+aCy+vr5Yu3YtAEBJSQm6urqoW7cuvLy84OvrCwWFz7slz+7du/H48WN4enp+1n4+VVJSktR+QKUhKCgIo0ePRpcuXYp1n3NzcxESEoL169cjOTkZJiYm8PX1RVBQECQSySe1Td+eMvvptm3bFjdu3MDVq1fFV4MGDeDt7S3+d4UKFRAZGSmec/fuXSQkJKBp06YAgKZNm+LGjRtISUkR6xw+fBhaWlqoVavWF78mIiIiIiIiIvr2ZGRkYNWqVRg4cOAX7beoDb87dOiApKQkxMXFYf/+/XBycsKoUaPQqVMnvH379rPGtWTJEvTv3/+rTwoaGRkVOKHuU3Ts2BGvXr3C/v37i3Xe7NmzsXz5cvz666+IiYnB7NmzMWfOHPzyyy+f3DZ9e8rsk6OpqYk6depIvdTV1aGnp4c6depAW1sbAwcOxJgxY3Ds2DFcunQJ/fv3R9OmTdGkSRMAQPv27VGrVi307dsX165dw8GDBxEUFIThw4eX+geOiIiIiIiIiP6b9u3bB2VlZTHfUJDjx49DIpEgMjISDRo0gJqaGpo1a4a7d+9K1duzZw8aNmwIFRUV6Ovro0uXLuIxCwsLTJs2Df369YOWlhb8/PwK7U9ZWRlGRkYwNTWFg4MDJk6ciF27dmH//v0ICwsT6718+RKDBg2CgYEBtLS00KZNG1y7dk3umD705MkTHD16FG5ubjLHkpKS0LFjR6iqqqJatWrYunWr1PFx48ahRo0aUFNTQ7Vq1TB58mSpDeKuXbsGJycnaGpqQktLC46Ojrh48aJ4PDo6Gi1atICqqirMzMwwcuRIvH79utBYJRIJdu7cCeDdjGSJRILt27fDyckJampqsLe3x5kzZ6TO+VgfioqKcHFxQURERKH9FuT06dNwd3eHq6srLCws0L17d7Rv317qyeyStk3fnq/6TxULFy5Ep06d0K1bN7Rs2RJGRkbYvn27eFxRURF//fUXFBUV0bRpU/Tp0wf9+vXD1KlTyzBqIiIiIiIiIvqWREVFwdHRUa66kyZNwvz583Hx4kUoKSlhwIAB4rG9e/eiS5cucHFxwZUrVxAZGYlGjRpJnT9v3jzY29vjypUrmDx5crHibNOmDezt7aVyIz169EBKSgr279+PS5cuwcHBAW3btsXz58/ljul90dHRUFNTg62trcyxyZMno1u3brh27Rq8vb3h6emJmJgY8bimpibCwsJw+/ZtLF68GCtXrsTChQvF497e3qhSpQouXLiAS5cuYfz48eLyGrGxsejQoQO6deuG69evY9OmTYiOjoa/v3+x7tGkSZMQGBiIq1evokaNGvDy8hJnBMvbR6NGjRAVFVWsfps1a4bIyEjcu3cPwLvkcnR0NDp27PjJbdO3p8zWcC3I8ePHpd6rqKhg6dKlWLp0aaHnmJubY9++r2tnTyIiIiIiIiL6dsTHx8PExESuujNmzECrVq0AAOPHj4erqyvevHkDFRUVzJgxA56enggNDRXr29vbS53fpk0b/PjjjyWO1cbGBtevXwfwLjl6/vx5pKSkiE/6zps3Dzt37sTWrVvh5+cnV0zvi4+PR+XKlQtcTqBHjx4YNGgQAGDatGk4fPgwfvnlFyxbtgzAuzVK81lYWCAwMBARERH46aefAAAJCQkYO3YsbGxsAADW1tZi/Z9//hne3t7ihljW1tZYsmQJWrVqheXLl8u94XpgYCBcXV0BAKGhoahduzb+/vtv2NjYyN2HiYkJHj58iLy8PLmXVRg/fjzS0tJgY2MDRUVF5ObmYsaMGfD29paqV5K26dvDnywRERERERERlWuZmZlyJ/Tq1q0r/rexsTEAiHvLXL16FW3bti3y/AYNGpQwyncEQRA3Ybp27RrS09Ohp6cHDQ0N8fXgwQPExsbKHdP7iroX+XvqvP/+/RmumzZtQvPmzWFkZAQNDQ0EBQUhISFBPD5mzBgMGjQI7dq1w6xZs8QY868lLCxM6jqcnZ2Rl5eHBw8eyB1/UT8feftQVVVFXl4esrKy5O538+bNCA8Px4YNG3D58mWsXbsW8+bNEzc/+5S26dvzVc1wJSIiIiIiIiL60vT19fHixQu56uY/Ag9ATHzm5eUBeJdM+xh1dfUSRPg/MTExsLS0BACkp6fD2NhY5olhAKhUqZLcMb2vOPfifWfOnIG3tzdCQ0Ph7OwMbW1tREREYP78+WKdkJAQ9O7dG3v37sX+/fsRHByMiIgIdOnSBenp6RgyZAhGjhwp03bVqlXljqOon4+8fTx//hzq6urFundjx47F+PHj4enpCQCws7NDfHw8fv75Z/j4+HxS2/TtYcKViL56i18sLlb9UTqjPlMkRERERET0X1S/fn2sX7/+k9upW7cuIiMj0b9//1KIStbRo0dx48YNjB49GgDg4OCA5ORkKCkpwcLColRiql+/PpKTk/HixQvo6OhIHTt79iz69esn9b5+/foA3m0aZW5ujkmTJonH4+PjZdqvUaMGatSogdGjR8PLywtr1qxBly5d4ODggNu3b6N69epyxVkS8vZx8+ZN8brklZGRIbNEgKKiopjs/ZS26dvDJQWIiIiIiIiIqFxzdnbGrVu3SjSz833BwcHYuHEjgoODERMTgxs3bmD27NklaisrKwvJyclITEzE5cuXMXPmTLi7u6NTp05i0rNdu3Zo2rQpPDw8cOjQIcTFxeH06dOYNGkSLl68WKKY6tevD319fZw6dUrm2JYtW7B69Wrcu3cPwcHBOH/+vLjhlLW1NRISEhAREYHY2FgsWbIEO3bsEM/NzMyEv78/jh8/jvj4eJw6dQoXLlwQN+caN24cTp8+DX9/f1y9ehX379/Hrl27ir1pVlHk7SMqKgrt27cvVttubm6YMWMG9u7di7i4OOzYsQMLFixAly5dPrlt+vZwhisRERERERERfVbj6+uXdQhFsrOzg4ODAzZv3owhQ4aUuJ3WrVtjy5YtmDZtGmbNmgUtLS20bNmyRG0dOHAAxsbGUFJSgo6ODuzt7bFkyRL4+PiIMyklEgn27duHSZMmoX///njy5AmMjIzQsmVLVK5cuUQxKSoqon///ggPD0enTp2kjoWGhiIiIgLDhg2DsbExNm7ciFq1agEAOnfujNGjR8Pf3x9ZWVlwdXXF5MmTERISIrb77Nkz9OvXD48fP4a+vj66du0qbuZVt25dnDhxApMmTUKLFi0gCAKsrKzQq1evEt2/gsjTR2JiIk6fPi014zkuLg6WlpY4duwYWrduXWDbv/zyCyZPnoxhw4YhJSUFJiYmGDJkCKZMmVJk2/TfJBEEQSjrIMpaWloatLW1kZqaCi0trbIOp1zIycnBvn374OLiIrW+ClFBvtUlBTjOqTg4zqk84Din8oDjnOT1X/w99M2bN3jw4AEsLS3l3oDqa7J3716MHTsWN2/eLPe7xycnJ6N27dq4fPkyzM3NyzqcL2rcuHF48eIFfv/9d7Hs2LFj6Nq1K/755x+ZZRY+tW36dhTnO44zXImIiIiIiIio3HN1dcX9+/eRmJgIMzOzsg6nTBkZGWHVqlVISEgodwlXQ0NDjBkzRqps3759mDhx4iclWwtrm/6bmHAlIiIiIiIiIgIQEBBQ1iF8NTw8PMo6hDLx448/ypTNnTv3s7VN/03le448ERERERERERERUSliwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREVO49e/YMhoaGiIuLK9Z5rVu3RkBAwGeJ6VMdP34cEokEL1++LLMYwsLCUKlSpTLrn4pvxYoVcHNzK+swvmlKZR0AEREREREREf23pYaGfrG+tIODS3TejBkz4O7uDgsLCwDvEqknTpwotP7x48fRqlWrEvX1pTRr1gxJSUnQ1tb+rP1IJBLs2LEDHh4en7WfzyEkJAQRERF4+PAhKlasCEdHR8yYMQONGzcu9Jxnz57B29sb169fFxP17u7umDlzJrS0tAAA0dHRGDduHO7cuYOMjAyYm5tjyJAhGD16tFRbS5cuxdy5c5GcnAx7e3v88ssvaNSokXjcwsIC8fHx2LhxIzw9PaXOrV27Nm7fvo01a9bA19e32Nf+7Nkz2NvbIzExES9evBAT4wMGDMC0adMQFRWFFi1aFLtd4gxXIiIiIiIiIirnMjIysGrVKgwcOFAs2759O5KSkqRe8fHxqFOnDho0aFBkQu5rUbFiRRgZGUEikZR1KF+tGjVq4Ndff8WNGzcQHR0NCwsLtG/fHk+ePCn0HAUFBbi7u2P37t24d+8ewsLCcOTIEQwdOlSso66uDn9/f5w8eRIxMTEICgpCUFAQfv/9d7HOpk2bMGbMGAQHB+Py5cuwt7eHs7MzUlJSpPozMzPDmjVrpMrOnj2L5ORkqKurl/jaBw4ciLp168qUV6xYEb1798aSJUtK3HZ5x4QrEREREREREZVr+/btg7KyMpo0aSKW6erqwsjISOo1bdo0PH36FDt27ICKiopYNy8vDz/99JN4TkhIiHgsLi4OEokEV69eFctevnwJiUSC48ePi2UnTpxAo0aNoKysDGNjY4wfPx5v374Vj7du3RojRoxAQEAAdHR0ULlyZaxcuRKvX79G//79oampierVq2P//v3iOR8uKZD/eP/Bgwdha2sLDQ0NdOjQAUlJSVL3Y/Xq1ahdu7YYi7+//yfe4f+JjY2Fu7s7KleuDA0NDTRs2BBHjhyRqmNhYYHp06ejX79+0NDQgLm5OXbv3o0nT57A3d0dGhoaqFu3Li5evCie8+zZM3h5ecHU1BRqamqws7PDxo0bPxpP79690a5dO1SrVg21a9fGggULkJaWhuvXrxd6jo6ODn744Qc0aNAA5ubmaNu2LYYNG4aoqCixTv369eHl5YXatWvDwsICffr0gbOzs1SdBQsWYPDgwejfvz9q1aqFFStWQE1NDatXr5bqz9vbGydOnMDDhw/FstWrV8Pb2xtKSiV7eH358uV4+fIlAgMDCzzu5uaG3bt3IzMzs0Ttl3dMuBIRERERERFRuRYVFQVHR8ci6yxbtgx//vkntm3bhipVqkgdW7t2LdTV1XHu3DnMmTMHU6dOxeHDh+XuPzExES4uLmjYsCGuXbuG5cuXY9WqVZg+fbpMP/r6+jh//jxGjBiBH374AT169ECzZs1w+fJltG/fHn379kVGRkahfWVkZGDevHlYt24dTp48iYSEBKmk2/LlyzF8+HD4+fnhxo0b2L17N6pXry73tXxMeno6XFxcEBkZiStXrqBDhw5wc3NDQkKCVL2FCxeiefPmuHLlClxdXdG3b1/069cPffr0weXLl2FlZYV+/fpBEAQAwJs3b+Do6Ii9e/fi5s2b8PPzQ9++fXH+/Hm5Y8vOzsbvv/8ObW1t2Nvby33eo0ePsH379iKXmLhy5QpOnz4t1snOzsalS5fQrl07sY6CggLatWuHM2fOSJ1buXJlODs7Y+3atQDe/Qw3bdqEAQMGyB3j+27fvo2pU6fizz//hIJCwanBBg0a4O3btzh37lyJ+ijvmHAlIiIiIiIionItPj4eJiYmhR4/efIkAgICsHTpUjRr1kzmeN26dREcHAxra2v069cPDRo0QGRkpNz9L1u2DGZmZvj1119hY2MDDw8PhIaGYv78+cjLyxPr2dvbIygoCNbW1pgwYQJUVFSgr6+PwYMHw9raGlOmTMGzZ8+KnJ2Zk5ODFStWoEGDBnBwcIC/v79UrNOnT8ePP/6IUaNGoUaNGmjYsGGpbgpmb2+PIUOGoE6dOrC2tsa0adNgZWWF3bt3S9VzcXHBkCFDxOtKS0tDw4YN0aNHD9SoUQPjxo1DTEwMHj9+DAAwNTVFYGAg6tWrh2rVqmHEiBHo0KEDNm/e/NGY/vrrL2hoaEBFRQULFy7E4cOHoa+v/9HzvLy8oKamBlNTU2hpaeGPP/6QqVOlShUoKyujQYMGGD58OAYNGgQAePr0KXJzc1G5cmWp+pUrV0ZycrJMOwMGDEBYWBgEQcDWrVthZWWFevXqfTTGD2VlZcHLywtz585F1apVC62npqYGbW1txMfHF7sPYsKViIiIiIiIiMq5zMxMqSUC3peQkIDu3bvDz89PTJZ96MN1MI2NjWXW4SxKTEwMmjZtKrXWavPmzZGeno5///23wH4UFRWhp6cHOzs7sSw/eVdU32pqarCysiow1pSUFDx69Aht27Yt8NyhQ4dCQ0NDfJVEeno6AgMDYWtri0qVKkFDQwMxMTEyM1zfv9b86yrqWnNzczFt2jTY2dlBV1cXGhoaOHjwoNhueHi4VOzvP9rv5OSEq1ev4vTp0+jQoQN69uwpttuxY0fxnNq1a0vFuHDhQly+fBm7du1CbGwsxowZI3O9UVFRuHjxIlasWIFFixbJtcxBQVxdXZGeno6TJ09i9erVJZ7dOmHCBNja2qJPnz4frauqqlrkbGkqXMkWeiAiIiIiIiIi+o/Q19fHixcvZMozMzPRpUsX1K5dG4sWLSr0/AoVKki9l0gk4szU/Ee28x99B97NMi2Jgvp5vyw/Yfv+rFh52siPTVVVtcj+p06dWuian/IKDAzE4cOHMW/ePFSvXh2qqqro3r07srOzC40z/7qKuta5c+di8eLFWLRoEezs7KCuro6AgACx3c6dO0ttdGZqair+t7q6OqpXr47q1aujSZMmsLa2xqpVqzBhwgT88ccf4jqmH967/LV9bWxsoKurixYtWmDy5MkwNjYW61haWgJ4lyx+/PgxQkJC4OXlBX19fSgqKoozdPM9fvwYRkZGMvdNSUkJffv2RXBwMM6dO4cdO3YUeZ8Lc/ToUdy4cQNbt24F8L9xqa+vj0mTJiE0NFSs+/z5cxgYGJSon/KOCVciIiIiIiIiKtfq16+P9evXy5QPGjQIz58/x8GDB0u8OVF+wiopKQn169cHAKkNtADA1tYW27ZtgyAIYiLx1KlT0NTUlFkv9nPS1NSEhYUFIiMj4eTkJHPc0NAQhoaGn9THqVOn4Ovriy5dugB4N+M1Li7uk9rMb9fd3V2cuZmXl4d79+6hVq1aAN5dm6amplxt5eXlISsrC4B0YvZj5wAQz/tYuxUrVoSjoyMiIyPh4eEhHo+MjCx0k7IBAwZg3rx56NWrF3R0dOSK60Pbtm2T2gjrwoULGDBgAKKioqRmPsfGxuLNmzfimKXiYcKViIiIiIiIiMo1Z2dnTJgwAS9evBATWXPnzsWWLVuwZ88evH37VmZdTW1t7Y/OCAXezRpt0qQJZs2aBUtLS6SkpCAoKEiqzrBhw7Bo0SKMGDEC/v7+uHv3LoKDgzFmzJhCNzX6XEJCQjB06FAYGhqiY8eOePXqFU6dOoURI0YUed6DBw9kEsnW1tYy9aytrbF9+3a4ublBIpFg8uTJRc7IlZe1tTW2bt2K06dPQ0dHBwsWLMDjx4/FhGtBXr9+jRkzZqBz584wNjbG06dPsXTpUiQmJqJHjx6Fnrdv3z48fvwYDRs2hIaGBm7duoWxY8eiefPmsLCwAAAsXboUVatWhY2NDYB36wDPmzcPI0eOFNsZM2YMfHx80KBBAzRq1AiLFi3C69ev0b9//wL7tbW1xdOnT6GmplaCO/TO+0lV4N1asvltV6pUSSyPiopCtWrVZOqTfJhwJSIiIiIiIqJyzc7ODg4ODti8eTOGDBkC4N1GVjk5OejQoUOB56xZswa+vr5ytb969WoMHDgQjo6OqFmzJubMmYP27duLx01NTbFv3z6MHTsW9vb20NXVxcCBA2USs1+Cj48P3rx5g4ULFyIwMBD6+vro3r37R88rbP3SDy1YsAADBgxAs2bNoK+vj3HjxiEtLe2T4w4KCsI///wDZ2dnqKmpwc/PDx4eHkhNTS30HEVFRdy5cwdr167F06dPoaenh4YNGyIqKkpmvdb3qaqqYuXKlRg9ejSysrJgZmaGrl27Yvz48WKdvLw8TJgwAQ8ePICSkhKsrKwwe/ZscXwBQK9evfDkyRNMmTIFycnJqFevHg4cOCCzkdb79PT0irwPvr6+iIuLw/Hjx4us9zEbN27E4MGDP6mN8kwivL+ISDmVlpYGbW1tpKamQktLq6zDKRdycnKwb98+uLi4yKyBQvShxS8WF6v+KJ1RnymS4uE4p+LgOKfygOOcygOOc5LXf/H30Ddv3uDBgwewtLQsdAOqr9nevXsxduxY3Lx584vPKiUqLa1atYKTkxNCQkJK3MatW7fQpk0b3Lt3D9ra2qUX3DeuON9xnOFKRERERET0DUh9byMTeWgHB3+mSIj+m1xdXXH//n0kJibCzMysrMMhKrbU1FTExsZi7969n9ROUlIS/vzzTyZbPwETrkREREREREREAAICAso6BKIS09bWxr///vvJ7bRr164UoinfOEeeiIiIiIiIiIiIqJQw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUsKEKxERERERERGVe8+ePYOhoSHi4uKKdV7r1q0REBDwWWL6VMePH4dEIsHLly/LLIawsDBUqlSpzPqn4jtw4ADq1auHvLy8sg7lm8WEKxERERERERF9XnckX+5VQjNmzIC7uzssLCwAvEukSiSSQl8nTpwopZvz+TRr1gxJSUnQ1tb+rP1IJBLs3Lnzs/bxuYSEhMDGxgbq6urQ0dFBu3btcO7cuSLPefbsGTp06AATExMoKyvDzMwM/v7+SEtLE+tER0ejefPm0NPTg6qqKmxsbLBw4UKZtpYuXQoLCwuoqKigcePGOH/+vNRxCwsLSCQSREREyJxbu3ZtSCQShIWFFeuaL1y4gLZt26JSpUrQ0dGBs7Mzrl27Jh7v0KEDKlSogPDw8GK1S/+jVNYBEBERERF9qtTQ0GLV1w4O/kyREBHRtygjIwOrVq3CwYMHxbLt27cjOztbql52djZcXV3F5NjXrmLFijAyMirrML5qNWrUwK+//opq1aohMzMTCxcuRPv27fH333/DwMCgwHMUFBTg7u6O6dOnw8DAAH///TeGDx+O58+fY8OGDQAAdXV1+Pv7o27dulBXV0d0dDSGDBkCdXV1+Pn5AQA2bdqEMWPGYMWKFWjcuDEWLVoEZ2dn3L17F4aGhmJ/ZmZmWLNmDTw9PcWys2fPIjk5Gerq6sW63vT0dHTo0AGdO3fGsmXL8PbtWwQHB8PZ2RkPHz5EhQoVAAC+vr5YsmQJ+vbtW6z26R3OcCUiIiIiIiKicm3fvn1QVlZGkyZNxDJdXV0YGRlJvaZNm4anT59ix44dUFFREevm5eXhp59+Es8JCQkRj8XFxUEikeDq1ati2cuXLyGRSHD8+HGx7MSJE2jUqBGUlZVhbGyM8ePH4+3bt+Lx1q1bY8SIEQgICICOjg4qV66MlStX4vXr1+jfvz80NTVRvXp17N+/XzznwyUF8h/vP3jwIGxtbaGhoYEOHTogKSlJ6n6sXr0atWvXFmPx9/f/xDv8P7GxsXB3d0flypWhoaGBhg0b4siRI1J1LCwsMH36dPTr1w8aGhowNzfH7t278eTJE7i7u0NDQwN169bFxYsXxXOePXsGLy8vmJqaQk1NDXZ2dti4ceNH4+nduzfatWuHatWqoXbt2liwYAHS0tJw/fr1Qs/R0dHBDz/8gAYNGsDc3Bxt27bFsGHDEBUVJdapX78+vLy8ULt2bVhYWKBPnz5wdnaWqrNgwQIMHjwY/fv3R61atbBixQqoqalh9erVUv15e3vjxIkTePjwoVi2evVqeHt7Q0mpeHMp79y5g+fPn2Pq1KmoWbMmateujeDgYDx+/Bjx8fFiPTc3N1y8eBGxsbHFap/eYcKViIiIiIiIiMq1qKgoODo6Flln2bJl+PPPP7Ft2zZUqVJF6tjatWuhrq6Oc+fOYc6cOZg6dSoOHz4sd/+JiYlwcXFBw4YNce3aNSxfvhyrVq3C9OnTZfrR19fH+fPnMWLECPzwww/o0aMHmjVrhsuXL6N9+/bo27cvMjIyCu0rIyMD8+bNw7p163Dy5EkkJCQgMDBQPL58+XIMHz4cfn5+uHHjBnbv3o3q1avLfS0fk56eDhcXF0RGRuLKlSvo0KED3NzckJCQIFVv4cKFaN68Oa5cuQJXV1f07dsX/fr1Q58+fXD58mVYWVmhX79+EAQBAPDmzRs4Ojpi7969uHnzJvz8/NC3b1+ZR/SLkp2djd9//x3a2tqwt7eX+7xHjx5h+/btaNWqVaF1rly5gtOnT4t1srOzcenSJbRr106so6CggHbt2uHMmTNS51auXBnOzs5Yu3YtgHc/w02bNmHAgAFyx5ivZs2a0NPTw6pVq5CdnY3MzEysWrUKtra24nIaAFC1alVUrlxZKkFM8mPClYiIiIiIiIjKtfj4eJiYmBR6/OTJkwgICMDSpUvRrFkzmeN169ZFcHAwrK2t0a9fPzRo0ACRkZFy979s2TKYmZnh119/hY2NDTw8PBAaGor58+dLbVxkb2+PoKAgWFtbY8KECVBRUYG+vj4GDx4Ma2trTJkyBc+ePStydmZOTg5WrFiBBg0awMHBAf7+/lKxTp8+HT/++CNGjRqFGjVqoGHDhqW6KZi9vT2GDBmCOnXqwNraGtOmTYOVlRV2794tVc/FxQVDhgwRrystLQ0NGzZEjx49UKNGDYwbNw4xMTF4/PgxAMDU1BSBgYGoV68eqlWrhhEjRqBDhw7YvHnzR2P666+/oKGhARUVFSxcuBCHDx+Gvr7+R8/z8vKCmpoaTE1NoaWlhT/++EOmTpUqVaCsrIwGDRpg+PDhGDRoEADg6dOnyM3NReXKlaXqV65cGcnJyTLtDBgwAGFhYRAEAVu3boWVlRXq1av30Rg/pKmpiePHj2P9+vVQVVWFhoYGDhw4gP3798vMljUxMZGa9UryY8KViIiIiIiIiMq1zMxMqSUC3peQkIDu3bvDz89PTJZ9qG7dulLvjY2NkZKSInf/MTExaNq0KSSS/2361bx5c6Snp+Pff/8tsB9FRUXo6enBzs5OLMtP3hXVt5qaGqysrAqMNSUlBY8ePULbtm0LPHfo0KHQ0NAQXyWRnp6OwMBA2NraolKlStDQ0EBMTIzMDNf3rzX/uoq61tzcXEybNg12dnbQ1dWFhoYGDh48KLYbHh4uFfv7MzednJxw9epVnD59Gh06dEDPnj3Fdjt27CieU7t2bakYFy5ciMuXL2PXrl2IjY3FmDFjZK43KioKFy9exIoVK7Bo0SK5ljkoiKurK9LT03Hy5EmsXr26RLNbgXdjfeDAgWjevDnOnj2LU6dOoU6dOnB1dUVmZqZUXVVV1SJnS1PhuGkWEREREREREZVr+vr6ePHihUx5ZmYmunTpgtq1a2PRokWFnp+/0VA+iUQizkxVUHg31y3/0Xfg3SzTkiion/fL8hO278+KlaeN/NhUVVWL7H/q1KlSyw+URGBgIA4fPox58+ahevXqUFVVRffu3WU2KCvouoq61rlz52Lx4sVYtGgR7OzsoK6ujoCAALHdzp07S210ZmpqKv63uro6qlevjurVq6NJkyawtrbGqlWrMGHCBPzxxx9iIvLDe5e/tq+NjQ10dXXRokULTJ48GcbGxmIdS0tLAO+SxY8fP0ZISAi8vLygr68PRUVFcYZuvsePHxe40ZmSkhL69u2L4OBgnDt3Djt27CjyPhdmw4YNiIuLw5kzZ8SxuWHDBujo6GDXrl1SG3M9f/680I3DqGhMuBIRERERERFRuVa/fn2sX79epnzQoEF4/vw5Dh48WOzNifLlJ6ySkpJQv359AJDaQAsAbG1tsW3bNgiCICYST506BU1NTZn1Yj8nTU1NWFhYIDIyEk5OTjLHDQ0NYWho+El9nDp1Cr6+vujSpQuAdzNe4+LiPqnN/Hbd3d3Rp08fAO8Ssffu3UOtWrUAvLs2TU1NudrKy8tDVlYWAOnE7MfOASCe97F2K1asCEdHR0RGRsLDw0M8HhkZWegmZQMGDMC8efPQq1cv6OjoyBXXhzIyMqCgoCA1mzr//fuJ+jdv3iA2NlYcs1Q8TLgSERERERERUbnm7OyMCRMm4MWLF2Iia+7cudiyZQv27NmDt2/fyqyrqa2t/dEZocC7WaNNmjTBrFmzYGlpiZSUFAQFBUnVGTZsGBYtWoQRI0bA398fd+/eRXBwMMaMGSPOQvxSQkJCMHToUBgaGqJjx4549eoVTp06hREjRhR53oMHD2QSydbW1jL1rK2tsX37dri5uUEikWDy5MlFzsiVl7W1NbZu3YrTp09DR0cHCxYswOPHj8WEa0Fev36NGTNmoHPnzjA2NsbTp0+xdOlSJCYmokePHoWet2/fPjx+/BgNGzaEhoYGbt26hbFjx6J58+bixlNLly5F1apVYWNjA+DdOsDz5s3DyJEjxXbGjBkDHx8fNGjQAI0aNcKiRYvw+vVr9O/fv8B+bW1t8fTpU6ipqZXgDr3z/fffY+zYsRg+fDhGjBiBvLw8zJo1C0pKSlJJ9rNnz0JZWRlNmzYtcV/lGROuRERERERERFSu2dnZwcHBAZs3b8aQIUMAvNvIKicnBx06dCjwnDVr1sDX11eu9levXo2BAwfC0dERNWvWxJw5c9C+fXvxuKmpKfbt24exY8fC3t4eurq6GDhwoExi9kvw8fHBmzdvsHDhQgQGBkJfXx/du3f/6HmFrV/6oQULFmDAgAFo1qwZ9PX1MW7cOKSlpX1y3EFBQfjnn3/g7OwMNTU1+Pn5wcPDA6mpqYWeo6ioiDt37mDt2rV4+vQp9PT00LBhQ0RFRcms1/o+VVVVrFy5EqNHj0ZWVhbMzMzQtWtXjB8/XqyTl5eHCRMm4MGDB1BSUoKVlRVmz54tji8A6NWrF548eYIpU6YgOTkZ9erVw4EDB2Q20nqfnp5ekffB19cXcXFxOH78eIHHbWxssGfPHoSGhqJp06ZQUFBA/fr1ceDAAamlEDZu3Ahvb+9PSu6WZxLh/UVEyqm0tDRoa2sjNTUVWlpaZR1OsSx+sbhY9UfpjPpMkRRPTk4O9u3bBxcXF5k1UIg+xHFO5QHHOZUHn3Ocp4aGFqtt7eBguetynFNxcJyTvL7l30ML8+bNGzx48ACWlpaFbkD1Ndu7dy/Gjh2LmzdvfvFZpUSlpVWrVnByckJISEiJ23j69Clq1qyJixcvimvQUvG+4zjDlYiIiIiIiIjKPVdXV9y/fx+JiYkwMzMr63CIii01NRWxsbHYu3fvJ7UTFxeHZcuWMdn6CZhwJSIiIiIiIiICEBAQUNYhEJWYtrY2/v33309up0GDBmjQoEEpRFR+MeFKRET0H/c5H0ElIiIiIiIiaVyUhIiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIir3nj17BkNDQ8TFxZV1KHI7fvw4JBIJXr58CQAICwtDpUqVyjSmkoqLi4NEIsHVq1fLOhQqp5o0aYJt27aVSltKpdIKEREREREREVEh/ln7zxfrq5pPtRKdN2PGDLi7u8PCwqJ0A/rA8ePH4eTkhBcvXnyzydGSat26NerVq4dFixaVdSjFtn37dsycORN///03cnJyYG1tjR9//BF9+/Yt9JykpCT8+OOPuHjxIv7++2+MHDlSrmvPzc1FSEgI1q9fj+TkZJiYmMDX1xdBQUGQSCQA3t3LEydOiOcYGhqiZcuWmDdvHszNzQttu7THX0hICHbu3PnFEuUWFhYICAhAQEBAiduIiIiAl5cX3N3dsXPnTrE8KCgIo0ePRpcuXaCg8GlzVDnDlYiIiIiIiIjKtYyMDKxatQoDBw4s61DoK6Wrq4tJkybhzJkzuH79Ovr374/+/fvj4MGDhZ6TlZUFAwMDBAUFwd7eXu6+Zs+ejeXLl+PXX39FTEwMZs+ejTlz5uCXX36Rqjd48GAkJSXh0aNH2LVrFx4+fIg+ffqU+Bo/p5ycnLIOAcC7mdSBgYFo0aKFzLGOHTvi1atX2L9//yf3U6YJ1+XLl6Nu3brQ0tKClpYWmjZtKnVRrVu3hkQikXoNHTpUqo2EhAS4urpCTU0NhoaGGDt2LN6+ffulL4WIiIiIiIiIvlH79u2DsrIymjRpIpblP65/8OBB1K9fH6qqqmjTpg1SUlKwf/9+2NraQktLC71790ZGRoZ4Xl5eHn7++WdYWlpCVVUV9vb22Lp1K4B3yR4nJycAgI6ODiQSCXx9fQEABw4cwHfffYdKlSpBT08PnTp1Qmxs7CddV2xsLNzd3VG5cmVoaGigYcOGOHLkiFQdCwsLTJ8+Hf369YOGhgbMzc2xe/duPHnyBO7u7tDQ0EDdunVx8eJF8Zxnz57By8sLpqamUFNTg52dHTZu3PhJsX4oNzcXAwcOFO9jzZo1sXjxYqk6vr6+8PDwwMyZM1G5cmVUqlQJU6dOxdu3bzF27Fjo6uqiSpUqWLNmjdR548aNQ40aNaCmpoZq1aph8uTJH00Itm7dGl26dIGtrS2srKwwatQo1K1bF9HR0YWeY2FhgcWLF6Nfv37Q1taW+9pPnz4Nd3d3uLq6wsLCAt27d0f79u1x/vx5qXpqamowMjKCsbExmjRpAn9/f1y+fFnufoD/LUNx8OBB2NraQkNDAx06dEBSUpJY5/jx42jUqBHU1dVRqVIlNG/eHPHx8QgLC0NoaCiuXbsm5u3CwsIAABKJBMuXL0fnzp2hrq6OGTNmFLjkxc6dO8VZu/n27NmDhg0bQkVFBfr6+ujSpQuAdz+D+Ph4jB49WuyvOHJzc+Ht7Y3Q0FBUqyY7E15RUREuLi6IiIgoVrsFKdOEa5UqVTBr1ixcunQJFy9eRJs2beDu7o5bt26JdfKz9fmvOXPmiMdyc3Ph6uqK7OxsnD59GmvXrkVYWBimTJlSFpdDRERERERERN+gqKgoODo6FngsJCQEv/76K06fPo2HDx+iZ8+eWLRoETZs2IC9e/fi0KFDUjMPf/75Z/z5559YsWIFbt26hdGjR6NPnz44ceIEzMzMxDUi7969i6SkJDGJ+Pr1a4wZMwYXL15EZGQkFBQU0KVLF+Tl5ZX4utLT0+Hi4oLIyEhcuXIFHTp0gJubGxISEqTqLVy4EM2bN8eVK1fg6uqKvn37ol+/fujTpw8uX74MKysr9OvXD4IgAADevHkDR0dH7N27Fzdv3oSfnx/69u0rkxD8FHl5eahSpQq2bNmC27dvY8qUKZg4cSI2b94sVe/o0aN49OgRTp48iQULFiA4OBidOnWCjo4Ozp07h6FDh2LIkCH4999/xXM0NTURFhaG27dvY/HixVi5ciUWLlwod2yCICAyMhJ3795Fy5YtS+2a8zVr1gyRkZG4d+8eAODatWuIjo5Gx44dCz3n+fPn2Lx5Mxo3blzs/jIyMjBv3jysW7cOJ0+eREJCAgIDAwEAb9++hYeHB1q1aoXr16/jzJkz8PPzg0QiQa9evfDjjz+idu3aYt6uV69eYrshISHo0qULbty4gQEDBsgVy969e9GlSxe4uLjgypUriIyMRKNGjQC8W9ahSpUqmDp1qthfcUydOhWGhoZFzmRv1KgRoqKiitVuQcp0DVc3Nzep9zNmzMDy5ctx9uxZ1K5dG8D/svUFOXToEG7fvo0jR46gcuXKqFevHqZNm4Zx48YhJCQEFStW/OzXQERERERERETftvj4eJiYmBR4bPr06WjevDkAYODAgZgwYQJiY2PFGXLdu3fHsWPHMG7cOGRlZWHmzJk4cuQImjZtCgCoVq0aoqOj8dtvv6FVq1bQ1dUF8G7Nzfdn+3Xr1k2q39WrV8PAwAC3b99GnTp1SnRd9vb2Uo+yT5s2DTt27MDu3bvh7+8vlru4uGDIkCEAgClTpmD58uVo2LAhevToAeDdjNCmTZvi8ePHMDIygqmpqZiQA4ARI0bg4MGD2Lx5s5gc+1QVKlRAaGio+N7S0hJnzpzB5s2b0bNnT7FcV1cXS5YsgYKCAmrWrIk5c+YgIyMDEydOBABMmDABs2bNQnR0NDw9PQG8W6szn4WFBQIDAxEREYGffvqpyJhSU1NhamqKrKwsKCoqYtmyZfj+++9L5XrfN378eKSlpcHGxgaKiorIzc3FjBkz4O3tLVVv2bJl+OOPPyAIAjIyMlCjRo0ilzgoTE5ODlasWAErKysAgL+/P6ZOnQoASEtLQ2pqKjp16iQet7W1Fc/V0NCAkpJSgbm73r17o3///sWKZcaMGfD09JT62eePYV1dXSgqKkJTU7PQXGFhoqOjsWrVqo+uNWtiYoKHDx8iLy/vk9Zx/Wo2zcrNzcWWLVvw+vVr8UsJAMLDw7F+/XoYGRnBzc0NkydPhpqaGgDgzJkzsLOzQ+XKlcX6zs7O+OGHH3Dr1i3Ur1+/wL6ysrKQlZUlvk9LSwPwboB9LWtKyEvytnjTp7+W68uP42uJh75uHOdUHnzOcf62mP9HoThtc5xTcXCcU3nAcU7y4r3+umRmZkJFRaXAY3Xr1hX/u3LlyuJj6O+X5c/s/Pvvv5GRkSGThMvOzi40R5Hv/v37mDJlCs6dO4enT5+KM1sTEhJQp04ddOzYUZx5Z25uLvV0cGHS09MREhKCvXv3IikpCW/fvkVmZqbMDNcPrxEA7OzsZMpSUlJgZGSE3NxczJw5E5s3b0ZiYiKys7ORlZUl5mvCw8PFBC4A7N+/v8A1Mz9m6dKlWL16NRISEpCZmYns7GzUq1dPqk7t2rWlEmOVK1eWSlArKipCT08PKSkpYtmmTZuwZMkSxMbGIj09HW/fvoWWlhaAd/e7Vq1aYt2JEyeKyVtNTU1cvXoV6enpiIyMxJgxY1CtWjW0bt262NcGvJtZ/f6s1d9++w3e3t7YvHkzwsPDsWHDBtSuXRtXr15FQEAATExM4OPjI9b39vbGpEmTAACPHz/GzJkz0b59e1y6dAmampqoXbs24uPjAQAtWrQodG1SNTU1MZkKAMbGxuL90tXVha+vL5ydnfH999+jXbt26NmzJ4yNjT96fQ0aNCj2Pbl69SoGDx5c7POK8urVK/Tt2xcrV66Evr5+kXVVVVWRl5eHrKwsqKqqlrjPMk+43rhxA02bNsWbN2+goaGBHTt2iAO7d+/eMDc3h4mJCa5fv45x48bh7t272L59OwAgOTlZKtkK/O9LIDk5udA+f/75Z6lMeb5Dhw6JXw7fCnMUvvNcQfZh32eKpGQOHz5c1iHQN4DjnMqDzzrO3/s/0PI1XvzPEMc5yYPjnMoDjnOS1/trflLZ09fXx4sXLwo8VqFCBfG/JRKJ1Pv8svzkaHp6OoB3j0WbmppK1VNWVi4yBjc3N5ibm2PlypUwMTFBXl4e6tSpg+zsbADAH3/8gczMTJmYihIYGIjDhw9j3rx5qF69OlRVVdG9e3exzcKusbCy/OucO3cuFi9ejEWLFsHOzg7q6uoICAgQ2+3cubPUo+0f3gt5REREIDAwEPPnz0fTpk2hqamJuXPn4ty5c4XGnh9rUT+jM2fOiOt4Ojs7Q1tbGxEREZg/fz6AdzMc358FmT8jGQAUFBRQvXp1AEC9evUQExODn3/+ucQJ1wYNGkj1lZ/TGjt2LMaPHy/OyLWzs0N8fDx+/vlnqYSrtra2GE/16tWxatUqGBsbY9OmTRg0aBD27dsn/nGnqORhQfcrf/kIAFizZg1GjhyJAwcOYNOmTQgKCsLhw4el1jwuiLq6utR7BQUFqXYB2T8+fUqSszCxsbGIi4uTetI+fzwoKSnh7t27YsL5+fPnUFdX/+Q4yjzhWrNmTVy9ehWpqanYunUrfHx8cOLECdSqVQt+fn5iPTs7OxgbG6Nt27aIjY2VyrwX14QJEzBmzBjxfVpaGszMzNC+fXvxLxrfiuUvlxer/g+VfvhMkRRPTk4ODh8+jO+//17ufyio/OI4p/Lgc47ztFmzitW21vjxctflOKfi4Din8oDjnOSV/6QlfR3q16+P9evXf3I7tWrVgrKyMhISEtCqVasC6+Qvf5ibmyuWPXv2DHfv3sXKlSvFmaAfbsZUkqTlqVOn4OvrK246lJ6ejri4uGK3U1C77u7u6NOnD4B3yat79+6JE+g0NTWhqan5yX00a9YMw4YNE8s+dRMx4N2GVObm5uLMUADiLFDgXQIuP4n5MfkzIUtKVVW1wL4yMjJkHmdXVFT86Hq+ioqKACAm5s3Ni/dHwKLUr18f9evXx4QJE9C0aVNs2LABTZo0QcWKFaXGclEMDAzw6tUrvH79WkzGfviIf926dREZGVnoUgTF6S+fjY0Nbty4IVUWFBSEV69eYfHixTAzMxPLb968+dHZ6PIo84RrxYoVxcHl6OiICxcu4P/Yu/f4nuv//+P392Zms/PsYA7bHHI+hTR8ipxCzh2cR4pChVYoh7EcUk4VisqQtVT0EZNYjpHkkERyXhj6hM0sM9t+f/ju/fO2mfd7e83MbtcuLhfv1+v5er4er9eeI/c9X8/X7Nmz9dFHH2Vpm/nTkSNHjqhixYry9/fPsiDzuXPnJCnHtRwcHR2z/cmSg4NDofufi4xiGXdudJN77foK4z3H3cc4R1GQn+O8mI0vWsjNeGWcwxqMcxQFjHNYi/t8b2nTpo1Gjx6tixcvytPTM9f9uLq6KiwsTMOHD1d6erqaNm2qhIQE/fjjj3Jzc1NoaKgCAwNlMpm0atUqtWvXTk5OTvL09JS3t7fmz5+v0qVLKy4uTqNs+KHJ7VSuXFnLly9Xhw4dZDKZNHbs2Dy9hOvmfr/66itt27ZNnp6emjFjhs6dO2fxKP7t/P3331lCtuweT69cubIWL16stWvXKjg4WEuWLNHOnTsVHByc59rj4uIUHR2thg0bavXq1VqxYsUdj5syZYoaNGigihUrKiUlRTExMVqyZInmzfv/P2gbPXq0Tp8+rcWLF5u3ZV5rUlKS+dqLFy+e473q0KGDJk2apPLly6tGjRras2ePZsyYkeXFU8nJyeYnvM+dO6eIiAiVKFFCrVu3tuWW5Oj48eOaP3++OnbsqICAAB06dEiHDx9W3759Jd1YA/f48ePau3evypYtK1dX19vO5m7UqJGcnZ31xhtv6OWXX9aOHTsUGRlp0Wb8+PFq0aKFKlasqO7du+v69euKiYnRyJEjzefbvHmzunfvLkdHxzsuESBJJUqUyLIOcub6ybdu37JliyH3r8AD11vl9NOBzEGa+Y0YEhKiSZMm6fz58/L19ZV04xEYNzc3q77JAQAAAABA/qsQWuHOjQpQrVq19OCDD2rZsmUWa4/mRkREhHx8fDRlyhQdO3ZMHh4eevDBB83rgJYpU0YTJkzQqFGj1L9/f/Xt21eRkZGKjo7Wyy+/rJo1a6pKlSp67733cv2oeqbMkK5x48YqVaqURo4cacjs6jFjxujYsWNq06aNnJ2dNXDgQHXu3FkJCQl3PDYqKkpRUVEW2yIiIsyzZTMNGjRIe/bs0TPPPCOTyaQePXpo8ODBt12H1FodO3bU8OHDNXToUKWkpKh9+/YaO3aswsPDczzuypUrGjx4sE6dOiUnJydVrVpVn332mZ555hlzm/j4+Czr4948W3LXrl2KiopSYGBgjjON33//fY0dO1aDBw/W+fPnFRAQoEGDBmncuHEW7RYsWKAFCxZIkjw9PVW7dm3FxMSoSpUqVt6NO3N2dtYff/yhRYsW6Z9//lHp0qU1ZMgQ8/dJt27dtHz5cjVv3lyXLl3SwoUL1a9fv2z78vLy0meffabXXntNCxYsUIsWLRQeHm7xhHuzZs305ZdfKiIiQlOnTpWbm5seeeQR8/6JEydq0KBB5uA7c4kCk8mU47mtcfr0aW3bts2Q2e6mjFsXT7iLRo8erbZt26p8+fK6fPmyoqKi9Pbbb2vt2rWqUKGCoqKi1K5dO3l7e2vfvn0aPny4ypYtq02bNkm6Mf2+bt26CggI0LRp03T27Fn16dNHzz33nCZPnmx1HYmJiXJ3d1dCQkKhW1Jg9sXZNrV/xfOVfKrENqmpqYqJiVG7du34yS7uiHGOoiA/x3lCNuuW58R9/Hir2zLOYQvGOYoCxjmsVZj/HXo7V69e1fHjxxUcHHzbF1Ddy1avXq3XXntN+/fvz9PbyQHcXcePH9cDDzygAwcOqHLlyrnuZ+TIkbp48aLmz5+f7X5b/owr0Bmu58+fV9++fRUfHy93d3fVrl1ba9euVatWrfTXX39p/fr1mjVrlq5cuaJy5cqpW7duGjNmjPl4e3t7rVq1Si+++KJCQkJUsmRJhYaGauLEiQV4VQAKk/z8hwsAAACAwqN9+/Y6fPiwTp8+bbGmI4B7W0xMjAYOHJinsFWSfH19Ld75lBcFGrh+8sknt91Xrlw580zWnAQGBiomF2/fBAAAAAAAuNmwYcMKugQANhoyZIgh/bz66quG9CNJzJEHAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAirx//vlHvr6+OnHiREGXYrWNGzfKZDLp0qVLkqTIyEh5eHgUaE25deLECZlMJu3du7egS0ER1b17d02fPt2QvghcAQAAAABA/jr8/d37lUuTJk1Sp06dFBQUZNx1Z+PWkLQoadasmYYNG1bQZeTK8uXL1aBBA3l4eKhkyZKqW7eulixZkuMx8fHx6tmzpx544AHZ2dlZfe1paWkaO3asgoOD5eTkpIoVKyoiIkIZGRnmNs2aNZPJZDL/8vPz01NPPaWTJ0/m2LfR4y88PFx169Y1pC9rBAUFadasWbk6dtasWapSpYqcnJxUrlw5DR8+XFevXjXvHzNmjCZNmqSEhIQ810ngCgAAAAAAirTk5GR98sknGjBgQEGXgnuUl5eX3nzzTW3fvl379u1T//791b9/f61du/a2x6SkpMjHx0djxoxRnTp1rD7X22+/rXnz5umDDz7QwYMH9fbbb2vatGl6//33Ldo9//zzio+P15kzZ/Tf//5Xf/31l3r37p3ra8xPqampBXr+qKgojRo1SuPHj9fBgwf1ySef6IsvvtAbb7xhblOzZk1VrFhRn332WZ7PR+AKAAAAAACKtJiYGDk6Ourhhx82b8ucCbh27VrVq1dPTk5Oeuyxx3T+/HmtWbNG1apVk5ubm3r27Knk5GTzcenp6ZoyZYp5dmKdOnX01VdfSbrx2Hzz5s0lSZ6enjKZTOrXr58k6bvvvlPTpk3l4eEhb29vPfHEEzp69Gieruvo0aPq1KmT/Pz85OLiooYNG2r9+vUWbYKCgvTWW2+pb9++cnFxUWBgoFauXKm///5bnTp1kouLi2rXrq1ffvnFfMw///yjHj16qEyZMnJ2dlatWrX0+eef56nWW6WlpWnAgAHm+1ilShXNnj3bok2/fv3UuXNnTZ48WX5+fvLw8NDEiRN1/fp1vfbaa/Ly8lLZsmW1cOFCi+NGjhypBx54QM7OzqpQoYLGjh17x0CwWbNm6tKli6pVq6aKFSvqlVdeUe3atbV169bbHhMUFKTZs2erb9++cnd3t/rat23bpk6dOql9+/YKCgrSk08+qdatW+vnn3+2aOfs7Cx/f3+VLl1aDz/8sIYOHardu3dbfR7p/y9DsXbtWlWrVk0uLi56/PHHFR8fb26zceNGPfTQQypZsqQ8PDzUpEkTnTx5UpGRkZowYYJ+/fVX80zbyMhISZLJZNK8efPUsWNHlSxZUpMmTcp2yYtvvvlGJpPJYtu3336rhg0bqkSJEipVqpS6dOki6cbX4OTJkxo+fLj5fNbatm2bmjRpop49eyooKEitW7dWjx49stzTDh06KDo62oY7mD0CVwAAAAAAUKRt2bJF9evXz3ZfeHi4PvjgA23btk1//fWXnn76ac2aNUtRUVFavXq1vv/+e4uZh1OmTNHixYv14Ycf6vfff9fw4cPVu3dvbdq0SeXKldPXX38tSTp06JDi4+PNIeKVK1c0YsQI/fLLL4qNjZWdnZ26dOmi9PT0XF9XUlKS2rVrp9jYWO3Zs0ePP/64OnTooLi4OIt2M2fOVJMmTbRnzx61b99effr0Ud++fdW7d2/t3r1bFStWVN++fc2PtF+9elX169fX6tWrtX//fg0cOFB9+vTJEl7lRXp6usqWLasvv/xSBw4c0Lhx4/TGG29o2bJlFu1++OEHnTlzRps3b9aMGTM0fvx4PfHEE/L09NSOHTv0wgsvaNCgQTp16pT5GFdXV0VGRurAgQOaPXu2FixYoJkzZ1pdW0ZGhmJjY3Xo0CE98sgjhl1zpsaNGys2NlZ//vmnJOnXX3/V1q1b1bZt29sec+HCBS1btkyNGjWy+XzJycl69913tWTJEm3evFlxcXEKCwuTJF2/fl2dO3fWo48+qn379mn79u0aOHCgTCaTnnnmGb366quqUaOG4uPjFR8fr2eeecbcb3h4uLp06aLffvtNzz77rFW1rF69Wl26dFG7du20Z88excbG6qGHHpJ0Y1mHsmXLauLEiebzWatx48batWuXeYweO3ZMMTExateunUW7hx56SD///LNSUlKs7js7xfJ0NAAAAAAAQCF38uRJBQQEZLvvrbfeUpMmTSRJAwYM0OjRo3X06FFVqFBBkvTkk09qw4YNGjlypFJSUjR58mStX79eISEhkqQKFSpo69at+uijj/Too4/Ky8tLkuTr62sx269bt24W5/3000/l4+OjAwcOqGbNmrm6rjp16lg8yh4REaEVK1Zo5cqVGjp0qHl7u3btNGjQIEnSuHHjNG/ePDVs2FBPPfWUpBszQkNCQnTu3Dn5+/urTJky5kBOkl566SWtXbtWy5YtM4djeeXg4KAJEyaYPwcHB2v79u1atmyZnn76afN2Ly8vvffee7Kzs1OVKlU0bdo0JScnmx8VHz16tKZOnaqtW7eqe/fukm6s1ZkpKChIYWFhio6O1uuvv55jTQkJCSpTpoxSUlJkb2+vuXPnqlWrVoZc781GjRqlxMREVa1aVfb29kpLS9OkSZPUq1cvi3Zz587Vxx9/rIyMDCUnJ+uBBx7IcYmD20lNTdWHH36oihUrSpKGDh2qiRMnSpISExOVkJCgJ554wry/WrVq5mNdXFxUrFgx+fv7Z+m3Z8+e6t+/v021TJo0Sd27d7f42meOYS8vL9nb28vV1TXb8+WkZ8+e+t///qemTZsqIyND169f1wsvvGCxpIAkBQQE6Nq1azp79qwCAwNtOsfNmOEKAAAAAACKtH///VclSpTIdl/t2rXNv/fz8zM/hn7ztvPnz0uSjhw5ouTkZLVq1UouLi7mX4sXL77j8gCHDx9Wjx49VKFCBbm5uZlf3pU5G7Vt27bm/mrUqGHVdSUlJSksLEzVqlWTh4eHXFxcdPDgwSwzXG+9RkmqVatWlm2Z15mWlqaIiAjVqlVLXl5ecnFx0dq1a839Ll261OL6t2zZYlW9t5ozZ47q168vHx8fubi4aP78+Vlqr1Gjhuzs/n+85efnZ1G7vb29vL29zbVL0hdffKEmTZrI399fLi4uGjNmjLnfuLg4i9onT55sPs7V1VV79+7Vzp07NWnSJI0YMUIbN27M1bVJN2ZW33yupUuXSpKWLVumpUuXKioqSrt379aiRYv07rvvatGiRRbH9+rVS3v37jXPgK1UqZJat26ty5cvm+9NZt85zY51dnY2h6mSVLp0afP98vLyUr9+/dSmTRt16NBBs2fPtnpmaYMGDWy6H5K0d+9etWjRwubj7mTjxo2aPHmy5s6dq927d2v58uVavXq1IiIiLNo5OTlJksUyIbnBDFcAAAAAAFCklSpVShcvXsx2n4ODg/n3JpPJ4nPmtszH/pOSkiTdeCy6TJkyFu0cHR1zrKFDhw4KDAzUggULFBAQoPT0dNWsWVPXrl2TJH388cf6999/s9SUk7CwMK1bt07vvvuuKlWqJCcnJz355JPmPm93jbfblnmd77zzjmbPnq1Zs2apVq1aKlmypIYNG2but2PHjhaPtt96L6wRHR2tsLAwTZ8+XSEhIXJ1ddU777yjHTt23Lb2zFpz+hpt375dvXr10oQJE9SmTRu5u7srOjpa06dPl3RjhuPevXvNx2bOSJYkOzs7VapUSZJUt25dHTx4UFOmTFGzZs1svj7pRiB587kyg+3XXntNo0aNMs/IrVWrlk6ePKkpU6YoNDTU3N7d3d1cT6VKlfTJJ5+odOnS+uKLL/Tcc88pJibGvDZtZpCYnezuV+byEZK0cOFCvfzyy/ruu+/0xRdfaMyYMVq3bp3FmsfZKVmypMVnOzs7i36lrC/TyqnOvBg7dqz69Omj5557TtKNe3rlyhUNHDhQb775pjm0v3DhgiTJx8cnT+cjcAUAAAAAAEVavXr1DHkzefXq1eXo6Ki4uDg9+uij2bYpXry4pBuzRDP9888/OnTokBYsWKD//Oc/kpTlZUy5CS1//PFH9evXz/zSoaSkJJ04ccLmfrLrt1OnTurdu7ekG0Hsn3/+qerVq0u6MRPU1dU1z+do3LixBg8ebN6W15eISTdenhQYGKg333zTvO3kyZPm3xcrVswcYt5Jenp6ntb6dHJyyvZcycnJFrN2pRszde+0nq+9vb0kmYP5vDwSf6t69eqpXr16Gj16tEJCQhQVFaWHH35YxYsXtxjLOfHx8dHly5d15coVcxh7c+As3ZhtHRsbe9ulCGw5381ud08lWYTA+/fvV9myZVWqVCmbz3EzAlcAAAAAAFCktWnTRqNHj9bFixfl6emZ635cXV0VFham4cOHKz09XU2bNlVCQoJ+/PFHubm5KTQ0VIGBgTKZTFq1apXatWsnJycneXp6ytvbW/Pnz1fp0qUVFxenUaNG5fm6KleurOXLl6tDhw4ymUwaO3Zsnl7CdXO/X331lbZt2yZPT0/NmDFD586dMweuOfn777+zhGylS5fO9hyLFy/W2rVrFRwcrCVLlmjnzp0KDg7Oc+1xcXGKjo5Ww4YNtXr1aq1YseKOx02ZMkUNGjRQxYoVlZKSopiYGC1ZskTz5s0ztxk9erROnz6txYsXm7dlXmtSUpL52osXL57jverQoYMmTZqk8uXLq0aNGtqzZ49mzJiR5cVTycnJOnv2rCTp3LlzioiIUIkSJdS6dWtbbkmOjh8/rvnz56tjx44KCAjQoUOHdPjwYfXt21fSjTVwjx8/rr1796ps2bJydXW97WzuRo0aydnZWW+88YZefvll7dixQ5GRkRZtxo8frxYtWqhixYrq3r27rl+/rpiYGI0cOdJ8vs2bN6t79+5ydHS0Ohjt0KGDZsyYoXr16qlRo0Y6cuSIxo4dqw4dOpiDV+nGMg9G3D8CVwAAAAAAkL8qGxcA5YdatWrpwQcf1LJly8wvj8qtiIgI+fj4aMqUKTp27Jg8PDz04IMPml/OU6ZMGU2YMEGjRo1S//791bdvX0VGRio6Olovv/yyatasqSpVqui9997L9aPqmTJDusaNG6tUqVIaOXKkEhMT89SndOOlU8eOHVObNm3k7OysgQMHqnPnzkpISLjjsVFRUYqKirLYFhERYZ4tm2nQoEHas2ePnnnmGZlMJvXo0UODBw/WmjVr8lR7x44dNXz4cA0dOlQpKSlq3769xo4dq/Dw8ByPu3LligYPHqxTp07JyclJVatW1WeffaZnnnnG3CY+Pj7LGrP16tUz/37Xrl2KiopSYGBgjjON33//fY0dO1aDBw/W+fPnFRAQoEGDBmncuHEW7RYsWKAFCxZIkjw9PVW7dm3FxMSoSpUqVt6NO3N2dtYff/yhRYsW6Z9//lHp0qU1ZMgQ8/dJt27dtHz5cjVv3lyXLl3SwoUL1a9fv2z78vLy0meffabXXntNCxYsUIsWLRQeHq6BAwea2zRr1kxffvmlIiIiNHXqVLm5uemRRx4x7584caIGDRpkDr4zZ6eaTKYczz1mzBiZTCaNGTNGp0+flo+PjznYznT16lV98803+u677/J41yRTxq2LJxRBiYmJcnd3V0JCgtzc3Aq6HJvMvjjbpvaveL6ST5XYJjU1VTExMWrXrp3Va8+g6MrPcZ5w05sPreE+frzVbRnnsAXjHEUB4xxFAeMc1irM/w69natXr+r48eMKDg6+7Quo7mWrV6/Wa6+9pv3792d59BjAvev48eN64IEHdODAAVWuXDnX/cybN08rVqzQ999/n+1+W/6MY4YrAAAAAAAo8tq3b6/Dhw/r9OnTKleuXEGXA8BKMTExGjhwYJ7CVunGy8Pef/99Q2oicAUAAAAAAJA0bNiwgi4BgI2GDBliSD/PPfecIf1IEnPkAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAgCLvn3/+ka+vr06cOFHQpVht48aNMplMunTpkiQpMjJSHh4eBVpTbp04cUImk0l79+4t6FJQBF27dk1BQUH65ZdfDOmvmCG9AAAAAAAA3Ebbfbvu2rnW1K6fq+MmTZqkTp06KSgoyNiCbrFx40Y1b95cFy9eLLThaG41a9ZMdevW1axZswq6FJstX75ckydP1pEjR5SamqrKlSvr1VdfVZ8+fXI8Zt68edq7d69SUlJUo0YNhYeHq02bNnc814cffqhdu3bpwoUL2rNnj+rWrWvRJigoSCdPnpQk2dnZyc/PT23bttW7774rT0/P2/YdGRmpYcOGmUP6vOrXr58uXbqkb775xpD+7sRkMmnFihXq3LlzrvuYOnWqRo8erVdeecU8FosXL66wsDCNHDlSsbGxea6TGa4AAAAAAKBIS05O1ieffKIBAwYUdCm4R3l5eenNN9/U9u3btW/fPvXv31/9+/fX2rVrb3vM5s2b1apVK8XExGjXrl1q3ry5OnTooD179uR4ritXrqhp06Z6++23c2w3ceJExcfHKy4uTkuXLtXmzZv18ssv5+r68ltqampBlyBJ2rlzpz766CPVrl07y75evXpp69at+v333/N8HgJXAAAAAABQpMXExMjR0VEPP/yweVvm4/pr165VvXr15OTkpMcee0znz5/XmjVrVK1aNbm5ualnz55KTk42H5eenq4pU6YoODhYTk5OqlOnjr766itJNx6bb968uSTJ09NTJpNJ/fr1kyR99913atq0qTw8POTt7a0nnnhCR48ezdN1HT16VJ06dZKfn59cXFzUsGFDrV+/3qJNUFCQ3nrrLfXt21cuLi4KDAzUypUr9ffff6tTp05ycXFR7dq1LR61/ueff9SjRw+VKVNGzs7OqlWrlj7//PM81XqrtLQ0DRgwwHwfq1SpotmzZ1u06devnzp37qzJkyfLz89PHh4emjhxoq5fv67XXntNXl5eKlu2rBYuXGhx3MiRI/XAAw/I2dlZFSpU0NixY+8YCDZr1kxdunRRtWrVVLFiRb3yyiuqXbu2tm7dettjZs2apddff10NGzZU5cqVNXnyZFWuXFnffvttjufq06ePxo0bp5YtW+bYztXVVf7+/ipTpoyaN2+u0NBQ7d69O8djbhUeHq66detqyZIlCgoKkru7u7p3767Lly+b23z11VeqVauWnJyc5O3trZYtW+rKlSsKDw/XokWL9N///lcmk0kmk0kbN240Lw/xxRdf6NFHH1WJEiW0dOlS87luvUe3zir/9NNPVaNGDTk6Oqp06dIaOnSoJJnbdenSRSaTyebZ6ElJSerVq5cWLFiQ7SxgT09PNWnSRNHR0Tb1mx0CVwAAAAAAUKRt2bJF9etnvxRBeHi4PvjgA23btk1//fWXnn76ac2aNUtRUVFavXq1vv/+e73//vvm9lOmTNHixYv14Ycf6vfff9fw4cPVu3dvbdq0SeXKldPXX38tSTp06JDi4+PNIeKVK1c0YsQI/fLLL4qNjZWdnZ26dOmi9PT0XF9XUlKS2rVrp9jYWO3Zs0ePP/64OnTooLi4OIt2M2fOVJMmTbRnzx61b99effr0Ud++fdW7d2/t3r1bFStWVN++fZWRkSFJunr1qurXr6/Vq1dr//79GjhwoPr06aOff/4517XeKj09XWXLltWXX36pAwcOaNy4cXrjjTe0bNkyi3Y//PCDzpw5o82bN2vGjBkaP368nnjiCXl6emrHjh164YUXNGjQIJ06dcp8jKurqyIjI3XgwAHNnj1bCxYs0MyZM62uLSMjQ7GxsTp06JAeeeQRm67p8uXL8vLysvoYa50+fVrffvutGjVqZPOxR48e1TfffKNVq1Zp1apV2rRpk6ZOnSpJio+PV48ePfTss8/q4MGD2rhxo7p27aqMjAyFhYXp6aef1uOPP674+HjFx8ercePG5n5HjRqlV155RQcPHrzjMgqZ5s2bpyFDhmjgwIH67bfftHLlSlWqVEnSjdmpkrRw4ULFx8ebP1tryJAhat++fY5B9kMPPaQtW7bY1G92WMMVAAAAAAAUaSdPnlRAQEC2+9566y01adJEkjRgwACNHj1aR48eVYUKFSRJTz75pDZs2KCRI0cqJSVFkydP1vr16xUSEiJJqlChgrZu3aqPPvpIjz76qDls8/X1tVjDtVu3bhbn/fTTT+Xj46MDBw6oZs2aubquOnXqqE6dOubPERERWrFihVauXGmeNShJ7dq106BBgyRJ48aN07x589SwYUM99dRTkm7MCA0JCdG5c+fMMyrDwsLMx7/00ktau3atli1bpoceeihXtd7KwcFBEyZMMH8ODg7W9u3btWzZMj399NPm7V5eXnrvvfdkZ2enKlWqaNq0aUpOTtYbb7whSRo9erSmTp2qrVu3qnv37pKkMWPGmI8PCgpSWFiYoqOj9frrr+dYU0JCgsqUKaOUlBTZ29tr7ty5atWqldXX9O677yopKcmi/rwYOXKkxowZo7S0NF29elWNGjXSjBkzbO4nPT1dkZGRcnV1lXRjhm1sbKwmTZqk+Ph4Xb9+XV27dlVgYKAkqVatWuZjnZyclJKSIn9//yz9Dhs2TF27drWplrfeekuvvvqqXnnlFfO2hg0bSpJ8fHwkSR4eHtmeLyfR0dHavXv3HUPagIAA89q4ecEMVwAAAAAAUKT9+++/KlGiRLb7bl7r0c/Pz/wY+s3bzp8/L0k6cuSIkpOT1apVK7m4uJh/LV68+I7LAxw+fFg9evRQhQoV5ObmZn5cOnM2atu2bc391ahRw6rrSkpKUlhYmKpVqyYPDw+5uLjo4MGDWWa43nqNkmWolrkt8zrT0tIUERGhWrVqycvLSy4uLlq7dq2536VLl1pcf25nDM6ZM0f169eXj4+PXFxcNH/+/Cy116hRQ3Z2/z/e8vPzs6jd3t5e3t7e5tol6YsvvlCTJk3k7+8vFxcXjRkzxtxvXFycRe2TJ082H+fq6qq9e/dq586dmjRpkkaMGKGNGzdadS1RUVGaMGGCli1bJl9fX0l5v0+vvfaa9u7dq3379plf9NS+fXulpaVJkkXfL7zwwm37CQoKMoetklS6dGnz/apTp45atGihWrVq6amnntKCBQt08eJFq+pr0KCBTddz/vx5nTlzRi1atLDpuDv566+/9Morr2jp0qW3/T7P5OTkZLFESG4xwxUAAAAAABRppUqVum2I5ODgYP69yWSy+Jy5LfOx/6SkJEnS6tWrVaZMGYt2jo6OOdbQoUMHBQYGasGCBQoICFB6erpq1qypa9euSZI+/vhj/fvvv1lqyklYWJjWrVund999V5UqVZKTk5OefPJJc5+3u8bbbcu8znfeeUezZ8/WrFmzVKtWLZUsWVLDhg0z99uxY0eLR9tvvRfWiI6OVlhYmKZPn66QkBC5urrqnXfe0Y4dO25be2atOX2Ntm/frl69emnChAlq06aN3N3dFR0drenTp0u6McNx79695mNvfvzfzs7O/Hh73bp1dfDgQU2ZMkXNmjW747U899xz+vLLLy0eZ8/rfSpVqpS5nsqVK2vWrFkKCQnRhg0b1LJlS4vrcHNzu20/Od0ve3t7rVu3Ttu2bTMvn/Hmm29qx44dCg4OzrG+kiVLWny2s7MzL0uR6ea1c52cnHLsL7d27dql8+fP68EHHzRvS0tL0+bNm/XBBx+YZyxL0oULF8wzafOCwBW3lXDT1H1ruI8fn0+VAAAAAACQf+rVq6fPPvssz/1Ur15djo6OiouL06OPPpptm+LFi0uSeRaidOMlVIcOHdKCBQv0n//8R5KyvIwpN6Hljz/+qH79+qlLly6SbgTCJ06csLmf7Prt1KmTevfuLelGEPvnn3+qevXqkm7MBL15xmRuz9G4cWMNHjzYvC2vLxGTpG3btikwMFBvvvmmedvNj5AXK1bMHGLeSXp6ulJSUnJs8/nnn+vZZ59VdHS02rdvb7HPiPt0s8zQMDOYt/Y67sRkMqlJkyZq0qSJxo0bp8DAQK1YsUIjRoxQ8eLFLcZyTnx8fHT27FllZGSYQ/ybQ2FXV1cFBQUpNjbW/HK5Wzk4OFh9vkwtWrTQb7/9ZrGtf//+qlq1qkaOHGm+b5K0f/9+1atXz6b+s0PgCgAAAAAAirQ2bdpo9OjRunjxYrZvL7eWq6urwsLCNHz4cKWnp6tp06ZKSEjQjz/+KDc3N4WGhiowMFAmk0mrVq1Su3bt5OTkJE9PT3l7e2v+/PkqXbq04uLiNGrUqDxfV+XKlbV8+XJ16NBBJpNJY8eOzdNLuG7u96uvvtK2bdvk6empGTNm6Ny5c+bANSd///23Rcgm3XiEPbtzLF68WGvXrlVwcLCWLFminTt33nFWpTW1x8XFKTo6Wg0bNtTq1au1YsWKOx43ZcoUNWjQQBUrVlRKSopiYmK0ZMkSzZs3z9xm9OjROn36tBYvXizpxjICoaGhmj17tho1aqSzZ89KujGT093d/bbnunDhguLi4nTmzBlJN16wJkn+/v4Wa5devnzZHGD+9ddfev311+Xj42Px4qq82rFjh2JjY9W6dWv5+vpqx44d+vvvv1WtWjVJN5YjWLt2rQ4dOiRvb+8cr6tZs2b6+++/NW3aND355JP67rvvtGbNGovZt+Hh4XrhhRfk6+urtm3b6vLly/rxxx/10ksvmc8XGxurJk2ayNHR0arvV1dX1yzrIJcsWVLe3t5Ztm/ZskURERFW35/bIXAFAAAAAAD5ak3t+gVdQo5q1aqlBx98UMuWLTO/PCq3IiIi5OPjoylTpujYsWPy8PDQgw8+aH6JU5kyZTRhwgSNGjVK/fv3V9++fRUZGano6Gi9/PLLqlmzpqpUqaL33nvvjo+q38mMGTP07LPPqnHjxipVqpRGjhypxMTEPPUp3Xjp1LFjx9SmTRs5Oztr4MCB6ty5sxISEu54bFRUlKKioiy2RUREmGfLZho0aJD27NmjZ555RiaTST169NDgwYO1Zs2aPNXesWNHDR8+XEOHDlVKSorat2+vsWPHKjw8PMfjrly5osGDB+vUqVNycnJS1apV9dlnn+mZZ54xt4mPj7dYY3b+/Pm6fv26hgwZoiFDhpi3h4aGKjIy8rbnWrlypfr372/+nPmyr/Hjx1vUOW7cOI0bN07SjdmjDRs21Pfffy9vb29rboVV3NzctHnzZs2aNUuJiYkKDAzU9OnT1bZtW0nS888/r40bN6pBgwZKSkrShg0bzOsP36patWqaO3euJk+erIiICHXr1k1hYWGaP3++uU1oaKiuXr2qmTNnKiwsTKVKldKTTz5p3j99+nSNGDFCCxYsUJkyZXTixAmdOHFCwcHB2rBhQ56+Z7Zv366EhASL8+WWKePWxROKoMTERLm7uyshISHHNS3uRbMvzrap/Suer9y50f/JzyUFUlNTFRMTo3bt2lm99gyKLsY5igLGOYoCxjmKAsY5rFWY/x16O1evXtXx48cVHBx8xxfT3ItWr16t1157Tfv377d4CROAe9uGDRvUtWtXHTt2LE8z1J955hnVqVPH/MORW9nyZxwzXAEAAAAAQJHXvn17HT58WKdPn1a5cuUKuhwAVoqJidEbb7yRp7D12rVrqlWrloYPH25ITQSuAAAAAAAAkoYNG1bQJQCw0TvvvJPnPooXL64xY8YYUM0NzJEHAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAirx//vlHvr6+OnHiREGXYrWNGzfKZDLp0qVLkqTIyEh5eHgUaE25deLECZlMJu3du7egS0ER9fDDD+vrr782pK9ihvQCAAAAAABwG6kTXr1r53IYPz1Xx02aNEmdOnVSUFCQsQXdYuPGjWrevLkuXrxYaMPR3GrWrJnq1q2rWbNmFXQpNlu+fLkmT56sI0eOKDU1VZUrV9arr76qPn365HjMvHnztHfvXqWkpKhGjRoKDw9XmzZt7niuDz/8ULt27dKFCxe0Z88e1a1b16JNUFCQTp48KUmys7OTn5+f2rZtq3fffVeenp637TsyMlLDhg0zh/R51a9fP126dEnffPONIf3diclk0ooVK9S5c2ebjktLS1N4eLg+++wznT17VgEBAerXr5/GjBkjk8kkSRozZoyGDx+uLl26yM4ub3NUmeEKAAAAAACKtOTkZH3yyScaMGBAQZeCe5SXl5fefPNNbd++Xfv27VP//v3Vv39/rV279rbHbN68Wa1atVJMTIx27dql5s2bq0OHDtqzZ0+O57py5YqaNm2qt99+O8d2EydOVHx8vOLi4rR06VJt3rxZL7/8cq6uL7+lpqYW6PnffvttzZs3Tx988IEOHjyot99+W9OmTdP7779vbtO2bVtdvnxZa9asyfP5CFwBAAAAAECRFhMTI0dHRz388MPmbZmP669du1b16tWTk5OTHnvsMZ0/f15r1qxRtWrV5Obmpp49eyo5Odl8XHp6uqZMmaLg4GA5OTmpTp06+uqrryTdeGy+efPmkiRPT0+ZTCb169dPkvTdd9+padOm8vDwkLe3t5544gkdPXo0T9d19OhRderUSX5+fnJxcVHDhg21fv16izZBQUF666231LdvX7m4uCgwMFArV67U33//rU6dOsnFxUW1a9fWL7/8Yj7mn3/+UY8ePVSmTBk5OzurVq1a+vzzz/NU663S0tI0YMAA832sUqWKZs+ebdGmX79+6ty5syZPniw/Pz95eHho4sSJun79ul577TV5eXmpbNmyWrhwocVxI0eO1AMPPCBnZ2dVqFBBY8eOvWMg2KxZM3Xp0kXVqlVTxYoV9corr6h27draunXrbY+ZNWuWXn/9dTVs2FCVK1fW5MmTVblyZX377bc5nqtPnz4aN26cWrZsmWM7V1dX+fv7q0yZMmrevLlCQ0O1e/fuHI+5VXh4uOrWraslS5YoKChI7u7u6t69uy5fvmxu89VXX6lWrVpycnKSt7e3WrZsqStXrig8PFyLFi3Sf//7X5lMJplMJm3cuNG8PMQXX3yhRx99VCVKlNDSpUvN57r1Ht06q/zTTz9VjRo15OjoqNKlS2vo0KGSZG7XpUsXmUwmm2ajb9u2TZ06dVL79u0VFBSkJ598Uq1bt9bPP/9sbmNvb6927dopOjrapnuYHQJXAAAAAABQpG3ZskX169fPdl94eLg++OADbdu2TX/99ZeefvppzZo1S1FRUVq9erW+//57i1lyU6ZM0eLFi/Xhhx/q999/1/Dhw9W7d29t2rRJ5cqVM68ReejQIcXHx5tDxCtXrmjEiBH65ZdfFBsbKzs7O3Xp0kXp6em5vq6kpCS1a9dOsbGx2rNnjx5//HF16NBBcXFxFu1mzpypJk2aaM+ePWrfvr369Omjvn37qnfv3tq9e7cqVqyovn37KiMjQ5J09epV1a9fX6tXr9b+/fs1cOBA9enTxyK8yqv09HSVLVtWX375pQ4cOKBx48bpjTfe0LJlyyza/fDDDzpz5ow2b96sGTNmaPz48XriiSfk6empHTt26IUXXtCgQYN06tQp8zGurq6KjIzUgQMHNHv2bC1YsEAzZ860uraMjAzFxsbq0KFDeuSRR2y6psuXL8vLy8vqY6x1+vRpffvtt2rUqJHNxx49elTffPONVq1apVWrVmnTpk2aOnWqJCk+Pl49evTQs88+q4MHD2rjxo3q2rWrMjIyFBYWpqefflqPP/644uPjFR8fr8aNG5v7HTVqlF555RUdPHjwjssoZJo3b56GDBmigQMH6rffftPKlStVqVIlSdLOnTslSQsXLlR8fLz5szUaN26s2NhY/fnnn5KkX3/9VVu3blXbtm0t2j300EPasmWL1f3eDmu4AgAAAACAIu3kyZMKCAjIdt9bb72lJk2aSJIGDBig0aNH6+jRo6pQoYIk6cknn9SGDRs0cuRIpaSkaPLkyVq/fr1CQkIkSRUqVNDWrVv10Ucf6dFHHzWHbb6+vhZruHbr1s3ivJ9++ql8fHx04MAB1axZM1fXVadOHdWpU8f8OSIiQitWrNDKlSvNswYlqV27dho0aJAkady4cZo3b54aNmyop556StKNGaEhISE6d+6ceUZlWFiY+fiXXnpJa9eu1bJly/TQQw/lqtZbOTg4aMKECebPwcHB2r59u5YtW6ann37avN3Ly0vvvfee7OzsVKVKFU2bNk3Jycl64403JEmjR4/W1KlTtXXrVnXv3l3SjbU6MwUFBSksLEzR0dF6/fXXc6wpISFBZcqUUUpKiuzt7TV37ly1atXK6mt69913lZSUZFF/XowcOVJjxoxRWlqarl69qkaNGmnGjBk295Oenq7IyEi5urpKujHDNjY2VpMmTVJ8fLyuX7+url27KjAwUJJUq1Yt87FOTk5KSUmRv79/ln6HDRumrl272lTLW2+9pVdffVWvvPKKeVvDhg0lST4+PpIkDw+PbM+Xk1GjRikxMVFVq1aVvb290tLSNGnSJPXq1cuiXUBAgP766y+lp6fnaR1XZrgCAAAAAIAi7d9//1WJEiWy3Ve7dm3z7/38/MyPod+87fz585KkI0eOKDk5Wa1atZKLi4v51+LFi++4PMDhw4fVo0cPVahQQW5ububHpTNno7Zt29bcX40aNay6rqSkJIWFhalatWry8PCQi4uLDh48mGWG663XKFmGapnbMq8zLS1NERERqlWrlry8vOTi4qK1a9ea+126dKnF9ed2xuCcOXNUv359+fj4yMXFRfPnz89Se40aNSyCMT8/P4va7e3t5e3tba5dkr744gs1adJE/v7+cnFx0ZgxY8z9xsXFWdQ+efJk83Gurq7au3evdu7cqUmTJmnEiBHauHGjVdcSFRWlCRMmaNmyZfL19ZWU9/v02muvae/evdq3b59iY2MlSe3bt1daWpokWfT9wgsv3LafoKAgc9gqSaVLlzbfrzp16qhFixaqVauWnnrqKS1YsEAXL160qr4GDRrYdD3nz5/XmTNn1KJFC5uOs8ayZcu0dOlSRUVFaffu3Vq0aJHeffddLVq0yKKdk5OT0tPTlZKSkqfzMcMVAAAAAAAUaaVKlbptiOTg4GD+vclksvicuS3zsf+kpCRJ0urVq1WmTBmLdo6OjjnW0KFDBwUGBmrBggUKCAhQenq6atasqWvXrkmSPv74Y/37779ZaspJWFiY1q1bp3fffVeVKlWSk5OTnnzySXOft7vG223LvM533nlHs2fP1qxZs1SrVi2VLFlSw4YNM/fbsWNHi0fbb70X1oiOjlZYWJimT5+ukJAQubq66p133tGOHTtuW3tmrTl9jbZv365evXppwoQJatOmjdzd3RUdHa3p06dLujHDce/eveZjb378387Ozvx4e926dXXw4EFNmTJFzZo1u+O1PPfcc/ryyy8t1mXN630qVaqUuZ7KlStr1qxZCgkJ0YYNG9SyZUuL63Bzc7ttPzndL3t7e61bt07btm0zL5/x5ptvaseOHQoODs6xvpIlS1p8trOzMy9LkenmtXOdnJxy7C8vXnvtNY0aNco8y7lWrVo6efKkpkyZotDQUHO7CxcuqGTJknmuhcAVAAAAAAAUafXq1dNnn32W536qV68uR0dHxcXF6dFHH822TfHixSXJPAtRuvESqkOHDmnBggX6z3/+I0lZXsaUm9Dyxx9/VL9+/dSlSxdJNwLhEydO2NxPdv126tRJvXv3lnQjiP3zzz9VvXp1STdmgt48YzK352jcuLEGDx5s3pbXl4hJN16eFBgYqDfffNO87eTJk+bfFytWzBxi3ok1MyE///xzPfvss4qOjlb79u0t9hlxn25mb28vSeZg3trruBOTyaQmTZqoSZMmGjdunAIDA7VixQqNGDFCxYsXtxjLOfHx8dHZs2eVkZFhDvFvDoVdXV0VFBSk2NhY88vlbuXg4GD1+W6WnJycZYkAe3v7LGsk79+/X/Xq1bO5/1sRuAIAAAAAgCKtTZs2Gj16tC5evChPT89c9+Pq6qqwsDANHz5c6enpatq0qRISEvTjjz/Kzc1NoaGhCgwMlMlk0qpVq9SuXTs5OTnJ09NT3t7emj9/vkqXLq24uDiNGjUqz9dVuXJlLV++XB06dJDJZNLYsWPz9BKum/v96quvtG3bNnl6emrGjBk6d+6cOXDNyd9//20Rskk3HmHP7hyLFy/W2rVrFRwcrCVLlmjnzp13nFVpTe1xcXGKjo5Ww4YNtXr1aq1YseKOx02ZMkUNGjRQxYoVlZKSopiYGC1ZskTz5s0ztxk9erROnz6txYsXS7qxjEBoaKhmz56tRo0a6ezZs5JuzOR0d3e/7bkuXLiguLg4nTlzRtKNF6xJkr+/v8XapZcvXzYHmH/99Zdef/11+fj4WLy4Kq927Nih2NhYtW7dWr6+vtqxY4f+/vtvVatWTdKN5QjWrl2rQ4cOydvbO8fratasmf7++29NmzZNTz75pL777jutWbPGYvZteHi4XnjhBfn6+qpt27a6fPmyfvzxR7300kvm88XGxqpJkyZydHS0+vu1Q4cOmjRpksqXL68aNWpoz549mjFjhp599lmLdlu2bFHr1q1tvU1ZELgCAAAAAIB85TB+ekGXkKNatWrpwQcf1LJly8wvj8qtiIgI+fj4aMqUKTp27Jg8PDz04IMPml/iVKZMGU2YMEGjRo1S//791bdvX0VGRio6Olovv/yyatasqSpVqui9996746Pqd5IZKDVu3FilSpXSyJEjlZiYmKc+pRsvnTp27JjatGkjZ2dnDRw4UJ07d1ZCQsIdj42KilJUVJTFtoiICPNs2UyDBg3Snj179Mwzz8hkMqlHjx4aPHiw1qxZk6faO3bsqOHDh2vo0KFKSUlR+/btNXbsWIWHh+d43JUrVzR48GCdOnVKTk5Oqlq1qj777DM988wz5jbx8fEWa8zOnz9f169f15AhQzRkyBDz9tDQUEVGRt72XCtXrlT//v3NnzMfgx8/frxFnePGjdO4ceMk3Zg92rBhQ33//ffy9va25lZYxc3NTZs3b9asWbOUmJiowMBATZ8+XW3btpUkPf/889q4caMaNGigpKQkbdiwwbz+8K2qVaumuXPnavLkyYqIiFC3bt0UFham+fPnm9uEhobq6tWrmjlzpsLCwlSqVCk9+eST5v3Tp0/XiBEjtGDBApUpU0YnTpzQiRMnFBwcrA0bNtz2e+b999/X2LFjNXjwYJ0/f14BAQEaNGiQ+f5J0unTp7Vt2zZDZrubMm5dPOEumjdvnubNm2eezl6jRg2NGzfO/EW7evWqXn31VUVHRyslJUVt2rTR3LlzzYs1SzcWM37xxRe1YcMGubi4KDQ0VFOmTFGxYtZnyYmJiXJ3d1dCQkKOa1rci2ZfnG1T+1c8X7lzo/+TcNPbAK3hPn681W1TU1MVExOjdu3aWb32DIouxjmKAsY5igLGOYoCxjmsVZj/HXo7V69e1fHjxxUcHHzbF1Ddy1avXq3XXntN+/fvz9PbyQHcXRs2bFDXrl117NixPM1QHzlypC5evGgRAN/Mlj/jCvRPkLJly2rq1KnatWuXfvnlFz322GPq1KmTfv/9d0nS8OHD9e233+rLL7/Upk2bdObMGXXt2tV8fFpamtq3b69r165p27ZtWrRokSIjIy3SaQAAAAAAgDtp3769Bg4cqNOnTxd0KQBsEBMTozfeeCNPYask+fr6KiIiwpCaCnRJgQ4dOlh8njRpkubNm6effvpJZcuW1SeffKKoqCg99thjkqSFCxeqWrVq+umnn/Twww/r+++/14EDB7R+/Xr5+fmpbt26ioiI0MiRIxUeHm5eiPpWKSkpFosaZ06nT01NtXg7WmFgum6yqb0t13fdxp/o2dJ3ZtvCdr9RMBjnKAoY5ygKGOcoChjnsBb3+t40bNiwgi4BgI3eeecdQ/p59dVXDelHKuAlBW6WlpamL7/8UqGhodqzZ4/Onj2rFi1a6OLFi/Lw8DC3CwwM1LBhwzR8+HCNGzdOK1eutFhs+fjx46pQoYJ2795927eKhYeHa0I2j+NERUXJ2dnZ6EsDAAAAAMBCcnKyevbsyZICAFBI2PJnXIG/NOu3335TSEiIrl69KhcXF61YsULVq1fX3r17Vbx4cYuwVZL8/PzMb3U7e/asxXqumfsz993O6NGjNWLECPPnxMRElStXTq1bty50f9HNuzTvzo1u8qLHi1a3TZw61aa+3Wx4g2JqaqrWrVunVq1asUYU7ohxjqKAcY6igHGOooBxDmsZ8eIiAMC9qcAD1ypVqmjv3r1KSEjQV199pdDQUG3atClfz+no6ChHR8cs2x0cHArd/1xkFLNtgrIt11csPT3f+r75mMJ2z3H3Mc5RFDDOURQwzlEUMM5hrfv5Pt8jD9ICgKFs+bOtwF+7V7x4cVWqVEn169fXlClTVKdOHc2ePVv+/v66du2aLl26ZNH+3Llz8vf3lyT5+/vr3LlzWfZn7gMAAAAAAHdHZoicnJxcwJUAgPGuXbsmSbK3t79j2wKf4Xqr9PR0paSkqH79+nJwcFBsbKy6desmSTp06JDi4uIUEhIiSQoJCdGkSZN0/vx5+fr6SpLWrVsnNzc3Va9evcCuAQAAAACAosbe3l4eHh46f/68JMnZ2Vkmk20vkgOAe1F6err+/vtvOTs7q1ixO8epBRq4jh49Wm3btlX58uV1+fJlRUVFaePGjVq7dq3c3d01YMAAjRgxQl5eXnJzc9NLL72kkJAQPfzww5Kk1q1bq3r16urTp4+mTZums2fPasyYMRoyZEi2SwYAAAAAAID8k/m0aWboCgD3Czs7O5UvX96qHyQVaOB6/vx59e3bV/Hx8XJ3d1ft2rW1du1atWrVSpI0c+ZM2dnZqVu3bkpJSVGbNm00d+5c8/H29vZatWqVXnzxRYWEhKhkyZIKDQ3VxIkTC+qSAAAAAAAoskwmk0qXLi1fX1+lpqYWdDkAYJjixYvLzs661VkLNHD95JNPctxfokQJzZkzR3PmzLltm8DAQMXExBhdGgAAAAAAyCV7e3ur1jkEgPtRgb80CwAAAAAAAADuFwSuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYpJitBxw/flxbtmzRyZMnlZycLB8fH9WrV08hISEqUaJEftQIAAAAAAAAAIWC1YHr0qVLNXv2bP3yyy/y8/NTQECAnJycdOHCBR09elQlSpRQr169NHLkSAUGBuZnzQAAAAAAAABwT7IqcK1Xr56KFy+ufv366euvv1a5cuUs9qekpGj79u2Kjo5WgwYNNHfuXD311FP5UjAAAAAAAAAA3KusClynTp2qNm3a3Ha/o6OjmjVrpmbNmmnSpEk6ceKEUfUBAAAAAAAAQKFhVeCaU9h6K29vb3l7e+e6IAAAAAAAAAAorOxsPSAmJkZr167Nsn3t2rVas2aNIUUBAAAAAAAAQGFkc+A6atQopaWlZdmekZGhUaNGGVIUAAAAAAAAABRGNgeuhw8fVvXq1bNsr1q1qo4cOWJIUQAAAAAAAABQGNkcuLq7u+vYsWNZth85ckQlS5Y0pCgAAAAAAAAAKIxsDlw7deqkYcOG6ejRo+ZtR44c0auvvqqOHTsaWhwAAAAAAAAAFCY2B67Tpk1TyZIlVbVqVQUHBys4OFjVqlWTt7e33n333fyoEQAAAAAAAAAKhWK2HuDu7q5t27Zp3bp1+vXXX+Xk5KTatWvrkUceyY/6AAAAAAAAAKDQsDlwlSSTyaTWrVvrkUcekaOjo0wmk9F1AQAAAAAAAEChY/OSAunp6YqIiFCZMmXk4uKi48ePS5LGjh2rTz75xPACAQAAAAAAAKCwsDlwfeuttxQZGalp06apePHi5u01a9bUxx9/bGhxAAAAAAAAAFCY2By4Ll68WPPnz1evXr1kb29v3l6nTh398ccfhhYHAAAAAAAAAIWJzYHr6dOnValSpSzb09PTlZqaakhRAAAAAAAAAFAY2Ry4Vq9eXVu2bMmy/auvvlK9evUMKQoAAAAAAAAACqNith4wbtw4hYaG6vTp00pPT9fy5ct16NAhLV68WKtWrcqPGgEAAAAAAACgULB5hmunTp307bffav369SpZsqTGjRungwcP6ttvv1WrVq3yo0YAAAAAAAAAKBRsnuEqSf/5z3+0bt06o2sBAAAAAAAAgELN5hmuf/31l06dOmX+/PPPP2vYsGGaP3++oYUBAAAAAAAAQGFjc+Das2dPbdiwQZJ09uxZtWzZUj///LPefPNNTZw40fACAQAAAAAAAKCwsDlw3b9/vx566CFJ0rJly1SrVi1t27ZNS5cuVWRkpE19TZkyRQ0bNpSrq6t8fX3VuXNnHTp0yKJNs2bNZDKZLH698MILFm3i4uLUvn17OTs7y9fXV6+99pquX79u66UBAAAAAAAAQJ7YvIZramqqHB0dJUnr169Xx44dJUlVq1ZVfHy8TX1t2rRJQ4YMUcOGDXX9+nW98cYbat26tQ4cOKCSJUua2z3//PMWs2ednZ3Nv09LS1P79u3l7++vbdu2KT4+Xn379pWDg4MmT55s6+UBAAAAAAAAQK7ZHLjWqFFDH374odq3b69169YpIiJCknTmzBl5e3vb1Nd3331n8TkyMlK+vr7atWuXHnnkEfN2Z2dn+fv7Z9vH999/rwMHDmj9+vXy8/NT3bp1FRERoZEjRyo8PFzFixfPckxKSopSUlLMnxMTEyXdCJNTU1NtuoaCZrpusqm9Ldd33c62CdC29J3ZtrDdbxQMxjmKAsY5igLGOYoCxjmsxb0GgPuXKSMjI8OWAzZu3KguXbooMTFRoaGh+vTTTyVJb7zxhv744w8tX74818UcOXJElStX1m+//aaaNWtKurGkwO+//66MjAz5+/urQ4cOGjt2rHmW67hx47Ry5Urt3bvX3M/x48dVoUIF7d69W/Xq1ctynvDwcE2YMCHL9qioKIvZswAAAAAA5Ifk5GT17NlTCQkJcnNzK+hyAAAGsjlwlW48xp+YmChPT0/zthMnTpjXUM2N9PR0dezYUZcuXdLWrVvN2+fPn6/AwEAFBARo3759GjlypB566CFzsDtw4ECdPHlSa9euNR+TnJyskiVLKiYmRm3bts1yruxmuJYrV07/+9//Ct1fdPMuzbOp/YseL1rdNnHqVJv6dhs1yuq2qampWrdunVq1aiUHBwebzoOih3GOooBxjqKAcY6igHEOayUmJqpUqVIErgBwH7J5SQFJsre3twhbJSkoKChPhQwZMkT79++3CFulG4Fqplq1aql06dJq0aKFjh49qooVK+bqXI6OjuZ1aG/m4OBQ6P7nIqOYbXm5LddXLD093/q++ZjCds9x9zHOURQwzlEUMM5RFDDOYS3uMwDcv6xaBOjxxx/XTz/9dMd2ly9f1ttvv605c+bYVMTQoUO1atUqbdiwQWXLls2xbaNGjSTdWH5Akvz9/XXu3DmLNpmfb7fuKwAAAAAAAADkB6tmuD711FPq1q2b3N3d1aFDBzVo0EABAQEqUaKELl68qAMHDmjr1q2KiYlR+/bt9c4771h18oyMDL300ktasWKFNm7cqODg4Dsek7lWa+nSpSVJISEhmjRpks6fP29ezmDdunVyc3NT9erVraoDAAAAAAAAAIxgVeA6YMAA9e7dW19++aW++OILzZ8/XwkJCZIkk8mk6tWrq02bNtq5c6eqVatm9cmHDBmiqKgo/fe//5Wrq6vOnj0rSXJ3d5eTk5OOHj2qqKgotWvXTt7e3tq3b5+GDx+uRx55RLVr15YktW7dWtWrV1efPn00bdo0nT17VmPGjNGQIUOyXTYAAAAAAAAAAPKL1Wu4Ojo6qnfv3urdu7ckKSEhQf/++6+8vb1zvfbMvHk3FpRv1qyZxfaFCxeqX79+Kl68uNavX69Zs2bpypUrKleunLp166YxY8aY29rb22vVqlV68cUXFRISopIlSyo0NFQTJ07MVU0AAAAAAAAAkFu5emmWdGMWqru7e55OnpGR84Ly5cqV06ZNm+7YT2BgoGJiYvJUCwAAAAAAAADklVUvzQIAAAAAAAAA3BmBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABslV4Hrp0iV9/PHHGj16tC5cuCBJ2r17t06fPm1ocQAAAAAAAABQmBSz9YB9+/apZcuWcnd314kTJ/T888/Ly8tLy5cvV1xcnBYvXpwfdQIAAAAAAADAPc/mGa4jRoxQv379dPjwYZUoUcK8vV27dtq8ebOhxQEAAAAAAABAYWJz4Lpz504NGjQoy/YyZcro7NmzhhQFAAAAAAAAAIWRzYGro6OjEhMTs2z/888/5ePjY0hRAAAAAAAAAFAY2Ry4duzYURMnTlRqaqokyWQyKS4uTiNHjlS3bt0MLxAAAAAAAAAACgubA9fp06crKSlJvr6++vfff/Xoo4+qUqVKcnV11aRJk/KjRgAAAAAAAAAoFIrZeoC7u7vWrVunrVu3at++fUpKStKDDz6oli1b5kd9AAAAAAAAAFBo2By4ZmratKmaNm1qZC0AAAAAAAAAUKjlKnDduXOnNmzYoPPnzys9Pd1i34wZMwwpDAAAAAAAAAAKG5sD18mTJ2vMmDGqUqWK/Pz8ZDKZzPtu/j0AAAAAoHA5EXVC9rK/Y7sKoRXuQjUAABRONgeus2fP1qeffqp+/frlQzkAAAAAAAAAUHjZ2XyAnZ2aNGmSH7UAAAAAAAAAQKFmc+A6fPhwzZkzJz9qAQAAAAAAAIBCzeYlBcLCwtS+fXtVrFhR1atXl4ODg8X+5cuXG1YcAAAAAAAAABQmNgeuL7/8sjZs2KDmzZvL29ubF2UBAAAAAAAAwP+xOXBdtGiRvv76a7Vv3z4/6gEAAAAAAACAQsvmNVy9vLxUsWLF/KgFAAAAAAAAAAo1mwPX8PBwjR8/XsnJyflRDwAAAAAAAAAUWjYvKfDee+/p6NGj8vPzU1BQUJaXZu3evduw4gAAAIB7xYmoE7KX/R3bVQitcBeqAQAAwL3K5sC1c+fO+VAGAAAAAAAAABR+Ngeu48ePz486AAAAAAAAAKDQs3kNVwAAAAAAAABA9qya4erl5aU///xTpUqVkqenp0wm023bXrhwwbDiAAAAAAAAAKAwsSpwnTlzplxdXSVJs2bNys96AAAAAAAAAKDQsipwDQ0N1WOPPably5crNDQ0v2sCAAAAAAAAgELJ6jVcN27cqGvXruVnLQAAAAAAAABQqFk1wxUAACAnJ6JOyF72d2xXIbTCXagGAAAAAAqOTYHrgQMHdPbs2Rzb1K5dO08FAQAAAAAAAEBhZVPg2qJFC2VkZGTZbjKZlJGRIZPJpLS0NMOKAwAAAAAAAIDCxKbAdceOHfLx8cmvWgAAAAAAAACgULMpcC1fvrx8fX3zqxYAAAAAAAAAKNTsCroAAAAAAAAAALhfWB24PvrooypevHh+1gIAAAAAAAAAhZrVSwps2LAhP+sAAAAAAAAAgEKPJQUAAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGMTql2ZlGjFiRLbbTSaTSpQooUqVKqlTp07y8vLKc3EAAAAAAAAAUJjYHLju2bNHu3fvVlpamqpUqSJJ+vPPP2Vvb6+qVatq7ty5evXVV7V161ZVr17d8IIBAAAAAAAA4F5l85ICnTp1UsuWLXXmzBnt2rVLu3bt0qlTp9SqVSv16NFDp0+f1iOPPKLhw4fnR70AAAAAAAAAcM+yOXB95513FBERITc3N/M2d3d3hYeHa9q0aXJ2dta4ceO0a9cuQwsFAAAAAAAAgHudzYFrQkKCzp8/n2X733//rcTEREmSh4eHrl27lvfqAAAAAAAAAKAQydWSAs8++6xWrFihU6dO6dSpU1qxYoUGDBigzp07S5J+/vlnPfDAA0bXCgAAAAAAAAD3NJtfmvXRRx9p+PDh6t69u65fv36jk2LFFBoaqpkzZ0qSqlatqo8//tjYSgEAAAAAAADgHmdz4Ori4qIFCxZo5syZOnbsmCSpQoUKcnFxMbepW7euYQUCAAAAAAAAQGFhc+CaycXFRbVr1zayFgAAAAAAAAAo1GwOXK9cuaKpU6cqNjZW58+fV3p6usX+zFmvAAAAAAAAAFDU2By4Pvfcc9q0aZP69Omj0qVLy2Qy5UddAHDfOBF1Qvayv2O7CqEV7kI1AAAAAAAgP9kcuK5Zs0arV69WkyZN8qMeAAAAAAAAACi07Gw9wNPTU15eXvlRCwAAAAAAAAAUajYHrhERERo3bpySk5Pzox4AAAAAAAAAKLRsXlJg+vTpOnr0qPz8/BQUFCQHBweL/bt37zasOAAAAAC4n03d8z+r276Yj3UAAADj2By4du7cOR/KAAAAAADcr9ru22VT+zW16+dTJQAA5D+bA9fx48cbdvIpU6Zo+fLl+uOPP+Tk5KTGjRvr7bffVpUqVcxtrl69qldffVXR0dFKSUlRmzZtNHfuXPn5+ZnbxMXF6cUXX9SGDRvk4uKi0NBQTZkyRcWK2Xx5AAAAAAAAAJBrNq/haqRNmzZpyJAh+umnn7Ru3TqlpqaqdevWunLlirnN8OHD9e233+rLL7/Upk2bdObMGXXt2tW8Py0tTe3bt9e1a9e0bds2LVq0SJGRkRo3blxBXBIAAAAAAACAIsyqKaBeXl76888/VapUKXl6espkMt227YULF6w++XfffWfxOTIyUr6+vtq1a5ceeeQRJSQk6JNPPlFUVJQee+wxSdLChQtVrVo1/fTTT3r44Yf1/fff68CBA1q/fr38/PxUt25dRUREaOTIkQoPD1fx4sWznDclJUUpKSnmz4mJiZKk1NRUpaamWl3/vcB0/fZfi+zYcn3X7WzL423pO7NtYbvfKBiFfZynKc3wvnH/YZyjKGCcoyiwdZzbpV+3um1hHucO6ek2tS8K30dF4RoBoKgyZWRkZNyp0aJFi9S9e3c5Ojpq0aJFObYNDQ3NdTFHjhxR5cqV9dtvv6lmzZr64Ycf1KJFC128eFEeHh7mdoGBgRo2bJiGDx+ucePGaeXKldq7d695//Hjx1WhQgXt3r1b9erVy3Ke8PBwTZgwIcv2qKgoOTs757p+AAAAAACskZycrJ49eyohIUFubm4FXQ4AwEBWzXC9OUTNS6Cak/T0dA0bNkxNmjRRzZo1JUlnz55V8eLFLcJWSfLz89PZs2fNbW5ezzVzf+a+7IwePVojRowwf05MTFS5cuXUunXrQvcX3bxL82xq/6KH9e82TZw61aa+3UaNsrptamqq1q1bp1atWsnBwcGm86DoKezjvMqlKrKX/R3bB/UMsqkW3F8Y5ygKGOcoCmwd51fjnra67YCYBTb1fS+N826/77Wp/dc16trUvjDKfNISAHD/ydVbpY4ePaqFCxfq6NGjmj17tnx9fbVmzRqVL19eNWrUyFUhQ4YM0f79+7V169ZcHW8LR0dHOTo6Ztnu4OBQ6MK/jGJ3nKBswZbrK2bjYz+5uXeF8Z7j7ivs49z+//7Lj75x/2CcoyhgnKMosHWcp9tZ/0+ywjzOU21cDqEofB8VhWsEgKLK5pdmbdq0SbVq1dKOHTu0fPlyJSUlSZJ+/fVXjR8/PldFDB06VKtWrdKGDRtUtmxZ83Z/f39du3ZNly5dsmh/7tw5+fv7m9ucO3cuy/7MfQAAAAAAAABwt9gcuI4aNUpvvfWW1q1bZ/FCqscee0w//fSTTX1lZGRo6NChWrFihX744QcFBwdb7K9fv74cHBwUGxtr3nbo0CHFxcUpJCREkhQSEqLffvtN58+fN7dZt26d3NzcVL16dVsvDwAAAAAAAAByzeYlBX777TdFRUVl2e7r66v//e9/NvU1ZMgQRUVF6b///a9cXV3Na666u7vLyclJ7u7uGjBggEaMGCEvLy+5ubnppZdeUkhIiB5++GFJUuvWrVW9enX16dNH06ZN09mzZzVmzBgNGTIk22UDAAAAAAAAACC/2By4enh4KD4+Psts1D179qhMmTI29TVv3o0F5Zs1a2axfeHCherXr58kaebMmbKzs1O3bt2UkpKiNm3aaO7cuea29vb2WrVqlV588UWFhISoZMmSCg0N1cSJE229NAAAAOCua7tvl03t19Sun0+VAAAAwAg2B67du3fXyJEj9eWXX8pkMik9PV0//vijwsLC1LdvX5v6ysi484LyJUqU0Jw5czRnzpzbtgkMDFRMTIxN5wYAAAAAAAAAo9m8huvkyZNVtWpVlStXTklJSapevboeeeQRNW7cWGPGjMmPGgEAAAAAAACgULB5hmvx4sW1YMECjR07Vvv371dSUpLq1aunypUr50d9AAAAKKKm7rH+/QAv5mMdAAAAgC1sDlwzlS9fXuXLlzeyFgAAAAAAAAAo1GwOXJ999tkc93/66ae5LgYAAAAAAAAACjObA9eLFy9afE5NTdX+/ft16dIlPfbYY4YVBgAAAAAAAACFjc2B64oVK7JsS09P14svvqiKFSsaUhQAAAAAAAAAFEZ2hnRiZ6cRI0Zo5syZRnQHAAAAAAAAAIWSIYGrJB09elTXr183qjsAAAAAAAAAKHRsXlJgxIgRFp8zMjIUHx+v1atXKzQ01LDCAAAAAAAAAKCwsTlw3bNnj8VnOzs7+fj4aPr06Xr22WcNKwwAANze1D3/s7rti/lYBwAAAADAks2B64YNG/KjDgAAgCza7ttlU/s1tevnUyUAAAAAYJ1creF6/fp1rV+/Xh999JEuX74sSTpz5oySkpIMLQ4AAAAAAAAAChObZ7iePHlSjz/+uOLi4pSSkqJWrVrJ1dVVb7/9tlJSUvThhx/mR50AAAAAAFv8YbK+bZqTpM/zrRQAAIoSm2e4vvLKK2rQoIEuXrwoJycn8/YuXbooNjbW0OIAAAAAAAAAoDCxeYbrli1btG3bNhUvXtxie1BQkE6fPm1YYQAAAAAAAABQ2Ng8wzU9PV1paWlZtp86dUqurq6GFAUAAAAAAAAAhZHNgWvr1q01a9Ys82eTyaSkpCSNHz9e7dq1M7I2AAAAAAAAAChUbF5SYPr06WrTpo2qV6+uq1evqmfPnjp8+LBKlSqlzz9nkXUAAAAAAAAARZfNgWvZsmX166+/Kjo6Wvv27VNSUpIGDBigXr16WbxECwAAAAAAAACKGpsDV0kqVqyYevfubXQtAAAAAAAAAFCo2Ry4Ll68OMf9ffv2zXUxAAAAAAAAAFCY2Ry4vvLKKxafU1NTlZycrOLFi8vZ2ZnAFQAAAPe+P0zWt01zksS7CgAAAGAdO1sPuHjxosWvpKQkHTp0SE2bNuWlWQAAAAAAAACKtFyt4XqrypUra+rUqerdu7f++OMPI7pEEXEi6oTsZX/HdhVCK9yFagAAAAAAAIC8sXmG6+0UK1ZMZ86cMao7AAAAAAAAACh0bJ7hunLlSovPGRkZio+P1wcffKAmTZoYVhgA5NbUPf+zuu2L+VgHAAAAAAAoemwOXDt37mzx2WQyycfHR4899pimT59uVF0AAAAAAAAAUOjYHLimp6fnRx0AAAAAAAAAUOjleg3X//3vf0pMTDSyFgAAAAAAAAAo1Gya4Xrp0iW9+eab+uKLL3Tx4kVJko+Pj/r376+xY8fK2dk5X4oEAGTVdt8um9qvqV0/nyoBAAAAAACZrA5cL1y4oJCQEJ0+fVq9evVStWrVJEkHDhzQ+++/r3Xr1mnr1q3at2+ffvrpJ7388sv5VjQAAAAAAAAA3IusDlwnTpyo4sWL6+jRo/Lz88uyr3Xr1urTp4++//57vffee4YXCgAAAAAAAAD3OqsD12+++UYfffRRlrBVkvz9/TVt2jS1a9dO48ePV2hoqKFFAgCAu+gPk/Vt05wkfZ5vpQAAAABAYWP1S7Pi4+NVo0aN2+6vWbOm7OzsNH78eEMKAwAAAAAAAIDCxurAtVSpUjpx4sRt9x8/fly+vr5G1AQAAAAAAAAAhZLVgWubNm305ptv6tq1a1n2paSkaOzYsXr88ccNLQ4AAAAAAAAAChObXprVoEEDVa5cWUOGDFHVqlWVkZGhgwcPau7cuUpJSdHixYvzs1YAAAAAAAAAuKdZHbiWLVtW27dv1+DBgzV69GhlZGRIkkwmk1q1aqUPPvhA5cuXz7dCAQAAAAAAAOBeZ3XgKknBwcFas2aNLl68qMOHD0uSKlWqJC8vr3wpDgAAAAAAAAAKE5sC10yenp566KGHjK4FAAAAAAAAAAo1q1+aBQAAAAAAAADIGYErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGKVbQBQBAofKHyfq2aU6SPs+3UgAAAAAAwL2HGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMUaOC6efNmdejQQQEBATKZTPrmm28s9vfr108mk8ni1+OPP27R5sKFC+rVq5fc3Nzk4eGhAQMGKCkp6S5eBQAAAAAAAADcUKCB65UrV1SnTh3NmTPntm0ef/xxxcfHm399/vnnFvt79eql33//XevWrdOqVau0efNmDRw4ML9LBwAAAAAAAIAsihXkydu2bau2bdvm2MbR0VH+/v7Z7jt48KC+++477dy5Uw0aNJAkvf/++2rXrp3effddBQQEGF4zAAAAAAAAANxOgQau1ti4caN8fX3l6empxx57TG+99Za8vb0lSdu3b5eHh4c5bJWkli1bys7OTjt27FCXLl2y7TMlJUUpKSnmz4mJiZKk1NRUpaam5uPVGM903WRTe1uu77qdbROgbek7s22a0gzvG/cfW8e5Xfp1q9vaPM7TnKxvm36jbX6Nc4f0dJva8310b2OcZ49xfn9hnGePcX5/YZxnj3GeVVG4RgAoqkwZGRkZBV2EJJlMJq1YsUKdO3c2b4uOjpazs7OCg4N19OhRvfHGG3JxcdH27dtlb2+vyZMna9GiRTp06JBFX76+vpowYYJefPHFbM8VHh6uCRMmZNkeFRUlZ2dnQ68LAAAAAIBbJScnq2fPnkpISJCbm1tBlwMAMNA9PcO1e/fu5t/XqlVLtWvXVsWKFbVx40a1aNEi1/2OHj1aI0aMMH9OTExUuXLl1Lp160L3F928S/Nsan817mmr2w6IWWBT326jRlndNjU1VevWrVOVS1VkL/s7tg/qGWRTLbi/3FPjvOsUq9umpjtp3bFP822cd/t9r03tv65R16b2uLsY59ljnN9fGOfZY5zfXxjn2WOcZ5X5pCUA4P5zTweut6pQoYJKlSqlI0eOqEWLFvL399f58+ct2ly/fl0XLly47bqv0o11YR0dHbNsd3BwkIODg+F156eMYrZNUE63s/5LXszGx35yc+/s/++//Ogb9497apzb/2tTeyn/xnmqjY8V8n10b2OcZ49xfn9hnGePcX5/YZxnj3GeVVG4RgAoqmz7W6+AnTp1Sv/8849Kly4tSQoJCdGlS5e0a9cuc5sffvhB6enpatSoUUGVCQAAAAAAAKCIKtAZrklJSTpy5Ij58/Hjx7V37155eXnJy8tLEyZMULdu3eTv76+jR4/q9ddfV6VKldSmTRtJUrVq1fT444/r+eef14cffqjU1FQNHTpU3bt3V0BAQEFdFgAAAAAAAIAiqkBnuP7yyy+qV6+e6tWrJ0kaMWKE6tWrp3Hjxsne3l779u1Tx44d9cADD2jAgAGqX7++tmzZYrEcwNKlS1W1alW1aNFC7dq1U9OmTTV//vyCuiQAAAAAAAAARViBznBt1qyZMjJuv8bR2rVr79iHl5eXoqKijCwLAAAAAAAAAHKlUL00C7BW23277tzoJmtq18+nSgAAAAAAAFCUFKqXZgEAAAAAAADAvYzAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMUqygC8B95A+T9W3TnCR9nm+lAAAAAAAAAAWBGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwSLGCLgAAAAAAUMgc/t7GA7zzpQwAAO5FzHAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQYoVdAEAAADAfeXw9zYe4J0vZQAAAKBgMMMVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYpFhBFwAAAIqQw9/beIB3vpQBAAAAAPmFGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgxQry5Js3b9Y777yjXbt2KT4+XitWrFDnzp3N+zMyMjR+/HgtWLBAly5dUpMmTTRv3jxVrlzZ3ObChQt66aWX9O2338rOzk7dunXT7Nmz5eLiUgBXBAAAAADIq9QJr1rd1mH89HysBAAA2xXoDNcrV66oTp06mjNnTrb7p02bpvfee08ffvihduzYoZIlS6pNmza6evWquU2vXr30+++/a926dVq1apU2b96sgQMH3q1LAAAAAAAAAACzAp3h2rZtW7Vt2zbbfRkZGZo1a5bGjBmjTp06SZIWL14sPz8/ffPNN+revbsOHjyo7777Tjt37lSDBg0kSe+//77atWund999VwEBAXftWgAAAAAAAACgQAPXnBw/flxnz55Vy5Ytzdvc3d3VqFEjbd++Xd27d9f27dvl4eFhDlslqWXLlrKzs9OOHTvUpUuXbPtOSUlRSkqK+XNiYqIkKTU1Vampqfl0RfnDdN1kU3u79OtWt71uZ9sE6NQ0J+vbpt9om6Y069rb+HVxSE+3qX1h+7oXNYzz7DHO7y9FZpynZdhUC+P8/sI4zx7j/P7COM+ezePczt6GxoXze4LvZQC4f5kyMjJs+5syn5hMJos1XLdt26YmTZrozJkzKl26tLnd008/LZPJpC+++EKTJ0/WokWLdOjQIYu+fH19NWHCBL344ovZnis8PFwTJkzIsj0qKkrOzs7GXRQAAAAAANlITk5Wz549lZCQIDc3t4IuBwBgoHt2hmt+Gj16tEaMGGH+nJiYqHLlyql169aF7i+6eZfm2dT+atzTVrcdELPApr7duk6xum1qupPWHftUVS5Vkb3u/NProJ5BNtXS7fe9NrX/ukZdm9rj7mKcZ49xfn8pMuO80TGbaul21cum9ozzexvjPHuM8/sL4zx7to7z6G+/tLqtw6hJNvV9r8h80hIAcP+5ZwNXf39/SdK5c+csZrieO3dOdevWNbc5f/68xXHXr1/XhQsXzMdnx9HRUY6Ojlm2Ozg4yMHBwYDq756MYrZNUE63s/5LXszGx34c7P+1qb0k2f/ff3fs28avS6qNj1sVtq97UcM4zx7j/P5SZMa5vW2P2jLO7y+M8+wxzu8vjPPs2TzO061b2kAqvN8ThbVuAMCd2fa33l0UHBwsf39/xcbGmrclJiZqx44dCgkJkSSFhITo0qVL2rVrl7nNDz/8oPT0dDVq1Oiu1wwAAAAAAACgaCvQGa5JSUk6cuSI+fPx48e1d+9eeXl5qXz58ho2bJjeeustVa5cWcHBwRo7dqwCAgLM67xWq1ZNjz/+uJ5//nl9+OGHSk1N1dChQ9W9e3cFBAQU0FUBAAAAAAAAKKoKNHD95Zdf1Lx5c/PnzHVVQ0NDFRkZqddff11XrlzRwIEDdenSJTVt2lTfffedSpQoYT5m6dKlGjp0qFq0aCE7Ozt169ZN77333l2/FgAAAAAAAAAo0MC1WbNmysi4/RpHJpNJEydO1MSJE2/bxsvLS1FRUflRHgAAAHDPSZ3wqtVtHcZPz8dKAAAAkJ17dg1XAAAAAAAAAChsCnSGKwDgJoe/t/EA73wpAwAAAAAA5B4zXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABilW0AUAAAAYJXXCq1a3dRg/PR8rAQAAAFBUMcMVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMEixgi4AAHB3pE541eq2DuOn52MlAAAAAADcv5jhCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGKVbQBQBWOfy9jQd450sZAAAAAAAAQE6Y4QoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxyTweu4eHhMplMFr+qVq1q3n/16lUNGTJE3t7ecnFxUbdu3XTu3LkCrBgAAAAAAABAUXZPB66SVKNGDcXHx5t/bd261bxv+PDh+vbbb/Xll19q06ZNOnPmjLp27VqA1QIAAAAAAAAoyooVdAF3UqxYMfn7+2fZnpCQoE8++URRUVF67LHHJEkLFy5UtWrV9NNPP+nhhx++26UCAAAAAAAAKOLu+cD18OHDCggIUIkSJRQSEqIpU6aofPny2rVrl1JTU9WyZUtz26pVq6p8+fLavn17joFrSkqKUlJSzJ8TExMlSampqUpNTc2/i8kHpusmm9rbpV+3uu11O9smQKemOVnfNv1G2zSlWdl3hk21OKSn29S+sH3dixrGefZsHud29jY05nvibmOcZ49xfn9hnGePcX5/YZxnj3GeFf8GAYD7lykjI8O2vynvojVr1igpKUlVqlRRfHy8JkyYoNOnT2v//v369ttv1b9/f4vgVJIeeughNW/eXG+//fZt+w0PD9eECROybI+KipKzs7Ph1wEAAAAAwM2Sk5PVs2dPJSQkyM3NraDLAQAY6J4OXG916dIlBQYGasaMGXJycsp14JrdDNdy5crpf//7X6H7i27epXk2tb8a97TVbQfELLCpb7euU6xum5rupHXHPlWVS1Vkrzv/9Dqo0TGbaul21cum9l/XqGtTe9xdjPPs2TrOo7/90uq2DqMm2dQ38o5xnj3G+f2FcZ49xvn9hXGePcZ5VomJiSpVqhSBKwDch+75JQVu5uHhoQceeEBHjhxRq1atdO3aNV26dEkeHh7mNufOnct2zdebOTo6ytHRMct2BwcHOTg4GF12vsooZltenm5n/Ze8mI2P/TjY/2tTe0my/7//7ty3bY9mpdr4uFVh+7oXNYzz7Nk8ztOte0RQ4nuiIDDOs8c4v78wzrPHOL+/MM6zxzjPqrDWDQC4M9v+1itgSUlJOnr0qEqXLq369evLwcFBsbGx5v2HDh1SXFycQkJCCrBKAAAAAAAAAEXVPT3DNSwsTB06dFBgYKDOnDmj8ePHy97eXj169JC7u7sGDBigESNGyMvLS25ubnrppZcUEhKS4wuzAAAAAAAAACC/3NOB66lTp9SjRw/9888/8vHxUdOmTfXTTz/Jx8dHkjRz5kzZ2dmpW7duSklJUZs2bTR37twCrhoAAAAAAABAUXVPB67R0dE57i9RooTmzJmjOXPm3KWKAAAAAAAAAOD2CtUargAAAAAAAABwLyNwBQAAAAAAAACD3NNLCgB3S+qEV61u6zB+ej5WAgAAAAAAgMKMGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYJD7JnCdM2eOgoKCVKJECTVq1Eg///xzQZcEAAAAAAAAoIi5LwLXL774QiNGjND48eO1e/du1alTR23atNH58+cLujQAAAAAAAAARUixgi7ACDNmzNDzzz+v/v37S5I+/PBDrV69Wp9++qlGjRqVpX1KSopSUlLMnxMSEiRJFy5cUGpq6t0p2iApCSl3bnSTa4kXrW578do1m/q+nlDC6rap6SWUnJyshH8TZC/7O7b/JyHJplp01cGm5v9cu251W4d//rGtFuQZ4/w2GOf3Fcb5bTDO7yuM89tgnN9XGOe3wTjP4vLly5KkjIyMAq4EAGA0U0Yh/9P92rVrcnZ21ldffaXOnTubt4eGhurSpUv673//m+WY8PBwTZgw4S5WCQAAAABAVn/99ZfKli1b0GUAAAxU6Ge4/u9//1NaWpr8/Pwstvv5+emPP/7I9pjRo0drxIgR5s/p6em6cOGCvL29ZTKZ8rVe3JCYmKhy5crpr7/+kpubW0GXA+QLxjmKAsY5igLGOYoCxvndl5GRocuXLysgIKCgSwEAGKzQB6654ejoKEdHR4ttHh4eBVNMEefm5sb/0OG+xzhHUcA4R1HAOEdRwDi/u9zd3Qu6BABAPij0L80qVaqU7O3tde7cOYvt586dk7+/fwFVBQAAAAAAAKAoKvSBa/HixVW/fn3Fxsaat6Wnpys2NlYhISEFWBkAAAAAAACAoua+WFJgxIgRCg0NVYMGDfTQQw9p1qxZunLlivr371/QpeE2HB0dNX78+CxLOwD3E8Y5igLGOYoCxjmKAsY5AADGMWVkZGQUdBFG+OCDD/TOO+/o7Nmzqlu3rt577z01atSooMsCAAAAAAAAUITcN4ErAAAAAAAAABS0Qr+GKwAAAAAAAADcKwhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAADINd69CQAAAACWCFwBAECuOTo66uDBgwVdBgAgF7Zs2aLevXsrJCREp0+fliQtWbJEW7duLeDKAAAo3IoVdAHAX3/9pfHjx+vTTz8t6FKAPPn333+1a9cueXl5qXr16hb7rl69qmXLlqlv374FVB2QNyNGjMh2e1pamqZOnSpvb29J0owZM+5mWUC++OCDD/Tzzz+rXbt26t69u5YsWaIpU6YoPT1dXbt21cSJE1WsGP8bjcLt66+/Vp8+fdSrVy/t2bNHKSkpkqSEhARNnjxZMTExBVwhAACFlymDZwFRwH799Vc9+OCDSktLK+hSgFz7888/1bp1a8XFxclkMqlp06aKjo5W6dKlJUnnzp1TQEAA4xyFlp2dnerUqSMPDw+L7Zs2bVKDBg1UsmRJmUwm/fDDDwVTIGCQt956S9OmTVPr1q31448/atiwYXrnnXc0fPhw2dnZaebMmXrxxRc1YcKEgi4VyJN69epp+PDh6tu3r1xdXfXrr7+qQoUK2rNnj9q2bauzZ88WdIkAABRa/Gge+W7lypU57j927NhdqgTIPyNHjlTNmjX1yy+/6NKlSxo2bJiaNGmijRs3qnz58gVdHpBnkydP1vz58zV9+nQ99thj5u0ODg6KjIzMMqsbKKwiIyMVGRmprl276tdff1X9+vW1aNEi9erVS5JUtWpVvf766wSuKPQOHTqkRx75f+3dbUzV5ePH8c/3KBxvuBGGBmwiM1yEUQmHbmQITpJSCSIXMxTblE1dSitEfJCbmvM80FnTLVuhqGGiq2zZrEwtbzPBpJGmqaU2TmmIQ8CyI+f/wHX25weZ5he+cs77tZ0H5/refc7GA/hwnesa1WE8NDRUly9f7v5AAAD4EApXdLnc3FwZhnHTjVUMw+jGRID5Dhw4oC+++EIRERGKiIjQxx9/rFmzZiktLU27d+9W//79rY4I3JGysjKNGTNGkydPVnZ2tpYuXaqAgACrYwGmq6+vl8PhkCQ99NBDstlsevjhh73Hk5KSVF9fb1E6wDyRkZE6deqUYmNj243v27dPQ4cOtSYUAAA+gk2z0OWioqL0wQcfqK2trdPXkSNHrI4I3LGrV6+2W8/PMAy9+eabys7OVnp6uk6ePGlhOsAcKSkpqqmp0cWLF+VwOFRXV8c/zOBzIiMjdezYMUnSjz/+qOvXr3vfS9L333+vQYMGWRUPME1RUZGKi4t16NAhGYah+vp6VVZWqqSkRDNnzrQ6HgAAPRozXNHlkpOTVVNTo5ycnE6P/9vsV6AniI+PV3V1te6///5246tWrZIkPf3001bEAkwXFBSkdevWadOmTcrMzGRdYvicgoICFRYWKicnRzt37lRpaalKSkrU0NAgwzC0ZMkSTZw40eqYwB0rKytTW1ubxowZo9bWVo0aNUp2u10lJSWaPXu21fEAAOjR2DQLXW7v3r1qaWnRk08+2enxlpYWVVdXKz09vZuTAeZZunSp9u7d+487+s6aNUurV69WW1tbNycDus4vv/yimpoaZWZmsmwGfEZbW5ucTqcOHjyokSNHqqysTFVVVSotLVVra6uys7O1atUqfubhM65du6ZTp06publZCQkJCgoKsjoSAAA9HoUrAAAAAAAAAJiEJQUAAAAAwM+0tLTI6XRq586dunDhQodv4Zw5c8aiZAAA9HwUrgAAAADgZ6ZPn66vvvpKU6ZMUVRUFJsgAgBgIpYUAAAAAAA/M2DAAH3yySdKTU21OgoAAD7HZnUAAAAAAED3CgsLU3h4uNUxAADwSRSuAAAAAOBnFi9erAULFqi1tdXqKAAA+ByWFAAAAAAAPzNixAidPn1aHo9HsbGxCggIaHf8yJEjFiUDAKDnY9MsAAAAAPAzubm5VkcAAMBnMcMVAAAAAAAAAEzCGq4AAAAAAAAAYBKWFAAAAAAAPxAeHq6TJ08qIiJCYWFhMgzjH8+9dOlSNyYDAMC3ULgCAAAAgB9YsWKFgoODJUmvv/66tWEAAPBhrOEKAAAAAAAAACZhhisAAAAA+IGmpqZbPjckJKQLkwAA4NuY4QoAAAAAfsBms9103VZJ8ng8MgxD169f76ZUAAD4Hma4AgAAAIAf2L17t9URAADwC8xwBQAAAAA/kJeXp4qKCoWEhGj9+vXKz8+X3W63OhYAAD6HwhUAAAAA/EBgYKDOnj2rqKgo9erVSy6XS4MGDbI6FgAAPoclBQAAAADAD8THx2v+/PkaPXq0PB6PNm/e/I+bYxUWFnZzOgAAfAczXAEAAADAD+zfv1+vvPKKTp8+rUuXLik4OLjTTbQMw9ClS5csSAgAgG+gcAUAAAAAP2Oz2fTrr7+ypAAAAF3AZnUAAAAAAEDXy8vLU1NTkyRp7dq1Cg4OtjgRAAC+iRmuAAAAAOAH2DQLAIDuwaZZAAAAAOAH2DQLAIDuwQxXAAAAAPADBw4c0Msvv8ymWQAAdDEKVwAAAADwMzabTS6XS/fcc4/VUQAA8DkUrgAAAADgZ86ePauQkBCtWbNGx48flyQNHz5c06ZN+8dlBgAAwK2hcAUAAAAAP1NdXa2srCz17dtXjzzyiCTp8OHDunr1qj777DMlJydbnBAAgJ6LwhUAAAAA/ExaWpri4uL09ttvq3fvG3spu91uTZ8+XWfOnNGePXssTggAQM9F4QoAAAAAfqZv37769ttvFR8f32782LFjcjgcam1ttSgZAAA9n83qAAAAAACA7hUSEqJz5851GD9//ryCg4MtSAQAgO+gcAUAAAAAP5Ofn69p06apqqpK58+f1/nz57Vp0yZNnz5dkyZNsjoeAAA9Wm+rAwAAAAAAuteyZctkGIYKCwvldrslSQEBAZo5c6acTqfF6QAA6NlYwxUAAAAA/FRra6tOnz4tSbr33nvVr18/ixMBANDzUbgCAAAAAAAAgElYwxUAAAAAAAAATELhCgAAAAAAAAAmoXAFAAAAAAAAAJNQuAIAAAAAAACASShcAQDwQ4ZhaOvWrabd78svv5RhGLp8+bJp9wQAAACAnojCFQCAHuLgwYPq1auXxo8ff8f3crlceuqpp0xIdetiY2NlGIYMw1C/fv2UmJiod95557bvY3ZZDAAAAABmonAFAKCHKC8v1+zZs7Vnzx7V19ff9FyPxyO3291h/Nq1a5KkyMhI2e32Lsl5M4sWLZLLQDxxLgAABhpJREFU5VJdXZ0mT56soqIibd++vdtzAAAAAEBXoXAFAKAHaG5uVlVVlWbOnKnx48eroqKi3fG/v9K/fft2JScny263a9++fcrIyNCLL76ol156SREREcrKypLUfpboyJEjNW/evHb3u3jxogICArRnzx5J0oYNG+RwOBQcHKzIyEg9//zzunDhwm1/jr+vHzp0qObNm6fw8HDt2LHDe/zw4cN64oknFBERodDQUKWnp+vIkSPe47GxsZKkZ555RoZheN9L0kcffaSkpCT16dNHQ4cO1cKFCzstnQEAAACgK1G4AgDQA2zevFnx8fG67777NHnyZK1Zs0Yej6fDeWVlZXI6nTp+/LgefPBBSdK6desUGBio/fv3a/Xq1R2uKSgo0KZNm9rdr6qqStHR0UpLS5Mk/fXXX1q8eLFqa2u1detW/fzzz3rhhRf+8+dpa2vT+++/r8bGRgUGBnrHr1y5oqlTp2rfvn36+uuvNWzYMI0bN05XrlyRdKOQlaS1a9fK5XJ53+/du1eFhYUqLi7WsWPH9NZbb6miokJLliz5zxkBAAAA4L8wPJ39tQYAAO4qqampeu6551RcXCy3262oqCht2bJFGRkZkm7McB09erS2bt2qnJwc73UZGRlqampqN0tUujHD9cMPP1Rubq4uXryo6Oho7dq1y1uwjhw5UqNGjZLT6ew0T3V1tVJSUnTlyhUFBQV5n9/Y2KgBAwZ0ek1sbKxcLpcCAgL0559/yu12Kzw8XIcOHVJcXFyn17S1tWnAgAHauHGjJkyY0CH73zIzMzVmzBjNnz/fO/buu++qtLT0X5dfAAAAAAAzMcMVAIC73IkTJ/TNN99o0qRJkqTevXsrPz9f5eXlHc51OBwdxpKTk296/4EDB2rs2LGqrKyUJP300086ePCgCgoKvOfU1NQoOztbMTExCg4OVnp6uiTp3Llzt/VZ5s6dq6NHj2rXrl169NFHtWLFinZl62+//aaioiINGzZMoaGhCgkJUXNz878+p7a2VosWLVJQUJD3VVRUJJfLpdbW1tvKCAAAAAB3orfVAQAAwM2Vl5fL7XYrOjraO+bxeGS327Vq1SqFhoZ6x/v379/h+s7G/ldBQYHmzJmjlStXauPGjUpMTFRiYqIkqaWlRVlZWcrKylJlZaUGDhyoc+fOKSsry7sJ162KiIhQXFyc4uLitGXLFiUmJsrhcCghIUGSNHXqVDU0NOiNN97QkCFDZLfb9fjjj//rc5qbm7Vw4ULl5eV1ONanT5/byggAAAAAd4IZrgAA3MXcbrfWr1+v5cuX6+jRo95XbW2toqOj9d5775nynJycHP3xxx/69NNPtXHjxnazW3/44Qc1NDTI6XQqLS1N8fHx/2nDrP81ePBg5efnt1sGYP/+/ZozZ47GjRun4cOHy2636/fff293XUBAgK5fv95uLCkpSSdOnPCWuf//ZbPx6w4AAACA7sMMVwAA7mLbtm1TY2Ojpk2b1m4mqyQ9++yzKi8v14wZM+74Of3791dubq5effVVHT9+3Lt8gSTFxMQoMDBQK1eu1IwZM1RXV6fFixff8TMlqbi4WA888ICqq6vlcDg0bNgwbdiwQQ6HQ01NTZo7d6769u3b7prY2Fjt3LlTqampstvtCgsL04IFCzRhwgTFxMRo4sSJstlsqq2tVV1dnV577TVTsgIAAADArWDKBwAAd7Hy8nJlZmZ2KFulG4VrdXW1vvvuO1OeVVBQoNraWqWlpSkmJsY7PnDgQFVUVGjLli1KSEiQ0+nUsmXLTHlmQkKCxo4dqwULFki68XkbGxuVlJSkKVOmaM6cORo0aFC7a5YvX64dO3Zo8ODBGjFihCQpKytL27Zt0+eff66UlBQ99thjWrFihYYMGWJKTgAAAAC4VYbH4/FYHQIAAAAAAAAAfAEzXAEAAAAAAADAJBSuAAAAAAAAAGASClcAAAAAAAAAMAmFKwAAAAAAAACYhMIVAAAAAAAAAExC4QoAAAAAAAAAJqFwBQAAAAAAAACTULgCAAAAAAAAgEkoXAEAAAAAAADAJBSuAAAAAAAAAGASClcAAAAAAAAAMMn/AST4flJEnzHJAAAAAElFTkSuQmCC",
+      "text/plain": [
+       "<Figure size 1200x800 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create a list to store the queueing time data\n",
+    "qt_data = []\n",
+    "\n",
+    "# Iterate over the models, batch sizes, and arrival rates to calculate queueing time\n",
+    "for ssm in small_model_names:\n",
+    "    for batch_size in batch_sizes:\n",
+    "        for arrival_rate in arrival_rates:\n",
+    "            model_name = ssm.replace(\"/\", \"-\")\n",
+    "            filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "            if os.path.exists(filepath):\n",
+    "                qt = get_queueing_time(filepath)\n",
+    "                qt_data.append({\n",
+    "                    'Model': model_name,\n",
+    "                    'Batch Size': batch_size,\n",
+    "                    'Arrival Rate': arrival_rate,\n",
+    "                    'Queueing Time': qt\n",
+    "                })\n",
+    "# add incremental decoding entry\n",
+    "for batch_size in batch_sizes:\n",
+    "    for arrival_rate in arrival_rates:\n",
+    "        model_name = ssm.replace(\"/\", \"-\")\n",
+    "        filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "        if os.path.exists(filepath):\n",
+    "            qt = get_queueing_time(filepath)\n",
+    "            qt_data.append({\n",
+    "                'Model': \"Incr Dec (baseline)\",\n",
+    "                'Batch Size': batch_size,\n",
+    "                'Arrival Rate': arrival_rate,\n",
+    "                'Queueing Time': qt\n",
+    "            })\n",
+    "\n",
+    "# Convert the list to a DataFrame\n",
+    "qt_df = pd.DataFrame(qt_data)\n",
+    "print(qt_df.head())\n",
+    "\n",
+    "# Pivot the dataframe to have models and batch sizes as columns\n",
+    "pivot_df = qt_df.pivot_table(index='Arrival Rate', columns=['Model', 'Batch Size'], values='Queueing Time')\n",
+    "\n",
+    "# Plot the data\n",
+    "fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "\n",
+    "colors = ['lightgreen', 'skyblue', 'lightcoral', 'gold', 'plum', 'peachpuff', 'mediumturquoise', 'salmon']\n",
+    "pivot_df.plot(kind='bar', ax=ax, color=colors)\n",
+    "\n",
+    "ax.set_title('Queueing Time vs Arrival Rate for Different Models and Batch Sizes\\nLLM: LLAMA-3.1-70B-Instruct')\n",
+    "ax.set_xlabel('Arrival Rate (requests/sec)')\n",
+    "ax.set_ylabel('Queueing Time (sec)')\n",
+    "ax.grid(True)\n",
+    "ax.legend(title='Model and Batch Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "\n",
+    "# Save the plot as a PDF\n",
+    "plt.savefig('/usr/FlexFlow/benchmarking/queueing_time_vs_arrival_rate.pdf', bbox_inches='tight')\n",
+    "\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/benchmarking/queueing_time_vs_arrival_rate.pdf b/benchmarking/queueing_time_vs_arrival_rate.pdf
new file mode 100644
index 000000000..a552ebcea
Binary files /dev/null and b/benchmarking/queueing_time_vs_arrival_rate.pdf differ
diff --git a/benchmarking/throughput_vs_tpot.pdf b/benchmarking/throughput_vs_tpot.pdf
new file mode 100644
index 000000000..064bfb661
Binary files /dev/null and b/benchmarking/throughput_vs_tpot.pdf differ
diff --git a/benchmarking/ttft_vs_arrival_rate.pdf b/benchmarking/ttft_vs_arrival_rate.pdf
new file mode 100644
index 000000000..041d5e501
Binary files /dev/null and b/benchmarking/ttft_vs_arrival_rate.pdf differ
diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index c140a44ec..82cf3b412 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -2,140 +2,88 @@ set(NCCL_NAME nccl)
 # set(NCCL_CUDA_ARCH "-gencode=arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}")
 # message("NCCL_CUDA_ARCH: ${NCCL_CUDA_ARCH}")
 
-set(NCCL_URL "")
-if((FF_USE_PREBUILT_NCCL OR FF_USE_ALL_PREBUILT_LIBRARIES) AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
-  if(LINUX_VERSION MATCHES "20.04")
-    if (CUDA_VERSION VERSION_EQUAL "11.0")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.0.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.1.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.2.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.3")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.3.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.4")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.4.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.5")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.5.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.6")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.6.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.7")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.7.0.tar.gz")
-    endif()
-  elseif(LINUX_VERSION MATCHES "18.04")
-    if (CUDA_VERSION VERSION_EQUAL "10.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.1.243.tar.gz")
-    elseif (CUDA_VERSION VERSION_EQUAL "10.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.2.89.tar.gz")
-    elseif (CUDA_VERSION VERSION_EQUAL "11.0")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.0.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.1.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.2.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.3")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.3.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.4")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.4.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.5")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.5.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.6")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.6.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.7")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.7.0.tar.gz")
-    endif()
-  endif()
+if(NCCL_PATH)
+  set(NCCL_ROOT ${NCCL_PATH})
+else()
+  # if NCCL_PATH is not set, let's try to find it in the CUDA root
+  set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
 endif()
 
-if(NCCL_URL)
-  # Download and import pre-compiled NCCL library
-  message(STATUS "Using pre-compiled NCCL library")
-  message(STATUS "NCCL_URL: ${NCCL_URL}")
+find_library(NCCL_LIBRARY
+  NAMES libnccl${LIBEXT}
+  PATHS ${NCCL_ROOT} ${CUDA_ROOT}
+  PATH_SUFFIXES lib lib64
+  DOC "NCCL library." )
 
-  include(FetchContent)
-  FetchContent_Declare(${NCCL_NAME}
-    URL ${NCCL_URL}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-  )
-  FetchContent_GetProperties(${NCCL_NAME})
-  if(NOT ${NCCL_NAME}_POPULATED)
-    FetchContent_Populate(${NCCL_NAME})
-  endif()
-  
-  set(NCCL_FOLDER_PATH ${${NCCL_NAME}_SOURCE_DIR}/deps/${NCCL_NAME})
-  set(NCCL_INCLUDE_DIR ${NCCL_FOLDER_PATH}/include)
-  set(NCCL_LIB_DIR ${NCCL_FOLDER_PATH}/lib)
-  message(STATUS "NCCL library path: ${NCCL_FOLDER_PATH}")
-  add_library(nccl SHARED IMPORTED)
-  set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_FOLDER_PATH})
+find_path(NCCL_INCLUDE_DIR
+  NAMES nccl.h
+  HINTS ${NCCL_ROOT}
+  PATH_SUFFIXES include 
+  DOC "NCCL include directory.")
 
-  list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
-  list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIB_DIR}/libnccl${LIBEXT})
-  install(DIRECTORY ${NCCL_INCLUDE_DIR}/ DESTINATION include)
-  install(DIRECTORY ${NCCL_LIB_DIR}/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
-  
-else()
-  if(NCCL_PATH)
-    set(NCCL_ROOT ${NCCL_PATH})
+# find NCCL, set NCCL lib and include    
+if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
+  set(NCCL_FOUND ON)
+  set(NCCL_LIBRARIES ${NCCL_LIBRARY})
+  set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
+
+  # Check NCCL version
+  if(EXISTS "${NCCL_INCLUDE_DIR}/nccl.h")
+    file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES
+         REGEX "#define NCCL_MAJOR [0-9]+" )
+    file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES2
+         REGEX "#define NCCL_MINOR [0-9]+" )
+    string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
+    string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
+    set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
+    if(NCCL_VERSION VERSION_LESS 2.23)
+      set(NCCL_OLD TRUE)
+    else()
+      set(NCCL_OLD FALSE)
+    endif()
+    message(STATUS "Found NCCL version: ${NCCL_VERSION}")
   else()
-    # if NCCL_PATH is not set, let's try to find it in the CUDA root
-    set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+    message(WARNING "NCCL header not found, unable to determine version")
+    set(NCCL_OLD TRUE)  # Assume old version if we can't determine
   endif()
-  
-  find_library(NCCL_LIBRARY
-    NAMES libnccl${LIBEXT}
-    PATHS ${NCCL_ROOT} ${CUDA_ROOT}
-    PATH_SUFFIXES lib lib64
-    DOC "NCCL library." )
+endif()
 
-  find_path(NCCL_INCLUDE_DIR
-    NAMES nccl.h
-    HINTS ${NCCL_ROOT}
-    PATH_SUFFIXES include 
-    DOC "NCCL include directory.")
-  
-  # find NCCL, set NCCL lib and include    
-  if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
-    set(NCCL_FOUND ON)
-    set(NCCL_LIBRARIES ${NCCL_LIBRARY})
-    set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
-  endif()
-  
-  # find NCCL
-  if(NCCL_FOUND)
-    list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
-    list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
-    message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
-    message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
-    add_library(nccl SHARED IMPORTED)
-  
-  # Build NCCL from source
-  else()
-    message(STATUS "Building NCCL from source")
-    list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
-  
-    ExternalProject_Add(${NCCL_NAME}
-      SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
-      PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
-      INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
-      BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
-      INSTALL_COMMAND ""
-      CONFIGURE_COMMAND ""
-      BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}"
-      BUILD_IN_SOURCE 1
-    )
+# find NCCL
+if(NCCL_FOUND AND (NOT NCCL_OLD OR CUDA_VERSION VERSION_LESS 12.0))
+  list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
+  message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
+  message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
+  add_library(nccl SHARED IMPORTED)
+
+# Build NCCL from source
+else()
+  message(STATUS "Building NCCL from source")
+  list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
 
-    ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
-    message(STATUS "NCCL install dir: ${INSTALL_DIR}")
-    list(APPEND FLEXFLOW_INCLUDE_DIRS
-      ${INSTALL_DIR}/include)
-    list(APPEND FLEXFLOW_EXT_LIBRARIES
-      ${INSTALL_DIR}/lib/libnccl${LIBEXT})
-    set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")
-    
-    install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
-    install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
+  set(NCCL_BUILD_CMD make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}")
+  if(DEFINED ENV{MAKEFLAGS})
+    set(NCCL_BUILD_CMD ${CMAKE_COMMAND} -E env MAKEFLAGS=$ENV{MAKEFLAGS} ${NCCL_BUILD_CMD})
   endif()
+  ExternalProject_Add(${NCCL_NAME}
+    SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
+    PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+    INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+    BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
+    INSTALL_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ${NCCL_BUILD_CMD}
+    BUILD_IN_SOURCE 1
+  )
 
+  ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
+  message(STATUS "NCCL install dir: ${INSTALL_DIR}")
+  list(APPEND FLEXFLOW_INCLUDE_DIRS
+    ${INSTALL_DIR}/include)
+  list(APPEND FLEXFLOW_EXT_LIBRARIES
+    ${INSTALL_DIR}/lib/libnccl${LIBEXT})
+  set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")
+  
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
 endif()
diff --git a/config/config.linux b/config/config.linux
index acffc210f..8eb4f3087 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -111,6 +111,11 @@ function get_build_configs() {
     BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
+#install raft
+echo "Building raft dependency ..."
+INSTALL_PREFIX=./install $(dirname $0)/../deps/raft/build.sh libraft > /dev/null
+echo "Building raft dependency ... Done"
+
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
     . $(dirname $0)/config.inc
     # Passing CMAKE_FLAGS or CUDA_PATH as $1 will print the value of the CMAKE_FLAGS/CUDA_PATH variable, 
diff --git a/deps/flashinfer b/deps/flashinfer
new file mode 160000
index 000000000..be6bf5bb2
--- /dev/null
+++ b/deps/flashinfer
@@ -0,0 +1 @@
+Subproject commit be6bf5bb26f1f1b3edf094d903544600c574ee09
diff --git a/deps/legion b/deps/legion
index 24e8c4523..0d32b3554 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835
+Subproject commit 0d32b35542bc0e9aba5950e485b8fc3413ae664b
diff --git a/deps/nccl b/deps/nccl
index 6e24ef4e1..2ea4ee94b 160000
--- a/deps/nccl
+++ b/deps/nccl
@@ -1 +1 @@
-Subproject commit 6e24ef4e1f1eac9f104d115ef65429f179924ee7
+Subproject commit 2ea4ee94bfb04c886c79ccae60ac9961000fdee2
diff --git a/deps/raft b/deps/raft
new file mode 160000
index 000000000..b79f15d2f
--- /dev/null
+++ b/deps/raft
@@ -0,0 +1 @@
+Subproject commit b79f15d2f229849bc02425b2e4ffd7bd3db89d4c
diff --git a/deps/tensorrt_llm/README.md b/deps/tensorrt_llm/README.md
new file mode 100644
index 000000000..39fcecdd7
--- /dev/null
+++ b/deps/tensorrt_llm/README.md
@@ -0,0 +1,5 @@
+## Custom AllReduce Implementation
+
+This is an adapted version of the custom AllReduce plugin from NVIDIA's [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) repository.
+
+To replace the NCCL AllReduce call, we should also add a CUDA IPC support to the custom AllReduce usage. Our IPC&AllReduce implementation is referenced from [mlc-ai/relax](https://github.com/mlc-ai/relax).
diff --git a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
new file mode 100644
index 000000000..619eb8987
--- /dev/null
+++ b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_fp16.h>
+
+#include "custom_allreduce_kernels.h"
+
+namespace tensorrt_llm {
+
+static inline __device__ void st_flag_release(uint32_t &flag,
+                                              uint32_t *flag_addr) {
+#if __CUDA_ARCH__ >= 700
+  asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#else
+  __threadfence_system();
+  asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void ld_flag_acquire(uint32_t &flag,
+                                              uint32_t *flag_addr) {
+#if __CUDA_ARCH__ >= 700
+  asm volatile("ld.global.acquire.sys.b32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#else
+  asm volatile("ld.global.volatile.b32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Type Converter that packs data format to 128 bits data type
+//
+using PackedFloat = union {
+  int4 packed;
+  float unpacked[4];
+};
+
+using PackedHalf = union {
+  int4 packed;
+  half2 unpacked[4];
+};
+
+template <typename T>
+struct PackedOn16Bytes {};
+
+template <>
+struct PackedOn16Bytes<float> {
+  using Type = PackedFloat;
+};
+
+template <>
+struct PackedOn16Bytes<half> {
+  using Type = PackedHalf;
+};
+
+#ifdef ENABLE_BF16
+using PackedBFloat16 = union {
+  int4 packed;
+  __nv_bfloat162 unpacked[4];
+};
+
+template <>
+struct PackedOn16Bytes<__nv_bfloat16> {
+  using Type = PackedBFloat16;
+};
+#endif
+
+// add two 128b data
+template <typename T>
+inline __device__ int4 add128b(T &a, T &b) {
+  T c;
+  c.unpacked[0] = a.unpacked[0] + b.unpacked[0];
+  c.unpacked[1] = a.unpacked[1] + b.unpacked[1];
+  c.unpacked[2] = a.unpacked[2] + b.unpacked[2];
+  c.unpacked[3] = a.unpacked[3] + b.unpacked[3];
+  return c.packed;
+}
+
+__inline__ __device__ void multi_gpu_barrier(uint32_t **signals,
+                                             const uint32_t flag,
+                                             const size_t rank,
+                                             const size_t world_size,
+                                             int const tidx,
+                                             int const bidx) {
+  // At the end of the function, we now that has least block 0 from all others
+  // GPUs have reached that point.
+  uint32_t volatile *my_signals = signals[rank];
+  if (tidx < world_size) {
+    // The 1st block notifies the other ranks.
+    if (bidx == 0) {
+      signals[tidx][rank] = flag;
+    }
+
+    // Busy-wait until all ranks are ready.
+    while (my_signals[tidx] != flag) {
+    }
+  }
+
+  // Make sure we can move on...
+  __syncthreads();
+}
+
+__global__ void multiGpuBarrierKernel(AllReduceParams params) {
+  multi_gpu_barrier(params.peer_barrier_ptrs_out,
+                    params.barrier_flag,
+                    params.local_rank,
+                    params.ranks_per_node,
+                    threadIdx.x,
+                    blockIdx.x);
+}
+
+template <typename T, int RANKS_PER_NODE>
+static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
+  int const bidx = blockIdx.x;
+  int const tidx = threadIdx.x;
+
+  // The number of elements packed into one for comms
+  static constexpr int NUM_ELTS = 16 / sizeof(T);
+
+  // Packed data type for comms
+  using PackedStruct = typename PackedOn16Bytes<T>::Type;
+
+  multi_gpu_barrier(params.peer_barrier_ptrs_in,
+                    params.barrier_flag,
+                    params.local_rank,
+                    RANKS_PER_NODE,
+                    tidx,
+                    bidx);
+
+  // The source pointers. Distributed round-robin for the different warps.
+  T const *src_d[RANKS_PER_NODE];
+#pragma unroll
+  for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+    int rank = (params.local_rank + ii) % RANKS_PER_NODE;
+    src_d[ii] = reinterpret_cast<T *>(params.peer_comm_buffer_ptrs[rank]);
+  }
+
+  // The location in the destination array (load 8 fp16 or load 4 fp32 using
+  // LDG.128).
+  size_t offset = bidx * params.elts_per_block + tidx * NUM_ELTS;
+  // The end of the segment computed by that block.
+  size_t max_offset =
+      min((bidx + 1) * params.elts_per_block, params.elts_per_rank);
+
+  // Each block accumulates the values from the different GPUs on the same node.
+  for (size_t iter_offset = offset; iter_offset < max_offset;
+       iter_offset += blockDim.x * NUM_ELTS) {
+    // Iterate over the different ranks/devices on the node to load the values.
+    PackedStruct vals[RANKS_PER_NODE];
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      vals[ii].packed =
+          *reinterpret_cast<int4 const *>(&src_d[ii][iter_offset]);
+    }
+
+    // Sum the values from the different ranks.
+    PackedStruct sums;
+    sums.packed = {0, 0, 0, 0};
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      sums.packed = add128b(sums, vals[ii]);
+    }
+
+    // Store to the destination buffer.
+    *reinterpret_cast<int4 *>(&reinterpret_cast<T *>(
+        params.local_output_buffer_ptr)[iter_offset]) = sums.packed;
+  }
+}
+
+template <typename T, int RANKS_PER_NODE>
+static __global__ void twoShotAllReduceKernel(AllReduceParams params) {
+  // The block index.
+  int const bidx = blockIdx.x;
+  // The thread index with the block.
+  int const tidx = threadIdx.x;
+
+  // The number of elements packed into one for comms
+  static constexpr int NUM_ELTS = 16 / sizeof(T);
+
+  // Packed data type for comms
+  using PackedType = typename PackedOn16Bytes<T>::Type;
+
+  // The location in the destination array (load 8 fp16 or load 4 fp32 using
+  // LDG.128).
+  const size_t block_offset = bidx * params.elts_per_block + tidx * NUM_ELTS;
+  const size_t block_start = params.rank_offset + block_offset;
+  // The end of the segment computed by that block.
+  size_t max_offset = min(block_start + params.elts_per_block,
+                          params.rank_offset + params.elts_per_rank);
+
+  multi_gpu_barrier(params.peer_barrier_ptrs_in,
+                    params.barrier_flag,
+                    params.local_rank,
+                    RANKS_PER_NODE,
+                    tidx,
+                    bidx);
+
+  // The source pointers. Distributed round-robin for the different warps.
+  T *src_d[RANKS_PER_NODE];
+  // The destination ranks for round-robin gathering
+  size_t dst_rank[RANKS_PER_NODE];
+#pragma unroll
+  for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+    int rank = (params.local_rank + ii) % RANKS_PER_NODE;
+    src_d[ii] = reinterpret_cast<T *>(params.peer_comm_buffer_ptrs[rank]);
+    dst_rank[ii] = rank;
+  }
+
+  // Each block accumulates the values from the different GPUs on the same node.
+  for (size_t local_offset = block_start; local_offset < max_offset;
+       local_offset += blockDim.x * NUM_ELTS) {
+    // Iterate over the different ranks/devices on the node to load the values.
+    PackedType vals[RANKS_PER_NODE];
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      vals[ii].packed =
+          *reinterpret_cast<int4 const *>(&src_d[ii][local_offset]);
+    }
+
+    // Sum the values from the different ranks.
+    PackedType sums;
+    sums.packed = {0, 0, 0, 0};
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      sums.packed = add128b(sums, vals[ii]);
+    }
+
+    // Store to the local buffer.
+    *reinterpret_cast<int4 *>(&src_d[0][local_offset]) = sums.packed;
+  }
+
+  // sync threads to make sure all block threads have the sums
+  __syncthreads();
+
+  // barriers among the blocks with the same idx (release-acquire semantics)
+  if (tidx < RANKS_PER_NODE) {
+    // The all blocks notifies the other ranks.
+    uint32_t flag_block_offset = RANKS_PER_NODE + bidx * RANKS_PER_NODE;
+    st_flag_release(params.barrier_flag,
+                    params.peer_barrier_ptrs_in[tidx] + flag_block_offset +
+                        params.local_rank);
+
+    // Busy-wait until all ranks are ready.
+    uint32_t rank_barrier = 0;
+    uint32_t *peer_barrier_d = params.peer_barrier_ptrs_in[params.local_rank] +
+                               flag_block_offset + tidx;
+    do {
+      ld_flag_acquire(rank_barrier, peer_barrier_d);
+    } while (rank_barrier != params.barrier_flag);
+  }
+
+  // sync threads to make sure all other ranks has the final partial results
+  __syncthreads();
+
+  size_t max_block_offset =
+      min(block_offset + params.elts_per_block, params.elts_per_rank);
+  // Gather all needed elts from other intra-node ranks
+  for (size_t local_offset = block_offset; local_offset < max_block_offset;
+       local_offset += blockDim.x * NUM_ELTS) {
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      // use round-robin gathering from other ranks
+      size_t offset_rank = dst_rank[ii] * params.elts_per_rank + local_offset;
+      if (offset_rank >= params.elts_total) {
+        continue;
+      }
+      *reinterpret_cast<int4 *>(
+          &reinterpret_cast<T *>(params.local_output_buffer_ptr)[offset_rank]) =
+          *reinterpret_cast<int4 *>(&src_d[ii][offset_rank]);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline int divUp(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+std::tuple<int, int> kernelLaunchConfig(AllReduceStrategyType algo,
+                                        AllReduceParams &param,
+                                        size_t elts_per_thread) {
+  assert(param.elts_total % elts_per_thread == 0);
+
+  int blocks_per_grid = 1, threads_per_block = DEFAULT_BLOCK_SIZE;
+
+  const size_t total_threads = param.elts_total / elts_per_thread;
+  switch (algo) {
+    case AllReduceStrategyType::ONESHOT: {       // one stage all reduce algo
+      if (total_threads <= DEFAULT_BLOCK_SIZE) { // local reduce
+        threads_per_block = WARP_SIZE * divUp(total_threads, WARP_SIZE);
+        blocks_per_grid = 1;
+      } else { // local reduce
+        threads_per_block = DEFAULT_BLOCK_SIZE;
+        blocks_per_grid = divUp(total_threads, DEFAULT_BLOCK_SIZE);
+        blocks_per_grid =
+            std::min(static_cast<int>(MAX_ALL_REDUCE_BLOCKS), blocks_per_grid);
+      }
+      param.elts_per_rank = param.elts_total;
+      param.elts_per_block =
+          elts_per_thread *
+          divUp(param.elts_per_rank, elts_per_thread * blocks_per_grid);
+      break;
+    }
+    case AllReduceStrategyType::TWOSHOT: { // two stage all reduce algo
+      const size_t elts_per_rank = param.elts_total / param.ranks_per_node;
+      assert(elts_per_rank % elts_per_thread == 0);
+
+      size_t total_threads = elts_per_rank / elts_per_thread;
+      total_threads = WARP_SIZE * ((total_threads + WARP_SIZE - 1) / WARP_SIZE);
+      assert(total_threads % WARP_SIZE == 0);
+
+      while (total_threads % blocks_per_grid != 0 ||
+             total_threads / blocks_per_grid > DEFAULT_BLOCK_SIZE) {
+        blocks_per_grid += 1;
+      }
+
+      threads_per_block = total_threads / blocks_per_grid;
+
+      // NOTE: need to adjust here
+      if (static_cast<size_t>(blocks_per_grid) > MAX_ALL_REDUCE_BLOCKS) {
+        size_t iter_factor = 1;
+        while (blocks_per_grid / iter_factor > MAX_ALL_REDUCE_BLOCKS ||
+               blocks_per_grid % iter_factor) {
+          iter_factor += 1;
+        }
+        blocks_per_grid /= iter_factor;
+      }
+      param.elts_per_rank = param.elts_total / param.ranks_per_node;
+      param.elts_per_block = param.elts_per_rank / blocks_per_grid;
+      param.elts_per_block =
+          elts_per_thread * divUp(param.elts_per_block, elts_per_thread);
+      param.rank_offset = param.rank * param.elts_per_rank;
+      break;
+    }
+    default:
+      assert(false && "Algorithm not supported here.");
+  }
+
+  return std::make_tuple(blocks_per_grid, threads_per_block);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, int RANKS_PER_NODE>
+void dispatchARKernels(AllReduceStrategyType algo,
+                       AllReduceParams &param,
+                       int blocks_per_grid,
+                       int threads_per_block,
+                       cudaStream_t stream) {
+  if (algo == AllReduceStrategyType::ONESHOT) {
+    oneShotAllReduceKernel<T, RANKS_PER_NODE>
+        <<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
+  } else {
+    twoShotAllReduceKernel<T, RANKS_PER_NODE>
+        <<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
+  }
+  multiGpuBarrierKernel<<<1, param.ranks_per_node, 0, stream>>>(param);
+}
+
+template <typename T>
+void invokeOneOrTwoShotAllReduceKernel(AllReduceParams &param,
+                                       AllReduceStrategyType strat,
+                                       cudaStream_t stream) {
+  assert(strat == AllReduceStrategyType::ONESHOT ||
+         strat == AllReduceStrategyType::TWOSHOT);
+  auto last_error = cudaGetLastError();
+  if (last_error != cudaSuccess) {
+    printf("cuda error: %s\n", cudaGetErrorString(last_error));
+    assert(false && "Error before launching the kernel");
+  }
+
+  size_t elts_per_thread = 16 / sizeof(T);
+  auto [blocks_per_grid, threads_per_block] =
+      kernelLaunchConfig(strat, param, elts_per_thread);
+  switch (param.ranks_per_node) {
+    case 2:
+      dispatchARKernels<T, 2>(
+          strat, param, blocks_per_grid, threads_per_block, stream);
+      break;
+    case 4:
+      dispatchARKernels<T, 4>(
+          strat, param, blocks_per_grid, threads_per_block, stream);
+      break;
+    case 6:
+      dispatchARKernels<T, 6>(
+          strat, param, blocks_per_grid, threads_per_block, stream);
+      break;
+    case 8:
+      dispatchARKernels<T, 8>(
+          strat, param, blocks_per_grid, threads_per_block, stream);
+      break;
+    default:
+      break;
+  }
+  last_error = cudaGetLastError();
+  if (last_error != cudaSuccess) {
+    printf("cuda error: %s\n", cudaGetErrorString(last_error));
+    assert(false && "Error after launching the kernel");
+  }
+}
+
+void invokeMultiGpuBarrier(AllReduceParams &param, cudaStream_t stream) {
+  multiGpuBarrierKernel<<<1, param.ranks_per_node, 0, stream>>>(param);
+}
+
+void customAllReduce(AllReduceParams &params,
+                     void *data,
+                     size_t elts,
+                     DataType dataType,
+                     AllReduceStrategyType strat,
+                     cudaStream_t stream) {
+  params.local_output_buffer_ptr = data;
+  params.elts_total = elts;
+
+  if (elts == 0) {
+    return;
+  }
+
+  if (dataType == DT_FLOAT) {
+    invokeOneOrTwoShotAllReduceKernel<float>(params, strat, stream);
+  } else if (dataType == DT_HALF) {
+    invokeOneOrTwoShotAllReduceKernel<half>(params, strat, stream);
+  } else {
+    assert(false && "Unspported data type");
+  }
+}
+
+} // namespace tensorrt_llm
diff --git a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h
new file mode 100644
index 000000000..e56795047
--- /dev/null
+++ b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ffconst.h"
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+namespace tensorrt_llm {
+
+constexpr size_t WARP_SIZE = 32;
+constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24;
+constexpr size_t MAX_RANKS_PER_NODE = 8;
+constexpr size_t DEFAULT_BLOCK_SIZE = 1024;
+
+enum class AllReduceStrategyType : int8_t {
+  RING = 0,
+  ONESHOT = 1,
+  TWOSHOT = 2,
+  AUTO = 3,
+};
+
+struct AllReduceParams {
+  size_t elts_total;
+  size_t elts_per_rank;
+  size_t elts_per_block;
+  size_t rank_offset;
+  size_t ranks_per_node, rank, local_rank;
+  uint32_t barrier_flag;
+  uint32_t *peer_barrier_ptrs_in[MAX_RANKS_PER_NODE];
+  uint32_t *peer_barrier_ptrs_out[MAX_RANKS_PER_NODE];
+  void *peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE];
+  void *local_output_buffer_ptr;
+};
+
+inline size_t GetMaxRequiredWorkspaceSize(int world_size) {
+  if (world_size <= 2) {
+    return 16 * 1000 * 1000;
+  }
+  return 8 * 1000 * 1000;
+}
+
+inline AllReduceStrategyType SelectImplementation(size_t message_size,
+                                                  int world_size) {
+  const size_t maxWorkspaceSize = GetMaxRequiredWorkspaceSize(world_size);
+
+  if (message_size > maxWorkspaceSize) {
+    return AllReduceStrategyType::RING;
+  }
+
+  if (world_size <= 2) {
+    return AllReduceStrategyType::ONESHOT;
+  }
+
+  if (world_size <= 4) {
+    if (message_size < 1 * 1000 * 1000) {
+      return AllReduceStrategyType::ONESHOT;
+    }
+    return AllReduceStrategyType::TWOSHOT;
+  }
+
+  if (message_size < 500 * 1000) {
+    return AllReduceStrategyType::ONESHOT;
+  }
+  return AllReduceStrategyType::TWOSHOT;
+}
+
+void customAllReduce(AllReduceParams &params,
+                     void *data,
+                     size_t elts,
+                     DataType dataType,
+                     AllReduceStrategyType strat,
+                     cudaStream_t stream);
+
+} // namespace tensorrt_llm
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 6ca337f58..db7164c84 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -37,21 +37,43 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
         chmod +x ~/${MINICONDA_SCRIPT_NAME} && \
         bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \
         rm ~/${MINICONDA_SCRIPT_NAME} && \
+	    /opt/conda/bin/conda config --set solver classic && \
         /opt/conda/bin/conda upgrade --all && \
         /opt/conda/bin/conda install conda-build conda-verify && \
         /opt/conda/bin/conda clean -ya
 
-# Optionally install HIP dependencies
+# set MAKEFLAGS to speedup any dependency that uses make
+ARG N_BUILD_CORES
+ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}"
+
+# Set env vars
+ENV PATH /opt/conda/bin:$PATH
+ENV CUDNN_DIR /usr/local/cuda
+ENV CUDA_DIR /usr/local/cuda
+
+# GPU-specific dependencies
+ARG FF_GPU_BACKEND "cuda"
+
+# Update NCCL if FF_GPU_BACKEND is cuda
+RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
+        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
+        ubuntu_version=$(lsb_release -rs); \
+        ubuntu_version=${ubuntu_version//./}; \
+        wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
+        DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
+        DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
+        rm -f cuda-keyring_1.0-1_all.deb; \
+        DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
+    else \
+        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
+    fi'
+
+# Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
 # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
 # package attempts to re-install cuda even though cuda is already installed
 # in the container. It also attempts to install packages for a graphical install.
 # For our container, we don't need `hip-runtime-nvidia`
-ARG FF_GPU_BACKEND "cuda"
 ARG hip_version "5.6"
-ARG N_BUILD_CORES
-# set MAKEFLAGS to speedup any dependency that uses make
-ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}"
-
 RUN  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \
         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \
         # Check that hip_version is one of 5.3,5.4,5.5,5.6
@@ -82,11 +104,6 @@ RUN  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]
     fi
 RUN rm -rf /var/lib/apt/lists/*
 
-# Set env vars
-ENV PATH /opt/conda/bin:$PATH
-ENV CUDNN_DIR /usr/local/cuda
-ENV CUDA_DIR /usr/local/cuda
-
 # Install python packages and other dependencies
 RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing
 # Install CPU-only Pytorch and related dependencies
diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile
index 60f9d4d65..dff925965 100644
--- a/docker/flexflow/Dockerfile
+++ b/docker/flexflow/Dockerfile
@@ -27,9 +27,7 @@ RUN for pair in $BUILD_CONFIGS; do \
 # Build and install C++ and Python versions of FlexFlow
 RUN mkdir -p build && cd build && \
     eval "$BUILD_CONFIGS" ../config/config.linux && \
-    make -j $N_BUILD_CORES && \
-    eval "$BUILD_CONFIGS" ../config/config.linux && \
-    make install && \
+    make -j $N_BUILD_CORES install && \
     ldconfig
 
 ENTRYPOINT ["/bin/bash"]
diff --git a/docker/run.sh b/docker/run.sh
index 666c8e112..2575150ae 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -18,8 +18,6 @@ ATTACH_GPUS=${ATTACH_GPUS:-true}
 gpu_arg=""
 if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
 
-# Whether to attach inference weights / files (make sure to download the weights first)
-ATTACH_INFERENCE_FILES=${ATTACH_INFERENCE_FILES:-false}
 
 # Amount of shared memory to give the Docker container access to
 # If you get a Bus Error, increase this value. If you don't have enough memory
@@ -115,9 +113,11 @@ if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":lat
   exit 1
 fi
 
-inference_volumes=""
-if $ATTACH_INFERENCE_FILES ; then 
-  inference_volumes="-v ~/.cache/flexflow:/usr/FlexFlow/inference";
+hf_token_volume=""
+hf_token_path="$HOME/.cache/huggingface/token"
+if [ -f "$hf_token_path" ]; then
+  # If the token exists, add the volume mount to the Docker command
+  hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
 fi
 
-eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc
index 128496eab..350788232 100644
--- a/examples/cpp/AlexNet/alexnet.cc
+++ b/examples/cpp/AlexNet/alexnet.cc
@@ -26,7 +26,7 @@ using FlexFlow::ParallelTensor;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("AlexNet");
+Legion::Logger log_app("AlexNet");
 
 void parse_input_args(char **argv, int argc, AlexNetConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc
index 7dc49215b..d7dc16755 100644
--- a/examples/cpp/DLRM/dlrm.cc
+++ b/examples/cpp/DLRM/dlrm.cc
@@ -19,7 +19,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("DLRM");
+Legion::Logger log_app("DLRM");
 
 void parse_input_args(char **argv, int argc, DLRMConfig &apConfig);
 
diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc
index b2070cc52..6d0fa7ee5 100644
--- a/examples/cpp/InceptionV3/inception.cc
+++ b/examples/cpp/InceptionV3/inception.cc
@@ -21,7 +21,7 @@
 using namespace Legion;
 using namespace FlexFlow;
 
-LegionRuntime::Logger::Category log_app("Inceptionv3");
+Legion::Logger log_app("Inceptionv3");
 
 Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) {
   Tensor t1 = input;
diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc
index 455eb743a..49ce934a6 100644
--- a/examples/cpp/ResNet/resnet.cc
+++ b/examples/cpp/ResNet/resnet.cc
@@ -24,7 +24,7 @@ using FlexFlow::Optimizer;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("ResNet");
+Legion::Logger log_app("ResNet");
 
 void parse_input_args(char **argv, int argc, ResNetConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc
index d61a63cd0..b04093b0a 100644
--- a/examples/cpp/Transformer/transformer.cc
+++ b/examples/cpp/Transformer/transformer.cc
@@ -17,7 +17,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("Transformer");
+Legion::Logger log_app("Transformer");
 
 Tensor create_emb(FFModel *model,
                   Tensor const &input,
diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc
index 2e6c3cec9..a2272f36e 100644
--- a/examples/cpp/XDL/xdl.cc
+++ b/examples/cpp/XDL/xdl.cc
@@ -18,7 +18,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("XDL");
+Legion::Logger log_app("XDL");
 
 void parse_input_args(char **argv, int argc, XDLConfig &apConfig);
 
diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc
index 779b8e9c1..e9f4bf876 100644
--- a/examples/cpp/candle_uno/candle_uno.cc
+++ b/examples/cpp/candle_uno/candle_uno.cc
@@ -21,7 +21,7 @@
 using namespace Legion;
 using namespace std;
 
-LegionRuntime::Logger::Category log_app("Candle_Uno");
+Legion::Logger log_app("Candle_Uno");
 
 void parse_input_args(char **argv, int argc, CandleConfig &apConfig);
 
diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc
index a70731088..a25f94abd 100644
--- a/examples/cpp/mixture_of_experts/moe.cc
+++ b/examples/cpp/mixture_of_experts/moe.cc
@@ -20,7 +20,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("MoE");
+Legion::Logger log_app("MoE");
 
 void parse_input_args(char **argv, int argc, MoeConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc
index 3c28ca27b..9b71b37cc 100644
--- a/examples/cpp/resnext50/resnext.cc
+++ b/examples/cpp/resnext50/resnext.cc
@@ -7,7 +7,7 @@ using FlexFlow::Optimizer;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("resnext");
+Legion::Logger log_app("resnext");
 
 Tensor resnext_block(FFModel &ff,
                      Tensor input,
diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc
index 97b98c321..ac9d516a5 100644
--- a/examples/cpp/split_test/split_test.cc
+++ b/examples/cpp/split_test/split_test.cc
@@ -3,7 +3,7 @@
 using namespace Legion;
 using namespace FlexFlow;
 
-LegionRuntime::Logger::Category log_app("split_test");
+Legion::Logger log_app("split_test");
 
 void FlexFlow::top_level_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc
index 69385d14c..fef078adb 100644
--- a/examples/cpp/split_test_2/split_test_2.cc
+++ b/examples/cpp/split_test_2/split_test_2.cc
@@ -9,7 +9,7 @@ using FlexFlow::PCG::Graph;
 using FlexFlow::PCG::GraphSearchHelper;
 using FlexFlow::PCG::Node;
 
-LegionRuntime::Logger::Category log_app("split_test_2");
+Legion::Logger log_app("split_test_2");
 
 void top_level_task(Task const *task,
                     std::vector<PhysicalRegion> const &regions,
diff --git a/include/flexflow/attention_config.h b/include/flexflow/attention_config.h
new file mode 100644
index 000000000..558246867
--- /dev/null
+++ b/include/flexflow/attention_config.h
@@ -0,0 +1,217 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FLEXFLOW_ATTENTION_CONFIG_H_
+#define _FLEXFLOW_ATTENTION_CONFIG_H_
+#include "flexflow/batch_config.h"
+
+namespace FlexFlow {
+
+constexpr uint32_t kPagesize = 64;
+
+inline int round_up_pages(int const num_elements) {
+  return (num_elements + kPagesize - 1) / kPagesize;
+}
+
+#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
+  switch (head_dim) {                                                          \
+    case 64: {                                                                 \
+      constexpr size_t HEAD_DIM = 64;                                          \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 128: {                                                                \
+      constexpr size_t HEAD_DIM = 128;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 256: {                                                                \
+      constexpr size_t HEAD_DIM = 256;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    default: {                                                                 \
+      std::ostringstream err_msg;                                              \
+      err_msg << "Unsupported head_dim: " << head_dim;                         \
+      throw std::invalid_argument(err_msg.str());                              \
+    }                                                                          \
+  }
+
+class AttentionMetaData {
+public:
+  AttentionMetaData() {
+    num_q_heads_ = 0;
+    num_kv_heads_ = 0;
+    head_dim_ = 0;
+    q_indptr = nullptr;
+    kv_indptr = nullptr;
+    kv_indices = nullptr;
+    kv_last_page_len = nullptr;
+    qk_indptr = nullptr;
+    custom_mask = nullptr;
+    workspace = nullptr;
+    workspace_size = 0;
+    float_workspace = nullptr;
+    float_workspace_size = 0;
+    int_workspace = nullptr;
+    int_workspace_size = 0;
+    mem_size_ = 0;
+    enabled_ = false;
+  }
+  AttentionMetaData(AttentionMetaData const &rhs) {
+    num_q_heads_ = rhs.num_q_heads_;
+    num_kv_heads_ = rhs.num_kv_heads_;
+    head_dim_ = rhs.head_dim_;
+    q_indptr = rhs.q_indptr;
+    kv_indptr = rhs.kv_indptr;
+    kv_indices = rhs.kv_indices;
+    kv_last_page_len = rhs.kv_last_page_len;
+    qk_indptr = rhs.qk_indptr;
+    custom_mask = rhs.custom_mask;
+    workspace = rhs.workspace;
+    workspace_size = rhs.workspace_size;
+    float_workspace = rhs.float_workspace;
+    float_workspace_size = rhs.float_workspace_size;
+    int_workspace = rhs.int_workspace;
+    int_workspace_size = rhs.int_workspace_size;
+    mem_size_ = rhs.mem_size_;
+    enabled_ = rhs.enabled_;
+    decode_handler_collections = rhs.decode_handler_collections;
+    prompt_handler_collections = rhs.prompt_handler_collections;
+  }
+
+  size_t mem_size() {
+    if (mem_size_ > 0) {
+      return mem_size_;
+    }
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages =
+        round_up_pages(BatchConfig::max_spec_tree_token_num() +
+                       BatchConfig::max_sequence_length());
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
+                              ((BatchConfig::max_spec_tree_token_num() *
+                                    (BatchConfig::max_spec_tree_token_num() +
+                                     BatchConfig::max_sequence_length()) +
+                                7) /
+                               8);
+
+    float_workspace_size = 128 * 1024 * 1024; // 128 MB
+    int_workspace_size = 8 * 1024 * 1024;     // 8 MB
+    workspace_size =
+        float_workspace_size + int_workspace_size; // float + int workspace
+
+    mem_size_ = alignTo(sizeof(int32_t) * indices_size +
+                            sizeof(uint8_t) * custom_mask_size + workspace_size,
+                        16);
+    return mem_size_;
+  }
+
+  void assign_address(void *ptr, int size) {
+    if (ptr == nullptr) {
+      q_indptr = nullptr;
+      kv_indptr = nullptr;
+      kv_indices = nullptr;
+      kv_last_page_len = nullptr;
+      qk_indptr = nullptr;
+      custom_mask = nullptr;
+      workspace = nullptr;
+      float_workspace = nullptr;
+      int_workspace = nullptr;
+      return;
+    }
+    assert(size >= mem_size() &&
+           "Insufficient memory size for attention metadata");
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages =
+        round_up_pages(BatchConfig::max_spec_tree_token_num() +
+                       BatchConfig::max_sequence_length());
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
+                              ((BatchConfig::max_spec_tree_token_num() *
+                                    (BatchConfig::max_spec_tree_token_num() +
+                                     BatchConfig::max_sequence_length()) +
+                                7) /
+                               8);
+
+    q_indptr = static_cast<int32_t *>(ptr);
+    kv_indptr = q_indptr + batch_size + 1;
+    kv_indices = kv_indptr + batch_size + 1;
+    kv_last_page_len = kv_indices + max_num_pages * batch_size;
+    qk_indptr = kv_last_page_len + batch_size + 1;
+    custom_mask = static_cast<uint8_t *>(ptr) + sizeof(int32_t) * indices_size;
+    workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
+                                    sizeof(int32_t) * indices_size +
+                                    sizeof(uint8_t) * custom_mask_size);
+    float_workspace = workspace;
+    int_workspace = static_cast<void *>(static_cast<uint8_t *>(workspace) +
+                                        float_workspace_size);
+  }
+
+  void set_num_q_heads(uint32_t const num_q_heads) {
+    num_q_heads_ = num_q_heads;
+  }
+  void set_num_kv_heads(uint32_t const num_kv_heads) {
+    num_kv_heads_ = num_kv_heads;
+  }
+  void set_head_dim(uint32_t const head_dim) {
+    head_dim_ = head_dim;
+  }
+  uint32_t num_q_heads() const {
+    return num_q_heads_;
+  }
+  uint32_t num_kv_heads() const {
+    return num_kv_heads_;
+  }
+  uint32_t head_dim() const {
+    return head_dim_;
+  }
+
+  void set_enabled(bool const enabled) {
+    enabled_ = enabled;
+  }
+  bool enabled() const {
+    return enabled_;
+  }
+
+  uint32_t num_q_heads_;
+  uint32_t num_kv_heads_;
+  uint32_t head_dim_;
+
+  int32_t *q_indptr;
+  int32_t *kv_indptr;
+  int32_t *kv_indices;
+  int32_t *kv_last_page_len;
+  int32_t *qk_indptr;
+  uint8_t *custom_mask;
+  void *workspace;
+  size_t workspace_size;
+  void *float_workspace;
+  size_t float_workspace_size;
+  void *int_workspace;
+  size_t int_workspace_size;
+
+  size_t mem_size_;
+
+  // batchsize -> handler
+  bool enabled_;
+  std::unordered_map<int, void *> decode_handler_collections;
+  std::unordered_map<int, void *> prompt_handler_collections;
+};
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_ATTENTION_CONFIG_H_
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 009d1c250..76521e5cf 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -20,185 +20,207 @@
 #include <cstddef>
 #include <cstdlib>
 
-// #define MAX_SEQ_LEN 1024
-// #define BATCH_SIZE 2
-// #define BATCH_SIZE 16
-// #define MAX_REQUESTS 256
-
 namespace FlexFlow {
 
+inline int alignTo(int x, int y) {
+  return ((x + y - 1) / y) * y;
+}
+
 class InferenceResult;
-class BeamInferenceResult;
 
 using BatchConfigFuture = Legion::Future;
 using InferenceResultFuture = Legion::Future;
-using BeamSearchBatchConfigFuture = Legion::Future;
-using TreeVerifyBatchConfigFuture = Legion::Future;
-using BeamInferenceResultFuture = Legion::Future;
+
+/*
+ * StreamingCacheInfo is a class that manages the streaming kv cache for
+ * attention operator (https://arxiv.org/abs/2309.17453), and we use it in the
+ * draft model. It maintains a fixed-content *sink* cache and a fixed-size
+ * *window* cache. The *sink* cache is the foremost part of the original kv
+ * cache, while the *window* cache is the backmost part of the original kv cache
+ * and is rolling updated. The information is per-request. Note that the
+ * position encoding of the q&k alters each iteration (relative position), so we
+ * store the *pre-pos-encoding* kv value in the cache.
+ */
+class StreamingCacheInfo {
+public:
+  StreamingCacheInfo();
+  StreamingCacheInfo(int sink_cache_size, int window_cache_size);
+  StreamingCacheInfo(StreamingCacheInfo const &other);
+
+  StreamingCacheInfo &operator=(StreamingCacheInfo const &other);
+
+  void commit_cache(int len);
+  void reset_cache();
+  int global_2_cache_index(int global_index);
+  int cache_2_global_index(int cache_index);
+
+public:
+  int sink_cache_size, window_cache_size;
+  // the meta info of the window cache, commit_len helps to determine if we fill
+  // up the window.
+  int window_back, commit_len, total_len;
+};
 
 class BatchConfig {
 public:
   using RequestGuid = size_t;
   using TokenId = int;
-  BatchConfig();
+  BatchConfig(InferenceMode inference_mode = INC_DECODING_MODE,
+              int model_id = 0);
+  BatchConfig(BatchConfig const &other);
   int num_active_requests() const;
   int num_active_tokens() const;
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
-  static int max_verify_tokens_per_batch();
+  static int max_tokens_per_ssm_batch();
+  static int max_tokens_per_prefilling_batch();
   static int max_spec_tree_token_num();
   static int max_sequence_length();
+  static int max_output_length();
+  static size_t max_kv_cache_size();
+  static bool streaming_cache();
+  static int get_max_tree_depth();
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;
   void save_to_file(std::string const &filename) const;
   virtual InferenceMode get_mode() const;
   static BatchConfig const *from_future(BatchConfigFuture const &future);
+
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  static int const MAX_NUM_REQUESTS = 64;
-  static int const MAX_NUM_TOKENS = 1024;
-  static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
-
-  //  Set by update
-  int num_tokens;
-  // number of tokens in prompt phase, start offset of tokens in inc_decoding
-  // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
-  int num_generation_tokens;
+  inline static int const MAX_NUM_REQUESTS = 64;
+  inline static int const MAX_NUM_TOKENS = 1024;
+  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 8;
+  inline static int const MAX_TREE_DEPTH = 8;
+  inline static int const MAX_TREE_WIDTH = 16;
+  inline static int const MAX_SPEC_TREE_TOKEN_NUM =
+      MAX_TREE_DEPTH * MAX_TREE_WIDTH;
+  inline static int const MAX_K_LOGITS = 16;
+
+  // The Constants for the Streaming KVCache
+  inline static int const SINK_SIZE = 4;
+  // size_SINK + size_WINDOW + depth_DRAFT shouldn't exceed this value
+  inline static int const MAX_STREAMING_POS = 2048;
+
+  int num_tokens = 0;
+  int num_available_requests = 0;
+  bool prompt_phase = false;
+  int num_tokens_to_commit = 0;
+  int model_id;
+  InferenceMode inference_mode;
 
   struct PerRequestInfo {
-    int first_token_depth_in_request;
-    int first_token_offset_in_batch;
-    int num_tokens_in_batch;
-    int max_sequence_length;
-
-    // request id in batch config:
-    int batch_config_request_id;
-    bool prompt_phase = false;
+    int first_token_index_in_request = -1;
+    int first_token_offset_in_batch = -1;
+    int num_tokens_in_batch = 0;
     RequestGuid request_guid;
+
+    static constexpr size_t request_guid_size = sizeof(RequestGuid);
+    static constexpr size_t alignment = 16;
+    static constexpr size_t padding_size =
+        (alignment - (sizeof(int) * 3 + request_guid_size) % alignment) %
+        alignment;
+    static constexpr size_t padding_length = padding_size / sizeof(int);
+    int padding[padding_length] = {}; // Padding for memory pointer alignment
   };
+
   struct PerTokenInfo {
-    int abs_depth_in_request;
-    int request_index;
-    TokenId token_id;
+    TokenId token_id = -1;
+    // Difference between the two:
+    // abs_index_in_request: non-tree cache size + index in the flattened
+    // speculative tree
+    // abs_depth_in_request: non_tree cache size + depth in the speculative tree
+    int abs_index_in_request = -1;
+    int abs_depth_in_request = -1;
+    int request_index = -1;
   };
 
-  struct BitMask {
-    unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0};
+  struct CommittedTokensInfo {
+    int index_in_kv_cache = -1; // the index in the temporary key-value cache
+    int request_index = -1;     // request index in the batch
+    int token_depth = -1; // position of the token in the request's sequence
+  };
 
-    // how many tokens before the tree, every sub requests need this part of
-    // cache
+  class BitMask {
+  public:
+    class Bitset {
+    public:
+      Bitset() : bits{0} {}
+
+      Bitset(Bitset const &other) {
+        // Copy the entire array of bits from 'other' to this object
+        std::copy(
+            std::begin(other.bits), std::end(other.bits), std::begin(bits));
+      }
+
+      void set_bit(size_t pos) {
+        size_t idx = pos / 64; // Find the index in the array
+        size_t bit = pos % 64; // Find the bit position within the uint64_t
+        bits[idx] |= (1ULL << bit);
+      }
+
+      bool test_bit(size_t pos) const {
+        size_t idx = pos / 64;
+        size_t bit = pos % 64;
+        return (bits[idx] & (1ULL << bit)) != 0;
+      }
+
+      void clear() {
+        std::fill(std::begin(bits), std::end(bits), 0);
+      }
+
+      uint64_t bits[(MAX_SPEC_TREE_TOKEN_NUM + 63) / 64];
+    };
+
+    Bitset bit_mask[MAX_SPEC_TREE_TOKEN_NUM];
+    // the number of generated tokens before the speculation tree (excluding the
+    // prompt tokens)
     int non_tree_cache_size = 0;
-
-    // current tree size
-    int tree_size = 0;
-
-    int this_layer_size = 0;
-
-    // input length-> prompt/root
-    int prompt_size = 0;
+    // Tree size or prompt size. Because the prefilling phase and the decoding
+    // phase are separated, we only need one field to store the size of the tree
+    // or the prompt.
+    int tree_or_prompt_size = 0;
+    int current_layer_size = 0;
+
+    BitMask() = default;
+
+    BitMask(BitMask const &other) {
+      non_tree_cache_size = other.non_tree_cache_size;
+      tree_or_prompt_size = other.tree_or_prompt_size;
+      current_layer_size = other.current_layer_size;
+      for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
+        bit_mask[i] = other.bit_mask[i];
+      }
+    }
+
+    void clear_bitmask() {
+      // Clear bit_mask but keep the other fields
+      for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
+        bit_mask[i].clear();
+      }
+    }
   };
 
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
+  StreamingCacheInfo streamingCacheInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
-
-  bool request_completed[MAX_NUM_REQUESTS];
-  bool request_running[MAX_NUM_REQUESTS];
-};
-
-class TreeVerifyBatchConfig : public BatchConfig {
-public:
-  TreeVerifyBatchConfig();
-  ~TreeVerifyBatchConfig();
-  InferenceMode get_mode() const;
-  friend std::ostream &operator<<(std::ostream &os,
-                                  TreeVerifyBatchConfig const &bc);
-  void print() const;
-  void save_to_file(std::string const &filename) const;
-  struct CommittedTokensInfo {
-    int token_index;   // the index of the token in the previous batch
-    int request_index; // request index in the batch
-    int token_depth;   // position of the token in the request's sequence
-  };
-
-  int num_tokens_to_commit;
   CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
+  bool request_available[MAX_NUM_REQUESTS];
 };
 
 struct InferenceResult {
-  static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
-  BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
-};
-
-class BeamSearchBatchConfig : public BatchConfig {
-public:
-  BeamSearchBatchConfig();
-  BeamSearchBatchConfig(int model_id);
-  BeamSearchBatchConfig(size_t beam_width, size_t target_iterations);
-  BeamSearchBatchConfig(BeamSearchBatchConfig const &other, int model_id);
-  InferenceMode get_mode() const;
-
-  ~BeamSearchBatchConfig();
-
-  friend std::ostream &operator<<(std::ostream &os,
-                                  BeamSearchBatchConfig const &bc);
-  void print() const;
-  void save_to_file(std::string const &filename) const;
-  bool done() const;
-  int max_beam_depth_all_requests() const;
-  int current_depth_all_requests() const;
-  int get_speculative_request_num() const;
-
-  size_t beam_width;
-  size_t target_iterations;
-
-  // how many requests is in speculative phase
-  int speculative_request_num = 0;
-  inline static int const MAX_BEAM_WIDTH = 3;
-  inline static int const MAX_BEAM_DEPTH = 8;
-
-  // maximum tree branches for a request
-  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
-
-  int model_id;
-
-  struct BeamSearchPerRequestInfo {
-    int beam_size;
-    int current_depth = -1;
-    int max_depth = MAX_BEAM_DEPTH;
-
-    BatchConfig::TokenId
-        tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int parent_id[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int sub_request_num;
-  };
-
-  struct BeamSearchPerTokenInfo {
-    int sub_request_index;
-  };
-
-  BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS];
-  BeamSearchPerTokenInfo
-      beamTokenInfo[MAX_NUM_TOKENS +
-                    MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS];
-
-  int sub_requests[MAX_NUM_REQUESTS];
-
-private:
-  size_t current_iteration;
-};
-
-struct BeamInferenceResult {
-  static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
+  int num_token_ids;
+  int num_gumbel_logits;
   BatchConfig::TokenId
-      token_ids[MAX_NUM_TOKENS *
-                BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-  float probs[MAX_NUM_TOKENS *
-              BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-  int parent_id[MAX_NUM_TOKENS *
-                BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+      token_ids[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS];
+  float probs[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS];
+  float gumbel_logits[BatchConfig::MAX_NUM_TOKENS *
+                      BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+  InferenceResult() : num_token_ids(0), num_gumbel_logits(0) {}
+  InferenceResult(InferenceResult const &other);
+  friend std::ostream &operator<<(std::ostream &os, InferenceResult const &ir);
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 2c11ae113..1aa80112b 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -16,10 +16,14 @@
 #ifndef _FLEXFLOW_CONFIG_H_
 #define _FLEXFLOW_CONFIG_H_
 #include "ffconst.h"
+#include "flexflow/attention_config.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ops/kernels/gemm_impl.h"
 #include "legion.h"
+#include <cstddef>
 #include <cstring>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cudnn.h>
 #elif defined(FF_USE_HIP_ROCM)
@@ -70,6 +74,8 @@ struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnHandle_t dnn;
   cublasHandle_t blas;
+  cublasLtHandle_t blasLt;
+  Internal::GemmEngine *gemm_engine;
 #else
   miopenHandle_t dnn;
   hipblasHandle_t blas;
@@ -77,19 +83,24 @@ struct FFHandler {
   void *workSpace;
   size_t workSpaceSize;
   void *batch_config_metadata;
+  AttentionMetaData *incr_attention_metadata;
+  AttentionMetaData *tree_search_attention_metadata;
+  AttentionMetaData *tree_verify_attention_metadata;
 
-  // request info + token info + topolopgy mask info
-  size_t batch_config_metadata_size =
+  size_t batch_config_metadata_size = alignTo(
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-      sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-      sizeof(BatchConfig::causalMask) +
-      sizeof(TreeVerifyBatchConfig::committed_tokens) +
-      sizeof(BatchConfig::request_completed);
+          sizeof(BatchConfig::request_available) +
+          sizeof(BatchConfig::causalMask) +
+          sizeof(BatchConfig::streamingCacheInfo) +
+          sizeof(BatchConfig::committed_tokens) + sizeof(int),
+      16);
+
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
+  int num_devices;
+  int device_id;
 #ifdef FF_USE_NCCL
   ncclComm_t ncclComm;
 #endif
@@ -145,6 +156,7 @@ class FFConfig {
   Legion::Runtime *lg_hlr;
   Legion::IndexSpaceT<1> all_gpu_task_is;
   // Legion::FieldSpace field_space;
+  bool log_instance_creation;
   bool benchmarking, profiling, perform_fusion;
   bool inference_debugging;
   size_t simulator_work_space_size;
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 512645e62..f713e4592 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -68,7 +68,7 @@ enum MetricsType {
 
 enum InferenceMode {
   INC_DECODING_MODE = 2001,
-  BEAM_SEARCH_MODE = 2002,
+  TREE_SEARCH_MODE = 2002,
   TREE_VERIFY_MODE = 2003,
 };
 
@@ -137,6 +137,7 @@ enum OperatorType {
   OP_SHAPE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Shape
   OP_SIZE,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Size
   OP_TOPK,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#TopK
+  OP_GUMBEL_TOPK,
   OP_ARG_TOPK,
   OP_WHERE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Where
   OP_CEIL,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Ceil
@@ -166,7 +167,7 @@ enum OperatorType {
   OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html
   OP_RMS_NORM,
   OP_RESIDUAL_RMS_NORM,
-  OP_BEAM_TOPK,
+  //   OP_BEAM_TOPK,
   OP_ARGMAX,
   OP_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 0b74b7fce..60372780e 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -133,71 +133,71 @@ flexflow_tensor_t flexflow_model_get_label_tensor(flexflow_model_t handle);
 void flexflow_model_zero_gradients(flexflow_model_t handle);
 
 flexflow_tensor_t flexflow_model_add_exp(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
+                                         flexflow_tensor_t const x,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_sin(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
+                                         flexflow_tensor_t const x,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_cos(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
+                                         flexflow_tensor_t const x,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_add(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
-                                         const flexflow_tensor_t y,
+                                         flexflow_tensor_t const x,
+                                         flexflow_tensor_t const y,
                                          bool inplace_a,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle,
-                                              const flexflow_tensor_t x,
-                                              const flexflow_tensor_t y,
+                                              flexflow_tensor_t const x,
+                                              flexflow_tensor_t const y,
                                               bool inplace_a,
                                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle,
-                                              const flexflow_tensor_t x,
-                                              const flexflow_tensor_t y,
+                                              flexflow_tensor_t const x,
+                                              flexflow_tensor_t const y,
                                               bool inplace_a,
                                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle,
-                                            const flexflow_tensor_t x,
-                                            const flexflow_tensor_t y,
+                                            flexflow_tensor_t const x,
+                                            flexflow_tensor_t const y,
                                             bool inplace_a,
                                             char const *name);
 
 flexflow_tensor_t flexflow_model_add_max(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
-                                         const flexflow_tensor_t y,
+                                         flexflow_tensor_t const x,
+                                         flexflow_tensor_t const y,
                                          bool inplace_a,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_min(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
-                                         const flexflow_tensor_t y,
+                                         flexflow_tensor_t const x,
+                                         flexflow_tensor_t const y,
                                          bool inplace_a,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 int *axes,
                                                 int n,
                                                 bool keepdims,
                                                 char const *name);
 
 flexflow_tensor_t flexflow_model_add_rsqrt(flexflow_model_t handle_,
-                                           const flexflow_tensor_t input_,
+                                           flexflow_tensor_t const input_,
                                            char const *name);
 
 flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input_,
+                                         flexflow_tensor_t const input_,
                                          float const exponent,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           int *dims,
                                           int n,
                                           bool keepdims,
@@ -205,7 +205,7 @@ flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_,
 
 flexflow_tensor_t
     flexflow_model_add_conv2d(flexflow_model_t handle,
-                              const flexflow_tensor_t input,
+                              flexflow_tensor_t const input,
                               int out_channels,
                               int kernel_h,
                               int kernel_w,
@@ -223,7 +223,7 @@ flexflow_tensor_t
 
 flexflow_tensor_t
     flexflow_model_add_embedding(flexflow_model_t handle,
-                                 const flexflow_tensor_t input,
+                                 flexflow_tensor_t const input,
                                  int num_entries,
                                  int out_dim,
                                  enum AggrMode aggr,
@@ -246,12 +246,12 @@ flexflow_tensor_t
                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle,
-                                                const flexflow_tensor_t input,
+                                                flexflow_tensor_t const input,
                                                 bool relu,
                                                 char const *name);
 
 flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle,
-                                                const flexflow_tensor_t input,
+                                                flexflow_tensor_t const input,
                                                 int n,
                                                 int *axes,
                                                 bool elementwise_affine,
@@ -261,9 +261,9 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle,
 
 flexflow_tensor_t *
     flexflow_model_add_residual_layer_norm(flexflow_model_t handle,
-                                           const flexflow_tensor_t input,
-                                           const flexflow_tensor_t residual1,
-                                           const flexflow_tensor_t residual2,
+                                           flexflow_tensor_t const input,
+                                           flexflow_tensor_t const residual1,
+                                           flexflow_tensor_t const residual2,
                                            bool use_two_residuals,
                                            int n,
                                            int *axes,
@@ -274,8 +274,8 @@ flexflow_tensor_t *
 
 flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     flexflow_model_t handle,
-    const flexflow_tensor_t input,
-    const flexflow_tensor_t residual,
+    flexflow_tensor_t const input,
+    flexflow_tensor_t const residual,
     int n,
     int *axes,
     bool elementwise_affine,
@@ -285,20 +285,21 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
 
 flexflow_tensor_t
     flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle,
-                                          const flexflow_tensor_t input1,
-                                          const flexflow_tensor_t input2,
+                                          flexflow_tensor_t const input1,
+                                          flexflow_tensor_t const input2,
+                                          int intermediate_size,
                                           char const *name);
 
 flexflow_tensor_t
     flexflow_model_add_batch_matmul(flexflow_model_t handle,
-                                    const flexflow_tensor_t a,
-                                    const flexflow_tensor_t b,
+                                    flexflow_tensor_t const a,
+                                    flexflow_tensor_t const b,
                                     int a_seq_length_dim /* -1 */,
                                     int b_seq_length_dim /* -1 */);
 
 flexflow_tensor_t flexflow_model_add_dense(
     flexflow_model_t handle,
-    const flexflow_tensor_t input,
+    flexflow_tensor_t const input,
     int out_dim,
     enum ActiMode activation /* AC_MODE_NONE */,
     bool use_bias /* true */,
@@ -329,96 +330,96 @@ flexflow_tensor_t flexflow_model_add_flat(flexflow_model_t handle,
                                           char const *name);
 
 flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle,
-                                            const flexflow_tensor_t input,
-                                            const flexflow_tensor_t index,
+                                            flexflow_tensor_t const input,
+                                            flexflow_tensor_t const index,
                                             int dim,
                                             char const *name);
 
 flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              int dim,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle,
-                                               const flexflow_tensor_t input,
+                                               flexflow_tensor_t const input,
                                                int n,
                                                int *perm,
                                                char const *name);
 
 flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              int n,
                                              int *shape,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              int axis,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle,
-                                          const flexflow_tensor_t input,
+                                          flexflow_tensor_t const input,
                                           bool inplace,
                                           char const *name);
 
 flexflow_tensor_t
     flexflow_model_add_scalar_multiply(flexflow_model_t handle,
-                                       const flexflow_tensor_t input,
+                                       flexflow_tensor_t const input,
                                        float const scalar,
                                        bool inplace,
                                        char const *name);
 
 flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle,
-                                                const flexflow_tensor_t input,
+                                                flexflow_tensor_t const input,
                                                 float const scalar,
                                                 bool inplace,
                                                 char const *name);
 
 flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle,
-                                                const flexflow_tensor_t input,
+                                                flexflow_tensor_t const input,
                                                 float const scalar,
                                                 bool inplace,
                                                 char const *name);
 
 flexflow_tensor_t
     flexflow_model_add_scalar_truediv(flexflow_model_t handle,
-                                      const flexflow_tensor_t input,
+                                      flexflow_tensor_t const input,
                                       float const scalar,
                                       bool inplace,
                                       char const *name);
 
 flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle,
-                                          const flexflow_tensor_t input,
+                                          flexflow_tensor_t const input,
                                           char const *name);
 
 flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle,
-                                              const flexflow_tensor_t input,
+                                              flexflow_tensor_t const input,
                                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle,
-                                          const flexflow_tensor_t input,
+                                          flexflow_tensor_t const input,
                                           char const *name);
 
 flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle,
-                                         const flexflow_tensor_t input,
+                                         flexflow_tensor_t const input,
                                          bool inplace,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              float rate,
                                              unsigned long long seed,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_multihead_attention(
     flexflow_model_t handle,
-    const flexflow_tensor_t query,
-    const flexflow_tensor_t key,
-    const flexflow_tensor_t value,
+    flexflow_tensor_t const query,
+    flexflow_tensor_t const key,
+    flexflow_tensor_t const value,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -432,7 +433,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention(
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -444,15 +445,22 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -464,15 +472,22 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -484,15 +499,21 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
     char const *name);
 
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
+flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -505,15 +526,22 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -526,15 +554,22 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -547,6 +582,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -554,39 +595,39 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               float eps,
                                               int dim,
                                               char const *name);
 
 flexflow_tensor_t *
     flexflow_model_add_residual_rms_norm(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input1_,
-                                         const flexflow_tensor_t input2_,
+                                         flexflow_tensor_t const input1_,
+                                         flexflow_tensor_t const input2_,
                                          float eps,
                                          int dim,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
-                                               const flexflow_tensor_t input_,
+                                               flexflow_tensor_t const input_,
                                                int k,
                                                bool sorted,
-                                               bool speculative_decoding,
+                                               bool renormalize,
                                                char const *name);
 
-flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
-                                                int max_beam_size,
-                                                bool sorted,
-                                                char const *name);
+// flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
+//                                                 const flexflow_tensor_t
+//                                                 input_, int max_beam_size,
+//                                                 bool sorted,
+//                                                 char const *name);
 
 flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               float top_p,
                                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
-                                            const flexflow_tensor_t input_,
+                                            flexflow_tensor_t const input_,
                                             bool beam_search,
                                             char const *name);
 
@@ -972,12 +1013,21 @@ void flexflow_request_manager_set_max_requests_per_batch(
 void flexflow_request_manager_set_max_tokens_per_batch(
     flexflow_request_manager_t handle_, int max_num_tokens);
 
-void flexflow_request_manager_set_max_spec_tree_token_num(
-    flexflow_request_manager_t handle_, int max_num_tokens);
+void flexflow_request_manager_set_max_tokens_per_ssm_batch(
+    flexflow_request_manager_t handle_, int max_num_ssm_tokens);
+
+void flexflow_request_manager_set_max_tokens_per_prefilling_batch(
+    flexflow_request_manager_t handle_, int max_num_prefilling_tokens);
 
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
+void flexflow_request_manager_set_max_output_length(
+    flexflow_request_manager_t handle_, int max_output_length);
+
+void flexflow_request_manager_set_max_kv_cache_size(
+    flexflow_request_manager_t handle_, int max_kv_cache_size);
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
@@ -1027,7 +1077,7 @@ flexflow_file_data_loader_t
                                      int num_q_heads,
                                      int num_kv_heads,
                                      int hidden_dim,
-                                     int qkv_inner_dim,
+                                     int head_dim,
                                      int tensor_parallelism_degree,
                                      bool use_full_precision);
 
diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h
index 2e0cf1ca4..9dc657259 100644
--- a/include/flexflow/graph.h
+++ b/include/flexflow/graph.h
@@ -24,7 +24,7 @@
 #include "legion/legion_utilities.h"
 #include <unordered_set>
 
-extern LegionRuntime::Logger::Category log_dp;
+extern Legion::Logger log_dp;
 
 namespace FlexFlow::PCG {
 
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index f24a797ff..a866e52cb 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -15,21 +15,47 @@
 
 #pragma once
 #include "flexflow/batch_config.h"
+#include <nlohmann/json.hpp>
 #include <string>
 #include <vector>
 
+using json = nlohmann::json;
+
 namespace FlexFlow {
 
 struct GenerationConfig {
   bool do_sample = false;
+  bool spec_sample = false;
   float temperature = 0.8;
+  // top-p renormalization
   float topp = 0.6;
-  GenerationConfig(bool _do_sample, float _temperature, float _topp) {
-    temperature = _temperature > 0 ? _temperature : temperature;
-    topp = _topp > 0 ? _topp : topp;
-    do_sample = _do_sample;
+  // top-k renormalization
+  int topk = 16;
+  GenerationConfig(bool _do_sample = false,
+                   float _temperature = 0.8,
+                   float _topp = 0.6,
+                   bool _spec_sample = false,
+                   int _topk = 16)
+      : do_sample(_do_sample), temperature(_temperature), topp(_topp),
+        spec_sample(_spec_sample), topk(_topk) {
+    assert(temperature > 0.0);
+    assert(topk <= BatchConfig::MAX_K_LOGITS);
   }
-  GenerationConfig() {}
+};
+
+struct GenerationRequest {
+  std::string prompt;
+  bool add_special_tokens = true;
+  double slo_ratio;
+  double emission_time_ms;
+
+  GenerationRequest(std::string const &prompt_,
+                    double slo_ratio_,
+                    double emission_time_ms_,
+                    bool add_special_tokens_ = true)
+      : prompt(prompt_), slo_ratio(slo_ratio_),
+        emission_time_ms(emission_time_ms_),
+        add_special_tokens(add_special_tokens_) {}
 };
 
 struct GenerationResult {
@@ -40,10 +66,139 @@ struct GenerationResult {
   std::string output_text;
   std::vector<TokenId> input_tokens;
   std::vector<TokenId> output_tokens;
+  double slo_ratio;
+  double emission_time_ms;
+  int decoding_steps;
 };
 
-#include <string>
-#include <vector>
+// Contains the configuration for how to emit requests to the server,
+// managing the request arrival rate.
+class EmissionMachine {
+public:
+  enum class EmissionMode { Constant, Poisson, Trace };
+  EmissionMode mode;
+  double elapsed_time_ms;
+  double last_request_time_ms;
+  double req_per_s;
+  std::vector<std::pair<double, double>> slo_ratios;
+
+  EmissionMachine(EmissionMode mode_,
+                  double req_per_s_,
+                  std::vector<std::pair<double, double>> slo_ratios_)
+      : mode(mode_), elapsed_time_ms(0), last_request_time_ms(0),
+        req_per_s(req_per_s_), slo_ratios(slo_ratios_) {
+    // cumulate the slo ratios for sampling
+    for (size_t i = 1; i < slo_ratios.size(); i++) {
+      slo_ratios[i].second += slo_ratios[i - 1].second;
+    }
+  }
+  void wait_until_next_request();
+
+  // Simulate next request arrival time
+  virtual double get_next_interval_ms() = 0;
+  virtual double sample_slo_ratio();
+  double get_elapsed_time_ms();
+};
+
+class EmissionTrace {
+public:
+  std::string prompt;
+  int input_length, output_length;
+  double slo_ratio;
+  double emission_time_ms;
+
+  EmissionTrace(std::string prompt_,
+                int input_length_,
+                int output_length_,
+                double slo_ratio_,
+                double emission_time_ms_)
+      : prompt(prompt_), input_length(input_length_),
+        output_length(output_length_), slo_ratio(slo_ratio_),
+        emission_time_ms(emission_time_ms_) {}
+  EmissionTrace(GenerationResult const &result)
+      : prompt(result.input_text), input_length(result.input_tokens.size()),
+        output_length(result.output_tokens.size()), slo_ratio(result.slo_ratio),
+        emission_time_ms(result.emission_time_ms) {}
+  EmissionTrace(json const &json_obj);
+
+  json to_json() const;
+};
+
+class ConstantEmissionMachine : public EmissionMachine {
+public:
+  double interval_ms;
+
+  ConstantEmissionMachine(double req_per_s_,
+                          std::vector<std::pair<double, double>> slo_ratios_)
+      : EmissionMachine(EmissionMode::Constant, req_per_s_, slo_ratios_),
+        interval_ms(req_per_s_ > 0 ? 1e3 / req_per_s_ : 0) {}
+
+  double get_next_interval_ms() override;
+};
+
+class PoissonEmissionMachine : public EmissionMachine {
+public:
+  double lambda;
+
+  PoissonEmissionMachine(double req_per_s_,
+                         std::vector<std::pair<double, double>> slo_ratios_)
+      : EmissionMachine(EmissionMode::Poisson, req_per_s_, slo_ratios_),
+        lambda(req_per_s_) {}
+
+  double get_next_interval_ms() override;
+};
+
+class TraceEmissionMachine : public EmissionMachine {
+public:
+  std::vector<double> timestamps, ratios;
+  size_t idx;
+
+  TraceEmissionMachine(std::vector<double> const &timestamps_,
+                       std::vector<double> const &ratios_)
+      : EmissionMachine(EmissionMode::Trace, 0, {}), timestamps(timestamps_),
+        ratios(ratios_), idx(0) {}
+
+  double get_next_interval_ms() override;
+  double sample_slo_ratio() override;
+};
+
+struct RotaryEmbeddingMeta {
+  bool apply_rotary_embedding = false;
+  float rope_theta = 10000.0f;
+  std::string rope_type = "default";
+  float factor = 8.0f;
+  float low_freq_factor = 1.0f;
+  float high_freq_factor = 4.0f;
+  int original_max_position_embeddings = 8192;
+
+  RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false,
+                      float rope_theta_ = 10000.0f,
+                      std::string rope_type_ = "default",
+                      float factor_ = 8.0f,
+                      float low_freq_factor_ = 1.0f,
+                      float high_freq_factor_ = 4.0f,
+                      int original_max_position_embeddings_ = 8192)
+      : apply_rotary_embedding(apply_rotary_embedding_),
+        rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_),
+        low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_),
+        original_max_position_embeddings(original_max_position_embeddings_) {}
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  RotaryEmbeddingMeta const &meta) {
+    os << std::boolalpha // To print bool as true/false instead of 1/0
+       << "RotaryEmbeddingMeta {\n"
+       << "  apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n"
+       << "  rope_theta: " << meta.rope_theta << ",\n"
+       << "  rope_type: \"" << meta.rope_type << "\",\n"
+       << "  factor: " << meta.factor << ",\n"
+       << "  low_freq_factor: " << meta.low_freq_factor << ",\n"
+       << "  high_freq_factor: " << meta.high_freq_factor << ",\n"
+       << "  original_max_position_embeddings: "
+       << meta.original_max_position_embeddings << "\n"
+       << "}";
+    return os;
+  }
+};
 
 std::string join_path(std::vector<std::string> const &paths);
 
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index 69a57e4e1..9d9045a44 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -32,11 +32,13 @@ class Layer {
   void add_float_property(std::string const &key, float value);
   void add_int_vector_property(std::string const &key,
                                std::vector<int> const &value);
+  void add_string_property(std::string const &key, std::string const &value);
   void add_initializer(std::string const &key, Initializer *initializer);
   bool get_int_property(std::string const &key, long long &value) const;
   bool get_float_property(std::string const &key, float &value) const;
   bool get_int_vector_property(std::string const &key,
                                std::vector<int> &value) const;
+  bool get_string_property(std::string const &key, std::string &value) const;
   bool get_initializer(std::string const &key, Initializer *&initializer) const;
   Tensor get_parameter(int index);
   void print();
@@ -59,6 +61,7 @@ class Layer {
   std::unordered_map<std::string, float> float_properties;
   std::unordered_map<std::string, Initializer *> initializers;
   std::unordered_map<std::string, std::vector<int>> int_vector_properties;
+  std::unordered_map<std::string, std::string> string_properties;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 95be9ab58..6d9356aee 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -145,6 +145,9 @@ enum TaskIDs {
   TOPK_INIT_TASK_ID,
   TOPK_FWD_TASK_ID,
   TOPK_BWD_TASK_ID,
+  GUMBEL_TOPK_INIT_TASK_ID,
+  GUMBEL_TOPK_INF_TASK_ID,
+  GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID,
   ARG_TOPK_INIT_TASK_ID,
   ARG_TOPK_INF_TASK_ID,
   ARG_TOPK_INF_SPECULATIVE_TASK_ID,
@@ -164,8 +167,8 @@ enum TaskIDs {
   RMSNORM_INF_TASK_ID,
   RESIDUAL_RMSNORM_INIT_TASK_ID,
   RESIDUAL_RMSNORM_INF_TASK_ID,
-  BEAM_TOPK_INIT_TASK_ID,
-  BEAM_TOPK_INF_TASK_ID,
+  //   BEAM_TOPK_INIT_TASK_ID,
+  //   BEAM_TOPK_INF_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
@@ -244,11 +247,13 @@ enum TaskIDs {
   RM_LOAD_TOKENS_TASK_ID,
   RM_LOAD_POSITION_TASK_ID,
   RM_LOAD_BATCH_CONFIG_TASK_ID,
+  RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
   RM_PREPARE_NEXT_BATCH_TASK_ID,
   RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
-  RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
+  RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   RM_BACKGROUND_SERVING_TASK_ID,
+  LOAD_WEIGHT_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
   CUSTOM_GPU_TASK_ID_1,
@@ -335,11 +340,12 @@ class Reshape;
 class Softmax;
 class Split;
 class TopK;
+class GumbelTopK;
 class ArgTopK;
 class Transpose;
 class RMSNorm;
 class ResidualRMSNorm;
-class BeamTopK;
+// class BeamTopK;
 class SpecIncMultiHeadSelfAttention;
 class Sampling;
 class ArgMax;
@@ -407,74 +413,74 @@ class FFModel {
   bool cpu_offload;
   // C++ APIs for constructing models
   // Add an exp layer
-  Tensor exp(const Tensor x, char const *name = NULL);
+  Tensor exp(Tensor const x, char const *name = NULL);
   // Add an add layer
-  Tensor add(const Tensor x,
-             const Tensor y,
+  Tensor add(Tensor const x,
+             Tensor const y,
              bool inplace_a = false,
              char const *name = NULL);
   // Add a subtract layer
-  Tensor subtract(const Tensor x,
-                  const Tensor y,
+  Tensor subtract(Tensor const x,
+                  Tensor const y,
                   bool inplace_a = false,
                   char const *name = NULL);
   // Add a multiply layer
-  Tensor multiply(const Tensor x,
-                  const Tensor y,
+  Tensor multiply(Tensor const x,
+                  Tensor const y,
                   bool inplace_a = false,
                   char const *name = NULL);
   // Add a divide layer
-  Tensor divide(const Tensor x,
-                const Tensor y,
+  Tensor divide(Tensor const x,
+                Tensor const y,
                 bool inplace_a = false,
                 char const *name = NULL);
   // Add a max layer
-  Tensor max(const Tensor x,
-             const Tensor y,
+  Tensor max(Tensor const x,
+             Tensor const y,
              bool inplace_a = false,
              char const *name = NULL);
   // Add a min layer
-  Tensor min(const Tensor x,
-             const Tensor y,
+  Tensor min(Tensor const x,
+             Tensor const y,
              bool inplace_a = false,
              char const *name = NULL);
   // Add a rsqrt layer
-  Tensor rsqrt(const Tensor x, bool inplace = true, char const *name = NULL);
+  Tensor rsqrt(Tensor const x, bool inplace = true, char const *name = NULL);
   // Add a pow layer
-  Tensor pow(const Tensor x,
+  Tensor pow(Tensor const x,
              float const exponent,
              bool inplace = true,
              char const *name = NULL);
   // Add a scalar multiply layer
-  Tensor scalar_multiply(const Tensor x,
+  Tensor scalar_multiply(Tensor const x,
                          float const scalar,
                          bool inplace = true,
                          char const *name = NULL);
-  Tensor scalar_add(const Tensor x,
+  Tensor scalar_add(Tensor const x,
                     float const scalar,
                     bool inplace = true,
                     char const *name = NULL);
-  Tensor scalar_sub(const Tensor x,
+  Tensor scalar_sub(Tensor const x,
                     float const scalar,
                     bool inplace = true,
                     char const *name = NULL);
-  Tensor scalar_truediv(const Tensor x,
+  Tensor scalar_truediv(Tensor const x,
                         float const scalar,
                         bool inplace = true,
                         char const *name = NULL);
   // Add a sin layer
-  Tensor sin(const Tensor x, char const *name = NULL);
+  Tensor sin(Tensor const x, char const *name = NULL);
   // Add a cos layer
-  Tensor cos(const Tensor x, char const *name = NULL);
+  Tensor cos(Tensor const x, char const *name = NULL);
   // Add an activation layer
-  Tensor relu(const Tensor x, bool inplace = true, char const *name = NULL);
-  Tensor identity(const Tensor x, char const *name = NULL);
-  Tensor gelu(const Tensor x, char const *name = NULL);
-  Tensor sigmoid(const Tensor x, char const *name = NULL);
-  Tensor tanh(const Tensor x, char const *name = NULL);
-  Tensor elu(const Tensor x, bool inplace = true, char const *name = NULL);
+  Tensor relu(Tensor const x, bool inplace = true, char const *name = NULL);
+  Tensor identity(Tensor const x, char const *name = NULL);
+  Tensor gelu(Tensor const x, char const *name = NULL);
+  Tensor sigmoid(Tensor const x, char const *name = NULL);
+  Tensor tanh(Tensor const x, char const *name = NULL);
+  Tensor elu(Tensor const x, bool inplace = true, char const *name = NULL);
   // Add a 2D convolutional layer
-  Tensor conv2d(const Tensor input,
+  Tensor conv2d(Tensor const input,
                 int outChannels,
                 int kernelH,
                 int kernelW,
@@ -490,12 +496,12 @@ class FFModel {
                 Initializer *bias_initializer = NULL,
                 char const *name = NULL);
   // Add a dropout layer
-  Tensor dropout(const Tensor input,
+  Tensor dropout(Tensor const input,
                  float rate,
                  unsigned long long seed = 0,
                  char const *name = NULL);
   // Add an embedding layer
-  Tensor embedding(const Tensor input,
+  Tensor embedding(Tensor const input,
                    int num_entries,
                    int outDim,
                    AggrMode aggr,
@@ -504,13 +510,13 @@ class FFModel {
                    Initializer *kernel_initializer = NULL,
                    char const *name = NULL);
   // Add a gather layer
-  Tensor gather(const Tensor input,
-                const Tensor index,
+  Tensor gather(Tensor const input,
+                Tensor const index,
                 int dim,
                 char const *name = NULL);
   // Add a group_by layer
-  void group_by(const Tensor data,
-                const Tensor assign,
+  void group_by(Tensor const data,
+                Tensor const assign,
                 Tensor *outputs,
                 int n,
                 float alpha,
@@ -532,7 +538,7 @@ class FFModel {
                         float lambda_bal,
                         char const *name = NULL);
   // Add a 2D pooling layer
-  Tensor pool2d(const Tensor input,
+  Tensor pool2d(Tensor const input,
                 int kernelH,
                 int kernelW,
                 int strideH,
@@ -543,7 +549,7 @@ class FFModel {
                 ActiMode activation = AC_MODE_NONE,
                 char const *name = NULL);
   // Add a layer_norm layer
-  Tensor layer_norm(const Tensor input,
+  Tensor layer_norm(Tensor const input,
                     std::vector<int> const &axes,
                     bool elementwise_affine,
                     float eps,
@@ -551,9 +557,9 @@ class FFModel {
                     DataType data_type = DT_NONE,
                     char const *name = NULL);
   // Add a layer_norm layer with residual(s)
-  void residual_layer_norm(const Tensor input,
-                           const Tensor residual1,
-                           const Tensor residual2,
+  void residual_layer_norm(Tensor const input,
+                           Tensor const residual1,
+                           Tensor const residual2,
                            Tensor *outputs,
                            bool use_two_residuals,
                            std::vector<int> const &axes,
@@ -563,8 +569,8 @@ class FFModel {
                            DataType data_type = DT_NONE,
                            char const *name = NULL);
   // Add a add_bias_residual_layer_norm layer
-  void add_bias_residual_layer_norm(const Tensor input,
-                                    const Tensor residual,
+  void add_bias_residual_layer_norm(Tensor const input,
+                                    Tensor const residual,
                                     Tensor *outputs,
                                     std::vector<int> const &axes,
                                     bool elementwise_affine,
@@ -573,41 +579,42 @@ class FFModel {
                                     DataType data_type = DT_NONE,
                                     char const *name = NULL);
   // Add a sigmoid_silu_multi layer
-  Tensor sigmoid_silu_multi(const Tensor input1,
-                            const Tensor input2,
+  Tensor sigmoid_silu_multi(Tensor const input1,
+                            Tensor const input2,
+                            int intermediate_size,
                             DataType data_type = DT_NONE,
                             char const *name = NULL);
   // Add a batch_norm layer
   Tensor
-      batch_norm(const Tensor input, bool relu = true, char const *name = NULL);
+      batch_norm(Tensor const input, bool relu = true, char const *name = NULL);
   // Add a batch_matmul layer
-  Tensor batch_matmul(const Tensor A,
-                      const Tensor B,
+  Tensor batch_matmul(Tensor const A,
+                      Tensor const B,
                       int a_seq_length_dim = -1,
                       int b_seq_length_dim = -1,
                       char const *name = nullptr);
   // Add a root mean square layer
-  Tensor rms_norm(const Tensor input,
+  Tensor rms_norm(Tensor const input,
                   float eps,
                   int dim,
                   DataType data_type = DT_NONE,
                   char const *name = NULL);
   // Add a residual root mean square layer
-  void residual_rms_norm(const Tensor input1,
-                         const Tensor input2,
+  void residual_rms_norm(Tensor const input1,
+                         Tensor const input2,
                          Tensor *outputs,
                          float eps,
                          int dim,
                          DataType data_type = DT_NONE,
                          char const *name = NULL);
-  // Add a beam search top k layer
-  Tensor beam_top_k(const Tensor input,
-                    int max_beam_size,
-                    bool sorted,
-                    char const *name = NULL);
+  //   // Add a beam search top k layer
+  //   Tensor beam_top_k(Tensor const input,
+  //                     int max_beam_size,
+  //                     bool sorted,
+  //                     char const *name = NULL);
 
   // Add a dense layer
-  Tensor dense(const Tensor input,
+  Tensor dense(Tensor const input,
                int outDim,
                ActiMode activation = AC_MODE_NONE,
                bool use_bias = true,
@@ -619,7 +626,7 @@ class FFModel {
                float regularizer_lambda = 0.0,
                char const *name = NULL);
   // Add a cast layer
-  Tensor cast(const Tensor input, DataType dtype, char const *name = nullptr);
+  Tensor cast(Tensor const input, DataType dtype, char const *name = nullptr);
   // Add a concat layer
   Tensor
       concat(int n, Tensor const *tensors, int axis, char const *name = NULL);
@@ -634,58 +641,64 @@ class FFModel {
       int experts_internal_dim_size = 0, // hidden dimension for internal layers
       char const *name = NULL);
   // Add a mean layer
-  Tensor mean(const Tensor input,
+  Tensor mean(Tensor const input,
               std::vector<int> const &dims,
               bool keepdims,
               char const *name);
   // Add a moe layer (wrapping topk, group_by and aggregate operators)
-  Tensor moe(const Tensor input,
+  Tensor moe(Tensor const input,
              int num_exp,
              int num_select,
              int expert_hidden_size,
              float alpha,
              float lambda);
   // Add a split layer
-  void split(const Tensor input,
+  void split(Tensor const input,
              Tensor *outputs,
              std::vector<int> const &split,
              int axis,
              char const *name = NULL);
   // Add a flat layer
-  Tensor flat(const Tensor input, char const *name = NULL);
+  Tensor flat(Tensor const input, char const *name = NULL);
   // Add a softmax layer
-  Tensor softmax(const Tensor input,
+  Tensor softmax(Tensor const input,
                  int dim = -1,
                  DataType data_type = DT_NONE,
                  char const *name = NULL);
   // Create input tensors and constants
-  Tensor transpose(const Tensor input,
+  Tensor transpose(Tensor const input,
                    std::vector<int> const &perm,
                    char const *name = NULL);
-  Tensor reduce_sum(const Tensor input,
+  Tensor reduce_sum(Tensor const input,
                     std::vector<int> const &axes,
                     bool keepdims = false,
                     char const *name = nullptr);
-  Tensor reshape(const Tensor input,
+  Tensor reshape(Tensor const input,
                  std::vector<int> const &shape,
                  char const *name = NULL);
-  Tensor reverse(const Tensor input, int axis, char const *name = NULL);
-  void top_k(const Tensor input,
+  Tensor reverse(Tensor const input, int axis, char const *name = NULL);
+  void top_k(Tensor const input,
              Tensor *outputs,
              int k,
              bool sorted,
              char const *name = NULL);
-  Tensor arg_top_k(const Tensor input,
+  Tensor gumbel_top_k(Tensor const input,
+                      // Tensor *outputs,
+                      int k,
+                      bool sorted,
+                      bool speculative_decoding,
+                      char const *name = NULL);
+  Tensor arg_top_k(Tensor const input,
                    // Tensor *outputs,
                    int k,
                    bool sorted,
-                   bool speculative_decoding,
+                   bool renormalize,
                    char const *name = NULL);
-  Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL);
-  Tensor sampling(const Tensor input, float top_p, char const *name = NULL);
-  Tensor multihead_attention(const Tensor query,
-                             const Tensor key,
-                             const Tensor value,
+  Tensor argmax(Tensor const input, bool beam_search, char const *name = NULL);
+  Tensor sampling(Tensor const input, float top_p, char const *name = NULL);
+  Tensor multihead_attention(Tensor const query,
+                             Tensor const key,
+                             Tensor const value,
                              int embed_dim,
                              int num_heads,
                              int kdim = 0,
@@ -697,42 +710,7 @@ class FFModel {
                              DataType data_type = DT_NONE,
                              Initializer *kernel_initializer = NULL,
                              char const *name = NULL);
-  Tensor inc_multihead_self_attention(const Tensor input,
-                                      int embed_dim,
-                                      int num_heads,
-                                      int kdim = 0,
-                                      int vdim = 0,
-                                      float dropout = 0.0f,
-                                      bool bias = false,
-                                      bool add_bias_kv = false,
-                                      bool add_zero_attn = false,
-                                      DataType data_type = DT_NONE,
-                                      Initializer *kernel_initializer = NULL,
-                                      bool apply_rotary_embedding = false,
-                                      bool scaling_query = false,
-                                      float scaling_factor = 1.0f,
-                                      bool qk_prod_scaling = true,
-                                      bool position_bias = false,
-                                      char const *name = NULL);
-  Tensor
-      spec_inc_multihead_self_attention(const Tensor input,
-                                        int embed_dim,
-                                        int num_heads,
-                                        int kdim = 0,
-                                        int vdim = 0,
-                                        float dropout = 0.0f,
-                                        bool bias = false,
-                                        bool add_bias_kv = false,
-                                        bool add_zero_attn = false,
-                                        DataType data_type = DT_NONE,
-                                        Initializer *kernel_initializer = NULL,
-                                        bool apply_rotary_embedding = false,
-                                        bool scaling_query = false,
-                                        float scaling_factor = 1.0f,
-                                        bool qk_prod_scaling = true,
-                                        bool position_bias = false,
-                                        char const *name = NULL);
-  Tensor inc_multihead_self_attention_verify(
+  Tensor inc_multihead_self_attention(
       const Tensor input,
       int embed_dim,
       int num_heads,
@@ -744,52 +722,73 @@ class FFModel {
       bool add_zero_attn = false,
       DataType data_type = DT_NONE,
       Initializer *kernel_initializer = NULL,
-      bool apply_rotary_embedding = false,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
       bool scaling_query = false,
       float scaling_factor = 1.0f,
       bool qk_prod_scaling = true,
       bool position_bias = false,
+      bool streaming_cache = false,
       char const *name = NULL);
-  Tensor inc_multiquery_self_attention(const Tensor input,
-                                       int embed_dim,
-                                       int num_q_heads,
-                                       int num_kv_heads,
-                                       int kdim = 0,
-                                       int vdim = 0,
-                                       float dropout = 0.0f,
-                                       bool bias = false,
-                                       bool add_bias_kv = false,
-                                       bool add_zero_attn = false,
-                                       DataType data_type = DT_NONE,
-                                       Initializer *kernel_initializer = NULL,
-                                       bool apply_rotary_embedding = false,
-                                       bool scaling_query = false,
-                                       float scaling_factor = 1.0f,
-                                       bool qk_prod_scaling = true,
-                                       bool position_bias = false,
-                                       char const *name = NULL);
-  Tensor
-      spec_inc_multiquery_self_attention(const Tensor input,
-                                         int embed_dim,
-                                         int num_q_heads,
-                                         int num_kv_heads,
-                                         int kdim = 0,
-                                         int vdim = 0,
-                                         float dropout = 0.0f,
-                                         bool bias = false,
-                                         bool add_bias_kv = false,
-                                         bool add_zero_attn = false,
-                                         DataType data_type = DT_NONE,
-                                         Initializer *kernel_initializer = NULL,
-                                         bool apply_rotary_embedding = false,
-                                         bool scaling_query = false,
-                                         float scaling_factor = 1.0f,
-                                         bool qk_prod_scaling = true,
-                                         bool position_bias = false,
-                                         char const *name = NULL);
-  Tensor inc_multiquery_self_attention_verify(
+  Tensor spec_inc_multihead_self_attention(
       const Tensor input,
       int embed_dim,
+      int num_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = false,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      bool streaming_cache = false,
+      char const *name = NULL);
+  Tensor inc_multihead_self_attention_verify(
+      Tensor const input,
+      int embed_dim,
+      int num_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = false,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      char const *name = NULL);
+  Tensor groupquery_self_attention(
+      Tensor const input,
+      int embed_dim,
+      int num_q_heads,
+      int num_kv_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = false,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      bool streaming_cache = false,
+      char const *name = NULL);
+  Tensor spec_inc_multiquery_self_attention(
+      Tensor const input,
+      int embed_dim,
       int num_q_heads,
       int num_kv_heads,
       int kdim = 0,
@@ -800,7 +799,27 @@ class FFModel {
       bool add_zero_attn = false,
       DataType data_type = DT_NONE,
       Initializer *kernel_initializer = NULL,
-      bool apply_rotary_embedding = false,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      bool streaming_cache = false,
+      char const *name = NULL);
+  Tensor inc_multiquery_self_attention_verify(
+      Tensor const input,
+      int embed_dim,
+      int num_q_heads,
+      int num_kv_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = false,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
       bool scaling_query = false,
       float scaling_factor = 1.0f,
       bool qk_prod_scaling = true,
@@ -810,7 +829,11 @@ class FFModel {
   // Inference APIs
   // ========================================
   std::vector<GenerationResult> generate(std::vector<std::string> &prompts,
-                                         int max_seq_length);
+                                         EmissionMachine &emission_machine);
+
+  std::vector<GenerationResult>
+      generate(std::vector<GenerationRequest> &requests,
+               EmissionMachine &emission_machine);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
@@ -820,7 +843,7 @@ class FFModel {
                                        bool create_grad = true);
   ParallelTensor
       create_parallel_tensor_legion_ordering(int num_dim,
-                                             const ParallelDim dims[],
+                                             ParallelDim const dims[],
                                              DataType data_type,
                                              Op const *owner_op = NULL,
                                              int owner_idx = 0,
@@ -833,7 +856,7 @@ class FFModel {
                        int owner_idx = 0,
                        bool create_grad = true);
   ParallelTensor create_parallel_tensor(int num_dim,
-                                        const ParallelDim dims[],
+                                        ParallelDim const dims[],
                                         DataType data_type,
                                         Op const *owner_op = NULL,
                                         int owner_idx = 0,
@@ -846,7 +869,7 @@ class FFModel {
                        int owner_idx = 0,
                        bool create_grad = true);
   template <int NDIM>
-  ParallelTensor create_parallel_tensor(const ParallelDim dims[],
+  ParallelTensor create_parallel_tensor(ParallelDim const dims[],
                                         DataType data_type,
                                         Op const *owner_op = NULL,
                                         int owner_idx = 0,
@@ -870,7 +893,7 @@ class FFModel {
       ParameterSyncType sync_type = ParameterSyncType::NONE);
   template <int NDIM>
   ParallelParameter create_parallel_weight(
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       DataType data_type,
       Op const *owner_op = NULL,
       bool create_grad = true,
@@ -878,7 +901,7 @@ class FFModel {
       ParameterSyncType sync_type = ParameterSyncType::NONE);
   ParallelParameter create_parallel_weight(
       int numdim,
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       DataType data_type,
       Op const *owner_op = NULL,
       bool create_grad = true,
@@ -886,7 +909,7 @@ class FFModel {
       ParameterSyncType sync_type = ParameterSyncType::NONE);
   ParallelParameter create_parallel_weight_legion_ordering(
       int numdim,
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       DataType data_type,
       Op const *owner_op = NULL,
       bool create_grad = true,
@@ -895,7 +918,7 @@ class FFModel {
 
   void map_tensor(ParallelTensor tensor, Op const *parallel_op);
   void map_weight(ParallelTensor tensor, Op const *parallel_op);
-  bool get_parallel_tensor_from_tensor(const Tensor tensor,
+  bool get_parallel_tensor_from_tensor(Tensor const tensor,
                                        ParallelTensor &parallel_tensor) const;
 
   template <int NDIM>
@@ -936,7 +959,7 @@ class FFModel {
   // Internal PCG::Node creation APIs
   // ========================================
   template <typename T>
-  PCG::Node get_or_create_node(const typename T::Input &input,
+  PCG::Node get_or_create_node(typename T::Input const &input,
                                typename T::Params const &params) {
     using Params = typename T::Params;
 
@@ -966,50 +989,50 @@ class FFModel {
     return this->new_node(op);
   }
 
-  PCG::Node get_or_create_noop_node(const ParallelTensor input);
+  PCG::Node get_or_create_noop_node(ParallelTensor const input);
   PCG::Node get_or_create_input_node(ParallelTensorShape const &);
   PCG::Node get_or_create_fused_parallel_node(
-      const ParallelTensor input,
+      ParallelTensor const input,
       std::vector<ParallelOpInfo> const &parallel_ops);
-  PCG::Node get_or_create_parallel_op_node(const ParallelTensor input,
+  PCG::Node get_or_create_parallel_op_node(ParallelTensor const input,
                                            ParallelOpInfo const &);
   // ========================================
   // Internal APIs that should not be invoked from applications
   // ========================================
   void create_disjoint_partition(int num_dims,
-                                 const ParallelDim dims[],
+                                 ParallelDim const dims[],
                                  Legion::IndexSpace const &part_is,
                                  Legion::LogicalRegion const &region,
                                  Legion::LogicalPartition &part);
   template <int NDIM, int TDIM>
   void create_disjoint_partition_with_dim2(
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       Legion::IndexSpaceT<TDIM> const &part_is,
       Legion::LogicalRegion const &region,
       Legion::LogicalPartition &part);
   void create_aliased_partition(int num_dims,
-                                const ParallelDim dims[],
+                                ParallelDim const dims[],
                                 int aliased_dim,
                                 Legion::IndexSpace const &part_is,
                                 Legion::LogicalRegion const &region,
                                 Legion::LogicalPartition &part);
   template <int NDIM, int TDIM>
   void create_aliased_partition_with_dim2(
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       int aliased_dim,
       Legion::IndexSpaceT<TDIM> const &part_is,
       Legion::LogicalRegion const &region,
       Legion::LogicalPartition &part);
 
   template <int NDIM>
-  void create_disjoint_partition(const ParallelTensor tensor,
+  void create_disjoint_partition(ParallelTensor const tensor,
                                  Legion::IndexSpaceT<NDIM> const &part_is,
                                  Legion::LogicalPartition &part_fwd,
                                  Legion::LogicalPartition &part_bwd);
 
   template <int NDIM, int TDIM>
   void create_data_parallel_partition_with_diff_dims(
-      const ParallelTensor tensor,
+      ParallelTensor const tensor,
       Legion::IndexSpaceT<TDIM> const &task_is,
       Legion::LogicalPartition &part_fwd,
       Legion::LogicalPartition &part_bwd);
@@ -1059,6 +1082,10 @@ class FFModel {
                CompMode comp_mode = COMP_MODE_TRAINING);
   void compile_inference();
   void set_transformer_layer_id(int id);
+  void set_num_transformer_layers(int num_layers);
+  void set_num_kv_heads(int num_heads);
+  void set_qkv_dim(int qkv_dim);
+  void set_size_dt(int size_dt);
   void set_position_offset(int offset);
   void graph_optimize(size_t budget,
                       bool only_data_parallel,
@@ -1078,6 +1105,7 @@ class FFModel {
                      bool use_propagation) const;
 #ifdef FF_USE_NCCL
   ncclComm_t *find_nccl_comms(MachineView const &view) const;
+  void finish_nccl_comms();
 #endif
 #ifdef FF_USE_PROPAGATE
   void propagate(std::map<Op *, ParallelConfig> const &current,
@@ -1097,7 +1125,7 @@ class FFModel {
   Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc);
   Legion::IndexSpace get_or_create_task_is(MachineView const &view);
   Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain);
-  Legion::IndexSpace get_or_create_task_is(const ParallelTensor);
+  Legion::IndexSpace get_or_create_task_is(ParallelTensor const);
   Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
   Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
   Legion::IndexSpace get_task_is(MachineView const &view) const;
@@ -1119,6 +1147,10 @@ class FFModel {
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
   // positional embedding start offset
+  int num_transformer_layers;
+  int num_kv_heads;
+  int qkv_dim;
+  int size_dt;
   int position_offset;
   FFConfig config;
   FFIterationConfig iter_config;
@@ -1203,8 +1235,8 @@ class FFModel {
       std::unordered_map<
           std::pair<ParallelTensorShape, IncMultiHeadSelfAttentionParams>,
           IncMultiHeadSelfAttention *>,
-      std::unordered_map<std::pair<ParallelTensorShape, BeamTopKParams>,
-                         BeamTopK *>,
+      //   std::unordered_map<std::pair<ParallelTensorShape, BeamTopKParams>,
+      //                      BeamTopK *>,
       std::unordered_map<std::pair<ParallelTensorShape, SamplingParams>,
                          Sampling *>,
       std::unordered_map<std::pair<ParallelTensorShape, ArgMaxParams>,
@@ -1223,6 +1255,8 @@ class FFModel {
       std::unordered_map<std::pair<ParallelTensorShape, SoftmaxParams>,
                          Softmax *>,
       std::unordered_map<std::pair<ParallelTensorShape, TopKParams>, TopK *>,
+      std::unordered_map<std::pair<ParallelTensorShape, GumbelTopKParams>,
+                         GumbelTopK *>,
       std::unordered_map<std::pair<ParallelTensorShape, ArgTopKParams>,
                          ArgTopK *>,
       std::unordered_map<std::pair<ParallelTensorShape, TransposeParams>,
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 1b19bdb82..311699d92 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -19,7 +19,7 @@
 
 namespace FlexFlow {
 
-extern LegionRuntime::Logger::Category log_measure;
+extern Legion::Logger log_measure;
 
 class OpMeta;
 class Simulator;
@@ -233,6 +233,8 @@ class Op {
                                       std::vector<ParallelTensor> const &,
                                       MachineView const *mv = nullptr) {
     assert(false);
+    Legion::FutureMap empty_map;
+    return empty_map;
   };
   virtual void print_layer(FFModel const &model) = 0;
   template <typename OpMetaType>
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index 5b187839e..766d4a582 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -8,7 +8,7 @@
 #include "flexflow/ops/argmax_params.h"
 #include "flexflow/ops/attention_params.h"
 #include "flexflow/ops/batch_matmul_params.h"
-#include "flexflow/ops/beam_topk_params.h"
+// #include "flexflow/ops/beam_topk_params.h"
 #include "flexflow/ops/cast_params.h"
 #include "flexflow/ops/concat_params.h"
 #include "flexflow/ops/conv_2d_params.h"
@@ -20,6 +20,7 @@
 #include "flexflow/ops/flat_params.h"
 #include "flexflow/ops/gather_params.h"
 #include "flexflow/ops/groupby_params.h"
+#include "flexflow/ops/gumbel_topk_params.h"
 #include "flexflow/ops/inc_multihead_self_attention_params.h"
 #include "flexflow/ops/layer_norm_params.h"
 #include "flexflow/ops/linear_params.h"
@@ -69,7 +70,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        LinearParams,
                                        MultiHeadAttentionParams,
                                        IncMultiHeadSelfAttentionParams,
-                                       BeamTopKParams,
+                                       //    BeamTopKParams,
                                        SpecIncMultiHeadSelfAttentionParams,
                                        TreeIncMultiHeadSelfAttentionParams,
                                        RMSNormParams,
@@ -79,6 +80,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        ReshapeParams,
                                        SplitParams,
                                        TopKParams,
+                                       GumbelTopKParams,
                                        ArgTopKParams,
                                        SamplingParams,
                                        ArgMaxParams,
diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index 3822a5e41..f46404e9e 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -5,15 +5,25 @@
 #include "flexflow/model.h"
 #include "flexflow/node.h"
 #include "flexflow/ops/arg_topk_params.h"
+#include "flexflow/utils/memory_allocator.h"
+#include "raft/core/device_resources.hpp"
+#include <unordered_map>
 
 namespace FlexFlow {
 
 class ArgTopKMeta : public OpMeta {
 public:
-  ArgTopKMeta(FFHandler handle, Op const *op);
   bool sorted;
   int k;
-  bool speculative_decoding;
+  bool renormalize;
+  Realm::RegionInstance reserveInst;
+  void *half_precision_output;
+  int max_output_size;
+  std::unordered_map<cudaStream_t, raft::device_resources *> device_resources;
+  ArgTopKMeta(FFHandler handle,
+              Op const *op,
+              MemoryAllocator &gpu_mem_allocator);
+  ~ArgTopKMeta(void);
 };
 
 class ArgTopK : public Op {
@@ -22,15 +32,15 @@ class ArgTopK : public Op {
   using Input = ParallelTensor;
   ArgTopK(FFModel &model,
           LayerID const &layer_guid,
-          const ParallelTensor input,
+          ParallelTensor const input,
           int k,
           bool sorted,
-          bool speculative_decoding,
+          bool renormalize,
           char const *name);
   ArgTopK(FFModel &model,
           LayerID const &layer_guid,
           ArgTopK const &other,
-          const ParallelTensor input);
+          ParallelTensor const input);
   ArgTopK(FFModel &model,
           Params const &params,
           Input const input,
@@ -64,7 +74,7 @@ class ArgTopK : public Op {
                      std::vector<Legion::PhysicalRegion> const &regions,
                      Legion::Context ctx,
                      Legion::Runtime *runtime);
-  static BeamInferenceResult inference_speculative_task(
+  static InferenceResult inference_speculative_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
@@ -81,28 +91,29 @@ class ArgTopK : public Op {
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
   template <typename DT>
-  static void forward_kernel(ArgTopKMeta const *m,
+  static void forward_kernel(ArgTopKMeta *m,
                              DT const *input_ptr,
-                             float *output_ptr,
+                             DT *output_ptr,
                              int *indices_ptr,
                              size_t batch_size,
                              int length,
                              int k,
                              bool sorted,
-                             BeamSearchBatchConfig const *bc,
+                             bool renormalize,
+                             BatchConfig const *bc,
                              ffStream_t stream);
-  static void forward_kernel_wrapper(ArgTopKMeta const *m,
+  static void forward_kernel_wrapper(ArgTopKMeta *m,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &prob,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     BeamSearchBatchConfig const *bc);
+                                     BatchConfig const *bc);
   Params get_params() const;
 
 public:
   int k;
   bool sorted;
-  bool speculative_decoding;
+  bool renormalize;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h
index b2876c011..306ce9dd1 100644
--- a/include/flexflow/ops/arg_topk_params.h
+++ b/include/flexflow/ops/arg_topk_params.h
@@ -11,7 +11,7 @@ struct ArgTopKParams {
   LayerID layer_guid;
   int k;
   bool sorted;
-  bool speculative_decoding;
+  bool renormalize;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h
index 298059e3e..e58e8ca80 100644
--- a/include/flexflow/ops/argmax.h
+++ b/include/flexflow/ops/argmax.h
@@ -34,10 +34,10 @@ class ArgMax : public Op {
   using Params = ArgMaxParams;
   using Input = ParallelTensor;
   ArgMax(FFModel &model,
-         const ParallelTensor input,
+         ParallelTensor const input,
          bool beam_search,
          char const *name);
-  ArgMax(FFModel &model, ArgMax const &other, const ParallelTensor input);
+  ArgMax(FFModel &model, ArgMax const &other, ParallelTensor const input);
   ArgMax(FFModel &model,
          Params const &params,
          Input const input,
@@ -66,7 +66,7 @@ class ArgMax : public Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static BeamInferenceResult
+  static InferenceResult
       inference_task_beam(Legion::Task const *task,
                           std::vector<Legion::PhysicalRegion> const &regions,
                           Legion::Context ctx,
diff --git a/include/flexflow/ops/beam_topk_params.h b/include/flexflow/ops/beam_topk_params.h
deleted file mode 100644
index 3e09848c9..000000000
--- a/include/flexflow/ops/beam_topk_params.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _FLEXFLOW_BEAM_TOPK_PARAMS_H
-#define _FLEXFLOW_BEAM_TOPK_PARAMS_H
-
-#include "flexflow/ffconst.h"
-#include "flexflow/fftype.h"
-#include "flexflow/parallel_tensor.h"
-
-namespace FlexFlow {
-
-struct BeamTopKParams {
-  LayerID layer_guid;
-  bool sorted;
-  int max_beam_width;
-  char name[MAX_OPNAME];
-  bool is_valid(ParallelTensorShape const &) const;
-};
-bool operator==(BeamTopKParams const &, BeamTopKParams const &);
-
-} // namespace FlexFlow
-
-namespace std {
-template <>
-struct hash<FlexFlow::BeamTopKParams> {
-  size_t operator()(FlexFlow::BeamTopKParams const &) const;
-};
-} // namespace std
-
-#endif // _FLEXFLOW_BEAM_TOPK_PARAMS_H
diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h
index a8326e9ab..b8e417ddc 100644
--- a/include/flexflow/ops/fused.h
+++ b/include/flexflow/ops/fused.h
@@ -1,17 +1,41 @@
 #ifndef _FLEXFLOW_FUSED_H_
 #define _FLEXFLOW_FUSED_H_
 
+#include "flexflow/batch_config.h"
 #include "flexflow/model.h"
+#include "graph_params.h"
 
 namespace FlexFlow {
 
+// declare Legion names
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::LogicalPartition;
+using Legion::LogicalRegion;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Runtime;
+using Legion::Task;
+
 class FusedOp;
 class FusedOpMeta {
 public:
-  FusedOpMeta(void) {}
+  FusedOpMeta(void) {
+    graphCaptured = false;
+    graph_collections.reserve(BatchConfig::MAX_NUM_REQUESTS *
+                              BatchConfig::MAX_NUM_TOKENS * 2);
+  }
   OpMeta *meta[MAX_NUM_FUSED_OPERATORS];
   FusedOp *fused_op;
   int numOperators;
+  bool graphCaptured = false;
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+  std::unordered_map<GraphParams, cudaGraphExec_t> graph_collections;
+#else
+  std::unordered_map<GraphParams, hipGraphExec_t> graph_collections;
+#endif
 };
 
 class FusedOp : public Op {
diff --git a/include/flexflow/ops/graph_params.h b/include/flexflow/ops/graph_params.h
new file mode 100644
index 000000000..0362801c8
--- /dev/null
+++ b/include/flexflow/ops/graph_params.h
@@ -0,0 +1,51 @@
+#ifndef _FLEXFLOW_GRAPH_PARAMS_H_
+#define _FLEXFLOW_GRAPH_PARAMS_H_
+
+#include <stdio.h>
+#include <string>
+
+namespace FlexFlow {
+struct GraphParams {
+  int num_active_requests;
+  int num_active_tokens;
+  bool prompt_phase;
+
+  GraphParams(int num_active_requests, int num_active_tokens, bool prompt_phase)
+      : num_active_requests(num_active_requests),
+        num_active_tokens(num_active_tokens), prompt_phase(prompt_phase) {}
+
+  void Print() const {
+    printf("GraphParams, num_active_requests: %d, num_active_tokens: %d, "
+           "prompt_phase: %d\n\n",
+           num_active_requests,
+           num_active_tokens,
+           prompt_phase);
+  }
+};
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::GraphParams> {
+  size_t operator()(FlexFlow::GraphParams const &gp) const {
+    return std::hash<int>()(gp.num_active_requests) ^
+           std::hash<int>()(gp.num_active_tokens) ^
+           std::hash<bool>()(gp.prompt_phase);
+  }
+};
+} // namespace std
+
+namespace std {
+template <>
+struct equal_to<FlexFlow::GraphParams> {
+  bool operator()(FlexFlow::GraphParams const &lhs,
+                  FlexFlow::GraphParams const &rhs) const {
+    return lhs.num_active_requests == rhs.num_active_requests &&
+           lhs.num_active_tokens == rhs.num_active_tokens &&
+           lhs.prompt_phase == rhs.prompt_phase;
+  }
+};
+} // namespace std
+
+#endif
diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/gumbel_topk.h
similarity index 53%
rename from include/flexflow/ops/beam_topk.h
rename to include/flexflow/ops/gumbel_topk.h
index 9466ba2a3..b74361fb2 100644
--- a/include/flexflow/ops/beam_topk.h
+++ b/include/flexflow/ops/gumbel_topk.h
@@ -1,45 +1,58 @@
-#ifndef _FLEXFLOW_BEAM_TOPK_H_
-#define _FLEXFLOW_BEAM_TOPK_H_
+#ifndef _FLEXFLOW_GUMBEL_TOPK_H_
+#define _FLEXFLOW_GUMBEL_TOPK_H_
 
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/node.h"
-#include "flexflow/ops/beam_topk_params.h"
+#include "flexflow/ops/gumbel_topk_params.h"
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include <curand.h>
+#include <curand_kernel.h>
+#elif defined(FF_USE_HIP_ROCM)
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand_kernel.h>
+#endif
 #include "flexflow/utils/memory_allocator.h"
 
 namespace FlexFlow {
 
-class BeamTopKMeta : public OpMeta {
+class GumbelTopKMeta : public OpMeta {
 public:
-  BeamTopKMeta(FFHandler handle,
-               Op const *op,
-               MemoryAllocator &gpu_mem_allocator);
-  ~BeamTopKMeta(void);
   bool sorted;
-  int max_beam_width;
-  int *parent_ids;
-  void *acc_probs;
-  int *block_start_index;
-  int *request_id;
-  int *tokens_per_request;
+  int k;
+  bool speculative_decoding;
   Realm::RegionInstance reserveInst;
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+  curandState *state;
+  int state_max_length;
+#elif defined(FF_USE_HIP_ROCM)
+  hiprandState *state;
+#endif
+  GumbelTopKMeta(FFHandler handle,
+                 Op const *op,
+                 MemoryAllocator &gpu_mem_allocator);
+  ~GumbelTopKMeta(void);
 };
 
-class BeamTopK : public Op {
+class GumbelTopK : public Op {
 public:
-  using Params = BeamTopKParams;
+  using Params = GumbelTopKParams;
   using Input = ParallelTensor;
-  BeamTopK(FFModel &model,
-           const ParallelTensor input,
-           LayerID const &_layer_guid,
-           int max_beam_width,
-           bool sorted,
-           char const *name);
-  BeamTopK(FFModel &model, BeamTopK const &other, const ParallelTensor input);
-  BeamTopK(FFModel &model,
-           Params const &params,
-           Input const input,
-           char const *name = nullptr);
+  GumbelTopK(FFModel &model,
+             LayerID const &layer_guid,
+             ParallelTensor const input,
+             int k,
+             bool sorted,
+             bool speculative_decoding,
+             char const *name);
+  GumbelTopK(FFModel &model,
+             LayerID const &layer_guid,
+             GumbelTopK const &other,
+             ParallelTensor const input);
+  GumbelTopK(FFModel &model,
+             Params const &params,
+             Input const input,
+             char const *name = nullptr);
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
@@ -64,11 +77,16 @@ class BeamTopK : public Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static BeamInferenceResult
+  static InferenceResult
       inference_task(Legion::Task const *task,
                      std::vector<Legion::PhysicalRegion> const &regions,
                      Legion::Context ctx,
                      Legion::Runtime *runtime);
+  static InferenceResult inference_speculative_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
   void serialize(Legion::Serializer &s) const override;
   static PCG::Node deserialize(FFModel &ff,
                                Legion::Deserializer &d,
@@ -81,30 +99,31 @@ class BeamTopK : public Op {
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
   template <typename DT>
-  static void forward_kernel(BeamTopKMeta const *m,
-                             BeamSearchBatchConfig const *bc,
+  static void forward_kernel(GumbelTopKMeta const *m,
                              DT const *input_ptr,
-                             float *output_ptr,
+                             float *log_probs_ptr,
+                             float *perturbed_log_probs_ptr,
                              int *indices_ptr,
-                             int *parent_ptr,
-                             int batch_size,
+                             size_t batch_size,
                              int length,
+                             int k,
                              bool sorted,
+                             BatchConfig const *bc,
                              ffStream_t stream);
-  static void forward_kernel_wrapper(BeamTopKMeta const *m,
-                                     BeamSearchBatchConfig const *bc,
-                                     GenericTensorAccessorR const &input,
-                                     float *output_ptr,
-                                     int *indices_ptr,
-                                     int *parent_ptr,
-                                     int batch_size,
-                                     int length,
-                                     bool sorted);
+  static void
+      forward_kernel_wrapper(GumbelTopKMeta const *m,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &log_probs,
+                             GenericTensorAccessorW const &perturbed_log_probs,
+                             GenericTensorAccessorW const &indices,
+                             int batch_size,
+                             BatchConfig const *bc);
   Params get_params() const;
 
 public:
+  int k;
   bool sorted;
-  int max_beam_width;
+  bool speculative_decoding;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/gumbel_topk_params.h b/include/flexflow/ops/gumbel_topk_params.h
new file mode 100644
index 000000000..480e7b9cc
--- /dev/null
+++ b/include/flexflow/ops/gumbel_topk_params.h
@@ -0,0 +1,29 @@
+#ifndef _FLEXFLOW_GUMBEL_TOPK_PARAMS_H
+#define _FLEXFLOW_GUMBEL_TOPK_PARAMS_H
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "flexflow/parallel_tensor.h"
+
+namespace FlexFlow {
+
+struct GumbelTopKParams {
+  LayerID layer_guid;
+  int k;
+  bool sorted;
+  bool speculative_decoding;
+  char name[MAX_OPNAME];
+  bool is_valid(ParallelTensorShape const &) const;
+};
+bool operator==(GumbelTopKParams const &, GumbelTopKParams const &);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::GumbelTopKParams> {
+  size_t operator()(FlexFlow::GumbelTopKParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_GUMBEL_TOPK_PARAMS_H
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 43dc527bc..8bc3b15a3 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H
 
 #include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/inference.h"
@@ -39,7 +40,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool _qkv_bias,
                             bool _final_bias,
                             bool _add_zero_attn,
-                            bool _apply_rotary_embedding,
+                            RotaryEmbeddingMeta _rotary_embedding_meta,
                             bool _scaling_query,
                             float _scaling_factor,
                             bool _qk_prod_scaling,
@@ -47,6 +48,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool allocate_weights,
                             DataType _quantization_type,
                             bool _offload,
+                            bool _streaming_cache,
                             int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
@@ -61,7 +63,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool _qkv_bias,
                             bool _final_bias,
                             bool _add_zero_attn,
-                            bool _apply_rotary_embedding,
+                            RotaryEmbeddingMeta _rotary_embedding_meta,
                             bool _scaling_query,
                             float _scaling_factor,
                             bool _qk_prod_scaling,
@@ -69,6 +71,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool allocate_weights,
                             DataType _quantization_type,
                             bool _offload,
+                            bool _streaming_cache,
                             int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
@@ -113,7 +116,7 @@ class IncMultiHeadSelfAttention : public Op {
                              MachineView const &mv,
                              CostMetrics &cost_metrics) const override;
 
-  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m,
+  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
                                        BatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
@@ -126,12 +129,12 @@ class IncMultiHeadSelfAttention : public Op {
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
+  int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
-  bool offload;
+  bool offload, streaming_cache;
 };
 
 class IncMultiHeadSelfAttentionMeta : public OpMeta {
@@ -146,14 +149,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   IncMultiHeadSelfAttentionMeta(FFHandler handler,
                                 InferenceMode infer_mode,
                                 Op const *attn,
-                                int _qSize,
-                                int _kSize,
-                                int _vSize,
-                                int _qProjSize,
-                                int _kProjSize,
-                                int _vProjSize,
-                                int _oProjSize,
-                                bool _apply_rotary_embedding,
+                                int _hidden_size,
+                                int _qk_dim,
+                                int _v_dim,
+                                int _o_dim,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _qkv_bias,
                                 bool _scaling_query,
                                 bool _qk_prod_scaling,
@@ -168,18 +168,19 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
                                 int _num_q_heads,
                                 int _num_kv_heads,
                                 DataType _quantization_type,
-                                bool _offload);
+                                bool _offload,
+                                bool _streaming_cache);
   ~IncMultiHeadSelfAttentionMeta(void);
 
 public:
   Realm::RegionInstance reserveInst;
   size_t weights_params, weightSize, biasSize, reserveSpaceSize,
       quantized_weightSize;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qk_dim, v_dim, o_dim;
   int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
-      hidden_size;
+      local_hidden_size;
   bool *has_load_weights;
-  bool *apply_rotary_embedding;
+  RotaryEmbeddingMeta *rotary_embedding_meta;
   bool *qkv_bias;
   bool *final_bias;
   bool *scaling_query;
@@ -187,12 +188,20 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   bool *position_bias;
   float scaling_factor;
   void *weight_ptr, *bias_ptr; // for weight offload
-  void *devQKVProjArray, *keyCache, *valueCache;
-  void *qk_prods, *qk_prods_softmax;
+  void *devQKVProjArray, *queryTmp;
+  half *outputTmp;
+  void *kvCache;
+  bool streaming_cache;
+  // When enable Streaming cache, we alter relative position each iteration, so
+  // we need below memory buffer for storing the pre-pos-encoding key value in
+  // sink and window.
+  void *streamingPrePosEncBuf;
   void *attn_heads;
   char *quantized_weight_ptr;
   BatchConfig::PerTokenInfo *token_infos;
   BatchConfig::PerRequestInfo *request_infos;
+  bool *request_available;
+  StreamingCacheInfo *streaming_cache_infos;
   DataType quantization_type;
   bool offload;
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h
index 58681069e..809c4f19e 100644
--- a/include/flexflow/ops/inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/inc_multihead_self_attention_params.h
@@ -3,6 +3,7 @@
 
 #include "flexflow/ffconst.h"
 #include "flexflow/fftype.h"
+#include "flexflow/inference.h"
 #include "flexflow/parallel_tensor.h"
 
 namespace FlexFlow {
@@ -12,10 +13,11 @@ struct IncMultiHeadSelfAttentionParams {
   int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
       tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
+      position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   DataType quantization_type;
-  bool offload;
+  bool offload, streaming_cache;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/include/flexflow/ops/kernels/gemm_impl.h b/include/flexflow/ops/kernels/gemm_impl.h
new file mode 100644
index 000000000..f0e08a67d
--- /dev/null
+++ b/include/flexflow/ops/kernels/gemm_impl.h
@@ -0,0 +1,129 @@
+#ifndef GEMM_IMPL_H
+#define GEMM_IMPL_H
+
+#include <cublasLt.h>
+#include <cublas_v2.h>
+
+namespace Internal {
+
+/* TODO: Consider appropriate case to use Lt */
+// #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040))
+//     // Strangely, if mat2 has only 1 row or column, we get
+//     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+//     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] ==
+//     mat2_sizes[1]
+//     // is to use lt interface only when self is bias.
+//     // for cuda 11.4, cublasLtMatmul is activated
+//     // the last two conditions is to skip 16b transA and non-trans-B having
+//     // leading dim >> rows when they are sliced from a large tensor
+//     // see
+//     fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul if
+//     (!disable_addmm_cuda_lt) {
+//       useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
+//           result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
+//           self.is_contiguous() && result.is_contiguous() &&
+//           (scalar_type == at::ScalarType::Double ||
+//            scalar_type == at::ScalarType::Float ||
+//            scalar_type == at::ScalarType::Half ||
+//            scalar_type == at::ScalarType::BFloat16) &&
+// #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010)
+//           mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
+// #else
+//           mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
+//           mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
+//           mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
+//           // avoid leading dim >> rows bugs
+//           ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) ||
+//            (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) ||
+//            (scalar_type != at::ScalarType::Half &&
+//             scalar_type != at::ScalarType::BFloat16)) &&
+//           ((mat2.strides()[0] == 1 && mat2.strides()[1] == mat2_sizes[0]) ||
+//            (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) ||
+//            (scalar_type != at::ScalarType::Half &&
+//             scalar_type != at::ScalarType::BFloat16));
+// #endif
+//     }
+// #endif
+
+#define USE_CUBLASLT
+
+#ifdef USE_CUBLASLT
+template <typename Dtype>
+inline void gemm_internal_cublaslt(cublasLtHandle_t handle,
+                                   cudaDeviceProp *prop,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   cublasOperation_t transa,
+                                   cublasOperation_t transb,
+                                   int64_t m,
+                                   int64_t n,
+                                   int64_t k,
+                                   Dtype alpha,
+                                   Dtype const *a,
+                                   int64_t lda,
+                                   Dtype const *b,
+                                   int64_t ldb,
+                                   Dtype beta,
+                                   Dtype *c,
+                                   int64_t ldc,
+                                   cudaStream_t stream);
+#else
+template <typename Dtype>
+inline void gemm_internal_cublas(cublasHandle_t handle,
+                                 cudaDeviceProp *prop,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb,
+                                 int64_t m,
+                                 int64_t n,
+                                 int64_t k,
+                                 Dtype alpha,
+                                 Dtype const *a,
+                                 int64_t lda,
+                                 Dtype const *b,
+                                 int64_t ldb,
+                                 Dtype beta,
+                                 Dtype *c,
+                                 int64_t ldc,
+                                 cudaStream_t stream);
+#endif
+
+// Wrapper for gemm
+// Adopted from pytorch:
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/CUDABlas.cpp
+class GemmEngine {
+public:
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
+  // defaultlt setting workspace size to 1M.
+  GemmEngine(cublasHandle_t blas_,
+             cublasLtHandle_t blasLt_,
+             cudaDeviceProp *device_prop_ = nullptr,
+             size_t workspace_size_ = 1024 * 1024);
+  void assign_workspace(void *workspace_, size_t workspace_size_);
+
+  template <typename Dtype>
+  void gemm_internal(cublasOperation_t transa,
+                     cublasOperation_t transb,
+                     int64_t m,
+                     int64_t n,
+                     int64_t k,
+                     Dtype alpha,
+                     Dtype const *a,
+                     int64_t lda,
+                     Dtype const *b,
+                     int64_t ldb,
+                     Dtype beta,
+                     Dtype *c,
+                     int64_t ldc,
+                     cudaStream_t stream);
+
+public:
+  cublasHandle_t blas;
+  cublasLtHandle_t blasLt;
+  cudaDeviceProp *device_prop;
+  size_t workspace_size; // in bytes
+  void *workspace;
+};
+
+} // namespace Internal
+
+#endif // GEMM_IMPL_H
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 9bf2f581e..4c66c1f2c 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -14,97 +14,143 @@ namespace FlexFlow {
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t
+    get_k_entry_offset_verify(int const token_idx,
+                              int const page_idx,
+                              int const num_heads,
+                              int const head_dim) {
+  size_t index = ((page_idx)*kPagesize * 2 + (token_idx % kPagesize)) *
+                 head_dim * num_heads;
+  return index;
+}
+
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t
+    get_v_entry_offset_verify(int const token_idx,
+                              int const page_idx,
+                              int const num_heads,
+                              int const head_dim) {
+  size_t index =
+      ((page_idx)*kPagesize * 2 + kPagesize + (token_idx % kPagesize)) *
+      head_dim * num_heads;
+  return index;
+}
+
+// // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const num_heads,
+                                                     int const head_dim) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          token_idx % kPagesize) * /* page slot index */
+         num_heads *
+         head_dim;
+}
+
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const num_heads,
+                                                     int const head_dim) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          kPagesize + token_idx % kPagesize) * /* page slot index */
+         num_heads *
+         head_dim;
+}
+
 template <typename DT>
-void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
-                                         BatchConfig const *bc,
-                                         DT *output_ptr,
-                                         ffStream_t stream);
+void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
+                      GenericTensorAccessorR const weight,
+                      DataType data_type,
+                      ffStream_t stream);
 
+// [For the tokens in batch]
+// Compute qkv projection for the tokens in the batch.
 template <typename DT>
-void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         int shard_id,
-                         DT *output_ptr,
-                         DT const *weight_ptr,
-                         DT const *bias_ptr,
-                         int num_tokens,
-                         ffStream_t stream);
+void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
+                 BatchConfig const *bc,
+                 int shard_id,
+                 DT const *input_ptr,
+                 DT const *weight_ptr,
+                 DT *output_ptr,
+                 DT const *bias_ptr,
+                 ffStream_t stream);
 
+// [For the tokens in batch]
+// Apply position embedding for qk.
+// Note that this is only used for tokens in the current batch.
+// For other Key tokens like in streaming cache, we nned other kernel to apply
+// the position embedding.
 template <typename DT>
-__global__ void apply_position_bias_qkprd(DT *input_ptr,
-                                          int num_tokens,
-                                          int num_total_tokens,
-                                          int num_heads,
-                                          int global_num_q_heads,
-                                          int shard_id);
+void apply_pos_encoding_to_tokens_in_batch(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    DT *output_ptr,
+    cudaStream_t stream);
 
+// [For the tokens in streaming cache]
+// Apply position embedding for k projection in the streaming cache.
+// Note that before the position encoding, the projection is moved *in order* to
+// the kv memory took by the attention kernel. So our operation is applied where
+// kvCache points to.
 template <typename DT>
-__global__ void apply_proj_bias_w(DT *input_ptr,
-                                  DT const *bias_ptr,
-                                  int num_tokens,
-                                  int qkv_weight_size,
-                                  int oProjSize);
+void apply_pos_encoding_to_streaming_proj(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
 
+// [For the tokens in batch]
+// Update the kv cache, and compact the q array.
+// Source: qkv projeciton array of tokens in the batch.
+// Destination: q&kv ptr took by the attention kernel.
+// Note that the q&k here are the value after applying with position encoding.
 template <typename DT>
-__global__ void apply_proj_bias_qkv(DT *input_ptr,
-                                    DT const *bias_ptr,
-                                    int shard_id,
-                                    int num_tokens,
-                                    int qProjSize,
-                                    int kProjSize,
-                                    int vProjSize,
-                                    int num_heads,
-                                    int num_kv_heads,
-                                    bool scaling_query,
-                                    float scaling_factor);
-
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         cudaStream_t stream);
+
+template <typename DT>
+void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
+                               BatchConfig const *bc,
+                               cudaStream_t stream,
+                               bool is_spec);
+
+// [For the tokens in streaming cache]
+// Convert the out-of-order cache to in-order relative position.
+// Source: pre-pos-encoding kv values in the streaming cache.
+// Destination: kv ptr took by the attention kernel.
 template <typename DT>
-__global__ void
-    apply_rotary_embedding(DT *input_ptr,
-                           cuFloatComplex *complex_input,
-                           BatchConfig::PerTokenInfo const *tokenInfos,
-                           int qProjSize,
-                           int kProjSize,
-                           int num_heads,
-                           int num_tokens,
-                           int num_kv_heads,
-                           int q_block_size,
-                           int k_block_size,
-                           int q_array_size,
-                           bool q_tensor);
-#elif defined(FF_USE_HIP_ROCM)
+void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m,
+                                  BatchConfig const *bc,
+                                  cudaStream_t stream);
+
+// [For the tokens in batch]
+// Commit the kv values to the streaming cache.
+// Source: qkv projeciton array of tokens in the batch.
+// Destination: pre-pos-encoding kv values in the streaming cache.
 template <typename DT>
-__global__ void
-    apply_rotary_embedding(DT *input_ptr,
-                           hipFloatComplex *complex_input,
-                           BatchConfig::PerTokenInfo const *tokenInfos,
-                           int qProjSize,
-                           int kProjSize,
-                           int num_heads,
-                           int num_tokens,
-                           int num_kv_heads,
-                           int q_block_size,
-                           int k_block_size,
-                           int q_array_size,
-                           bool q_tensor);
-#endif
+void commit_kv(IncMultiHeadSelfAttentionMeta const *m,
+               BatchConfig const *bc,
+               cudaStream_t stream);
 
 template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        ffStream_t stream);
+void produce_output(IncMultiHeadSelfAttentionMeta const *m,
+                    BatchConfig const *bc,
+                    DT *output_ptr,
+                    cudaStream_t stream);
 
 template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             ffStream_t stream);
+void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         int shard_id,
+                         DT *output_ptr,
+                         DT const *weight_ptr,
+                         DT const *bias_ptr,
+                         int num_tokens,
+                         ffStream_t stream);
 } // namespace IncMultiHeadAttention
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index d1e0e050b..481243867 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -476,24 +476,24 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head,
                                     int max_sequence_length,
                                     int threads_per_value,
                                     int threads_per_block,
-                                    TreeVerifyBatchConfig const *bc,
+                                    BatchConfig const *bc,
                                     int shared_mem[]) {
 
   int max_query_length = 0;
   int max_total_length = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (!bc->request_available[i]) {
       continue;
     }
     max_query_length =
         max(max_query_length, bc->requestsInfo[i].num_tokens_in_batch);
     max_total_length = max(max_total_length,
-                           bc->requestsInfo[i].first_token_depth_in_request +
+                           bc->requestsInfo[i].first_token_index_in_request +
                                bc->requestsInfo[i].num_tokens_in_batch);
   }
 
   // todo fix this
-  int max_qk_length = max_query_length;
+  int max_qk_length = max_total_length;
 
   // The amount of shared memory needed to store the Q*K^T values in float.
   size_t qk_sz = div_up(max_qk_length + 1, 4) * 16;
@@ -512,7 +512,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head,
   size_t red_sz = rows_per_red * hidden_size_per_head * sizeof(float) / 2;
   // The max.
   shared_mem[0] = qk_sz;
-  shared_mem[1] = softmax_sz + red_sz + q_size;
+  shared_mem[1] = max(softmax_sz, red_sz) + q_size;
 }
 
 template <typename T, int Dh>
@@ -520,5 +520,8 @@ struct threads_per_value_t {
   static int const value = Dh * sizeof(T) / 16;
 };
 
+#define test_bit(bit_mask, idx, pos)                                           \
+  (((bit_mask)[idx][(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
+
 } // namespace FlexFlow
 #endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
\ No newline at end of file
diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 0eef4ca72..084898710 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -47,7 +47,8 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorR const &input2,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &residual_output,
-                            GenericTensorAccessorW const &output);
+                            GenericTensorAccessorW const &output,
+                            int batch_size);
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/sampling.h b/include/flexflow/ops/sampling.h
index 1696582cc..026be221f 100644
--- a/include/flexflow/ops/sampling.h
+++ b/include/flexflow/ops/sampling.h
@@ -46,10 +46,10 @@ class Sampling : public Op {
   using Params = SamplingParams;
   using Input = ParallelTensor;
   Sampling(FFModel &model,
-           const ParallelTensor input,
+           ParallelTensor const input,
            float top_p,
            char const *name);
-  Sampling(FFModel &model, Sampling const &other, const ParallelTensor input);
+  Sampling(FFModel &model, Sampling const &other, ParallelTensor const input);
   Sampling(FFModel &model,
            Params const &params,
            Input const input,
diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h
index 604438260..bc07e253e 100644
--- a/include/flexflow/ops/sigmoid_silu_multi.h
+++ b/include/flexflow/ops/sigmoid_silu_multi.h
@@ -19,6 +19,8 @@ class SigmoidSiluMulti : public Op {
                    LayerID const &_layer_guid,
                    const ParallelTensor _input1,
                    const ParallelTensor _input2,
+                   int _intermediate_size,
+                   int _tensor_parallelism_degree,
                    char const *name = nullptr);
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
@@ -68,18 +70,25 @@ class SigmoidSiluMulti : public Op {
   static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m,
                                        GenericTensorAccessorR const &input1,
                                        GenericTensorAccessorR const &input2,
-                                       GenericTensorAccessorW const &output);
+                                       GenericTensorAccessorW const &output,
+                                       int token_size);
+
+public:
+  int intermediate_size, tensor_parallelism_degree;
 };
 
 class SigmoidSiluMultiMeta : public OpMeta {
 public:
   SigmoidSiluMultiMeta(FFHandler handle,
                        SigmoidSiluMulti const *ln,
-                       MemoryAllocator &gpu_mem_allocator);
+                       MemoryAllocator &gpu_mem_allocator,
+                       int _global_intermediate_size,
+                       int _intermediate_size);
   ~SigmoidSiluMultiMeta(void);
 
 public:
   Realm::RegionInstance reserveInst;
+  int global_intermediate_size, intermediate_size;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h
index eb152db5c..0e92c0aa6 100644
--- a/include/flexflow/ops/sigmoid_silu_multi_params.h
+++ b/include/flexflow/ops/sigmoid_silu_multi_params.h
@@ -8,6 +8,7 @@ namespace FlexFlow {
 
 struct SigmoidSiluMultiParams {
   LayerID layer_guid;
+  int intermediate_size, tensor_parallelism_degree;
   char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index a0d01092b..e4e077e78 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -26,7 +26,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
 
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 LayerID const &layer_guid,
-                                const ParallelTensor _input,
+                                ParallelTensor const _input,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
@@ -36,16 +36,18 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool _qkv_bias,
                                 bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
                                 bool allocate_weights,
+                                bool _streaming_cache,
+                                int _tensor_parallelism_degree,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
-                                const ParallelTensor _input,
-                                const ParallelTensor _weight,
+                                ParallelTensor const _input,
+                                ParallelTensor const _weight,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
@@ -55,16 +57,18 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool _qkv_bias,
                                 bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
                                 bool allocate_weights,
+                                bool _streaming_cache,
+                                int _tensor_parallelism_degree,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 SpecIncMultiHeadSelfAttention const &other,
-                                const ParallelTensor input,
+                                ParallelTensor const input,
                                 bool allocate_weights);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 Params const &params,
@@ -107,24 +111,24 @@ class SpecIncMultiHeadSelfAttention : public Op {
                              MachineView const &mv,
                              CostMetrics &cost_metrics) const override;
 
-  static void
-      inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta const *m,
-                               BeamSearchBatchConfig const *bc,
-                               int shard_id,
-                               GenericTensorAccessorR const &input,
-                               GenericTensorAccessorR const &weight,
-                               GenericTensorAccessorW const &output,
-                               GenericTensorAccessorR const &bias);
+  static void inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta *m,
+                                       BatchConfig const *bc,
+                                       int shard_id,
+                                       GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorR const &weight,
+                                       GenericTensorAccessorW const &output,
+                                       GenericTensorAccessorR const &bias);
   Params get_params() const;
 
 public:
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
+  int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
+  bool streaming_cache;
 };
 
 class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
@@ -137,13 +141,6 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
                                     int _num_q_heads,
                                     int _num_kv_heads);
   ~SpecIncMultiHeadSelfAttentionMeta(void);
-
-public:
-  Realm::RegionInstance beam_search_reserve_inst;
-  BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos;
-  BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos;
-  bool *request_completed;
-  BatchConfig::BitMask *causalMask;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
index 1461224ba..75cb576dc 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
@@ -9,10 +9,13 @@ namespace FlexFlow {
 
 struct SpecIncMultiHeadSelfAttentionParams {
   LayerID layer_guid;
-  int embed_dim, num_q_heads, num_kv_heads, kdim, vdim;
+  int embed_dim, num_q_heads, num_kv_heads, kdim, vdim,
+      tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
+      position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
+  bool streaming_cache;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 02df0c013..3edf4dbd7 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -26,7 +26,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
 
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 LayerID const &layer_guid,
-                                const ParallelTensor _input,
+                                ParallelTensor const _input,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
@@ -36,7 +36,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 bool _qkv_bias,
                                 bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
@@ -47,8 +47,8 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 int _tensor_parallelism_degree,
                                 char const *name);
   TreeIncMultiHeadSelfAttention(FFModel &model,
-                                const ParallelTensor _input,
-                                const ParallelTensor _weight,
+                                ParallelTensor const _input,
+                                ParallelTensor const _weight,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
@@ -58,7 +58,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 bool _qkv_bias,
                                 bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
@@ -70,7 +70,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 char const *name);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 TreeIncMultiHeadSelfAttention const &other,
-                                const ParallelTensor input,
+                                ParallelTensor const input,
                                 bool allocate_weights);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 Params const &params,
@@ -111,7 +111,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                              CostMetrics &cost_metrics) const override;
 
   static void inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta *m,
-                                       TreeVerifyBatchConfig const *bc,
+                                       BatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorR const &weight,
@@ -124,9 +124,9 @@ class TreeIncMultiHeadSelfAttention : public Op {
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
+  int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
   bool offload;
@@ -145,10 +145,8 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 
 public:
   int num_active_tokens;
-  Realm::RegionInstance committed_token_reserve_inst;
-  TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos;
-  bool *request_completed;
-  BatchConfig::BitMask *causalMask;
+  BatchConfig::CommittedTokensInfo *committed_token_infos;
+  int *num_tokens_to_commit;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
index d1a51b8b8..3906210d4 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
@@ -12,8 +12,9 @@ struct TreeIncMultiHeadSelfAttentionParams {
   int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
       tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
+      position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   DataType quantization_type;
   bool offload;
   char name[MAX_OPNAME];
diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h
index bab7e6e4e..35f0c8542 100644
--- a/include/flexflow/optimizer.h
+++ b/include/flexflow/optimizer.h
@@ -20,7 +20,8 @@
 #include "legion.h"
 
 namespace FlexFlow {
-
+using Legion::Context;
+using Legion::Runtime;
 class FFModel;
 class OpMeta;
 
@@ -60,7 +61,9 @@ class SGDOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(SGDOptimizer const *op,
+  static void nccl_update_task_gpu(Legion::Context ctx,
+                                   Legion::Runtime *runtime,
+                                   SGDOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
                                    size_t size,
@@ -103,7 +106,9 @@ class AdamOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(AdamOptimizer const *op,
+  static void nccl_update_task_gpu(Legion::Context ctx,
+                                   Legion::Runtime *runtime,
+                                   AdamOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
                                    size_t size,
diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
new file mode 100644
index 000000000..c0d6df085
--- /dev/null
+++ b/include/flexflow/page_manager.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include "flexflow/batch_config.h"
+#include "flexflow/config.h"
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/file_loader.h"
+#include <deque>
+#include <future>
+#include <mutex>
+#include <tokenizers_cpp.h>
+
+namespace FlexFlow {
+
+using TokenId = BatchConfig::TokenId;
+
+/**
+ * @class LogicalTokenBlock
+ * @brief A class to represent a sequence of tokens for each request
+ */
+class LogicalTokenBlock {
+public:
+  using TokenId = BatchConfig::TokenId;
+
+  // Constructor
+  LogicalTokenBlock(int block_number, uint32_t block_size);
+
+  // Method to check if the block is empty
+  bool is_empty() const;
+
+  // Method to check if the block is full
+  bool is_full() const;
+
+  // Method to get the number of empty slots
+  int get_num_empty_slots() const;
+
+  // Method to get the number of allocated slots
+  int get_num_alloc_slots() const;
+
+  // Used to clean up the spec tokens in a block since these spec tokens may not
+  // be committed after use
+  void reset_num_spec_tokens();
+
+  // Method to append tokens
+  void append_tokens(std::vector<TokenId> const &token_ids_to_append,
+                     bool committed);
+
+  int get_num_tokens() const {
+    return num_tokens;
+  }
+  int get_num_commit_tokens() const {
+    return num_commit_tokens;
+  }
+  int get_num_spec_tokens() const {
+    return num_spec_tokens;
+  }
+
+  std::vector<TokenId> get_token_ids() const;
+
+private:
+  int block_number;      // the index of the logical token block
+  int block_size;        // the size of the block
+  int num_tokens;        // the number of tokens currently stored in the block
+  int num_commit_tokens; // the number of tokens inside this block that are
+                         // already committed
+  int num_spec_tokens;   // the number of tokens inside this block that are
+                         // speculative tokens, which is stored temporarily
+  std::vector<TokenId> token_ids; // store the token ids in a order that
+                                  // corresponds to the inference sequence
+};
+
+/**
+ * @class PhysicalTokenBlock
+ * @brief A class to represent a physical block of tokens similar to physical
+ * memory address It keeps track of the location of the tokens stored on GPU
+ * memory
+ */
+class PhysicalTokenBlock {
+public:
+  // Constructor
+  PhysicalTokenBlock(int block_number, int block_size);
+
+  // Method to get the block number
+  int get_block_number() const {
+    return block_number;
+  }
+  void incr_ref_count() {
+    ref_count++;
+  }
+  void decr_ref_count() {
+    ref_count--;
+  }
+  int ref_count; // reference count, TODO: move to private
+
+private:
+  int block_number; // the index of the physical token block
+  int block_size;   // the size of the block
+};
+
+/**
+ * @class BlockAllocator
+ * @brief A Block Manager that is reponsible for maintaining a pool of free
+ * blocks
+ */
+class BlockAllocator {
+public:
+  // Constructor
+  BlockAllocator(int block_size, int num_total_blocks);
+
+  // Allocate a block
+  PhysicalTokenBlock allocate();
+
+  // Free a block
+  void free(PhysicalTokenBlock &block);
+
+  // Get the number of free blocks
+  int get_num_free_blocks() const;
+
+private:
+  int block_size;
+  size_t num_total_blocks;
+  std::deque<PhysicalTokenBlock> free_blocks;
+};
+
+/*
+ * @class PageManager
+ * @brief A wrapper class that manages the kv cache allocation status
+ * notice that all the layers of model will share the same page manager because
+ * the position of kv cache will be the same
+ */
+class PageManager {
+public:
+  // Get the singleton instance of the PageManager as it will be shared in
+  // multiple places
+  static PageManager *get_page_manager();
+  static PageManager *get_page_manager(FFModel *ff, size_t kv_cache_size);
+  size_t get_kv_cache_size_per_layer();
+  using BlockTable = std::vector<PhysicalTokenBlock>;
+  using RequestGuid = BatchConfig::RequestGuid;
+  PageManager(int block_size, size_t num_total_blocks);
+  int allocate_one_block(RequestGuid const &request_guid);
+  void free_request(RequestGuid const &request_guid);
+  // used for the case that we want to free the last num_blocks that stores spec
+  // tokens(which are the tokens are not yet committed)
+  void free_multiple_blocks(RequestGuid const &request_guid, int num_blocks);
+  std::vector<int>
+      get_block_table_indices(RequestGuid const &request_guid) const;
+
+  void free_block_table(BlockTable &block_table);
+
+private:
+  size_t kv_cache_size_per_layer;
+  int block_size;       // the size of the block
+  int num_total_blocks; // the total number of blocks
+  BlockAllocator block_allocator;
+  std::unordered_map<RequestGuid, BlockTable> block_tables;
+
+  int get_num_total_free_blocks() const;
+  int get_num_allocated_blocks(RequestGuid const &request_guid) const;
+};
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
index bdf7aae50..3436fc2a6 100644
--- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
@@ -6,23 +6,41 @@
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
 #include "flexflow/parallel_ops/allreduce.h"
+#include "flexflow/utils/communication_buffer.h"
+#include "flexflow/utils/memory_allocator.h"
+#include <unordered_map>
 
 namespace FlexFlow {
 
 class AllReduceMeta : public OpMeta {
 public:
-  AllReduceMeta(FFHandler handle, AllReduce const *reduct);
+  AllReduceMeta(FFHandler handle,
+                AllReduce const *reduct,
+                MemoryAllocator &gpu_mem_allocator);
+  ~AllReduceMeta(void);
+
+public:
+  std::unordered_map<void *, CommunicationBuffer *> comm_bufs;
+  Realm::RegionInstance reserveInst;
+  void *allgather_src, *allgather_dst;
+  // reuse for communication buffer
+  void *barrier_in_ptr, *barrier_out_ptr;
+  int barrier_ptr_size, barrier_flag;
 };
 
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
+void inference_kernel_wrapper(Legion::Context ctx,
+                              Legion::Runtime *runtime,
+                              AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output);
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
+void forward_kernel_wrapper(Legion::Context ctx,
+                            Legion::Runtime *runtime,
+                            AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output);
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
old mode 100644
new mode 100755
index a38a3b267..16b41285b
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -18,7 +18,9 @@
 #include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
+#include "flexflow/page_manager.h"
 #include "flexflow/utils/file_loader.h"
+#include <condition_variable>
 #include <future>
 #include <mutex>
 #include <tokenizers_cpp.h>
@@ -26,7 +28,7 @@
 namespace FlexFlow {
 
 class FFModel;
-class BeamTree;
+class TokenTree;
 class RequestManager;
 using tokenizers::Tokenizer;
 
@@ -34,7 +36,7 @@ class InferenceManager {
 public:
   InferenceManager();
   static InferenceManager *get_inference_manager();
-  void compile_model_and_allocate_buffer(FFModel *model);
+  void compile_model_and_allocate_buffer(FFModel *model, bool is_llm = true);
   void init_operators_inference(FFModel *model);
   Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
   Legion::FutureMap
@@ -57,6 +59,72 @@ class InferenceManager {
   std::unordered_map<FFModel *, FileDataLoader *> model_weights_loaders;
 };
 
+class TokenTreeNode {
+public:
+  BatchConfig::TokenId id;
+  double log_accumulated_prob;
+  int parent_pos;
+  bool included = false;
+  bool gumbel = false;
+  float gumbel_logit = 0.0f;
+
+  TokenTreeNode(BatchConfig::TokenId id,
+                double log_accumulated_prob,
+                int parent_pos,
+                bool gumbel = false,
+                float gumbel_logit = 0.0f)
+      : id(id), log_accumulated_prob(log_accumulated_prob),
+        parent_pos(parent_pos), gumbel(gumbel), gumbel_logit(gumbel_logit) {}
+};
+
+bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
+               std::shared_ptr<TokenTreeNode> const &rhs);
+
+bool operator<=(std::shared_ptr<TokenTreeNode> const &lhs,
+                std::shared_ptr<TokenTreeNode> const &rhs);
+
+// A comparator for std::shared_ptr<TokenTreeNode>
+// This is used to construct a max heap for the token tree nodes
+struct SharedTokenTreeNodePtrLess {
+  bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
+                  std::shared_ptr<TokenTreeNode> const &rhs) const {
+    if (lhs->gumbel) {
+      assert(rhs->gumbel);
+      return lhs->gumbel_logit < rhs->gumbel_logit;
+    }
+    return lhs->log_accumulated_prob < rhs->log_accumulated_prob;
+  }
+};
+
+// A comparator for std::pair<std::shared_ptr<TokenTreeNode>, double>
+// This is used to construct a max heap for the token tree nodes
+struct SharedTokenTreeNodePtrDoubleLess {
+  bool operator()(
+      std::pair<std::shared_ptr<TokenTreeNode>, double> const &lhs,
+      std::pair<std::shared_ptr<TokenTreeNode>, double> const &rhs) const {
+    return lhs.second < rhs.second;
+  }
+};
+
+class TokenTree {
+public:
+  std::vector<std::vector<std::shared_ptr<TokenTreeNode>>> tree_layers = {};
+  void add_layer() {
+    tree_layers.emplace_back();
+    tree_layers.back().reserve(BatchConfig::MAX_TREE_WIDTH);
+  }
+
+  void clear() {
+    tree_layers.clear();
+  }
+
+  TokenTree() {
+    tree_layers.reserve(BatchConfig::MAX_TREE_DEPTH + 1);
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree);
+
 struct Request {
   enum Status {
     PENDING = 101,   // loading prompt
@@ -65,46 +133,180 @@ struct Request {
     FINISHING = 104, // finishing request, but not yet verified
   };
   BatchConfig::RequestGuid guid;
-  int max_sequence_length;
-  int initial_len;
+  int batch_index = -1;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
+  double slo_ratio = 1.0;
+  double decode_latency_ms = 0.0;
+  int ssm_prefill_len = 0;
+  int llm_prefill_len = 0;
+  bool attained = true;
+  bool add_special_tokens = true;
+
+  int first_token_offset_in_batch = 0;
+  int num_tokens_in_batch = 0;
 
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  std::vector<struct BeamTree> beam_trees;
-};
-
-// store the result of beam search
-struct BeamTree {
-  struct treeLayer {
-    BeamSearchBatchConfig::TokenId
-        tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int parent_ids[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int nodes_num_this_layer = 0;
+  // page attention, page_last_committed should be -1 because there are no
+  // blocks at the beginning
+  int page_last_committed = -1;
+  std::vector<LogicalTokenBlock> blocks;
+
+  // TokenTree speculative_token_tree;
+  std::vector<TokenTree> speculative_token_trees;
+  // To make request manager stateful, we need to store the causal mask here
+  BatchConfig::BitMask causal_mask;
+  // Here we maintain a struct CommittedToken which has a field `from_index` and
+  // `to_index`. The `from_index` is used by the LLM KV cache commitment and the
+  // `to_index` is used both by the the SSM KV cache recomputation and the LLM
+  // KV cache commitment. Details are as follows:
+  //
+  // 1. Recompute the SSM KV cache: We don't commit the KV cache of the SSM
+  // committed tokens but recompute them instead. That is, after the we append
+  // the committed tokens to the generated sequence, just like in the prefilling
+  // phase, and pass them into the SSM to recompute the KV cache. Here we don't
+  // need `from_index` because we don't copy the KV cache, but we need
+  // `to_index`, which is the indices of the committed tokens in the request.
+  //
+  // to_index -> BatchConfig::PerTokenInfo.abs_index_in_request
+  //
+  // 2. Commit the LLM KV cache: On the GPU, the KV cache of the speculative
+  // token tree and the generated tokens are stored separately. So the
+  // `from_index` should be the index of the token in the speculative token
+  // tree. `to_index` should be the place to put the KV cache in the LLM KV
+  // cache: prompt_length + generated_sequence_length +
+  // index_in_committed_tokens.
+  //
+  // from_index -> TreeVerifyBatchConfig::CommittedTokensInfo.index_in_kv_cache
+  // to_index -> TreeVerifyBatchConfig::CommittedTokensInfo.token_depth
+  //
+  // Actually, for a committed token, the `to_index` for the LLM KV cache and
+  // the SSM KV cache are the same thing, so we can use the same field to store
+  // the information.
+  //
+  // When storing the committed tokens:
+  // from_index: The offset of the committed token in the request in the
+  // TreeVerifyBatchConfig
+  // to_index: The absolute index of the token in the request
+
+  struct CommittedToken {
+    int from_index;
+    int to_index;
+    BatchConfig::TokenId token_id;
+    CommittedToken(int from_index, int to_index, BatchConfig::TokenId token_id)
+        : from_index(from_index), to_index(to_index), token_id(token_id) {}
   };
-  treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1];
+  std::vector<CommittedToken> committed_tokens;
+
+  // Enabling Streaming KVCache means we doesn't store the whole KV sequence of
+  // the tokens in a request. Instead, we only store the sink cache (a few
+  // foremost tokens) and the window cache (rolling-updated backmost tokens
+  // through decoding). Currently, we only use streaming cache in the *draft
+  // model* calculation.
+  // - Maintain the streaming cache: During inference, we
+  // first fill up the sink cache then the window cache. After the window cache
+  // is full, we move back to the beginning of the window cache and commit the
+  // tokens in replace there.
+  // - When to update the streaming cache:
+  // 1. Prefilling phase
+  // 2. Committing phase after the target model verification
+  StreamingCacheInfo streaming_cache_info;
+
+  std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, double>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+      SharedTokenTreeNodePtrDoubleLess>
+      token_tree_nodes_acc_prob_pair_pq;
+
+  double get_length_weight();
+  void set_slo_ratio(double slo_ratio_);
+  double get_slo_ratio();
+  int decode_length() const;
+
+  Request() {
+    std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
+        _prealloc_vector;
+    _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+    token_tree_nodes_acc_prob_pair_pq = std::priority_queue<
+        std::pair<std::shared_ptr<TokenTreeNode>, double>,
+        std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+        SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
+                                          std::move(_prealloc_vector));
+  }
 };
 
-// struct BeamTree_v2 {
-//   std::vector<BatchConfig::TokenId> tokens;
-//   std::vector<int> parent_ids;
-//   std::vector<float> probs;
-// };
+struct NewProfileInfo {
+  long long timestamp;
+  BatchConfig::RequestGuid request_guid;
+  int request_step_idx;
+  int num_speculated_tokens;
+  int num_accepted_tokens;
+  double speculation_score;
+  int num_generated_tokens;
+  long long speculation_start_timestamp;
+  long long speculation_end_timestamp;
+};
+struct RequestProfileInfo {
+  int llm_prefilling_steps = 0;
+  int ssm_prefilling_steps = 0;
+  int llm_decoding_steps = 0;
+  int ssm_decoding_steps = 0;
+  long long start_time = 0, start_decoding_time = 0, finish_time = 0;
+  long long speculation_start_timestamp;
+  long long speculation_end_timestamp;
+  std::vector<int> speculated_size_per_step;
+  std::vector<int> accepted_tokens_per_step;
+  std::vector<int> generated_tokens_per_step__;
+};
+struct ProfileInfo {
+  // For SpecInfer: One step is comprised of one ssm speculation phase + a
+  // single llm verification phase (forward pass + verification) For Incr
+  // Decoding: One step is one LLM decoding phase
+  long long llm_step_start = 0, ssm_step_start = 0;
+  // Times for each LLM verification phase (in ms)
+  std::vector<double> llm_step_times;
+  // Number of requests in batch at each step
+  std::vector<int> requests_per_step;
+  // Times for each SSM speculation phase (in ms)
+  std::vector<double> ssm_step_times;
+  // Number of requests getting decoded at each step
+  std::vector<int> ssm_steps;
+  std::vector<double> tree_operation_step_times;
+  // Number of generated tokens at each step
+  std::vector<int> generated_tokens_per_step;
+  // To calculate the E2E time of serving
+  long long server_start_time = 0;
+  long long server_end_time = 0;
+};
 
 class RequestManager {
 public:
-  enum Status {
-    INITIALIZED = 1001,
-    SERVING = 1002,
-    TERMINATED = 1003,
+  enum State {
+    PREFILLING = 1001,
+    DECODING = 1002,
+    SSM_SPEC = 1003,
+    LLM_VERIFY = 1004,
+  };
+  enum BackgroundServerStatus {
+    INITIALIZED = 2001,
+    SERVING = 2002,
+    TERMINATED = 2003,
+  };
+  enum DecodingMode {
+    INCREMENTAL_DECODING = 3001,
+    SPECULATIVE_DECODING = 3002,
   };
+  enum PrefillModel {
+    LLM = 4001,
+    SSM = 4002,
+  };
+
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
 
-  static const RequestGuid INVALID_GUID = 0;
+  inline static RequestGuid const INVALID_GUID = 0;
   RequestManager();
   static RequestManager *get_request_manager();
   size_t get_num_processed_requests();
@@ -114,104 +316,81 @@ class RequestManager {
   int get_max_requests_per_batch();
   void set_max_tokens_per_batch(int max_num_tokens);
   int get_max_tokens_per_batch();
-  void set_max_spec_tree_token_num(int max_num_tokens);
+  void set_max_tokens_per_ssm_batch(int max_num_ssm_tokens);
+  int get_max_tokens_per_ssm_batch();
+  void set_max_tokens_per_prefilling_batch(int max_num_prefilling_tokens);
+  int get_max_tokens_per_prefilling_batch();
   int get_max_spec_tree_token_num();
-  int get_max_verify_tokens_per_batch();
   void set_max_sequence_length(int max_seq_length);
-  void push_spec_infer_tree_width(int tree_width);
   int get_max_sequence_length();
+  void set_max_kv_cache_size(size_t max_kv_cache_size);
+  size_t get_max_kv_cache_size();
+  void set_max_output_length(int max_output_length);
+  int get_max_output_length();
+  void set_decoding_mode(DecodingMode mode);
+  void set_verbose(bool verbose_);
+  int get_k();
+  void set_k(int k);
+  int get_max_tree_depth();
+  void set_max_tree_depth(int max_tree_depth);
+  int get_max_tree_width();
+  void set_max_tree_width(int max_tree_width);
+  int get_expansion_degree();
+  void set_expansion_degree(int expansion_degree_);
+  void set_speculative_sampling(bool speculative_sampling);
+  void set_baseline_latency(double baseline_latency_ms);
+  double get_baseline_latency();
+  void set_ssm_spec_latency(double ssm_spec_latency_ms);
+  double get_ssm_spec_latency();
+  void set_llm_verify_latency(double llm_verify_latency_ms);
+  double get_llm_verify_latency();
+  void set_correction_factor(double correction_factor);
+  double get_correction_factor();
+  void set_streaming_cache(bool streaming_cache);
+  bool get_streaming_cache();
+  bool get_memory_occupancy();
+  void set_memory_occupancy(bool memory_occupancy);
+  void
+      set_slo_violation_early_termination(bool slo_violation_early_termination);
+  void set_spec_infer_old_version(bool spec_infer_old_version);
+  void set_greedy_schedule(bool greedy_schedule);
+  void set_equal_schedule(bool equal_schedule);
+  void set_fcfs_slo(bool fcfs_slo);
+  void set_stta(bool stta);
+  bool get_spec_infer_old_version();
+  bool get_greedy_schedule();
+  bool get_equal_schedule();
+  bool get_fcfs_slo();
+  bool get_stta();
+  inline double get_slo_constraint(Request &request);
+  void set_eval_overhead_breakdown(bool eval_overhead_breakdown);
+  bool get_eval_overhead_breakdown();
+  double get_request_expected_latency(Request &request);
+  Request &get_request_with_guid(RequestGuid guid);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
-                          int eos_token_id,
+                          std::vector<int> eos_token_ids,
                           std::string const &path);
+  std::vector<int32_t> tokenize(std::string const &text);
   void register_output_filepath(std::string const &);
-  void initBitMask(BatchConfig::BitMask &bitmask, int initLength);
-  void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
-  void appendBitMask(BatchConfig::BitMask &bitmask,
-                     int newNodes,
-                     int preBeamSize,
-                     int old_sub_num,
-                     BeamTree const tree,
-                     int currentDepth);
-  void updateBitMask(BatchConfig::BitMask &bitmask,
-                     int initLength,
-                     int non_tree_size);
 
   FFModel *get_ssm_model(int model_id);
 
-  void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
+  void serve_spec_infer_sync(FFModel *model);
+  void serve_decoding(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
-  RequestGuid register_new_request(std::string const &prompt,
-                                   int max_sequence_length);
-  RequestGuid register_new_request(std::vector<TokenId> const &prompt,
-                                   int max_sequence_length);
+  RequestGuid register_new_request(GenerationRequest const &req);
   // Methods to start and terminate request manager's background task
   void start_background_server(FFModel *model);
+  bool is_background_server_serving();
   bool is_background_server_terminated();
   void terminate_background_server();
   static void terminate_background_server_at_exit();
   // Methods to check and mark request completion
-  bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
-  // Methods for preparing next batches
-  BatchConfig prepare_next_batch(BatchConfig const &bc,
-                                 InferenceResult const &result);
-  BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
-                                       InferenceResultFuture const &result,
-                                       Legion::Context ctx,
-                                       Legion::Runtime *runtime);
-  BeamSearchBatchConfig
-      prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
-                              BeamInferenceResult const &result);
-  BeamSearchBatchConfigFuture
-      prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
-                              BeamInferenceResultFuture const &result,
-                              Legion::Context ctx,
-                              Legion::Runtime *runtime);
-  BeamSearchBatchConfig
-      prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
-                              InferenceResult const &result,
-                              int model_id);
-  BeamSearchBatchConfigFuture
-      prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
-                              InferenceResultFuture const &result,
-                              int model_id,
-                              Legion::Context ctx,
-                              Legion::Runtime *runtime);
-  TreeVerifyBatchConfig prepare_next_batch_verify(
-      std::vector<BeamSearchBatchConfig> const &old_batches);
-  TreeVerifyBatchConfigFuture prepare_next_batch_verify(
-      std::vector<BeamSearchBatchConfigFuture> const &old_batches,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  void store_beam_metadata(BeamSearchBatchConfig const &old_bc,
-                           BeamInferenceResult const &result);
-  void update_beam_metadata(BeamSearchBatchConfig &new_bc,
-                            BeamSearchBatchConfig const &old_bc,
-                            BeamTree &tree,
-                            int request_index);
-
-  std::vector<std::pair<BatchConfig::TokenId, int>>
-      traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
-                         int request_index,
-                         int first_token_depth_in_request);
-
-  // remove guid after put the cached tree in request
-  std::vector<std::pair<BatchConfig::TokenId, int>> merge_dfs_trees(
-      std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
-          input_trees,
-      int root_depth,
-      RequestGuid guid);
-
-  std::vector<std::pair<BatchConfig::TokenId, int>> traverse_verify_tree(
-      size_t guid,
-      std::vector<std::pair<BatchConfig::TokenId, int>> const
-          &inputSerializedTree,
-      std::vector<std::pair<BatchConfig::TokenId, int>> const
-          &outputSerializedTree);
+  bool is_eos_token(TokenId token_id);
   static void background_serving_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
@@ -233,80 +412,218 @@ class RequestManager {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
-  static BatchConfig prepare_next_batch_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static BeamSearchBatchConfig prepare_next_batch_beam_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static BeamSearchBatchConfig prepare_next_batch_init_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static TreeVerifyBatchConfig prepare_next_batch_verify_task(
+  // API for rm state machine
+  BatchConfigFuture get_next_batch_config(InferenceResultFuture const &result,
+                                          Legion::Context ctx,
+                                          Legion::Runtime *runtime);
+  static BatchConfig get_next_batch_config_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
       Legion::Runtime *runtime);
+  BatchConfig get_next_batch_config(InferenceResult const &result);
+  void update_inference_results(InferenceResult const &result);
+  BatchConfig prepare_next_batch();
+
+  int get_num_active_requests();
+  int get_empty_request_index();
+
+  std::unordered_map<RequestGuid, RequestProfileInfo> get_requests_profiling();
+  std::unordered_map<RequestGuid, GenerationResult>
+      get_request_generation_results();
+  ProfileInfo get_profiling_info();
+  std::vector<NewProfileInfo> get_new_profiling_info();
+
+  // Comparters
+  struct SharedTokenTreeNodePtrRequestGuidWeightedLess {
+    bool operator()(
+        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
+        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
+        const;
+  };
+  struct SharedTokenTreeNodePtrDoubleRequestGuidLess {
+    bool operator()(
+        std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid> const
+            &lhs,
+        std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid> const
+            &rhs) const;
+  };
 
 private:
   // configuration parameters
   int max_requests_per_batch;
   int max_tokens_per_batch;
+  int max_tokens_per_ssm_batch;
+  int max_tokens_per_prefilling_batch;
   int max_spec_tree_token_num;
   int max_sequence_length;
-  Status request_manager_status;
+  int max_output_length;
+  size_t max_kv_cache_size;
+  int max_tree_depth;
+  int max_tree_width;
+  int k;
+  int expansion_degree = 3;
+  // Profile based latency
+  double baseline_latency_ms = 43;
+  double ssm_spec_latency_ms = 17;
+  double llm_verify_latency_ms = 65;
+  double correction_factor = 1.05;
+
+  State request_manager_status;
+  BackgroundServerStatus background_server_status;
+  DecodingMode decoding_mode;
+  PrefillModel prefill_model;
+  bool speculative_sampling = false;
+  // specify if enable streaming cache for incremental decoding or draft model
+  bool streaming_cache = false;
+  bool memory_occupancy = false;
+  bool slo_violation_early_termination = false;
+  bool spec_infer_old_version = false;
+  bool greedy_schedule = false;
+  bool equal_schedule = false;
+  bool fcfs_slo = false;
+  bool stta = false;                    // The smallest time to attain policy
+  bool eval_overhead_breakdown = false; // for evaluation purpose
+  double eval_ssm_prefill_latency_us = 0.0;
+  double eval_llm_prefill_latency_us = 0.0;
+  double eval_ssm_spec_latency_us = 0.0;
+  double eval_llm_verify_latency_us = 0.0;
+  double eval_process_latency_us = 0.0;
+  double eval_schedule_latency_us = 0.0;
+  double eval_other_latency_us = 0.0; // load pending request, request complete
 
-  // tree width in each speculative step, if not specified 1
-  std::vector<int> spec_infer_tree_width;
-
-  // private fields
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
   ModelType model_type;
   int bos_token_id;
-  int eos_token_id;
-  std::string output_filepath;
+  std::vector<int> eos_token_ids;
+  bool old_llama_tokenizer = false;
+  std::string output_filepath, csv_filepath;
   std::queue<Request> pending_request_queue;
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
   std::mutex request_queue_mutex;
+  std::condition_variable request_queue_cv;
+  std::mutex request_result_mutex;
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
-
-  // TODO: Move this two vector to request struct
-  std::unordered_map<RequestGuid,
-                     std::vector<std::pair<BatchConfig::TokenId, int>>>
-      dfs_tree_inputs;
-  std::unordered_map<RequestGuid, std::vector<std::pair<int, int>>>
-      committed_tokens;
+  std::queue<Request *> prefilled_requests;
+  std::vector<Request *> prefilling_requests;
+
+  // Added to make the request manager stateful. During the processing of the
+  // first small model inference results, the step equals to 1. That is, every
+  // time a small model inference task is launched, the step is increased
+  // by 1.
+  int current_ssm_step = 0;
+  // Maps the index of the request in the batch config to the request guid.
+  // Note that we may have some prefilled requests not in the batch config,
+  // but should be re-considered in the decoding phase.
+  int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
+  int num_running_requests = 0;
+  // Available requests in the batch config
+  bool request_available[BatchConfig::MAX_NUM_REQUESTS];
+  int num_available_requests = 0;
+  int ssm_completed = true;
+  int ssm_tree_depth = 0;
 
   // Multi-model support
   std::vector<FFModel *> ssm_models;
 
-  // Performance profiling
-  size_t num_processed_requests;
-
   // Background server handler
   Legion::Future background_server_handler;
 
-private:
-  struct ProfileInfo {
-    int llm_decoding_steps;
-    int ssm_decoding_steps;
-    double start_time, finish_time;
-  };
-  std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
+  // Performance profiling
+  // TODO: maintain this field
+  size_t num_processed_requests;
+
+  ProfileInfo profiling;
+  std::unordered_map<RequestGuid, RequestProfileInfo> profiling_requests;
+  std::vector<NewProfileInfo> new_profiling_info;
   double total_request_run_time;
+  bool load_pending_request_to_batch();
+  void request_update_attainment(int index, bool attained);
+  void request_complete_clean_up(int batch_index);
+  void request_offload_from_batch(int batch_index);
+  void request_load_onto_batch(int batch_index);
+  /* ---------- Incremental Decoding Helper Functions ---------- */
+  bool update_llm_prefill_results(InferenceResult const &result);
+  bool update_llm_decode_results(InferenceResult const &result);
+  BatchConfig prepare_llm_prefilling_batch();
+  BatchConfig prepare_decoding_batch();
+  BatchConfig prepare_decoding_batch_fcfs_slo();
+  BatchConfig prepare_decoding_batch_stta();
+  /* ---------- Incremental Decoding Helper Functions ---------- */
+
+  /* ---------- Spec Decoding Helper Functions ---------- */
+  BatchConfig prepare_ssm_prefilling_batch();
+  bool update_llm_verify_results(InferenceResult const &llm_verify_result);
+  bool
+      update_ssm_inference_results(InferenceResult const &ssm_inference_result);
+  void update_ssm_prefill_results(InferenceResult const &ssm_prefill_result);
+  // Prepare the next speculation batch config. This function is called before
+  // the second step of the speculation.
+  BatchConfig prepare_next_spec_batch_config();
+  // Prepare the first speculation batch config. This function is called before
+  // the first step of the speculation. The difference with
+  // prepare_next_batch_config_spec is that we put the info of the committed
+  // tokens into the batch config in the first speculation step to commit the KV
+  // cache of the small model.
+  BatchConfig prepare_first_spec_batch_config();
+  BatchConfig prepare_verify_batch_config();
+
+  // LLM result verification
+  void get_verify_results_greedy(InferenceResult const &llm_verify_result);
+  void get_verify_results_sample(InferenceResult const &llm_verify_result);
+
+  // Bitmask related
+  void init_bitmask_prompt(RequestGuid guid, int prompt_length);
+  void append_bitmask(RequestGuid guid);
+  void update_bitmask_prompt(RequestGuid guid, int num_committed_tokens);
+  void init_bitmask_spec(RequestGuid guid);
+  BatchConfig::BitMask create_llm_bitmask(RequestGuid guid);
+
+  // Page Attention related
+  int get_num_blocks_allocated(Request &request) const;
+  int get_len_last_block(Request &request) const;
+  int get_idx_last_logical_token(Request &request) const;
+  int idx_logical_to_physical(Request &request, int idx_logical);
+  void _append_block_to_request(Request &request, bool is_commit);
+  int append_token_to_block(Request &request, TokenId token, bool is_commit);
+  void reset_block_table(Request &request);
+  void print_num_tokens(Request &request);
+
+  // Token tree related
+  void init_token_tree(RequestGuid guid);
+  void add_root_to_spec_token_tree(RequestGuid guid,
+                                   BatchConfig::TokenId token_id);
+  void add_tokens_to_spec_token_tree(
+      InferenceResult const &ssm_inference_result);
+  void add_tokens_to_spec_token_tree_old_version(
+      InferenceResult const &ssm_inference_result);
+  void prune_token_tree();
+  void prune_token_tree_equal();
+  void prune_token_tree_greedy();
+  void add_tokens_toward_slo(RequestGuid guid,
+                             int &budget,
+                             int num_req_with_slo);
+  void add_tokens_toward_memory_occupancy(int budget);
+  void add_tokens_toward_goodput(int budget);
+  void add_tokens_toward_goodput_per_request(int budget, int request_index);
+  void update_token_tree_depth();
+
+  /* ---------- Spec Decoding Helper Functions ---------- */
+  void renormalize(std::vector<std::pair<TokenId, float>> &D,
+                   std::unordered_map<TokenId, float> &R,
+                   TokenId token_id);
+  std::tuple<int, BatchConfig::TokenId, bool>
+      reject_sampling(std::vector<std::pair<TokenId, float>> &D,
+                      std::unordered_map<TokenId, float> &R,
+                      int k);
+  void gumbel_conditioned_on_max(double target_max,
+                                 std::vector<std::pair<double, int>> &logits);
+
+  // Profiling related functions
+  void reset_profiling_statistics();
 };
-
 }; // namespace FlexFlow
diff --git a/include/flexflow/substitution_loader.h b/include/flexflow/substitution_loader.h
index e0c252ffd..e7367c5bb 100644
--- a/include/flexflow/substitution_loader.h
+++ b/include/flexflow/substitution_loader.h
@@ -103,6 +103,7 @@ NLOHMANN_JSON_SERIALIZE_ENUM(
      {OP_SHAPE, "OP_SHAPE"},
      {OP_SIZE, "OP_SIZE"},
      {OP_TOPK, "OP_TOPK"},
+     {OP_GUMBEL_TOPK, "OP_GUMBEL_TOPK"},
      {OP_WHERE, "OP_WHERE"},
      {OP_CEIL, "OP_CEIL"},
      {OP_CAST, "OP_CAST"},
diff --git a/include/flexflow/utils/communication_buffer.h b/include/flexflow/utils/communication_buffer.h
new file mode 100644
index 000000000..016860bf6
--- /dev/null
+++ b/include/flexflow/utils/communication_buffer.h
@@ -0,0 +1,78 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _COMMUNICATION_BUFFER_H
+#define _COMMUNICATION_BUFFER_H
+
+#include "legion.h"
+#include <vector>
+#ifdef FF_USE_NCCL
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include <nccl.h>
+#else
+#include <rccl/rccl.h>
+#endif
+#endif
+
+// adapted from https://github.com/mlc-ai/relax
+
+// The CUDA interdevice communication memory object,
+// which internally contains data pointers to other device's peer memory.
+// It is be useful for efficient all-reduce implementation.
+// Right now the class members are closely tied with customized
+// all-reduce kernel. They may also be extended for other uses in
+// the future.
+class CommunicationBuffer {
+public:
+  // The device information for CUDA CommunicationBuffer.
+  int num_devices;
+  int device_id;
+  void *local_ptr;
+
+  // The data pointers of all all-reduce inputs.
+  // It has "num_devices" pointers. The i-th pointer is the data pointer on
+  // worker i. If "i != device_id", the pointer is an peer data pointer of other
+  // device. Otherwise, the pointer is a local CUDA data pointer.
+  std::vector<void *> comm_ptrs;
+
+  // The barrier helper datas per CommunicationBuffer, which can be used
+  // by custom collective operations and allow fine-grained synchronization on
+  // each buffer. They have "num_devices" pointers, and the pointer arrangement
+  // is the same as "comm_ptrs".
+  std::vector<void *> barrier_in;
+  std::vector<void *> barrier_out;
+
+  // The integer buffer flag for all-reduce.
+  // It will self increment by 1 after each all-reduce operation.
+  int *barrier_flag;
+};
+
+// All NCCL operations need to be wrapped by Legion concurrent_task_barrier.
+CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
+                                                    Legion::Runtime *runtime,
+                                                    int num_devices,
+                                                    int device_id,
+                                                    ncclComm_t ncclComm,
+                                                    void *allgather_src,
+                                                    void *allgather_dst,
+                                                    void *local_ptr,
+                                                    void *barrier_in_ptr,
+                                                    void *barrier_out_ptr,
+                                                    int *barrier_flag,
+                                                    cudaStream_t stream);
+
+void release_comm_buf(CommunicationBuffer *comm_buf);
+
+#endif // _COMMUNICATION_BUFFER_H
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index f8bf67b3e..f5ea76c5b 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -3,6 +3,7 @@
 #include "flexflow/accessor.h"
 #include "flexflow/ffconst.h"
 #include "legion.h"
+#include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cudnn.h>
 #ifdef FF_USE_NCCL
diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index 646eb18da..4ccc6db48 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -21,6 +21,7 @@
 
 using namespace std;
 using namespace FlexFlow;
+using namespace Legion;
 
 class FileDataLoader {
 public:
@@ -29,17 +30,38 @@ class FileDataLoader {
                  int _num_heads,
                  int _num_kv_heads,
                  size_t _hidden_dim,
-                 size_t _qkv_inner_dim,
+                 size_t _head_dim,
                  int _tensor_parallelism_degree,
                  bool _use_full_precision);
 
   BatchConfig::TokenId *generate_requests(int num, int length);
 
   template <typename DT>
-  void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
+  void load_single_weight_tensor(FFModel *ff,
+                                 Layer *l,
+                                 int weight_idx,
+                                 size_t volume,
+                                 size_t num_replicas,
+                                 DT *weight,
+                                 Domain weight_domain);
 
-  void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
-  void load_weights(FFModel *ff);
+  void load_quantization_weight(FFModel *ff,
+                                Layer *l,
+                                int weight_idx,
+                                size_t volume,
+                                size_t num_replicas,
+                                char *weight,
+                                DataType data_type,
+                                Domain weight_domain);
+
+  static void
+      load_weight_task(Legion::Task const *task,
+                       std::vector<Legion::PhysicalRegion> const &regions,
+                       Legion::Context ctx,
+                       Legion::Runtime *runtime);
+  void load_weights_parallel(FFModel *ff,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
 
   void load_positions(FFModel *ff,
                       Tensor pt,
@@ -49,8 +71,26 @@ class FileDataLoader {
 
 private:
   int num_heads, num_kv_heads, tensor_parallelism_degree;
-  size_t hidden_dim, qkv_inner_dim;
+  size_t hidden_dim, head_dim;
   std::string prompts_filepath;
   std::string weights_folder;
   bool use_full_precision;
 };
+
+struct WeightLoadTaskArgs {
+  FFModel *ff;
+  FileDataLoader *loader;
+  Layer *layer;
+  int weight_idx;
+  size_t volume, num_replicas;
+  DataType data_type;
+  WeightLoadTaskArgs(FFModel *_ff,
+                     FileDataLoader *_loader,
+                     Layer *_l,
+                     int _idx,
+                     size_t _volume,
+                     size_t _num_replicas,
+                     DataType _data_type)
+      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), volume(_volume),
+        num_replicas(_num_replicas), data_type(_data_type) {}
+};
diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h
index 8e50a4c3b..af3327b04 100644
--- a/include/flexflow/utils/memory_allocator.h
+++ b/include/flexflow/utils/memory_allocator.h
@@ -23,7 +23,9 @@ namespace FlexFlow {
 class MemoryAllocator {
 public:
   MemoryAllocator(Legion::Memory memory);
-  void create_legion_instance(Realm::RegionInstance &inst, size_t size);
+  void create_legion_instance(Realm::RegionInstance &inst,
+                              size_t size,
+                              char const *task_name = NULL);
   void register_reserved_work_space(void *base, size_t size);
   inline void *allocate_reserved_untyped(size_t datalen) {
     void *ptr = static_cast<char *>(reserved_ptr) + reserved_allocated_size;
@@ -60,6 +62,7 @@ class MemoryAllocator {
   void *instance_ptr;
   size_t reserved_total_size, reserved_allocated_size;
   size_t instance_total_size, instance_allocated_size;
+  bool log_instance_creation;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/utils/recursive_logger.h b/include/flexflow/utils/recursive_logger.h
index 2c43b4230..d073f58f3 100644
--- a/include/flexflow/utils/recursive_logger.h
+++ b/include/flexflow/utils/recursive_logger.h
@@ -26,7 +26,7 @@ class DepthTag {
 
 class RecursiveLogger {
 public:
-  /* RecursiveLogger(LegionRuntime::Logger::Category const &); */
+  /* RecursiveLogger(Legion::Logger const &); */
   RecursiveLogger(std::string const &category_name);
 
   Realm::LoggerMessage info();
@@ -42,7 +42,7 @@ class RecursiveLogger {
 
   void print_prefix(Realm::LoggerMessage &) const;
 
-  LegionRuntime::Logger::Category logger;
+  Legion::Logger logger;
 };
 
 };     // namespace FlexFlow
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index aae7256ff..a34d27e9a 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -20,19 +20,19 @@
 #include "models/mpt.h"
 #include "models/opt.h"
 #include "models/starcoder.h"
+#include <cassert>
 #include <wordexp.h>
 
-#include <nlohmann/json.hpp>
-
 using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
 
-LegionRuntime::Logger::Category log_app("llama");
+Legion::Logger log_app("llama");
 
 struct FilePaths {
   std::string cache_folder_path;
   std::string prompt_file_path;
+  std::string trace_file_path;
   std::string output_file_path;
 };
 
@@ -47,7 +47,24 @@ void parse_input_args(char **argv,
                       float &topp,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
-                      int &max_sequence_length) {
+                      int &max_tokens_per_ssm_batch,
+                      int &max_tokens_per_prefilling_batch,
+                      int &max_sequence_length,
+                      int &max_output_length,
+                      size_t &max_kv_cache_size,
+                      int &sampling_seed,
+                      bool &streaming_cache,
+                      bool &slo_attainment_early_termination,
+                      double &baseline_latency_ms,
+                      double &ssm_spec_latency_ms,
+                      double &llm_verify_latency_ms,
+                      double &slo_filter,
+                      int &replica,
+                      double &request_per_second,
+                      std::string &emission_file_path,
+                      bool &add_special_tokens,
+                      bool &fcfs_slo,
+                      bool &stta) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -67,6 +84,11 @@ void parse_input_args(char **argv,
       paths.prompt_file_path = std::string(argv[++i]);
       continue;
     }
+    // traces
+    if (!strcmp(argv[i], "-trace")) {
+      paths.trace_file_path = std::string(argv[++i]);
+      continue;
+    }
     // output file
     if (!strcmp(argv[i], "-output-file")) {
       paths.output_file_path = std::string(argv[++i]);
@@ -101,10 +123,78 @@ void parse_input_args(char **argv,
       max_tokens_per_batch = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-tokens-per-ssm-batch")) {
+      max_tokens_per_ssm_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-prefilling-batch")) {
+      max_tokens_per_prefilling_batch = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--max-sequence-length")) {
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-kv-cache-size")) {
+      max_kv_cache_size = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--sampling-seed")) {
+      sampling_seed = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--enable-streaming-cache")) {
+      streaming_cache = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--slo-attainment-early-termination")) {
+      slo_attainment_early_termination = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--baseline-latency-ms")) {
+      baseline_latency_ms = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--ssm-spec-latency-ms")) {
+      ssm_spec_latency_ms = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--llm-verify-latency-ms")) {
+      llm_verify_latency_ms = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--eval-slo-filter")) {
+      slo_filter = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--eval-replica")) {
+      replica = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--request-per-second")) {
+      request_per_second = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--emission-file-path")) {
+      emission_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--no-special-tokens")) {
+      add_special_tokens = false;
+      continue;
+    }
+    if (!strcmp(argv[i], "--fcfs-serving")) {
+      fcfs_slo = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--stta-serving")) {
+      stta = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -131,11 +221,30 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
-  float temperature = 0.0f;
-  float topp = 0.0f;
-  int max_requests_per_batch = 8;
+  float temperature = 0.8f;
+  float topp = 0.6f;
+  int max_requests_per_batch = 1;
   int max_tokens_per_batch = 128;
+  int max_tokens_per_ssm_batch = -1;
+  int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 256;
+  int max_output_length = 512;
+  size_t max_kv_cache_size = 0; // if -1, then use the default value
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::INCREMENTAL_DECODING;
+  int sampling_seed = 0;
+  bool streaming_cache = false;
+  bool slo_attainment_early_termination = false;
+  double baseline_latency_ms = 50;
+  double ssm_spec_latency_ms = 20;
+  double llm_verify_latency_ms = 50;
+  double slo_filter = 0.0;
+  int replica = 1;
+  double request_per_second = 1.0;
+  bool add_special_tokens = true;
+  bool fcfs_slo = false;
+  bool stta = false;
+  std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -151,7 +260,33 @@ void FlexFlow::top_level_task(Task const *task,
                    topp,
                    max_requests_per_batch,
                    max_tokens_per_batch,
-                   max_sequence_length);
+                   max_tokens_per_ssm_batch,
+                   max_tokens_per_prefilling_batch,
+                   max_sequence_length,
+                   max_output_length,
+                   max_kv_cache_size,
+                   sampling_seed,
+                   streaming_cache,
+                   slo_attainment_early_termination,
+                   baseline_latency_ms,
+                   ssm_spec_latency_ms,
+                   llm_verify_latency_ms,
+                   slo_filter,
+                   replica,
+                   request_per_second,
+                   emission_file_path,
+                   add_special_tokens,
+                   fcfs_slo,
+                   stta);
+  if (max_tokens_per_ssm_batch == -1) {
+    max_tokens_per_ssm_batch = max_tokens_per_batch;
+  }
+  if (max_tokens_per_prefilling_batch == -1) {
+    max_tokens_per_prefilling_batch = max_tokens_per_batch;
+  }
+  if (slo_filter == 0.0) {
+    replica = 1;
+  }
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -179,7 +314,8 @@ void FlexFlow::top_level_task(Task const *task,
   ModelType model_type = ModelType::UNKNOWN;
   auto architectures = model_config["architectures"];
   for (auto const &str : architectures) {
-    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
       model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
@@ -199,20 +335,48 @@ void FlexFlow::top_level_task(Task const *task,
   int bos_token_id = model_config.find("bos_token_id") == model_config.end()
                          ? -1
                          : (int)model_config.at("bos_token_id");
-  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
-                         ? -1
-                         : (int)model_config.at("eos_token_id");
+  // int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+  //                        ? -1
+  //                        : (int)model_config.at("eos_token_id");
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
 
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
 
+  srand(sampling_seed);
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
+  rm->set_max_kv_cache_size(max_kv_cache_size);
+  rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(slo_attainment_early_termination);
+  rm->set_baseline_latency(baseline_latency_ms);
+  rm->set_ssm_spec_latency(ssm_spec_latency_ms);
+  rm->set_llm_verify_latency(llm_verify_latency_ms);
+  rm->set_max_tree_depth(8);
+  rm->set_max_tree_width(16);
+  rm->set_verbose(verbose);
+  rm->set_streaming_cache(streaming_cache);
+  rm->set_fcfs_slo(fcfs_slo);
+  rm->set_stta(stta);
   rm->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
@@ -222,6 +386,7 @@ void FlexFlow::top_level_task(Task const *task,
                               weights_filepath,
                               INC_DECODING_MODE,
                               generationConfig,
+                              streaming_cache,
                               use_full_precision);
   } else if (model_type == ModelType::OPT) {
     OPT::create_opt_model(model,
@@ -255,24 +420,84 @@ void FlexFlow::top_level_task(Task const *task,
 
   rm->start_background_server(&model);
 
-  int total_num_requests = 0;
   {
-    using json = nlohmann::json;
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
-    std::vector<std::string> prompts;
-    for (auto &prompt : prompt_json) {
-      std::string text = prompt.get<std::string>();
-      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-      total_num_requests++;
-      prompts.push_back(text);
-    }
-    std::vector<GenerationResult> result =
-        model.generate(prompts, 128 /*max_sequence_length*/);
+    std::vector<GenerationRequest> requests;
+    std::vector<GenerationResult> results;
+
+    if (!file_paths.prompt_file_path.empty()) {
+      std::ifstream file_handle(file_paths.prompt_file_path);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+
+      // Parse slo_ratios
+      std::vector<std::pair<double, double>> slo_ratios;
+      if (prompt_json[0].contains("slo_ratios")) {
+        for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
+          slo_ratios.emplace_back(std::stod(key), value.get<double>());
+        }
+      }
+      double total = std::accumulate(
+          slo_ratios.begin(),
+          slo_ratios.end(),
+          0.0,
+          [](double sum, std::pair<double, double> const &pair) {
+            return sum + pair.second;
+          });
+      if (std::abs(total - 1.0) > 1e-6) {
+        std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
+                  << total << std::endl;
+        assert(false);
+      }
+      for (size_t i = 1; i < prompt_json.size(); ++i) {
+        requests.push_back(
+            GenerationRequest(prompt_json[i]["prompt"].get<std::string>(),
+                              -1.0,
+                              0,
+                              add_special_tokens));
+      }
+      PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
+      // ConstantEmissionMachine emission_machine(-1, slo_ratios);
+      results = model.generate(requests, emission_machine);
+    } else if (!file_paths.trace_file_path.empty()) {
+      std::ifstream file_handle(file_paths.trace_file_path);
+      assert(file_handle.good() && "Trace file does not exist.");
+      json trace_json = json::parse(file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+      std::vector<double> timestamps, ratios;
+      for (auto const &json_obj : trace_json) {
+        EmissionTrace trace(json_obj);
+        if (slo_filter != 0.0 &&
+            std::fabs(trace.slo_ratio - slo_filter) > 1e-6) {
+          continue;
+        }
+        for (size_t i = 0; i < replica; ++i) {
+          requests.push_back(
+              GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
+          timestamps.push_back(trace.emission_time_ms);
+          ratios.push_back(trace.slo_ratio);
+        }
+      }
+      TraceEmissionMachine emission_machine(timestamps, ratios);
+      results = model.generate(requests, emission_machine);
+    } else {
+      assert(false && "No prompt or trace file provided.");
+    }
+
+    // output generation results as json
+    if (!emission_file_path.empty()) {
+      json output_json;
+      for (size_t i = 0; i < results.size(); ++i) {
+        EmissionTrace trace(results[i]);
+        output_json.push_back(trace.to_json());
+      }
+      std::ofstream emission_file_handle(emission_file_path);
+      emission_file_handle << output_json.dump(2) << std::endl;
+    }
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index a529411dd..9049b3885 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -40,9 +40,10 @@ void FALCON::create_falcon_model(FFModel &ff,
   {
     // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
-            : BatchConfig::max_tokens_per_batch(),
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
@@ -62,6 +63,11 @@ void FALCON::create_falcon_model(FFModel &ff,
   Tensor mha = nullptr, mlp_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(falcon_config.n_layer);
+  ff.set_num_kv_heads(falcon_config.n_head_kv);
+  ff.set_qkv_dim(falcon_config.hidden_size / falcon_config.n_head * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
+
   for (int i = 0; i < falcon_config.n_layer; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
@@ -76,7 +82,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           falcon_config.layer_norm_epsilon,
           true,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_layer_norm(
@@ -90,14 +96,14 @@ void FALCON::create_falcon_model(FFModel &ff,
           falcon_config.layer_norm_epsilon,
           true,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = res_ln_outputs[0];
       att_norm = res_ln_outputs[1];
     }
 
     switch (mode) {
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         mha = ff.spec_inc_multiquery_self_attention(
             att_norm,
             falcon_config.hidden_size,
@@ -111,12 +117,13 @@ void FALCON::create_falcon_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            falcon_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
+            false, /*streaming_cache*/
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -136,19 +143,19 @@ void FALCON::create_falcon_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            falcon_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
       }
 
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
+        mha = ff.groupquery_self_attention(
             att_norm,
             falcon_config.hidden_size,
             falcon_config.n_head,
@@ -161,12 +168,13 @@ void FALCON::create_falcon_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            falcon_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
+            false, /*streaming_cache*/
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -187,7 +195,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h")
             .c_str());
 
     dense_h_to_4h = ff.gelu(dense_h_to_4h);
@@ -203,7 +211,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h")
             .c_str());
   }
   // final normalization and linear
@@ -233,7 +241,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                             "lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
     output = ff.argmax(softmax, /*beam_Search*/ true);
   } else {
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index fce2dade3..a15c28991 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
@@ -50,6 +51,26 @@ class FALCON {
                         : model_config["num_hidden_layers"];
           parallel_attn = model_config["parallel_attn"];
           vocab_size = model_config["vocab_size"];
+          rotary_embedding_meta.apply_rotary_embedding = true;
+          if (model_config.find("rope_theta") != model_config.end()) {
+            rotary_embedding_meta.rope_theta = model_config["rope_theta"];
+          } else {
+            rotary_embedding_meta.rope_theta = 10000.0f;
+          }
+          if (model_config.find("scaling_factor") != model_config.end() &&
+              !model_config["scaling_factor"].is_null()) {
+            rotary_embedding_meta.rope_type =
+                model_config["scaling_factor"]["rope_type"];
+            rotary_embedding_meta.factor =
+                model_config["scaling_factor"]["factor"];
+            rotary_embedding_meta.low_freq_factor =
+                model_config["scaling_factor"]["low_freq_factor"];
+            rotary_embedding_meta.high_freq_factor =
+                model_config["scaling_factor"]["high_freq_factor"];
+            rotary_embedding_meta.original_max_position_embeddings =
+                model_config["scaling_factor"]
+                            ["original_max_position_embeddings"];
+          }
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -61,8 +82,7 @@ class FALCON {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
@@ -76,18 +96,19 @@ class FALCON {
       std::cout << "\tn_layer: " << n_layer << std::endl;
       std::cout << "\tparallel_attn: " << parallel_attn << std::endl;
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
-
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
-      std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
-      std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
+      std::cout << "\tk_of_arg_topk: " << k_of_arg_topk << std::endl;
     }
 
     bool bias, multi_query, parallel_attn;
     int hidden_size, n_head, n_head_kv, n_layer, vocab_size;
     float layer_norm_epsilon;
+    RotaryEmbeddingMeta rotary_embedding_meta;
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
   };
 
   static void create_falcon_model(FFModel &ff,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 517f53443..414306877 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -25,6 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                                std::string const &weight_file_path,
                                InferenceMode mode,
                                GenerationConfig generation_config,
+                               bool streaming_cache,
                                bool use_full_precision) {
   // do not apply cpu offload in beam search model.
   LLAMAConfig llama_config(model_config_file_path);
@@ -42,9 +43,10 @@ void LLAMA::create_llama_model(FFModel &ff,
   Tensor input;
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
-            : BatchConfig::max_tokens_per_batch(),
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
@@ -58,10 +60,17 @@ void LLAMA::create_llama_model(FFModel &ff,
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "tok_embeddings");
+                              "embed_tokens");
 
   Tensor w2 = nullptr;
 
+  // metadata that needs to be sent to page manager in order to calculate the kv
+  // cache per layer
+  ff.set_num_transformer_layers(llama_config.num_hidden_layers);
+  ff.set_num_kv_heads(llama_config.num_key_value_heads);
+  int qkv_dim = llama_config.hidden_size / llama_config.num_attention_heads * 2;
+  ff.set_qkv_dim(qkv_dim);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < llama_config.num_hidden_layers; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
@@ -75,7 +84,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_rms_norm(
@@ -85,7 +94,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
@@ -93,11 +102,12 @@ void LLAMA::create_llama_model(FFModel &ff,
 
     Tensor mha;
     switch (mode) {
-      case BEAM_SEARCH_MODE: {
-        mha = ff.spec_inc_multihead_self_attention(
+      case TREE_SEARCH_MODE: {
+        mha = ff.spec_inc_multiquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -106,21 +116,23 @@ void LLAMA::create_llama_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            llama_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
+            streaming_cache,
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
       }
       case TREE_VERIFY_MODE: {
-        mha = ff.inc_multihead_self_attention_verify(
+        mha = ff.inc_multiquery_self_attention_verify(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -129,21 +141,22 @@ void LLAMA::create_llama_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            llama_config.rotary_embedding_meta,
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
       }
       case INC_DECODING_MODE: {
-        mha = ff.inc_multihead_self_attention(
+        mha = ff.groupquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -152,12 +165,13 @@ void LLAMA::create_llama_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            llama_config.rotary_embedding_meta,
+            false,           /*scaling query*/
+            1.0f,            /*scaling factor*/
+            true,            /*qk_prod_scaling*/
+            false,           /*position_bias*/
+            streaming_cache, /*streaming_cache*/
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -176,53 +190,52 @@ void LLAMA::create_llama_model(FFModel &ff,
         llama_config.rms_norm_eps,
         llama_config.hidden_size,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str());
+        std::string("layers." + std::to_string(i) + ".post_attention_layernorm")
+            .c_str());
     token = token_ff_norm[0];
     Tensor ff_norm = token_ff_norm[1];
 
-    Tensor w1 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w1")
-                     .c_str());
+    Tensor w1 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str());
 
-    Tensor w3 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
-                     .c_str());
+    Tensor w3 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str());
 
-    Tensor multi = ff.sigmoid_silu_multi(w1, w3);
+    Tensor multi =
+        ff.sigmoid_silu_multi(w1, w3, llama_config.intermediate_size);
 
-    w2 =
-        ff.dense(multi,
-                 llama_config.hidden_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
-                     .c_str());
+    w2 = ff.dense(
+        multi,
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
@@ -244,23 +257,29 @@ void LLAMA::create_llama_model(FFModel &ff,
                           nullptr,
                           REG_MODE_NONE,
                           0.0f,
-                          "output");
+                          "lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(dense, -1);
-    // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false);
-    // output = ff.argmax(softmax, /*beam_Search*/ true);
-    output = ff.arg_top_k(softmax, llama_config.max_beam_width, false, true);
-    // output = ff.top_k(softmax, )
+    output = ff.arg_top_k(softmax, llama_config.k_of_arg_topk, false, false);
+  } else if (mode == INC_DECODING_MODE) {
+    if (generation_config.do_sample) {
+      Tensor softmax = ff.softmax(dense, -1);
+      output = ff.sampling(softmax, generation_config.topp);
+    } else {
+      output = ff.argmax(dense, /*beam_Search*/ false);
+    }
   } else {
-    // Tensor softmax = ff.softmax(dense, -1);
     if (generation_config.do_sample) {
       dense = ff.scalar_truediv(dense, generation_config.temperature, false);
       Tensor softmax = ff.softmax(dense, -1);
-      output = ff.sampling(softmax, generation_config.topp);
+      if (generation_config.spec_sample) {
+        output = ff.gumbel_top_k(softmax, generation_config.topk, false, true);
+      } else {
+        output = ff.sampling(softmax, generation_config.topp);
+      }
     } else {
-      // output = ff.arg_top_k(dense, /*k=*/1, false);
       output = ff.argmax(dense, /*beam_Search*/ false);
     }
   }
@@ -269,7 +288,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       "",
       weight_file_path,
       llama_config.num_attention_heads,
-      llama_config.num_attention_heads,
+      llama_config.num_key_value_heads,
       llama_config.hidden_size,
       llama_config.hidden_size / llama_config.num_attention_heads,
       ff.config.tensor_parallelism_degree,
diff --git a/inference/models/llama.h b/inference/models/llama.h
index ba1f0236f..cd6f9c5cc 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
@@ -36,9 +37,34 @@ class LLAMA {
           num_hidden_layers = model_config["num_hidden_layers"];
           vocab_size = model_config["vocab_size"];
           num_attention_heads = model_config["num_attention_heads"];
+          if (model_config.find("num_key_value_heads") != model_config.end()) {
+            num_key_value_heads = model_config["num_key_value_heads"];
+          } else {
+            num_key_value_heads = num_attention_heads;
+          }
           hidden_size = model_config["hidden_size"];
           rms_norm_eps = model_config["rms_norm_eps"];
           intermediate_size = model_config["intermediate_size"];
+          rotary_embedding_meta.apply_rotary_embedding = true;
+          if (model_config.find("rope_theta") != model_config.end()) {
+            rotary_embedding_meta.rope_theta = model_config["rope_theta"];
+          } else {
+            rotary_embedding_meta.rope_theta = 10000.0f;
+          }
+          if (model_config.find("scaling_factor") != model_config.end() &&
+              !model_config["scaling_factor"].is_null()) {
+            rotary_embedding_meta.rope_type =
+                model_config["scaling_factor"]["rope_type"];
+            rotary_embedding_meta.factor =
+                model_config["scaling_factor"]["factor"];
+            rotary_embedding_meta.low_freq_factor =
+                model_config["scaling_factor"]["low_freq_factor"];
+            rotary_embedding_meta.high_freq_factor =
+                model_config["scaling_factor"]["high_freq_factor"];
+            rotary_embedding_meta.original_max_position_embeddings =
+                model_config["scaling_factor"]
+                            ["original_max_position_embeddings"];
+          }
         } catch (json::exception const &e) {
           std::cerr << "Error parsing LLAMA config from JSON file: " << e.what()
                     << std::endl;
@@ -49,10 +75,7 @@ class LLAMA {
                   << std::endl;
         assert(false);
       }
-      // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
-      // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
@@ -61,21 +84,24 @@ class LLAMA {
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
       std::cout << "\tnum_attention_heads: " << num_attention_heads
                 << std::endl;
+      std::cout << "\tnum_key_value_heads: " << num_key_value_heads
+                << std::endl;
       std::cout << "\thidden_size: " << hidden_size << std::endl;
       std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl;
       std::cout << "\tintermediate_size: " << intermediate_size << std::endl;
-
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
-      std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
-      std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
+      std::cout << "\tk_of_arg_topk : " << k_of_arg_topk << std::endl;
     }
 
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
-    int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
-        intermediate_size;
+    int k_of_arg_topk;
+    int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads,
+        hidden_size, intermediate_size;
     float rms_norm_eps;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_llama_model(FFModel &ff,
@@ -83,6 +109,7 @@ class LLAMA {
                                  std::string const &weight_file_path,
                                  InferenceMode mode,
                                  GenerationConfig generation_config,
+                                 bool streaming_cache,
                                  bool use_full_precision = false);
 };
 
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 70e2b5e9c..b95cb5c91 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -41,9 +41,10 @@ void MPT::create_mpt_model(FFModel &ff,
   Tensor input;
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
-            : BatchConfig::max_tokens_per_batch(),
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
@@ -58,11 +59,15 @@ void MPT::create_mpt_model(FFModel &ff,
                                       use_full_precision ? DT_FLOAT : DT_HALF,
                                       NULL,
                                       embed_init,
-                                      "transformer_wte");
+                                      "wte");
 
   Tensor intermediate_output = nullptr, layernorm_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(mpt_config.n_layers);
+  ff.set_num_kv_heads(mpt_config.n_heads);
+  ff.set_qkv_dim(mpt_config.hidden_size / mpt_config.n_heads * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < mpt_config.n_layers; i++) {
     ff.set_transformer_layer_id(i);
 
@@ -74,7 +79,7 @@ void MPT::create_mpt_model(FFModel &ff,
           1e-05,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
     } else {
       ff.residual_layer_norm(
           intermediate_output,
@@ -87,14 +92,14 @@ void MPT::create_mpt_model(FFModel &ff,
           1e-05,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
       hidden_states = res_ln_outputs[0];
       layernorm_output = res_ln_outputs[1];
     }
 
     Tensor attn_outputs;
     switch (mode) {
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         attn_outputs = ff.spec_inc_multihead_self_attention(
             layernorm_output,
             mpt_config.hidden_size,
@@ -107,13 +112,13 @@ void MPT::create_mpt_model(FFModel &ff,
             false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -131,13 +136,13 @@ void MPT::create_mpt_model(FFModel &ff,
             false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -155,13 +160,13 @@ void MPT::create_mpt_model(FFModel &ff,
             false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -182,7 +187,7 @@ void MPT::create_mpt_model(FFModel &ff,
         1e-05,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_norm_2").c_str());
+        std::string("layers." + std::to_string(i) + ".norm_2").c_str());
     hidden_states = res_ln_outputs[0];
     layernorm_output = res_ln_outputs[1];
 
@@ -198,7 +203,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str());
     layernorm_output = ff.gelu(layernorm_output);
     intermediate_output = ff.dense(
         layernorm_output,
@@ -211,7 +216,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str());
   }
 
   // final
@@ -225,7 +230,7 @@ void MPT::create_mpt_model(FFModel &ff,
                          1e-05,
                          false,
                          DT_NONE,
-                         "transformer_norm_f");
+                         "norm_f");
   Tensor all_final_norm = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(all_final_norm,
@@ -241,7 +246,7 @@ void MPT::create_mpt_model(FFModel &ff,
                             "lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
     output = ff.argmax(softmax, /*beam_Search*/ true);
   } else {
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index 08597e1d7..8466ea1cb 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
@@ -37,6 +38,7 @@ class MPT {
           n_heads = model_config["n_heads"];
           n_layers = model_config["n_layers"];
           vocab_size = model_config["vocab_size"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -48,8 +50,7 @@ class MPT {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
@@ -61,8 +62,9 @@ class MPT {
     }
 
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
     int hidden_size, n_heads, n_layers, vocab_size;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_mpt_model(FFModel &ff,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 5677d5658..352809ede 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -43,9 +43,10 @@ void OPT::create_opt_model(FFModel &ff,
   ff.set_position_offset(2);
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
-            : BatchConfig::max_tokens_per_batch(),
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
     position_input = ff.create_tensor<2>(token_dims, DT_INT32);
@@ -76,6 +77,10 @@ void OPT::create_opt_model(FFModel &ff,
   Tensor fc2 = nullptr, added = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(opt_config.num_hidden_layers);
+  ff.set_num_kv_heads(opt_config.num_attention_heads);
+  ff.set_qkv_dim(opt_config.hidden_size / opt_config.num_attention_heads * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < opt_config.num_hidden_layers; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
@@ -95,14 +100,14 @@ void OPT::create_opt_model(FFModel &ff,
         1e-05,
         true,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_attention_layer_norm")
+        std::string("layers." + std::to_string(i) + ".self_attn_layer_norm")
             .c_str());
     Tensor residual = res_ln_outputs[0];
     Tensor hidden_states = res_ln_outputs[1];
 
     Tensor mha;
     switch (mode) {
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         mha = ff.spec_inc_multihead_self_attention(
             hidden_states,
             opt_config.hidden_size,
@@ -115,13 +120,13 @@ void OPT::create_opt_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -139,13 +144,13 @@ void OPT::create_opt_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -163,13 +168,13 @@ void OPT::create_opt_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -187,8 +192,8 @@ void OPT::create_opt_model(FFModel &ff,
                                     1e-05,
                                     true,
                                     DT_NONE,
-                                    std::string("layers_" + std::to_string(i) +
-                                                "_add_bias_residual_layer_norm")
+                                    std::string("layers." + std::to_string(i) +
+                                                ".add_bias_residual_layer_norm")
                                         .c_str());
     added = res_ln_outputs[0];
     Tensor final_norm = res_ln_outputs[1];
@@ -205,7 +210,7 @@ void OPT::create_opt_model(FFModel &ff,
                  nullptr,
                  REG_MODE_NONE,
                  0.0f,
-                 std::string("layers_" + std::to_string(i) + "_fc1").c_str());
+                 std::string("layers." + std::to_string(i) + ".fc1").c_str());
     fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
@@ -216,7 +221,7 @@ void OPT::create_opt_model(FFModel &ff,
                    nullptr,
                    REG_MODE_NONE,
                    0.0f,
-                   std::string("layers_" + std::to_string(i) + "_fc2").c_str());
+                   std::string("layers." + std::to_string(i) + ".fc2").c_str());
   }
 
   // final
@@ -243,13 +248,12 @@ void OPT::create_opt_model(FFModel &ff,
                             nullptr,
                             REG_MODE_NONE,
                             0.0f,
-                            "embed_tokens_weight_lm_head");
+                            "lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
-    // output = ff.beam_top_k(softmax, opt_config.max_beam_width, false);
-    output = ff.argmax(softmax, /*beam_Search*/ true);
+    output = ff.arg_top_k(softmax, opt_config.k_of_arg_topk, false, false);
   } else {
     // output = ff.arg_top_k(lm_head, /*k=*/1, false);
     output = ff.argmax(lm_head, /*beam_Search*/ false);
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 7c736a26d..23ba8888b 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
@@ -45,6 +46,7 @@ class OPT {
           num_hidden_layers = model_config["num_hidden_layers"];
           vocab_size = model_config["vocab_size"];
           word_embed_proj_dim = model_config["word_embed_proj_dim"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -56,8 +58,7 @@ class OPT {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
@@ -78,19 +79,20 @@ class OPT {
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
       std::cout << "\tword_embed_proj_dim: " << word_embed_proj_dim
                 << std::endl;
-
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
-      std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
-      std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
+      std::cout << "\tk_of_arg_topk : " << k_of_arg_topk << std::endl;
     }
 
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
     bool do_layer_norm_before, enable_bias, layer_norm_elementwise_affine;
     float dropout;
     int ffn_dim, hidden_size, max_position_embeddings, num_attention_heads,
         num_hidden_layers, vocab_size, word_embed_proj_dim;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_opt_model(FFModel &ff,
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 8b0dc1098..401a754d0 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -49,9 +49,10 @@ void STARCODER::create_starcoder_model(
   {
     // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
-            : BatchConfig::max_tokens_per_batch(),
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
     position_input = ff.create_tensor<2>(token_dims, DT_INT32);
@@ -66,7 +67,7 @@ void STARCODER::create_starcoder_model(
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "transformer_wte");
+                              "wte");
 
   Tensor positional_embedding =
       ff.embedding(position_input,
@@ -76,11 +77,16 @@ void STARCODER::create_starcoder_model(
                    use_full_precision ? DT_FLOAT : DT_HALF,
                    NULL,
                    embed_init,
-                   "transformer_wpe");
+                   "wpe");
 
   Tensor residual = nullptr, c_proj = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(startcoder_config.num_hidden_layers);
+  ff.set_num_kv_heads(startcoder_config.num_attention_heads);
+  ff.set_qkv_dim(startcoder_config.hidden_size /
+                 startcoder_config.num_attention_heads * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < startcoder_config.num_hidden_layers; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
@@ -97,14 +103,14 @@ void STARCODER::create_starcoder_model(
         startcoder_config.layer_norm_epsilon,
         true,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_1").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_1").c_str());
     Tensor hidden_states = res_ln_outputs[0];
     Tensor ln_1 = res_ln_outputs[1];
 
     Tensor mha;
     switch (mode) {
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
+        mha = ff.groupquery_self_attention(
             ln_1,
             startcoder_config.hidden_size,
             startcoder_config.num_attention_heads,
@@ -113,18 +119,19 @@ void STARCODER::create_starcoder_model(
                 startcoder_config.num_attention_heads,
             startcoder_config.hidden_size /
                 startcoder_config.num_attention_heads,
-            startcoder_config.dropout_p, /*dropout*/
-            true,                        /*bias*/
-            false,                       /*add_bias_kv*/
-            false,                       /*add_zero_attn*/
-            DT_NONE,                     /*data_type*/
-            nullptr,                     /*kernel_initializer*/
-            false,                       /*apply_rotary_embedding*/
-            false,                       /*scaling query*/
-            1.0f,                        /*scaling factor*/
-            true,                        /*qk_prod_scaling*/
-            false,                       /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            startcoder_config.dropout_p,             /*dropout*/
+            true,                                    /*bias*/
+            false,                                   /*add_bias_kv*/
+            false,                                   /*add_zero_attn*/
+            DT_NONE,                                 /*data_type*/
+            nullptr,                                 /*kernel_initializer*/
+            startcoder_config.rotary_embedding_meta, /*apply_rotary_embedding*/
+            false,                                   /*scaling query*/
+            1.0f,                                    /*scaling factor*/
+            true,                                    /*qk_prod_scaling*/
+            false,                                   /*position_bias*/
+            false,                                   /*streaming_cache*/
+            std::string("layers." + std::to_string(i) + ".attn.c_attn")
                 .c_str() /*name*/
         );
         break;
@@ -145,7 +152,7 @@ void STARCODER::create_starcoder_model(
         startcoder_config.layer_norm_epsilon,
         true,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_2").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_2").c_str());
     residual = res_ln_outputs[0];
     Tensor l2_norm = res_ln_outputs[1];
 
@@ -161,7 +168,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str());
 
     c_fc = ff.gelu(c_fc);
 
@@ -176,7 +183,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str());
   }
   // final normalization and linear
   ff.residual_layer_norm(residual,
@@ -189,7 +196,7 @@ void STARCODER::create_starcoder_model(
                          startcoder_config.layer_norm_epsilon,
                          true,
                          DT_NONE,
-                         "transformer_ln_f");
+                         "ln_f");
   Tensor ln_f = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(ln_f,
@@ -205,10 +212,10 @@ void STARCODER::create_starcoder_model(
                             "lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
-    // output = ff.beam_top_k(softmax, startcoder_config.max_beam_width, false);
-    output = ff.argmax(softmax, /*beam_Search*/ true);
+    output =
+        ff.arg_top_k(softmax, startcoder_config.k_of_arg_topk, false, false);
   } else {
     // Tensor softmax = ff.softmax(dense, -1);
     if (generationConfig.do_sample) {
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index 0e9577d56..57e1229f1 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
@@ -41,6 +42,7 @@ class STARCODER {
           intermediate_size = model_config["n_inner"];
           dropout_p = model_config["attn_pdrop"];
           max_position_embeddings = model_config["n_positions"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing STARCODER config from JSON file: "
                     << e.what() << std::endl;
@@ -53,17 +55,17 @@ class STARCODER {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {}
 
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
     int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
         intermediate_size, max_position_embeddings;
     float layer_norm_epsilon, dropout_p;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_starcoder_model(FFModel &ff,
diff --git a/inference/python/chat.py b/inference/python/chat.py
new file mode 100644
index 000000000..95132443a
--- /dev/null
+++ b/inference/python/chat.py
@@ -0,0 +1,103 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+
+
+def get_configs():
+    # Define sample configs
+    ff_init_configs = {
+        # required parameters
+        "num_gpus": 8,
+        "memory_per_gpu": 34000,
+        "zero_copy_memory_per_node": 200000,
+        # optional parameters
+        "num_cpus": 16,
+        "legion_utility_processors": 16,
+        "data_parallelism_degree": 1,
+        "tensor_parallelism_degree": 8,
+        "pipeline_parallelism_degree": 1,
+        "offload": False,
+        "offload_reserve_space_size": 8 * 1024,  # 8GB
+        "use_4bit_quantization": False,
+        "use_8bit_quantization": False,
+        "enable_peft": False,
+        "peft_activation_reserve_space_size": 1024,  # 1GB
+        "profiling": False,
+        "benchmarking": False,
+        "inference_debugging": False,
+        "fusion": True,
+    }
+    llm_configs = {
+        # required parameters
+        "llm_model": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+        # optional parameters
+        "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+        "refresh_cache": False,
+        "full_precision": False,
+    }
+    # Merge dictionaries
+    ff_init_configs.update(llm_configs)
+    return ff_init_configs
+
+
+def main():
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.llm_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+    )
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1,
+        max_seq_length=2048,
+        max_tokens_per_batch=256,
+    )
+
+    llm.start_server()
+
+    nemotron_system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
+    llama_generic_system = "You are a helpful an honest programming assistant."
+
+
+    messages=[
+        {"role": "system", "content": nemotron_system},
+        {"role": "user", "content": "Is Rust better than Python?"},
+    ]
+    llm.generate(messages, max_new_tokens=1024)
+    
+    llm.stop_server()
+
+
+if __name__ == "__main__":
+    print("flexflow inference example (incremental decoding)")
+    main()
diff --git a/inference/simplified_infer/CMakeLists.txt b/inference/simplified_infer/CMakeLists.txt
new file mode 100644
index 000000000..35ee40711
--- /dev/null
+++ b/inference/simplified_infer/CMakeLists.txt
@@ -0,0 +1,74 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_SpecInfer)
+set(project_target1 specinfer)
+
+
+set(CPU_SRC1
+  ${FLEXFLOW_CPP_DRV_SRC}
+  specinfer.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target1} DESTINATION ${BIN_DEST})
+
+
+project(FlexFlow_IncrDecoding)
+set(project_target3 incr_dec)
+
+
+set(CPU_SRC3
+  ${FLEXFLOW_CPP_DRV_SRC}
+  incr_dec.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target3} DESTINATION ${BIN_DEST})
diff --git a/inference/simplified_infer/incr_dec.cc b/inference/simplified_infer/incr_dec.cc
new file mode 100644
index 000000000..ed6125d0f
--- /dev/null
+++ b/inference/simplified_infer/incr_dec.cc
@@ -0,0 +1,473 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include <cassert>
+#include <wordexp.h>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string trace_file_path;
+  std::string trace_output_path;
+  std::string log_file_path;
+  std::string csv_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_output_length,
+                      bool &do_sample,
+                      int &request_per_second,
+                      bool &add_special_tokens,
+                      std::string &target_partition) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // traces
+    if (!strcmp(argv[i], "-trace")) {
+      paths.trace_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-trace-output-path")) {
+      paths.trace_output_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-target-partition")) {
+      target_partition = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-log-output-path")) {
+      paths.log_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-csv-output-path")) {
+      paths.csv_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--request-per-second")) {
+      request_per_second = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--add-special-tokens")) {
+      add_special_tokens = true;
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 512;
+  int max_output_length = 512;
+  int num_warmup_requests = 0;
+  double warmup_delay = 15.0;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::INCREMENTAL_DECODING;
+  int sampling_seed = 0;
+  int request_per_second = -1;
+  bool add_special_tokens = false;
+  std::string target_partition = "FEATURE_EXTRACTION";
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   use_full_precision,
+                   verbose,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_output_length,
+                   do_sample,
+                   request_per_second,
+                   add_special_tokens,
+                   target_partition);
+
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  // Get dataset
+  std::ifstream input_file(file_paths.trace_file_path);
+  assert(input_file.good() && "Prompt file does not exist.");
+  nlohmann::ordered_json j = nlohmann::ordered_json::parse(input_file);
+  input_file.close();
+
+  // Find the partition with name "FEATURE_EXTRACTION"
+  auto &partitions = j["partitions"];
+  auto it =
+      std::find_if(partitions.begin(),
+                   partitions.end(),
+                   [target_partition](nlohmann::ordered_json const &partition) {
+                     return partition["partition_name"] == target_partition;
+                   });
+  nlohmann::ordered_json &partition = *it;
+  if (it == partitions.end()) {
+    std::cerr << "Partition " << target_partition
+              << " not found in the trace file." << std::endl;
+    assert(false);
+  }
+  // check that the max prompt + response length sum in the eval_entries in the
+  // partition does not exceed the max_sequence_length
+  int max_prompt_response_length = 0;
+  for (auto &eval_entry : partition["eval_entries"]) {
+    int prompt_length = eval_entry["prompt_length"];
+    int response_length = eval_entry["response_length"];
+    if (response_length >= max_output_length) {
+      std::cerr << "Error: A response length from the targt partition in the "
+                   "dataset (="
+                << response_length
+                << ") exceeds the max_output_length(=" << max_output_length
+                << ")." << std::endl;
+      assert(false);
+    }
+    max_prompt_response_length =
+        std::max(max_prompt_response_length, prompt_length + response_length);
+  }
+  if (max_prompt_response_length >= max_sequence_length) {
+    std::cerr << "Error: max prompt + response length sum (="
+              << max_prompt_response_length
+              << ") in the eval_entries in the partition exceeds the "
+                 "max_sequence_length(="
+              << max_sequence_length << ")." << std::endl;
+    assert(false);
+  }
+
+  // Get model configs
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // set request manager properties
+  srand(sampling_seed);
+  GenerationConfig generationConfig(do_sample, 0.8, 0.6, false, 16);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
+  rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(false);
+  rm->set_baseline_latency(50);
+  rm->set_ssm_spec_latency(20);
+  rm->set_llm_verify_latency(50);
+  rm->set_spec_infer_old_version(true);
+  rm->set_greedy_schedule(false);
+  rm->set_equal_schedule(false);
+  rm->set_max_tree_depth(8);
+  rm->set_max_tree_width(16);
+  rm->set_verbose(verbose);
+  rm->set_streaming_cache(false);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.log_file_path);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              false,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  rm->start_background_server(&model);
+
+  int total_num_requests = 0;
+  {
+    // Iterate through eval_entries
+    std::vector<GenerationRequest> requests;
+    std::vector<double> timestamps, ratios;
+    if (partition.contains("num_warmup_requests")) {
+      num_warmup_requests = partition["num_warmup_requests"];
+    }
+    for (auto &entry : partition["eval_entries"]) {
+      std::string text = entry["prompt"];
+      int max_new_tokens_ = entry["response_length"];
+
+      bool is_warmup_request = total_num_requests < num_warmup_requests;
+      double request_delay =
+          1000.0 *
+          (request_per_second > 0 ? (1.0 / (double)request_per_second) : 0);
+      double emission_time_ms =
+          is_warmup_request
+              ? 0.0
+              : (warmup_delay +
+                 request_delay * (total_num_requests - num_warmup_requests));
+
+      GenerationRequest inference_req(text,             // prompt
+                                      -1.0,             // slo_ratio
+                                      emission_time_ms, // emission_time_ms
+                                      add_special_tokens);
+
+      requests.push_back(inference_req);
+      timestamps.push_back(emission_time_ms);
+      ratios.push_back(1.0);
+      total_num_requests++;
+
+      if (verbose) {
+        break;
+      }
+    }
+    TraceEmissionMachine emission_machine(timestamps, ratios);
+    std::vector<GenerationResult> result =
+        model.generate(requests, emission_machine);
+    assert(result.size() == requests.size());
+    assert(result.size() == total_num_requests);
+    assert(result.size() == partition["eval_entries"].size());
+    int i = 0;
+    for (auto &entry : partition["eval_entries"]) {
+      entry["original_response"] = entry["response"];
+      entry["original_response_length"] = entry["response_length"];
+      std::string ff_out = result[i].output_text;
+      int tot_length = result[i].output_text.length();
+      entry["response"] = ff_out;
+      entry["response_length"] = result[i].output_tokens.size();
+      entry["specinfer_decoding_steps"] = result[i].decoding_steps;
+      i++;
+    }
+
+    // Write the modified JSON to a file
+    std::ofstream output_file(file_paths.trace_output_path);
+    if (output_file.is_open()) {
+      output_file << j.dump(2);
+      output_file.close();
+      std::cout << "Modified JSON has been saved to "
+                << file_paths.trace_output_path << std::endl;
+    } else {
+      std::cerr << "Unable to open file for writing." << std::endl;
+    }
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  std::string header =
+      "llm,partition,max_requests_per_batch,max_tokens_per_"
+      "batch,request_per_second,is_warmup_request,request_guid,"
+      "request_step_idx,timestamp,num_generated_tokens";
+  // csv filepath
+  // create csv filepath and add header if it doesn't exist
+
+  bool csv_file_exists = std::filesystem::exists(file_paths.csv_file_path);
+  if (!csv_file_exists) {
+    // Create new file and write header
+    std::ofstream file(file_paths.csv_file_path);
+    if (!file.is_open()) {
+      std::cerr << "Failed to open file: " << file_paths.csv_file_path
+                << std::endl;
+      assert(false);
+    }
+    file << header << "\n";
+    file.close();
+  }
+
+  // Append the new row
+  std::ofstream file(file_paths.csv_file_path, std::ios::app);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open file: " << file_paths.csv_file_path
+              << std::endl;
+  }
+
+  std::vector<NewProfileInfo> new_profiling_info = rm->get_new_profiling_info();
+  for (auto const &info : new_profiling_info) {
+    file << llm_model_name + ",";
+    file << target_partition + ",";
+    file << std::to_string(max_requests_per_batch) + ",";
+    file << std::to_string(max_tokens_per_batch) + ",";
+    file << std::to_string(request_per_second) + ",";
+    bool is_warmup_request =
+        (info.request_guid - 1000000) < num_warmup_requests;
+    file << std::to_string(is_warmup_request) + ",";
+    file << info.request_guid << "," << info.request_step_idx << ","
+         << info.timestamp << "," << info.num_generated_tokens << "\n";
+  }
+  file.close();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/simplified_infer/specinfer.cc b/inference/simplified_infer/specinfer.cc
new file mode 100644
index 000000000..58f302075
--- /dev/null
+++ b/inference/simplified_infer/specinfer.cc
@@ -0,0 +1,692 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include <cassert>
+#include <filesystem>
+#include <string>
+#include <wordexp.h>
+
+using namespace FlexFlow;
+using namespace Legion;
+using RequestGuid = BatchConfig::RequestGuid;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string trace_file_path;
+  std::string trace_output_path;
+  std::string log_file_path;
+  std::string csv_file_path;
+};
+
+struct ModelNames {
+  std::string llm_model_name;
+  std::vector<std::string> ssm_model_names;
+};
+
+struct ModelMeta {
+  ModelNames model_names;
+
+  ModelType llm_model_type;
+  std::string llm_tokenizer_path;
+  std::string llm_weights_path;
+  std::string llm_model_config_path;
+
+  int bos_token_id;
+  std::vector<int> eos_token_ids;
+
+  std::vector<ModelType> ssm_model_types;
+  std::vector<std::string> ssm_model_config_paths;
+  std::vector<std::string> ssm_model_weights_paths;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      ModelNames &model_names,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &ssm_tp_degree,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_output_length,
+                      int &max_tree_width,
+                      int &max_tree_depth,
+                      int &expansion_degree,
+                      bool &do_sample,
+                      int &request_per_second,
+                      bool &add_special_tokens,
+                      std::string &target_partition) {
+  for (int i = 1; i < argc; i++) {
+    // llm model name
+    if (!strcmp(argv[i], "-llm-model")) {
+      model_names.llm_model_name = std::string(argv[++i]);
+      for (char &c : model_names.llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // ssm models names
+    if (!strcmp(argv[i], "-ssm-model")) {
+      std::string ssm_model_name = std::string(argv[++i]);
+      for (char &c : ssm_model_name) {
+        c = std::tolower(c);
+      }
+      model_names.ssm_model_names.push_back(ssm_model_name);
+      continue;
+    }
+    if (!strcmp(argv[i], "-ssm-tp-degree")) {
+      ssm_tp_degree = std::stoi(argv[++i]);
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // trace
+    if (!strcmp(argv[i], "-trace")) {
+      paths.trace_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-trace-output-path")) {
+      paths.trace_output_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-target-partition")) {
+      target_partition = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-log-output-path")) {
+      paths.log_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-csv-output-path")) {
+      paths.csv_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tree-width")) {
+      max_tree_width = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tree-depth")) {
+      max_tree_depth = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--expansion-degree")) {
+      expansion_degree = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--request-per-second")) {
+      request_per_second = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--add-special-tokens")) {
+      add_special_tokens = true;
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void get_model_meta(FilePaths &file_paths,
+                    ModelMeta &model_metadata,
+                    bool use_full_precision) {
+  if (model_metadata.model_names.llm_model_name.empty() ||
+      model_metadata.model_names.ssm_model_names.size() == 0) {
+    assert(false && "SpecInfer needs at least one LLM and one SSM for "
+                    "speculative inference");
+  }
+  model_metadata.llm_model_config_path =
+      join_path({file_paths.cache_folder_path,
+                 "configs",
+                 model_metadata.model_names.llm_model_name,
+                 "config.json"});
+  model_metadata.llm_tokenizer_path =
+      join_path({file_paths.cache_folder_path,
+                 "tokenizers",
+                 model_metadata.model_names.llm_model_name});
+  model_metadata.llm_weights_path =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 model_metadata.model_names.llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+
+  std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path);
+  if (!llm_config_file_handle.good()) {
+    std::cout << "LLM Model config file "
+              << model_metadata.llm_model_config_path << " not found."
+              << std::endl;
+    assert(false);
+  }
+  nlohmann::ordered_json llm_model_config =
+      nlohmann::ordered_json::parse(llm_config_file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+
+  model_metadata.llm_model_type = ModelType::UNKNOWN;
+  auto architectures = llm_model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
+      model_metadata.llm_model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_metadata.llm_model_type = ModelType::FALCON;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::MPT;
+      break;
+    }
+  }
+  model_metadata.bos_token_id =
+      llm_model_config.find("bos_token_id") == llm_model_config.end()
+          ? -1
+          : (int)llm_model_config.at("bos_token_id");
+  // model_metadata.eos_token_id =
+  //     llm_model_config.find("eos_token_id") == llm_model_config.end()
+  //         ? -1
+  //         : (int)llm_model_config.at("eos_token_id");
+  if (llm_model_config.find("eos_token_id") != llm_model_config.end()) {
+    if (llm_model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : llm_model_config["eos_token_id"]) {
+        model_metadata.eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]);
+    }
+  } else {
+    model_metadata.eos_token_ids.push_back(-1);
+  }
+
+  for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
+    std::string ssm_config_path = join_path({file_paths.cache_folder_path,
+                                             "configs",
+                                             ssm_model_name,
+                                             "config.json"});
+    std::string ssm_tokenizer_path =
+        join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name});
+    std::string ssm_weights_path =
+        join_path({file_paths.cache_folder_path,
+                   "weights",
+                   ssm_model_name,
+                   use_full_precision ? "full-precision" : "half-precision"});
+
+    std::ifstream ssm_config_file_handle(ssm_config_path);
+    if (!ssm_config_file_handle.good()) {
+      std::cout << "SSM Model config file " << ssm_config_path << " not found."
+                << std::endl;
+      assert(false);
+    }
+    nlohmann::ordered_json ssm_model_config =
+        nlohmann::ordered_json::parse(ssm_config_file_handle,
+                                      /*parser_callback_t */ nullptr,
+                                      /*allow_exceptions */ true,
+                                      /*ignore_comments */ true);
+
+    ModelType ssm_model_type = ModelType::UNKNOWN;
+    auto architectures = ssm_model_config["architectures"];
+    for (auto const &str : architectures) {
+      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+          str == "MistralForCausalLM") {
+        ssm_model_type = ModelType::LLAMA;
+        break;
+      } else if (str == "OPTForCausalLM") {
+        ssm_model_type = ModelType::OPT;
+        break;
+      } else if (str == "RWForCausalLM") {
+        ssm_model_type = ModelType::FALCON;
+        break;
+      } else if (str == "MPTForCausalLM") {
+        ssm_model_type = ModelType::MPT;
+        break;
+      }
+    }
+    int ssm_bos_id =
+        ssm_model_config.find("bos_token_id") == ssm_model_config.end()
+            ? -1
+            : (int)ssm_model_config.at("bos_token_id");
+    // int ssm_eos_id =
+    //     ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+    //         ? -1
+    //         : (int)ssm_model_config.at("eos_token_id");
+    // if (ssm_bos_id != model_metadata.bos_token_id ||
+    //     ssm_eos_id != model_metadata.eos_token_id) {
+    //   printf("Warning: bos/eos token id mismatch between LLM and one of the "
+    //          "SSMs!\n");
+    // }
+    model_metadata.ssm_model_types.push_back(ssm_model_type);
+    model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
+    model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
+  }
+
+  assert(model_metadata.llm_model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  for (auto mt : model_metadata.ssm_model_types) {
+    if (mt == ModelType::UNKNOWN) {
+      assert(false && "One of the SSM model types passed is invalid.");
+    }
+  }
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  FilePaths file_paths;
+  ModelMeta model_metadata;
+  bool use_full_precision = false;
+  bool verbose = false;
+  int ssm_tp_degree = 1;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 512;
+  int max_output_length = 512;
+  int expansion_degree = 3;
+  int max_tree_depth = 8;
+  int max_tree_width = 16;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::SPECULATIVE_DECODING;
+  bool do_sample = false;
+  int sampling_seed = 0;
+  int request_per_second = -1;
+  int num_warmup_requests = 0;
+  double warmup_delay = 15.0;
+  bool add_special_tokens = false;
+  std::string target_partition = "FEATURE_EXTRACTION";
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   model_metadata.model_names,
+                   use_full_precision,
+                   verbose,
+                   ssm_tp_degree,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_output_length,
+                   max_tree_width,
+                   max_tree_depth,
+                   expansion_degree,
+                   do_sample,
+                   request_per_second,
+                   add_special_tokens,
+                   target_partition);
+
+  get_model_meta(file_paths, model_metadata, use_full_precision);
+
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+  assert(ssm_tp_degree >= 1 &&
+         ssm_tp_degree <= ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::ifstream input_file(file_paths.trace_file_path);
+  assert(input_file.good() && "Prompt file does not exist.");
+  nlohmann::ordered_json j = nlohmann::ordered_json::parse(input_file);
+  input_file.close();
+
+  // Find the partition with name "FEATURE_EXTRACTION"
+  auto &partitions = j["partitions"];
+  auto it =
+      std::find_if(partitions.begin(),
+                   partitions.end(),
+                   [target_partition](nlohmann::ordered_json const &partition) {
+                     return partition["partition_name"] == target_partition;
+                   });
+  nlohmann::ordered_json &partition = *it;
+  if (it == partitions.end()) {
+    std::cerr << "Partition " << target_partition
+              << " not found in the trace file." << std::endl;
+    assert(false);
+  }
+  // check that the max prompt + response length sum in the eval_entries in the
+  // partition does not exceed the max_sequence_length
+  int max_prompt_response_length = 0;
+  for (auto &eval_entry : partition["eval_entries"]) {
+    int prompt_length = eval_entry["prompt_length"];
+    int response_length = eval_entry["response_length"];
+    if (response_length >= max_output_length) {
+      std::cerr << "Error: A response length from the targt partition in the "
+                   "dataset (="
+                << response_length
+                << ") exceeds the max_output_length(=" << max_output_length
+                << ")." << std::endl;
+      assert(false);
+    }
+    max_prompt_response_length =
+        std::max(max_prompt_response_length, prompt_length + response_length);
+  }
+  if (max_prompt_response_length >= max_sequence_length) {
+    std::cerr << "Error: max prompt + response length sum (="
+              << max_prompt_response_length
+              << ") in the eval_entries in the partition exceeds the "
+                 "max_sequence_length(="
+              << max_sequence_length << ")." << std::endl;
+    assert(false);
+  }
+
+  // Sanity check for SpecInfer old version
+  assert(max_tree_depth <= 8);
+  assert(max_tree_width >= 3);
+  // Total verified tokens
+  assert(max_tokens_per_batch >= max_requests_per_batch * 21);
+
+  // Create SentencePiece tokenizer or OPT tokenizer
+  srand(sampling_seed);
+  GenerationConfig generationConfig(do_sample, 0.8, 0.6, false, 16);
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
+  rm->set_max_tree_depth(max_tree_depth);
+  rm->set_max_tree_width(max_tree_width);
+  rm->set_expansion_degree(expansion_degree);
+  rm->set_verbose(verbose);
+  rm->set_streaming_cache(false);
+  rm->register_tokenizer(model_metadata.llm_model_type,
+                         model_metadata.bos_token_id,
+                         model_metadata.eos_token_ids,
+                         model_metadata.llm_tokenizer_path);
+  rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(false);
+  rm->set_baseline_latency(50);
+  rm->set_ssm_spec_latency(20);
+  rm->set_llm_verify_latency(50);
+  rm->set_spec_infer_old_version(true);
+  rm->set_greedy_schedule(false);
+  rm->set_equal_schedule(false);
+  rm->register_output_filepath(file_paths.log_file_path);
+
+  // Create LLM model
+  FFModel tree_model(ffconfig, ffconfig.cpu_offload);
+  if (model_metadata.llm_model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(tree_model,
+                              model_metadata.llm_model_config_path,
+                              model_metadata.llm_weights_path,
+                              TREE_VERIFY_MODE,
+                              generationConfig,
+                              false,
+                              use_full_precision);
+  } else if (model_metadata.llm_model_type == ModelType::OPT) {
+    OPT::create_opt_model(tree_model,
+                          model_metadata.llm_model_config_path,
+                          model_metadata.llm_weights_path,
+                          TREE_VERIFY_MODE,
+                          use_full_precision);
+  } else if (model_metadata.llm_model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(tree_model,
+                                model_metadata.llm_model_config_path,
+                                model_metadata.llm_weights_path,
+                                TREE_VERIFY_MODE,
+                                use_full_precision);
+  } else if (model_metadata.llm_model_type == ModelType::MPT) {
+    MPT::create_mpt_model(tree_model,
+                          model_metadata.llm_model_config_path,
+                          model_metadata.llm_weights_path,
+                          TREE_VERIFY_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "Invalid LLM model type passed (or no type was passed).");
+  }
+
+  // Create SSM models
+  int num_ssms = model_metadata.ssm_model_types.size();
+  std::vector<int> ssm_model_ids;
+  std::vector<FFModel> ssm_models;
+  FFConfig bm_config = ffconfig;
+  std::cout << "SSM TP Degree: " << ssm_tp_degree << std::endl;
+  // bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
+  //     bm_config.pipeline_parallelism_degree = 1;
+  bm_config.data_parallelism_degree = 1;
+  bm_config.tensor_parallelism_degree = ssm_tp_degree;
+  bm_config.pipeline_parallelism_degree = 1;
+  for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
+    FFModel beam_model(bm_config);
+    ssm_models.push_back(beam_model);
+  }
+
+  for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
+    FFModel &beam_model = ssm_models[ssm_id];
+    if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) {
+      LLAMA::create_llama_model(beam_model,
+                                model_metadata.ssm_model_config_paths[ssm_id],
+                                model_metadata.ssm_model_weights_paths[ssm_id],
+                                TREE_SEARCH_MODE,
+                                generationConfig,
+                                false,
+                                use_full_precision);
+    } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
+      OPT::create_opt_model(beam_model,
+                            model_metadata.ssm_model_config_paths[ssm_id],
+                            model_metadata.ssm_model_weights_paths[ssm_id],
+                            TREE_SEARCH_MODE,
+                            use_full_precision);
+    } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) {
+      FALCON::create_falcon_model(
+          beam_model,
+          model_metadata.ssm_model_config_paths[ssm_id],
+          model_metadata.ssm_model_weights_paths[ssm_id],
+          TREE_SEARCH_MODE,
+          use_full_precision);
+    } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) {
+      MPT::create_mpt_model(beam_model,
+                            model_metadata.ssm_model_config_paths[ssm_id],
+                            model_metadata.ssm_model_weights_paths[ssm_id],
+                            TREE_SEARCH_MODE,
+                            generationConfig,
+                            use_full_precision);
+    } else {
+      assert(false && "Invalid SSM model type passed.");
+    }
+
+    rm->register_ssm_model(&beam_model);
+  }
+
+  rm->start_background_server(&tree_model);
+
+  int total_num_requests = 0;
+  {
+    // Iterate through eval_entries
+    std::vector<GenerationRequest> requests;
+    std::vector<double> timestamps, ratios;
+    if (partition.contains("num_warmup_requests")) {
+      num_warmup_requests = partition["num_warmup_requests"];
+    }
+    for (auto &entry : partition["eval_entries"]) {
+      std::string text = entry["prompt"];
+      int max_new_tokens_ = entry["response_length"];
+
+      bool is_warmup_request = total_num_requests < num_warmup_requests;
+      double request_delay =
+          1000.0 *
+          (request_per_second > 0 ? (1.0 / (double)request_per_second) : 0);
+      double emission_time_ms =
+          is_warmup_request
+              ? 0.0
+              : (warmup_delay +
+                 request_delay * (total_num_requests - num_warmup_requests));
+
+      GenerationRequest inference_req(text,             // prompt
+                                      -1.0,             // slo_ratio
+                                      emission_time_ms, // emission_time_ms
+                                      add_special_tokens);
+      requests.push_back(inference_req);
+      timestamps.push_back(emission_time_ms);
+      ratios.push_back(1.0);
+      total_num_requests++;
+
+      if (verbose) {
+        break;
+      }
+    }
+    TraceEmissionMachine emission_machine(timestamps, ratios);
+    std::vector<GenerationResult> result =
+        tree_model.generate(requests, emission_machine);
+    assert(result.size() == requests.size());
+    assert(result.size() == total_num_requests);
+    assert(result.size() == partition["eval_entries"].size());
+    int i = 0;
+    for (auto &entry : partition["eval_entries"]) {
+      entry["original_response"] = entry["response"];
+      entry["original_response_length"] = entry["response_length"];
+      std::string ff_out = result[i].output_text;
+      int tot_length = result[i].output_text.length();
+      entry["response"] = ff_out;
+      entry["response_length"] = result[i].output_tokens.size();
+      entry["specinfer_decoding_steps"] = result[i].decoding_steps;
+      i++;
+    }
+
+    // Write the modified JSON to a file
+    std::ofstream output_file(file_paths.trace_output_path);
+    if (output_file.is_open()) {
+      output_file << j.dump(2);
+      output_file.close();
+      std::cout << "Modified JSON has been saved to "
+                << file_paths.trace_output_path << std::endl;
+    } else {
+      std::cerr << "Unable to open file for writing." << std::endl;
+    }
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  std::string header =
+      "llm,ssm,partition,expansion_degree,max_tree_depth,max_tree_width,max_"
+      "requests_per_batch,max_tokens_per_batch,request_per_second,is_warmup_"
+      "request,request_guid,"
+      "request_step_idx,"
+      "timestamp,speculation_start_timestamp,speculation_end_timestamp,num_"
+      "speculated_tokens,num_accepted_tokens,num_generated_tokens";
+  // csv filepath
+  // create csv filepath and add header if it doesn't exist
+
+  bool csv_file_exists = std::filesystem::exists(file_paths.csv_file_path);
+  if (!csv_file_exists) {
+    // Create new file and write header
+    std::ofstream file(file_paths.csv_file_path);
+    if (!file.is_open()) {
+      std::cerr << "Failed to open file: " << file_paths.csv_file_path
+                << std::endl;
+      assert(false);
+    }
+    file << header << "\n";
+    file.close();
+  }
+
+  // Append the new row
+  std::ofstream file(file_paths.csv_file_path, std::ios::app);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open file: " << file_paths.csv_file_path
+              << std::endl;
+  }
+
+  std::vector<NewProfileInfo> new_profiling_info = rm->get_new_profiling_info();
+  for (auto const &info : new_profiling_info) {
+    file << model_metadata.model_names.llm_model_name + ",";
+    file << model_metadata.model_names.ssm_model_names[0] + ",";
+    file << target_partition + ",";
+    file << std::to_string(expansion_degree) + ",";
+    file << std::to_string(max_tree_depth) + ",";
+    file << std::to_string(max_tree_width) + ",";
+    file << std::to_string(max_requests_per_batch) + ",";
+    file << std::to_string(max_tokens_per_batch) + ",";
+    file << std::to_string(request_per_second) + ",";
+    bool is_warmup_request =
+        (info.request_guid - 1000000) < num_warmup_requests;
+    file << std::to_string(is_warmup_request) + ",";
+    file << info.request_guid << "," << info.request_step_idx << ","
+         << info.timestamp << "," << info.speculation_start_timestamp << ","
+         << info.speculation_end_timestamp << "," << info.num_speculated_tokens
+         << "," << info.num_accepted_tokens << "," << info.num_generated_tokens
+         << "\n";
+  }
+  file.close();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index f7edfd769..ddf92cbf6 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -18,19 +18,21 @@
 #include "models/llama.h"
 #include "models/mpt.h"
 #include "models/opt.h"
+#include <cassert>
 #include <filesystem>
-#include <nlohmann/json.hpp>
+#include <string>
 #include <wordexp.h>
 
 using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
 
-LegionRuntime::Logger::Category log_app("llama");
+Legion::Logger log_app("llama");
 
 struct FilePaths {
   std::string cache_folder_path;
   std::string prompt_file_path;
+  std::string trace_file_path;
   std::string output_file_path;
 };
 
@@ -47,7 +49,8 @@ struct ModelMeta {
   std::string llm_weights_path;
   std::string llm_model_config_path;
 
-  int bos_token_id, eos_token_id;
+  int bos_token_id;
+  std::vector<int> eos_token_ids;
 
   std::vector<ModelType> ssm_model_types;
   std::vector<std::string> ssm_model_config_paths;
@@ -60,10 +63,32 @@ void parse_input_args(char **argv,
                       ModelNames &model_names,
                       bool &use_full_precision,
                       bool &verbose,
+                      int &ssm_tp_degree,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
+                      int &max_tokens_per_ssm_batch,
+                      int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
-                      int &expansion_degree) {
+                      int &max_output_length,
+                      size_t &max_kv_cache_size,
+                      int &max_tree_width,
+                      int &max_tree_depth,
+                      int &expansion_degree,
+                      bool &spec_sampling,
+                      bool &do_sample,
+                      int &sampling_seed,
+                      bool &streaming_cache,
+                      bool &slo_attainment_early_termination,
+                      double &baseline_latency_ms,
+                      double &ssm_spec_latency_ms,
+                      double &llm_verify_latency_ms,
+                      double &request_per_second,
+                      bool &spec_infer_old_version,
+                      bool &greedy_schedule,
+                      bool &equal_schedule,
+                      std::string &emission_file_path,
+                      bool &add_special_tokens,
+                      bool &eval_overhead_breakdown) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -82,6 +107,10 @@ void parse_input_args(char **argv,
       model_names.ssm_model_names.push_back(ssm_model_name);
       continue;
     }
+    if (!strcmp(argv[i], "-ssm-tp-degree")) {
+      ssm_tp_degree = std::stoi(argv[++i]);
+      continue;
+    }
     // cache folder
     if (!strcmp(argv[i], "-cache-folder")) {
       paths.cache_folder_path = std::string(argv[++i]);
@@ -92,6 +121,11 @@ void parse_input_args(char **argv,
       paths.prompt_file_path = std::string(argv[++i]);
       continue;
     }
+    // traces
+    if (!strcmp(argv[i], "-trace")) {
+      paths.trace_file_path = std::string(argv[++i]);
+      continue;
+    }
     // output file
     if (!strcmp(argv[i], "-output-file")) {
       paths.output_file_path = std::string(argv[++i]);
@@ -114,14 +148,99 @@ void parse_input_args(char **argv,
       max_tokens_per_batch = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-tokens-per-ssm-batch")) {
+      max_tokens_per_ssm_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-prefilling-batch")) {
+      max_tokens_per_prefilling_batch = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--max-sequence-length")) {
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-kv-cache-size")) {
+      max_kv_cache_size = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tree-width")) {
+      max_tree_width = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tree-depth")) {
+      max_tree_depth = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--expansion-degree")) {
       expansion_degree = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--sampling-seed")) {
+      sampling_seed = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--spec-sampling")) {
+      spec_sampling = true;
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--enable-streaming-cache")) {
+      streaming_cache = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--slo-attainment-early-termination")) {
+      slo_attainment_early_termination = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--baseline-latency-ms")) {
+      baseline_latency_ms = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--ssm-spec-latency-ms")) {
+      ssm_spec_latency_ms = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--llm-verify-latency-ms")) {
+      llm_verify_latency_ms = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--request-per-second")) {
+      request_per_second = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--spec-infer-old-version")) {
+      spec_infer_old_version = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--greedy-schedule")) {
+      greedy_schedule = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--equal-schedule")) {
+      equal_schedule = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--emission-file-path")) {
+      emission_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--no-special-tokens")) {
+      add_special_tokens = false;
+      continue;
+    }
+    if (!strcmp(argv[i], "--eval-overhead-breakdown")) {
+      eval_overhead_breakdown = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -173,7 +292,8 @@ void get_model_meta(FilePaths &file_paths,
   model_metadata.llm_model_type = ModelType::UNKNOWN;
   auto architectures = llm_model_config["architectures"];
   for (auto const &str : architectures) {
-    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
       model_metadata.llm_model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
@@ -191,10 +311,21 @@ void get_model_meta(FilePaths &file_paths,
       llm_model_config.find("bos_token_id") == llm_model_config.end()
           ? -1
           : (int)llm_model_config.at("bos_token_id");
-  model_metadata.eos_token_id =
-      llm_model_config.find("eos_token_id") == llm_model_config.end()
-          ? -1
-          : (int)llm_model_config.at("eos_token_id");
+  // model_metadata.eos_token_id =
+  //     llm_model_config.find("eos_token_id") == llm_model_config.end()
+  //         ? -1
+  //         : (int)llm_model_config.at("eos_token_id");
+  if (llm_model_config.find("eos_token_id") != llm_model_config.end()) {
+    if (llm_model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : llm_model_config["eos_token_id"]) {
+        model_metadata.eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]);
+    }
+  } else {
+    model_metadata.eos_token_ids.push_back(-1);
+  }
 
   for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
     std::string ssm_config_path = join_path({file_paths.cache_folder_path,
@@ -223,7 +354,8 @@ void get_model_meta(FilePaths &file_paths,
     ModelType ssm_model_type = ModelType::UNKNOWN;
     auto architectures = ssm_model_config["architectures"];
     for (auto const &str : architectures) {
-      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+          str == "MistralForCausalLM") {
         ssm_model_type = ModelType::LLAMA;
         break;
       } else if (str == "OPTForCausalLM") {
@@ -241,15 +373,15 @@ void get_model_meta(FilePaths &file_paths,
         ssm_model_config.find("bos_token_id") == ssm_model_config.end()
             ? -1
             : (int)ssm_model_config.at("bos_token_id");
-    int ssm_eos_id =
-        ssm_model_config.find("eos_token_id") == ssm_model_config.end()
-            ? -1
-            : (int)ssm_model_config.at("eos_token_id");
-    if (ssm_bos_id != model_metadata.bos_token_id ||
-        ssm_eos_id != model_metadata.eos_token_id) {
-      printf("Warning: bos/eos token id mismatch between LLM and one of the "
-             "SSMs!\n");
-    }
+    // int ssm_eos_id =
+    //     ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+    //         ? -1
+    //         : (int)ssm_model_config.at("eos_token_id");
+    // if (ssm_bos_id != model_metadata.bos_token_id ||
+    //     ssm_eos_id != model_metadata.eos_token_id) {
+    //   printf("Warning: bos/eos token id mismatch between LLM and one of the "
+    //          "SSMs!\n");
+    // }
     model_metadata.ssm_model_types.push_back(ssm_model_type);
     model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
     model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
@@ -274,11 +406,34 @@ void FlexFlow::top_level_task(Task const *task,
   ModelMeta model_metadata;
   bool use_full_precision = false;
   bool verbose = false;
-  int max_requests_per_batch = 16;
-  int max_tokens_per_batch = 256;
-  int max_sequence_length = 1024;
-  int max_spec_tree_token_num = 23;
+  int ssm_tp_degree = 1;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_tokens_per_ssm_batch = -1;
+  int max_tokens_per_prefilling_batch = -1;
+  int max_sequence_length = 512;
+  int max_output_length = 512;
+  size_t max_kv_cache_size = 0; // if 0, then use the default value
   int expansion_degree = 3;
+  int max_tree_depth = 8;
+  int max_tree_width = 16;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::SPECULATIVE_DECODING;
+  bool spec_sampling = false;
+  bool do_sample = false;
+  int sampling_seed = 0;
+  bool streaming_cache = false;
+  bool slo_attainment_early_termination = false;
+  double baseline_latency_ms = 50;
+  double ssm_spec_latency_ms = 20;
+  double llm_verify_latency_ms = 50;
+  double request_per_second = 1.0;
+  bool spec_infer_old_version = false;
+  bool greedy_schedule = false;
+  bool equal_schedule = false;
+  bool add_special_tokens = true;
+  bool eval_overhead_breakdown = false;
+  std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -289,37 +444,85 @@ void FlexFlow::top_level_task(Task const *task,
                    model_metadata.model_names,
                    use_full_precision,
                    verbose,
+                   ssm_tp_degree,
                    max_requests_per_batch,
                    max_tokens_per_batch,
+                   max_tokens_per_ssm_batch,
+                   max_tokens_per_prefilling_batch,
                    max_sequence_length,
-                   expansion_degree);
+                   max_output_length,
+                   max_kv_cache_size,
+                   max_tree_width,
+                   max_tree_depth,
+                   expansion_degree,
+                   spec_sampling,
+                   do_sample,
+                   sampling_seed,
+                   streaming_cache,
+                   slo_attainment_early_termination,
+                   baseline_latency_ms,
+                   ssm_spec_latency_ms,
+                   llm_verify_latency_ms,
+                   request_per_second,
+                   spec_infer_old_version,
+                   greedy_schedule,
+                   equal_schedule,
+                   emission_file_path,
+                   add_special_tokens,
+                   eval_overhead_breakdown);
+  if (max_tokens_per_ssm_batch == -1) {
+    max_tokens_per_ssm_batch = max_tokens_per_batch;
+  }
+  if (max_tokens_per_prefilling_batch == -1) {
+    max_tokens_per_prefilling_batch = max_tokens_per_batch;
+  }
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
+  assert(ssm_tp_degree >= 1 &&
+         ssm_tp_degree <= ffconfig.numNodes * ffconfig.workersPerNode);
+
+  // Sanity check for SpecInfer old version
+  if (spec_infer_old_version) {
+    assert(max_tree_depth = 8);
+    assert(max_tree_width >= 3);
+    // Total verified tokens
+    assert(max_tokens_per_batch >= max_requests_per_batch * 21);
+  }
 
   // Create SentencePiece tokenizer or OPT tokenizer
-  GenerationConfig generationConfig;
+  srand(sampling_seed);
+  GenerationConfig generationConfig(do_sample, 0.8, 0.6, spec_sampling, 16);
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
-  rm->set_max_spec_tree_token_num(max_spec_tree_token_num);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
+  rm->set_max_kv_cache_size(max_kv_cache_size);
+  rm->set_max_tree_depth(max_tree_depth);
+  rm->set_max_tree_width(max_tree_width);
+  rm->set_verbose(verbose);
+  rm->set_streaming_cache(streaming_cache);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
-                         model_metadata.eos_token_id,
+                         model_metadata.eos_token_ids,
                          model_metadata.llm_tokenizer_path);
+  rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(slo_attainment_early_termination);
+  rm->set_baseline_latency(baseline_latency_ms);
+  rm->set_ssm_spec_latency(ssm_spec_latency_ms);
+  rm->set_llm_verify_latency(llm_verify_latency_ms);
+  rm->set_spec_infer_old_version(spec_infer_old_version);
+  rm->set_greedy_schedule(greedy_schedule);
+  rm->set_equal_schedule(equal_schedule);
   rm->register_output_filepath(file_paths.output_file_path);
-
-  // first decoding step: 3 results
-  if (expansion_degree != -1) {
-    rm->push_spec_infer_tree_width(1);
-    rm->push_spec_infer_tree_width(1);
-    rm->push_spec_infer_tree_width(expansion_degree);
-  }
+  rm->set_eval_overhead_breakdown(eval_overhead_breakdown);
 
   // Create LLM model
   FFModel tree_model(ffconfig, ffconfig.cpu_offload);
@@ -329,6 +532,7 @@ void FlexFlow::top_level_task(Task const *task,
                               model_metadata.llm_weights_path,
                               TREE_VERIFY_MODE,
                               generationConfig,
+                              false,
                               use_full_precision);
   } else if (model_metadata.llm_model_type == ModelType::OPT) {
     OPT::create_opt_model(tree_model,
@@ -358,8 +562,12 @@ void FlexFlow::top_level_task(Task const *task,
   std::vector<int> ssm_model_ids;
   std::vector<FFModel> ssm_models;
   FFConfig bm_config = ffconfig;
-  bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
-      bm_config.pipeline_parallelism_degree = 1;
+  std::cout << "SSM TP Degree: " << ssm_tp_degree << std::endl;
+  // bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
+  //     bm_config.pipeline_parallelism_degree = 1;
+  bm_config.data_parallelism_degree = 1;
+  bm_config.tensor_parallelism_degree = ssm_tp_degree;
+  bm_config.pipeline_parallelism_degree = 1;
   for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
     FFModel beam_model(bm_config);
     ssm_models.push_back(beam_model);
@@ -371,27 +579,28 @@ void FlexFlow::top_level_task(Task const *task,
       LLAMA::create_llama_model(beam_model,
                                 model_metadata.ssm_model_config_paths[ssm_id],
                                 model_metadata.ssm_model_weights_paths[ssm_id],
-                                BEAM_SEARCH_MODE,
+                                TREE_SEARCH_MODE,
                                 generationConfig,
+                                streaming_cache,
                                 use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
       OPT::create_opt_model(beam_model,
                             model_metadata.ssm_model_config_paths[ssm_id],
                             model_metadata.ssm_model_weights_paths[ssm_id],
-                            BEAM_SEARCH_MODE,
+                            TREE_SEARCH_MODE,
                             use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) {
       FALCON::create_falcon_model(
           beam_model,
           model_metadata.ssm_model_config_paths[ssm_id],
           model_metadata.ssm_model_weights_paths[ssm_id],
-          BEAM_SEARCH_MODE,
+          TREE_SEARCH_MODE,
           use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) {
       MPT::create_mpt_model(beam_model,
                             model_metadata.ssm_model_config_paths[ssm_id],
                             model_metadata.ssm_model_weights_paths[ssm_id],
-                            BEAM_SEARCH_MODE,
+                            TREE_SEARCH_MODE,
                             generationConfig,
                             use_full_precision);
     } else {
@@ -404,25 +613,79 @@ void FlexFlow::top_level_task(Task const *task,
   rm->start_background_server(&tree_model);
 
   // Register requests from prompt file
-  int total_num_requests = 0;
   {
-    using json = nlohmann::json;
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
-
-    std::vector<std::string> prompts;
-    for (auto &prompt : prompt_json) {
-      std::string text = prompt.get<std::string>();
-      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-      total_num_requests++;
-      prompts.push_back(text);
-      // tree_model.generate(text, 128 /*max_sequence_length*/);
-    }
-    tree_model.generate(prompts, 128 /*max_sequence_length*/);
+    std::vector<GenerationRequest> requests;
+    std::vector<GenerationResult> results;
+
+    if (!file_paths.prompt_file_path.empty()) {
+      std::ifstream file_handle(file_paths.prompt_file_path);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+      // Parse slo_ratios
+      std::vector<std::pair<double, double>> slo_ratios;
+      if (prompt_json[0].contains("slo_ratios")) {
+        for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
+          slo_ratios.emplace_back(std::stod(key), value.get<double>());
+        }
+      }
+      double total = std::accumulate(
+          slo_ratios.begin(),
+          slo_ratios.end(),
+          0.0,
+          [](double sum, std::pair<double, double> const &pair) {
+            return sum + pair.second;
+          });
+      if (std::abs(total - 1.0) > 1e-6) {
+        std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
+                  << total << std::endl;
+        assert(false);
+      }
+      for (size_t i = 1; i < prompt_json.size(); ++i) {
+        requests.push_back(
+            GenerationRequest(prompt_json[i]["prompt"].get<std::string>(),
+                              -1.0,
+                              0,
+                              add_special_tokens));
+      }
+      PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
+      //   ConstantEmissionMachine emission_machine(-1, slo_ratios);
+      results = tree_model.generate(requests, emission_machine);
+    } else if (!file_paths.trace_file_path.empty()) {
+      std::ifstream file_handle(file_paths.trace_file_path);
+      assert(file_handle.good() && "Trace file does not exist.");
+      json trace_json = json::parse(file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+      std::vector<double> timestamps, ratios;
+      for (auto const &json_obj : trace_json) {
+        EmissionTrace trace(json_obj);
+        requests.push_back(
+            GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
+        timestamps.push_back(trace.emission_time_ms);
+        ratios.push_back(trace.slo_ratio);
+      }
+      timestamps.erase(timestamps.begin());
+      timestamps.push_back(timestamps.back() + 1000.0);
+      TraceEmissionMachine emission_machine(timestamps, ratios);
+      results = tree_model.generate(requests, emission_machine);
+    } else {
+      assert(false && "No prompt or trace file provided.");
+    }
+
+    // output generation results as json
+    if (!emission_file_path.empty()) {
+      json output_json;
+      for (size_t i = 0; i < results.size(); ++i) {
+        EmissionTrace trace(results[i]);
+        output_json.push_back(trace.to_json());
+      }
+      std::ofstream emission_file_handle(emission_file_path);
+      emission_file_handle << output_json.dump(2) << std::endl;
+    }
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/trace_generator/CMakeLists.txt b/inference/trace_generator/CMakeLists.txt
new file mode 100644
index 000000000..f18eb712c
--- /dev/null
+++ b/inference/trace_generator/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_TraceGenerator)
+set(project_target trace_generator)
+
+
+set(CPU_SRC
+  ${FLEXFLOW_CPP_DRV_SRC}
+  trace_generator.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target} ${CPU_SRC})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target} ${CPU_SRC})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/inference/trace_generator/Makefile b/inference/trace_generator/Makefile
new file mode 100644
index 000000000..0e4b79f51
--- /dev/null
+++ b/inference/trace_generator/Makefile
@@ -0,0 +1,37 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 0		# Include debugging symbols
+MAX_DIM         ?= 4		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Put the binary file name here
+OUTFILE		?= llama_pipeline
+# List all the application source files here
+ifndef CUDA_HOME
+CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
+endif
+
+
+ifndef FF_HOME
+$(error FF_HOME variable is not defined, aborting build)
+endif
+
+include $(FF_HOME)/FlexFlow.mk
diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
new file mode 100644
index 000000000..0b9285a0c
--- /dev/null
+++ b/inference/trace_generator/trace_generator.cc
@@ -0,0 +1,558 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include <cassert>
+#include <chrono>
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <wordexp.h>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string log_file_path;
+  std::string emission_file_path;
+};
+
+struct Prompts {
+  std::vector<std::string> file_paths;
+  std::vector<double> proportions;
+  std::vector<double> slo_ratios;
+
+  std::vector<json> jsons;
+  std::vector<int> idxs;
+};
+
+struct ModelNames {
+  std::string llm_model_name;
+  std::vector<std::string> ssm_model_names;
+};
+
+struct ModelMeta {
+  ModelNames model_names;
+
+  ModelType llm_model_type;
+  std::string llm_tokenizer_path;
+  std::string llm_weights_path;
+  std::string llm_model_config_path;
+
+  int bos_token_id;
+  std::vector<int> eos_token_ids;
+
+  std::vector<ModelType> ssm_model_types;
+  std::vector<std::string> ssm_model_config_paths;
+  std::vector<std::string> ssm_model_weights_paths;
+};
+
+template <typename T>
+std::vector<T> split_by_comma(std::string const &input) {
+  std::vector<T> result;
+  std::stringstream ss(input);
+  std::string item;
+  while (std::getline(ss, item, ',')) {
+    std::stringstream item_stream(item);
+    if constexpr (std::is_same<T, double>::value) {
+      double value;
+      if (item_stream >> value) {
+        result.push_back(value);
+      }
+    } else if constexpr (std::is_same<T, std::string>::value) {
+      result.push_back(item);
+    }
+  }
+  return result;
+}
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      Prompts &prompts,
+                      ModelNames &model_names,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &max_sequence_length,
+                      int &max_output_length,
+                      size_t &max_kv_cache_size,
+                      double &scaling_factor) {
+  for (int i = 1; i < argc; i++) {
+    // llm model name
+    if (!strcmp(argv[i], "-llm-model")) {
+      model_names.llm_model_name = std::string(argv[++i]);
+      for (char &c : model_names.llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // ssm models names
+    if (!strcmp(argv[i], "-ssm-model")) {
+      std::string ssm_model_name = std::string(argv[++i]);
+      for (char &c : ssm_model_name) {
+        c = std::tolower(c);
+      }
+      model_names.ssm_model_names.push_back(ssm_model_name);
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "--prompt-files")) {
+      prompts.file_paths = split_by_comma<std::string>(std::string(argv[++i]));
+      continue;
+    }
+    if (!strcmp(argv[i], "--prompt-proportions")) {
+      prompts.proportions = split_by_comma<double>(std::string(argv[++i]));
+      continue;
+    }
+    if (!strcmp(argv[i], "--prompt-slo-ratios")) {
+      prompts.slo_ratios = split_by_comma<double>(std::string(argv[++i]));
+      continue;
+    }
+    // traces
+    if (!strcmp(argv[i], "-log")) {
+      paths.log_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--emission-file-path")) {
+      paths.emission_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-kv-cache-size")) {
+      max_kv_cache_size = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--scaling-factor")) {
+      scaling_factor = std::stod(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void get_model_meta(FilePaths &file_paths,
+                    ModelMeta &model_metadata,
+                    bool use_full_precision) {
+  if (model_metadata.model_names.llm_model_name.empty() ||
+      model_metadata.model_names.ssm_model_names.size() == 0) {
+    assert(false && "SpecInfer needs at least one LLM and one SSM for "
+                    "speculative inference");
+  }
+  model_metadata.llm_model_config_path =
+      join_path({file_paths.cache_folder_path,
+                 "configs",
+                 model_metadata.model_names.llm_model_name,
+                 "config.json"});
+  model_metadata.llm_tokenizer_path =
+      join_path({file_paths.cache_folder_path,
+                 "tokenizers",
+                 model_metadata.model_names.llm_model_name});
+  model_metadata.llm_weights_path =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 model_metadata.model_names.llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+
+  std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path);
+  if (!llm_config_file_handle.good()) {
+    std::cout << "LLM Model config file "
+              << model_metadata.llm_model_config_path << " not found."
+              << std::endl;
+    assert(false);
+  }
+  json llm_model_config = json::parse(llm_config_file_handle,
+                                      /*parser_callback_t */ nullptr,
+                                      /*allow_exceptions */ true,
+                                      /*ignore_comments */ true);
+
+  model_metadata.llm_model_type = ModelType::UNKNOWN;
+  auto architectures = llm_model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
+      model_metadata.llm_model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_metadata.llm_model_type = ModelType::FALCON;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::MPT;
+      break;
+    }
+  }
+  model_metadata.bos_token_id =
+      llm_model_config.find("bos_token_id") == llm_model_config.end()
+          ? -1
+          : (int)llm_model_config.at("bos_token_id");
+  // model_metadata.eos_token_id =
+  //     llm_model_config.find("eos_token_id") == llm_model_config.end()
+  //         ? -1
+  //         : (int)llm_model_config.at("eos_token_id");
+  if (llm_model_config.find("eos_token_id") != llm_model_config.end()) {
+    if (llm_model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : llm_model_config["eos_token_id"]) {
+        model_metadata.eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]);
+    }
+  } else {
+    model_metadata.eos_token_ids.push_back(-1);
+  }
+
+  for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
+    std::string ssm_config_path = join_path({file_paths.cache_folder_path,
+                                             "configs",
+                                             ssm_model_name,
+                                             "config.json"});
+    std::string ssm_tokenizer_path =
+        join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name});
+    std::string ssm_weights_path =
+        join_path({file_paths.cache_folder_path,
+                   "weights",
+                   ssm_model_name,
+                   use_full_precision ? "full-precision" : "half-precision"});
+
+    std::ifstream ssm_config_file_handle(ssm_config_path);
+    if (!ssm_config_file_handle.good()) {
+      std::cout << "SSM Model config file " << ssm_config_path << " not found."
+                << std::endl;
+      assert(false);
+    }
+    json ssm_model_config = json::parse(ssm_config_file_handle,
+                                        /*parser_callback_t */ nullptr,
+                                        /*allow_exceptions */ true,
+                                        /*ignore_comments */ true);
+
+    ModelType ssm_model_type = ModelType::UNKNOWN;
+    auto architectures = ssm_model_config["architectures"];
+    for (auto const &str : architectures) {
+      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+          str == "MistralForCausalLM") {
+        ssm_model_type = ModelType::LLAMA;
+        break;
+      } else if (str == "OPTForCausalLM") {
+        ssm_model_type = ModelType::OPT;
+        break;
+      } else if (str == "RWForCausalLM") {
+        ssm_model_type = ModelType::FALCON;
+        break;
+      } else if (str == "MPTForCausalLM") {
+        ssm_model_type = ModelType::MPT;
+        break;
+      }
+    }
+    int ssm_bos_id =
+        ssm_model_config.find("bos_token_id") == ssm_model_config.end()
+            ? -1
+            : (int)ssm_model_config.at("bos_token_id");
+    // int ssm_eos_id =
+    //     ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+    //         ? -1
+    //         : (int)ssm_model_config.at("eos_token_id");
+    // if (ssm_bos_id != model_metadata.bos_token_id ||
+    //     ssm_eos_id != model_metadata.eos_token_id) {
+    //   printf("Warning: bos/eos token id mismatch between LLM and one of the "
+    //          "SSMs!\n");
+    // }
+    model_metadata.ssm_model_types.push_back(ssm_model_type);
+    model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
+    model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
+  }
+
+  assert(model_metadata.llm_model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  for (auto mt : model_metadata.ssm_model_types) {
+    if (mt == ModelType::UNKNOWN) {
+      assert(false && "One of the SSM model types passed is invalid.");
+    }
+  }
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  FilePaths file_paths;
+  Prompts prompts;
+  ModelMeta model_metadata;
+  bool use_full_precision = false;
+  bool verbose = false;
+  int max_sequence_length = 256;
+  int max_output_length = 512;
+  size_t max_kv_cache_size = 0;
+  double scaling_factor = 1.0;
+
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_tokens_per_ssm_batch = -1;
+  int max_tokens_per_prefilling_batch = -1;
+  int expansion_degree = 3;
+  int max_tree_depth = 8;
+  int max_tree_width = 16;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::SPECULATIVE_DECODING;
+  bool spec_sampling = false;
+  bool do_sample = false;
+  int sampling_seed = 0;
+  bool streaming_cache = false;
+  bool slo_attainment_early_termination = false;
+  double baseline_latency_ms = 50;
+  double ssm_spec_latency_ms = 20;
+  double llm_verify_latency_ms = 50;
+  double request_per_second = 1.0;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   prompts,
+                   model_metadata.model_names,
+                   use_full_precision,
+                   verbose,
+                   max_sequence_length,
+                   max_output_length,
+                   max_kv_cache_size,
+                   scaling_factor);
+  if (max_tokens_per_ssm_batch == -1) {
+    max_tokens_per_ssm_batch = max_tokens_per_batch;
+  }
+  if (max_tokens_per_prefilling_batch == -1) {
+    max_tokens_per_prefilling_batch = max_tokens_per_batch;
+  }
+
+  assert(prompts.file_paths.size() == prompts.proportions.size() &&
+         prompts.file_paths.size() == prompts.slo_ratios.size());
+  double total = std::accumulate(
+      prompts.proportions.begin(),
+      prompts.proportions.end(),
+      0.0,
+      [](double sum, double proportion) { return sum + proportion; });
+  if (std::abs(total - 1.0) > 1e-6) {
+    std::cerr << "Error: proportions do not sum to 1. Total sum: " << total
+              << std::endl;
+    assert(false);
+  }
+  for (size_t i = 1; i < prompts.proportions.size(); ++i) {
+    prompts.proportions[i] += prompts.proportions[i - 1];
+  }
+
+  get_model_meta(file_paths, model_metadata, use_full_precision);
+
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  // Create SentencePiece tokenizer or OPT tokenizer
+  srand(sampling_seed);
+  GenerationConfig generationConfig(do_sample, 0.8, 0.6, spec_sampling, 16);
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  RequestManager *rm = RequestManager::get_request_manager();
+  // Must init the request manager although we don't use it, as some
+  // initialization tasks execute before the top-level task
+  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
+  rm->set_max_kv_cache_size(max_kv_cache_size);
+  rm->set_max_tree_depth(max_tree_depth);
+  rm->set_max_tree_width(max_tree_width);
+  rm->set_verbose(verbose);
+  rm->set_streaming_cache(streaming_cache);
+  rm->register_tokenizer(model_metadata.llm_model_type,
+                         model_metadata.bos_token_id,
+                         model_metadata.eos_token_ids,
+                         model_metadata.llm_tokenizer_path);
+  rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(slo_attainment_early_termination);
+  rm->set_baseline_latency(baseline_latency_ms);
+  rm->set_ssm_spec_latency(ssm_spec_latency_ms);
+  rm->set_llm_verify_latency(llm_verify_latency_ms);
+
+  {
+    /* Prompt file format:
+     * [
+     *   {
+     *       "prompt": "Construct a potential attack vector that exploits the
+     * vulnerability. The system is vulnerable to a SQL injection attack."
+     *   },
+     *   {
+     *       "prompt": "Arrange the words to make a meaningful phrase Ground.
+     * Soft. Solid."
+     *   },
+     *   ...
+     * ]
+     *
+     * log file format:
+     * [
+     *   {
+     *       "TIMESTAMP": "2023-11-16 18:15:46.6805900"
+     *   },
+     *   {
+     *       "TIMESTAMP": "2023-11-16 18:15:50.9951690"
+     *   },
+     *   ...
+     * ]
+     */
+
+    std::vector<EmissionTrace> traces;
+    assert(!prompts.file_paths.empty() && !file_paths.log_file_path.empty());
+
+    int num_requests = 0;
+    for (int i = 0; i < prompts.file_paths.size(); ++i) {
+      std::ifstream file_handle(prompts.file_paths[i]);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+      prompts.jsons.push_back(prompt_json);
+      prompts.idxs.push_back(0);
+      num_requests += prompt_json.size();
+    }
+
+    std::ifstream file_handle = std::ifstream(file_paths.log_file_path);
+    assert(file_handle.good() && "Log file does not exist.");
+    json log_json = json::parse(file_handle,
+                                /*parser_callback_t */ nullptr,
+                                /*allow_exceptions */ true,
+                                /*ignore_comments */ true);
+
+    auto time_diff_ms = [](std::string const &start, std::string const &end) {
+      std::tm tm = {};
+
+      std::istringstream ss(start);
+      ss >> std::get_time(&tm, "%Y-%m-%d %H:%M:%S");
+      auto start_time =
+          std::chrono::system_clock::from_time_t(std::mktime(&tm));
+      ss.seekg(0);
+      size_t dot_pos = start.find('.');
+      std::string fraction =
+          dot_pos != std::string::npos ? start.substr(dot_pos + 1) : "0";
+      while (fraction.size() < 6) {
+        fraction += "0";
+      }
+      if (!fraction.empty()) {
+        long long microseconds = std::stoll(fraction.substr(0, 6));
+        start_time += std::chrono::microseconds(microseconds);
+      }
+
+      ss = std::istringstream(end);
+      ss >> std::get_time(&tm, "%Y-%m-%d %H:%M:%S");
+      auto end_time = std::chrono::system_clock::from_time_t(std::mktime(&tm));
+      ss.seekg(0);
+      dot_pos = end.find('.');
+      fraction = dot_pos != std::string::npos ? end.substr(dot_pos + 1) : "0";
+      while (fraction.size() < 6) {
+        fraction += "0";
+      }
+      if (!fraction.empty()) {
+        long long microseconds = std::stoll(fraction.substr(0, 6));
+        end_time += std::chrono::microseconds(microseconds);
+      }
+
+      return std::chrono::duration_cast<std::chrono::microseconds>(end_time -
+                                                                   start_time)
+                 .count() /
+             1000.0;
+    };
+
+    num_requests = min((unsigned long)num_requests, log_json.size());
+    std::string start_time = log_json[0]["TIMESTAMP"].get<std::string>();
+    srand(time(0));
+    for (int i = 0; i < num_requests; ++i) {
+      // sample from proportions
+      double sample = (double)rand() / RAND_MAX;
+      int ptr = 0;
+      for (size_t j = 0; j < prompts.proportions.size(); ++j) {
+        if (sample < prompts.proportions[j]) {
+          ptr = j;
+          break;
+        }
+      }
+      int &idx = prompts.idxs[ptr];
+      std::string prompt = prompts.jsons[ptr][idx]["prompt"].get<std::string>();
+      idx = (idx + 1) % prompts.jsons[ptr].size();
+      std::vector<int32_t> input_tokens = rm->tokenize(prompt);
+      std::string timestamp = log_json[i]["TIMESTAMP"].get<std::string>();
+      EmissionTrace trace(prompt,
+                          input_tokens.size(),
+                          max_output_length,
+                          prompts.slo_ratios[ptr],
+                          time_diff_ms(start_time, timestamp) * scaling_factor);
+      traces.push_back(trace);
+    }
+
+    // output generation results as json
+    assert(!file_paths.emission_file_path.empty());
+    json output_json;
+    for (EmissionTrace const &trace : traces) {
+      output_json.push_back(trace.to_json());
+    }
+    std::ofstream emission_file_handle(file_paths.emission_file_path);
+    emission_file_handle << output_json.dump(2) << std::endl;
+  }
+
+  // float* data
+  std::cout << "----------trace generated--------------" << std::endl;
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/utils/mem_analysis.py b/inference/utils/mem_analysis.py
new file mode 100644
index 000000000..5168e7003
--- /dev/null
+++ b/inference/utils/mem_analysis.py
@@ -0,0 +1,115 @@
+import pandas as pd
+import re, os, math, argparse
+
+# Usage:
+# Run FlexFlow code with --log-instance-creation flag and redirect the output to a file
+# python mem_analysis.py --file_path /path/to/log_file.txt
+
+def extract_data(file_path):
+    # Define regex patterns
+    memory_allocator_pattern = re.compile(r'MemoryAllocator.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task_name: (.+)')
+    mapper_pattern = re.compile(r'Mapper.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task: (.+)')
+    parallel_tensor_pattern = re.compile(r'ParallelTensor.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task_name: (.+)')
+
+    # Initialize lists to store extracted data
+    memory_kinds = []
+    memory_ids = []
+    sizes = []
+    capacities = []
+    tasks = []
+
+    # Read the file
+    with open(file_path, 'r') as file:
+        for line in file:
+            if 'MemoryAllocator' in line:
+                match = memory_allocator_pattern.search(line)
+                if match:
+                    memory_kinds.append(match.group(1))
+                    memory_ids.append(match.group(2))
+                    sizes.append(int(match.group(3)))
+                    capacities.append(int(match.group(4)))
+                    tasks.append(match.group(5))
+            elif 'Mapper' in line:
+                match = mapper_pattern.search(line)
+                if match:
+                    memory_kinds.append(match.group(1))
+                    memory_ids.append(match.group(2))
+                    sizes.append(int(match.group(3)))
+                    capacities.append(int(match.group(4)))
+                    tasks.append(match.group(5))
+            elif 'ParallelTensor' in line:
+                match = parallel_tensor_pattern.search(line)
+                if match:
+                    memory_kinds.append(match.group(1))
+                    memory_ids.append(match.group(2))
+                    sizes.append(int(match.group(3)))
+                    capacities.append(int(match.group(4)))
+                    tasks.append(match.group(5))
+
+    # Create a DataFrame
+    df = pd.DataFrame({
+        'Memory Kind': memory_kinds,
+        'Device ID': memory_ids,
+        'Size': sizes,
+        'Capacity': capacities,
+        'Task': tasks
+    })
+
+    return df
+
+def human_readable_size(size_bytes):
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB")
+    i = int(math.floor(math.log(size_bytes, 1000)))
+    p = math.pow(1000, i)
+    s = round(size_bytes / p, 2)
+    return f"{s} {size_name[i]}"
+
+def print_grouped_by_device(df):
+    grouped_df = df.groupby(['Memory Kind', 'Device ID']).agg({'Size': 'sum', 'Capacity': 'first'})
+    # Check that all entries that share the same memory id have the same capacity
+    for (memory_kind, memory_id), group in df.groupby(['Memory Kind', 'Device ID']):
+        capacities = group['Capacity'].unique()
+        if len(capacities) > 1:
+            print(f"Warning: Device ID {memory_id} in Memory Kind {memory_kind} has multiple capacities: {capacities}")
+    # Convert sizes to human-readable format
+    grouped_df['Size'] = grouped_df['Size'].apply(human_readable_size)
+    grouped_df['Capacity'] = grouped_df['Capacity'].apply(human_readable_size)
+    print("############## Memory usage (by device) ##############")
+    print(grouped_df)
+
+def print_grouped_by_task(df):
+    # Group by 'Memory Kind', 'Device ID', and 'Task', and sum the 'Size' column
+    task_grouped_df = df.groupby(['Memory Kind', 'Device ID', 'Task']).agg({'Size': 'sum'}).reset_index()
+    # Sort the DataFrame by 'Memory Kind', 'Device ID', and 'Size' in descending order
+    task_grouped_df = task_grouped_df.sort_values(by=['Memory Kind', 'Device ID', 'Size'], ascending=[True, True, False])
+    print("\n\n############## Memory usage (by task) ##############")
+    for (memory_kind, memory_id), group in task_grouped_df.groupby(['Memory Kind', 'Device ID']):
+        print("\n-------------------------------------------------------------")
+        print(f"Memory Kind: {memory_kind}, Device ID: {memory_id}")
+        group['Size'] = group['Size'].apply(human_readable_size)
+        print(group[['Task', 'Size']].to_string(index=False))
+        print("-------------------------------------------------------------")
+
+def print_notes():
+    print("\n\n############## Notes ##############")
+    print("* Check that each GPU retains enough capacity in GPU_FB_MEM to hold the weights from Z_COPY_MEM (total size / tp_degree)")
+    print("* Check whether the memory usage is balanced across devices")
+    print("* `set_tensor` generally refers to the memory used to load the model weights")
+    print()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Analyze memory usage from a FlexFlow log file.')
+    parser.add_argument('--file_path', '-fp', type=str, help='Path to the input log file')
+    args = parser.parse_args()
+
+    # Change working directory to the directory holding the script
+    # script_dir = os.path.dirname(os.path.abspath(__file__))
+    # os.chdir(script_dir)
+    
+    df = extract_data(args.file_path)
+    print_grouped_by_device(df)
+    print_grouped_by_task(df)
+
+    print_notes()
\ No newline at end of file
diff --git a/inference/utils/process_prompts.py b/inference/utils/process_prompts.py
new file mode 100644
index 000000000..902662191
--- /dev/null
+++ b/inference/utils/process_prompts.py
@@ -0,0 +1,28 @@
+import json
+import argparse
+
+def read_prompts_from_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+        return data
+
+def write_prompts_to_json(file_path, data):
+    with open(file_path, 'w', encoding='utf-8') as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+
+def process_prompts(input_file, output_file):
+    prompts = read_prompts_from_json(input_file)
+    processed_prompts = [{"prompt": prompt, "slo_ratio": 1.0} for prompt in prompts]
+    write_prompts_to_json(output_file, processed_prompts)
+
+def main():
+    parser = argparse.ArgumentParser(description="Process prompts JSON file and generate slo_ratio for each prompt.")
+    parser.add_argument('input_file', type=str, help="Input JSON file containing prompts.")
+    parser.add_argument('output_file', type=str, help="Output JSON file to save the processed prompts.")
+    
+    args = parser.parse_args()
+
+    process_prompts(args.input_file, args.output_file)
+
+if __name__ == '__main__':
+    main()
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index 2820cf485..24bb15889 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -40,6 +40,7 @@
     "zero_copy_memory_per_node": "-ll:zsize",
     "num_cpus": "-ll:cpu",
     "legion_utility_processors": "-ll:util",
+    "log_instance_creation": "--log-instance-creation",
     "profiling": "--profiling",
     "benchmarking": "--benchmarking",
     "inference_debugging": "--inference-debugging",
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 14cf4eebf..b17f36f72 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -15,6 +15,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+from dataclasses import dataclass
 import warnings
 import numpy as np
 from .flexflow_logger import fflogger
@@ -1241,6 +1242,21 @@ def get_weights(self, ffmodel):
         assert ret_val == True
         return np_array
 
+# -----------------------------------------------------------------------
+# RotaryEmbeddingMeta
+# -----------------------------------------------------------------------
+
+
+@dataclass
+class RotaryEmbeddingMeta:
+    apply_rotary_embedding: bool = False
+    rope_theta: float = 10000.0
+    rope_type: str = "default"
+    factor: float = 8.0
+    low_freq_factor: float = 1.0
+    high_freq_factor: float = 4.0
+    original_max_position_embeddings: int = 8192
+
 
 # -----------------------------------------------------------------------
 # FFModel
@@ -2046,10 +2062,10 @@ def add_bias_residual_layer_norm(
             handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM
         ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM)
 
-    def sigmoid_silu_multi(self, input1, input2, name=None):
+    def sigmoid_silu_multi(self, input1, input2, intermediate_size, name=None):
         c_name = get_c_name(name)
         handle = ffc().flexflow_model_add_sigmoid_silu_multi(
-            self.handle, input1.handle, input2.handle, c_name
+            self.handle, input1.handle, input2.handle, intermediate_size, c_name
         )
         self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
         return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
@@ -2676,7 +2692,7 @@ def inc_multihead_self_attention(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -2720,8 +2736,8 @@ def inc_multihead_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -2756,7 +2772,13 @@ def inc_multihead_self_attention(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -2779,11 +2801,12 @@ def spec_inc_multihead_self_attention(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
         position_bias=False,
+        streaming_cache=False,
         name=None,
     ):
         """Defines the MultiHead Attention operation as described in Attention Is All You Need
@@ -2823,8 +2846,8 @@ def spec_inc_multihead_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -2859,11 +2882,18 @@ def spec_inc_multihead_self_attention(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
             position_bias,
+            streaming_cache,
             c_name,
         )
         self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
@@ -2882,7 +2912,7 @@ def inc_multihead_self_attention_verify(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -2926,8 +2956,8 @@ def inc_multihead_self_attention_verify(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -2962,7 +2992,13 @@ def inc_multihead_self_attention_verify(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -2972,7 +3008,7 @@ def inc_multihead_self_attention_verify(
         self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
         return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
 
-    def inc_multiquery_self_attention(
+    def groupquery_self_attention(
         self,
         input,
         embed_dim,
@@ -2986,11 +3022,12 @@ def inc_multiquery_self_attention(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
         position_bias=False,
+        streaming_cache=False,
         name=None,
     ):
         """Defines the multi-query head attention, which allows a different number of Q and KV heads,
@@ -3033,8 +3070,8 @@ def inc_multiquery_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3056,7 +3093,7 @@ def inc_multiquery_self_attention(
         c_name = get_c_name(name)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
         c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
+        handle = ffc().flexflow_model_add_groupquery_self_attention(
             self.handle,
             input.handle,
             embed_dim,
@@ -3070,11 +3107,18 @@ def inc_multiquery_self_attention(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
             position_bias,
+            streaming_cache,
             c_name,
         )
         self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
@@ -3094,7 +3138,7 @@ def spec_inc_multiquery_self_attention(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3141,8 +3185,8 @@ def spec_inc_multiquery_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3178,7 +3222,13 @@ def spec_inc_multiquery_self_attention(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -3202,7 +3252,7 @@ def inc_multiquery_self_attention_verify(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3249,8 +3299,8 @@ def inc_multiquery_self_attention_verify(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3286,7 +3336,13 @@ def inc_multiquery_self_attention_verify(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -4208,6 +4264,14 @@ def set_max_sequence_length(self, max_length):
         return ffc().flexflow_request_manager_set_max_sequence_length(
             self.handle, max_length)
 
+    def set_max_output_length(self, max_length):
+        return ffc().flexflow_request_manager_set_max_output_length(
+            self.handle, max_length)
+    
+    def set_max_kv_cache_size(self, max_size):
+        return ffc().flexflow_request_manager_set_max_kv_cache_size(
+            self.handle, max_size)
+
     def start_server(self, model):
         return ffc().flexflow_request_manager_start_background_server(
             self.handle, model.handle
@@ -4257,7 +4321,7 @@ def __init__(
         num_q_heads,
         num_kv_heads,
         hidden_dim,
-        qkv_inner_dim,
+        head_dim,
         tensor_parallelism_degree,
         use_full_precision
     ):
@@ -4267,7 +4331,7 @@ def __init__(
             num_q_heads,
             num_kv_heads,
             hidden_dim,
-            qkv_inner_dim,
+            head_dim,
             tensor_parallelism_degree,
             use_full_precision
         )
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 5af077273..df630462a 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -200,7 +200,7 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 1024**2
+        configs_dict["offload_reserve_space_size"] = 8 * 1024**3
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 7a55da26e..ab3bc4623 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -41,6 +41,17 @@ def __init__(self, hf_config):
         )
         self.parallel_attn = hf_config.parallel_attn
         self.vocab_size = hf_config.vocab_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(
+            apply_rotary_embedding=True,
+            rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0,
+        )
+        if "rope_scaling" in hf_config.__dict__:
+            if hf_config.rope_scaling is not None:
+                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"]
+                self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"]
+                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"]
+                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"]
+                self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"]
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = self.n_head
         self.num_key_value_heads = self.n_head_kv
@@ -54,8 +65,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -63,11 +72,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.falcon_config = FalconConfig(hf_config)
-        # self.falcon_config.max_seq_length = max_seq_length
-        # self.falcon_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -124,7 +130,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, att_norm = ffmodel.residual_layer_norm(
@@ -135,7 +141,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -152,8 +158,8 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    self.falcon_config.rotary_embedding_meta,
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -169,11 +175,11 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    self.falcon_config.rotary_embedding_meta,
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                mha = ffmodel.inc_multiquery_self_attention(
+                mha = ffmodel.groupquery_self_attention(
                     att_norm,
                     self.falcon_config.hidden_size,
                     self.falcon_config.n_head,
@@ -186,8 +192,8 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    self.falcon_config.rotary_embedding_meta,
+                    name=f"layers.{i}.self_attention",
                 )
             else:
                 assert False
@@ -197,7 +203,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size * 4,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_h_to_4h",
+                name=f"layers.{i}.mlp.dense_h_to_4h",
             )
             dense_h_to_4h = ffmodel.gelu(dense_h_to_4h)
             mlp_output = ffmodel.dense(
@@ -205,7 +211,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_4h_to_h",
+                name=f"layers.{i}.mlp.dense_4h_to_h",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -243,6 +249,13 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (name.replace("transformer.h.", "layers.")
+            .replace("transformer.", "")
+            .replace("self_attention.dense", "self_attention.o_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         n_head = (
@@ -258,10 +271,10 @@ def convert_hf_model(model, dst_folder):
                 .replace("self_attention_dense", "attention_wo")
             )
             # Split Q,K,V attention weights
-            if "self_attention_query_key_value" in name:
-                name_q = name.replace("self_attention_query_key_value", "attention_wq")
-                name_k = name.replace("self_attention_query_key_value", "attention_wk")
-                name_v = name.replace("self_attention_query_key_value", "attention_wv")
+            if "self_attention.query_key_value" in name:
+                name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj")
+                name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj")
+                name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -278,5 +291,5 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         # LM head weight
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 6b33030f6..e58ed57bc 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -19,8 +19,6 @@
 
 class LLAMAConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -29,6 +27,17 @@ def __init__(self, hf_config):
         self.hidden_size = hf_config.hidden_size
         self.rms_norm_eps = hf_config.rms_norm_eps
         self.intermediate_size = hf_config.intermediate_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(
+            apply_rotary_embedding=True,
+            rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0,
+        )
+        if "rope_scaling" in hf_config.__dict__:
+            if hf_config.rope_scaling is not None:
+                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"]
+                self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"]
+                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"]
+                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"]
+                self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"]
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.num_attention_heads
         self.num_key_value_heads = (
@@ -55,11 +64,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.llama_config = LLAMAConfig(hf_config)
-        # self.llama_config.max_seq_length = max_seq_length
-        # self.llama_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -106,7 +112,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="tok_embeddings",
+            name="embed_tokens",
         )
 
         for i in range(self.llama_config.num_hidden_layers):
@@ -117,7 +123,7 @@ def build_model(self, max_tokens_per_batch):
                     token,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, attn_norm = ffmodel.residual_rms_norm(
@@ -125,7 +131,7 @@ def build_model(self, max_tokens_per_batch):
                     w2,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -144,8 +150,8 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    self.llama_config.rotary_embedding_meta,
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -163,11 +169,11 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    self.llama_config.rotary_embedding_meta,
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                mha = ffmodel.inc_multiquery_self_attention(
+                mha = ffmodel.groupquery_self_attention(
                     attn_norm,
                     self.llama_config.hidden_size,
                     self.llama_config.num_attention_heads,
@@ -182,8 +188,8 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    self.llama_config.rotary_embedding_meta,
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -193,29 +199,29 @@ def build_model(self, max_tokens_per_batch):
                 mha,
                 self.llama_config.rms_norm_eps,
                 self.llama_config.hidden_size,
-                name=f"layers_{i}_ffn_norm",
+                name=f"layers.{i}.post_attention_layernorm",
             )
             w1 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w1",
+                name=f"layers.{i}.mlp.gate_proj",
             )
             w3 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w3",
+                name=f"layers.{i}.mlp.up_proj",
             )
-            multi = ffmodel.sigmoid_silu_multi(w1, w3)
+            multi = ffmodel.sigmoid_silu_multi(w1, w3, self.llama_config.intermediate_size)
             w2 = ffmodel.dense(
                 multi,
                 self.llama_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w2",
+                name=f"layers.{i}.mlp.down_proj",
             )
 
         _, token = ffmodel.residual_rms_norm(
@@ -230,7 +236,7 @@ def build_model(self, max_tokens_per_batch):
             self.llama_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="output",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -250,6 +256,9 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return name.replace("model.", "")
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
@@ -271,3 +280,7 @@ def convert_hf_model(model, dst_folder):
                 .replace("model_", "")
             )
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
+        # LM head weight
+        model.lm_head.weight.detach().cpu().numpy().tofile(
+            os.path.join(dst_folder, "output_weight")
+        )
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 92867fd49..a68bbd2a0 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -19,8 +19,6 @@
 
 class MPTConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -28,6 +26,7 @@ def __init__(self, hf_config):
         self.n_heads = hf_config.n_heads
         self.n_layers = hf_config.n_layers
         self.vocab_size = hf_config.vocab_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.n_heads
         self.num_key_value_heads = hf_config.n_heads
@@ -50,11 +49,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.mpt_config = MPTConfig(hf_config)
-        # self.mpt_config.max_seq_length = max_seq_length
-        # self.mpt_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -97,7 +93,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
 
         axes = [
@@ -114,7 +110,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
             else:
                 hidden_states, layernorm_output = ffmodel.residual_layer_norm(
@@ -126,7 +122,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -142,13 +138,13 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention_verify(
@@ -163,13 +159,13 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention(
@@ -184,13 +180,13 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             else:
                 assert False
@@ -204,7 +200,7 @@ def build_model(self, max_tokens_per_batch):
                 True,
                 1e-05,
                 False,
-                name=f"layers_{i}_norm_2",
+                name=f"layers.{i}.norm_2",
             )
             # mlp
             layernorm_output = ffmodel.dense(
@@ -212,7 +208,7 @@ def build_model(self, max_tokens_per_batch):
                 4 * self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_up_proj",
+                name=f"layers.{i}.ffn.up_proj",
             )
             layernorm_output = ffmodel.gelu(layernorm_output)
             intermediate_output = ffmodel.dense(
@@ -220,7 +216,7 @@ def build_model(self, max_tokens_per_batch):
                 self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_down_proj",
+                name=f"layers.{i}.ffn.down_proj",
             )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -232,7 +228,7 @@ def build_model(self, max_tokens_per_batch):
             True,
             1e-05,
             False,
-            name=f"transformer_norm_f",
+            name=f"norm_f",
         )
         lm_head = ffmodel.dense(
             all_final_norm,
@@ -253,14 +249,22 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("transformer.blocks.", "layers.")
+            .replace("transformer.", "")
+            .replace("attn.out_proj", "attn.o_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
             name = name.replace("transformer.blocks.", "layers.").replace(".", "_")
             if "Wqkv" in name:
-                name_q = name.replace("attn_Wqkv", "attention_wq")
-                name_k = name.replace("attn_Wqkv", "attention_wk")
-                name_v = name.replace("attn_Wqkv", "attention_wv")
+                name_q = name.replace("attn.Wqkv", "attn.q_proj")
+                name_k = name.replace("attn.Wqkv", "attn.k_proj")
+                name_v = name.replace("attn.Wqkv", "attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -280,6 +284,6 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
 
         shutil.copy(
-            os.path.join(dst_folder, "transformer_wte_weight"),
-            os.path.join(dst_folder, "lm_head_weight"),
+            os.path.join(dst_folder, "wte.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index b715f5f35..abf88b784 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -34,6 +34,7 @@ def __init__(self, hf_config):
         self.num_hidden_layers = hf_config.num_hidden_layers
         self.vocab_size = hf_config.vocab_size
         self.word_embed_proj_dim = hf_config.word_embed_proj_dim
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.num_attention_heads
         self.num_key_value_heads = hf_config.num_attention_heads
@@ -47,8 +48,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -56,11 +55,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.opt_config = OPTConfig(hf_config)
-        # self.opt_config.max_seq_length = max_seq_length
-        # self.opt_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -139,7 +135,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_attention_layer_norm",
+                    name=f"layers.{i}.self_attn_layer_norm",
                 )
             else:
                 hidden_states = ffmodel.add(token, positional_embedding)
@@ -158,12 +154,12 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multihead_self_attention_verify(
@@ -178,12 +174,12 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multihead_self_attention(
@@ -198,12 +194,12 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -215,7 +211,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 self.opt_config.layer_norm_elementwise_affine,
                 1e-05,
-                name=f"layers_{i}_add_bias_residual_layer_norm",
+                name=f"layers.{i}.add_bias_residual_layer_norm",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -226,14 +222,14 @@ def build_model(self, max_tokens_per_batch):
                 self.opt_config.ffn_dim,
                 ActiMode.AC_MODE_RELU,
                 True,
-                name=f"layers_{i}_fc1",
+                name=f"layers.{i}.fc1",
             )
             fc2 = ffmodel.dense(
                 fc1,
                 self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_fc2",
+                name=f"layers.{i}.fc2",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -245,7 +241,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_final_layer_norm",
+                    name=f"layers.{i}.final_layer_norm",
                 )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -263,7 +259,7 @@ def build_model(self, max_tokens_per_batch):
             self.opt_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="embed_tokens_weight_lm_head",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -283,6 +279,17 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("decoder.", "")
+            .replace("model.", "")
+            .replace("self_attn.out_proj", "self_attn.o_proj")
+            .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias")
+            .replace(
+                ".final_layer_norm", ".add_bias_residual_layer_norm"
+            )  # important to use the leading "_" to avoid matching the last LayerNorm
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
@@ -303,6 +310,6 @@ def convert_hf_model(model, dst_folder):
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
         # copy embedding weights
         shutil.copy(
-            os.path.join(dst_folder, "embed_tokens_weight"),
-            os.path.join(dst_folder, "embed_tokens_weight_lm_head"),
+            os.path.join(dst_folder, "embed_tokens.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 37edaa4c4..762ad24c4 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -19,8 +19,6 @@
 
 class STARCODERConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -32,6 +30,7 @@ def __init__(self, hf_config):
         self.vocab_size = hf_config.vocab_size
         self.intermediate_size = hf_config.n_inner
         self.n_head_kv = 1 if hf_config.multi_query else hf_config.n_head
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.n_head
         self.num_key_value_heads = self.n_head_kv
@@ -45,8 +44,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -54,11 +51,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.starcoder_config = STARCODERConfig(hf_config)
-        # self.starcoder_config.max_seq_length = max_seq_length
-        # self.starcoder_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -111,7 +105,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
         positional_embedding = ffmodel.embedding(
             position_tensor,
@@ -121,7 +115,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wpe",
+            name="wpe",
         )
 
         axes = [
@@ -139,11 +133,11 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_1",
+                name=f"layers.{i}.ln_1",
             )
 
             assert self.mode == InferenceMode.INC_DECODING_MODE
-            mha = ffmodel.inc_multiquery_self_attention(
+            mha = ffmodel.groupquery_self_attention(
                 ln_1,
                 self.starcoder_config.hidden_size,
                 self.starcoder_config.num_attention_heads,
@@ -158,8 +152,8 @@ def build_model(self, max_tokens_per_batch):
                 False,  # add_zero_attn
                 DataType.DT_NONE,  # data_type
                 None,  # kernel initializer
-                False,  # apply_rotary_embedding
-                name=f"layers_{i}_attention",
+                self.starcoder_config.rotary_embedding_meta,
+                name=f"layers.{i}.attn.c_attn",
             )
 
             residual, l2_norm = ffmodel.residual_layer_norm(
@@ -171,7 +165,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_2",
+                name=f"layers.{i}.ln_2",
             )
 
             # mlp
@@ -181,7 +175,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_fc",
+                name=f"layers.{i}.mlp.c_fc",
             )
             activation = ffmodel.gelu(c_fc, False)
             c_proj = ffmodel.dense(
@@ -189,7 +183,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_proj",
+                name=f"layers.{i}.mlp.c_proj",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -200,7 +194,7 @@ def build_model(self, max_tokens_per_batch):
             axes,
             True,
             self.starcoder_config.layer_norm_epsilon,
-            name=f"transformer_ln_f",
+            name=f"ln_f",
         )
         lm_head = ffmodel.dense(
             ln_f,
@@ -224,11 +218,11 @@ def build_model(self, max_tokens_per_batch):
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.h", "layers").replace(".", "_")
-            if "c_attn_weight" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            name = name.replace("transformer.h", "layers").replace("transformer", "")
+            if "attn.c_attn.weight" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -241,10 +235,10 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_attn_bias" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            elif "attn.c_attn.bias" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -257,14 +251,14 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_proj_bias" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.bias" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
-            elif "c_proj_weight" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.weight" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index ac622b333..37606e875 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -27,11 +27,11 @@
     MPTConfig,
 )
 from flexflow.core import *
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM
 from huggingface_hub import HfApi
 import sys, torch, shutil, hashlib
 from typing import Union, List
-
+from huggingface_hub import snapshot_download
 
 class GenerationConfig:
     """A class to store the sampling configs."""
@@ -95,6 +95,7 @@ def __init__(
         self.supported_models = {
             "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
             "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
+            "MistralForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
             "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig),
             "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
             "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
@@ -219,7 +220,13 @@ def download_hf_weights_if_needed(self):
                 )
             # Download model from HuggingFace, or load it from the local folder
             hf_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name, trust_remote_code=True
+                self.model_name,
+                trust_remote_code=True,
+                torch_dtype=(
+                    torch.float32
+                    if self.data_type == DataType.DT_FLOAT
+                    else torch.float16
+                ),
             )
             # Print log message to notify user download of model has finished
             if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
@@ -261,29 +268,21 @@ def download_hf_tokenizer_if_needed(self):
         )
 
         if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
-                print(
-                    f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                # Remote model
-                print(
-                    f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..."
-                )
-            # Download tokenizer from HuggingFace, or load it from the local folder
-            if self.model_type == ModelType.LLAMA:
-                hf_tokenizer = LlamaTokenizer.from_pretrained(
-                    self.model_name, use_fast=True
-                )
+            print(
+                f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
+            )
+            # Load/download the tokenizer files
+            target_tokenizer_files = ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.json", "merges.txt", "tokenizer.model"]
+            if os.path.exists(self.model_name):
+                hf_tokenizer_path = self.model_name
             else:
-                hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            # Print log message to notify user download of tokenizer has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading tokenizer. Saving it now...")
-            # Save tokenizer
-            hf_tokenizer.save_pretrained(self.tokenizer_path)
-            print("Done saving HF tokenizer.")
+                hf_tokenizer_path = snapshot_download(repo_id=self.model_name, allow_patterns=target_tokenizer_files)
+            for file in target_tokenizer_files:
+                src_path = os.path.join(hf_tokenizer_path, file)
+                dst_path = os.path.join(self.tokenizer_path, file)
+                if os.path.exists(src_path):
+                    shutil.copy(src_path, dst_path)
+            print("Done updating HF tokenizer.")
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 5714c8fe3..8b3403653 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -18,6 +18,7 @@
 #include "flexflow/mapper.h"
 #include "flexflow/request_manager.h"
 #include "flexflow/utils/file_loader.h"
+#include <vector>
 
 using namespace Legion;
 using namespace FlexFlow;
@@ -59,10 +60,9 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_single_dataloader_t, SingleDataLoader *);
   // inference
   FF_NEW_OPAQUE_WRAPPER(flexflow_batch_config_t, BatchConfig *);
-  FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t,
-                        TreeVerifyBatchConfig *);
-  FF_NEW_OPAQUE_WRAPPER(flexflow_beam_search_batch_config_t,
-                        BeamSearchBatchConfig *);
+  //   FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t, BatchConfig
+  //   *); FF_NEW_OPAQUE_WRAPPER(flexflow_beam_search_batch_config_t,
+  //   BatchConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_inference_manager_t, InferenceManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *);
@@ -253,56 +253,56 @@ void flexflow_model_zero_gradients(flexflow_model_t handle_) {
 }
 
 flexflow_tensor_t flexflow_model_add_exp(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
+                                         flexflow_tensor_t const x_,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
   Tensor tensor = handle->exp(x, name);
   DEBUG_PRINT("[Exp] new Tensor %p, x %p, name %s", tensor, x, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_sin(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
+                                         flexflow_tensor_t const x_,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
   Tensor tensor = handle->sin(x, name);
   DEBUG_PRINT("[Sin] new Tensor %p, x %p, name %s", tensor, x, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_cos(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
+                                         flexflow_tensor_t const x_,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
   Tensor tensor = handle->cos(x, name);
   DEBUG_PRINT("[Cos] new Tensor %p, x %p, name %s", tensor, x, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_add(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
-                                         const flexflow_tensor_t y_,
+                                         flexflow_tensor_t const x_,
+                                         flexflow_tensor_t const y_,
                                          bool inplace_a,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->add(x, y, inplace_a, name);
   DEBUG_PRINT("[Add] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle_,
-                                              const flexflow_tensor_t x_,
-                                              const flexflow_tensor_t y_,
+                                              flexflow_tensor_t const x_,
+                                              flexflow_tensor_t const y_,
                                               bool inplace_a,
                                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->subtract(x, y, inplace_a, name);
   DEBUG_PRINT(
       "[Subtract] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
@@ -310,13 +310,13 @@ flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle_,
-                                              const flexflow_tensor_t x_,
-                                              const flexflow_tensor_t y_,
+                                              flexflow_tensor_t const x_,
+                                              flexflow_tensor_t const y_,
                                               bool inplace_a,
                                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->multiply(x, y, inplace_a, name);
   DEBUG_PRINT(
       "[Multiply] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
@@ -324,13 +324,13 @@ flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle_,
-                                            const flexflow_tensor_t x_,
-                                            const flexflow_tensor_t y_,
+                                            flexflow_tensor_t const x_,
+                                            flexflow_tensor_t const y_,
                                             bool inplace_a,
                                             char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->divide(x, y, inplace_a, name);
   DEBUG_PRINT(
       "[Divide] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
@@ -338,33 +338,33 @@ flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_max(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
-                                         const flexflow_tensor_t y_,
+                                         flexflow_tensor_t const x_,
+                                         flexflow_tensor_t const y_,
                                          bool inplace_a,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->max(x, y, inplace_a, name);
   DEBUG_PRINT("[Max] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_min(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
-                                         const flexflow_tensor_t y_,
+                                         flexflow_tensor_t const x_,
+                                         flexflow_tensor_t const y_,
                                          bool inplace_a,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->min(x, y, inplace_a, name);
   DEBUG_PRINT("[Min] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 int *axes,
                                                 int n,
                                                 bool keepdims,
@@ -385,21 +385,21 @@ flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_rsqrt(flexflow_model_t handle_,
-                                           const flexflow_tensor_t input_,
+                                           flexflow_tensor_t const input_,
                                            char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
   Tensor tensor = handle->rsqrt(input, name);
   DEBUG_PRINT("[Rsqrt] new Tensor %p, input %p, name %s", tensor, input, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input_,
+                                         flexflow_tensor_t const input_,
                                          float const exponent,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
   Tensor tensor = handle->pow(input, exponent, name);
   DEBUG_PRINT("[Pow] new Tensor %p, input %p, exponent %f, name %s",
               tensor,
@@ -410,13 +410,13 @@ flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           int *dims,
                                           int n,
                                           bool keepdims,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
   std::vector<int> dims_vec;
   char cbuffer[256];
   char *cbuffer_ptr = cbuffer;
@@ -441,7 +441,7 @@ flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_,
 
 flexflow_tensor_t
     flexflow_model_add_conv2d(flexflow_model_t handle_,
-                              const flexflow_tensor_t input_,
+                              flexflow_tensor_t const input_,
                               int out_channels,
                               int kernel_h,
                               int kernel_w,
@@ -457,7 +457,7 @@ flexflow_tensor_t
                               flexflow_initializer_t bias_initializer_,
                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap_const(input_);
+  Tensor const input = FFCObjectWrapper::unwrap_const(input_);
   Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
@@ -505,7 +505,7 @@ flexflow_tensor_t
 
 flexflow_tensor_t
     flexflow_model_add_embedding(flexflow_model_t handle_,
-                                 const flexflow_tensor_t input_,
+                                 flexflow_tensor_t const input_,
                                  int num_entries,
                                  int out_dim,
                                  enum AggrMode aggr,
@@ -514,7 +514,7 @@ flexflow_tensor_t
                                  flexflow_initializer_t kernel_initializer_,
                                  char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap_const(input_);
+  Tensor const input = FFCObjectWrapper::unwrap_const(input_);
   Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
@@ -588,7 +588,7 @@ flexflow_tensor_t
 }
 
 flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 bool relu,
                                                 char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -608,7 +608,7 @@ flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 int n,
                                                 int *axes,
                                                 bool elementwise_affine,
@@ -616,7 +616,7 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_,
                                                 bool use_bias,
                                                 char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
   std::vector<int> axes_vec;
   for (int i = 0; i < n; i++) {
     axes_vec.push_back(axes[i]);
@@ -640,9 +640,9 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_,
 
 flexflow_tensor_t *
     flexflow_model_add_residual_layer_norm(flexflow_model_t handle_,
-                                           const flexflow_tensor_t input_,
-                                           const flexflow_tensor_t residual1_,
-                                           const flexflow_tensor_t residual2_,
+                                           flexflow_tensor_t const input_,
+                                           flexflow_tensor_t const residual1_,
+                                           flexflow_tensor_t const residual2_,
                                            bool use_two_residuals,
                                            int n,
                                            int *axes,
@@ -651,9 +651,9 @@ flexflow_tensor_t *
                                            bool use_bias,
                                            char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
-  const Tensor residual1 = FFCObjectWrapper::unwrap(residual1_);
-  const Tensor residual2 =
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
+  Tensor const residual1 = FFCObjectWrapper::unwrap(residual1_);
+  Tensor const residual2 =
       use_two_residuals ? FFCObjectWrapper::unwrap(residual2_) : nullptr;
   Tensor tensor_outputs[2];
   std::vector<int> axes_vec;
@@ -699,8 +699,8 @@ flexflow_tensor_t *
 
 flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    const flexflow_tensor_t residual_,
+    flexflow_tensor_t const input_,
+    flexflow_tensor_t const residual_,
     int n,
     int *axes,
     bool elementwise_affine,
@@ -708,8 +708,8 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool use_bias,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
-  const Tensor residual = FFCObjectWrapper::unwrap(residual_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
+  Tensor const residual = FFCObjectWrapper::unwrap(residual_);
   Tensor tensor_outputs[2];
   std::vector<int> axes_vec;
   for (int i = 0; i < n; i++) {
@@ -746,14 +746,15 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
 
 flexflow_tensor_t
     flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input1_,
-                                          const flexflow_tensor_t input2_,
+                                          flexflow_tensor_t const input1_,
+                                          flexflow_tensor_t const input2_,
+                                          int intermediate_size,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input1 = FFCObjectWrapper::unwrap(input1_);
-  const Tensor input2 = FFCObjectWrapper::unwrap(input2_);
-  Tensor tensor =
-      handle->sigmoid_silu_multi(input1, input2, input1->data_type, name);
+  Tensor const input1 = FFCObjectWrapper::unwrap(input1_);
+  Tensor const input2 = FFCObjectWrapper::unwrap(input2_);
+  Tensor tensor = handle->sigmoid_silu_multi(
+      input1, input2, intermediate_size, input1->data_type, name);
   DEBUG_PRINT("[SigmoidSiluMulti] new Tensor %p, input1 %p, input2 %p, name %s",
               tensor,
               input1,
@@ -763,8 +764,8 @@ flexflow_tensor_t
 }
 
 flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_,
-                                                  const flexflow_tensor_t a_,
-                                                  const flexflow_tensor_t b_,
+                                                  flexflow_tensor_t const a_,
+                                                  flexflow_tensor_t const b_,
                                                   int a_seq_length_dim,
                                                   int b_seq_length_dim) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -778,7 +779,7 @@ flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_,
 
 flexflow_tensor_t flexflow_model_add_dense(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int out_dim,
     enum ActiMode activation /* AC_MODE_NONE */,
     bool use_bias /* true */,
@@ -790,7 +791,7 @@ flexflow_tensor_t flexflow_model_add_dense(
     float kernel_reg_lambda,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap_const(input_);
+  Tensor const input = FFCObjectWrapper::unwrap_const(input_);
   Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
@@ -896,8 +897,8 @@ flexflow_tensor_t flexflow_model_add_flat(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_,
-                                            const flexflow_tensor_t input_,
-                                            const flexflow_tensor_t index_,
+                                            flexflow_tensor_t const input_,
+                                            flexflow_tensor_t const index_,
                                             int dim,
                                             char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -914,7 +915,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              int dim,
                                              char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -926,7 +927,7 @@ flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle_,
-                                               const flexflow_tensor_t input_,
+                                               flexflow_tensor_t const input_,
                                                int n,
                                                int *perm,
                                                char const *name) {
@@ -946,7 +947,7 @@ flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              int n,
                                              int *shape,
                                              char const *name) {
@@ -966,7 +967,7 @@ flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              int axis,
                                              char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -982,7 +983,7 @@ flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle_,
 
 flexflow_tensor_t
     flexflow_model_add_scalar_multiply(flexflow_model_t handle_,
-                                       const flexflow_tensor_t input_,
+                                       flexflow_tensor_t const input_,
                                        float const scalar,
                                        bool inplace,
                                        char const *name) {
@@ -998,7 +999,7 @@ flexflow_tensor_t
 }
 
 flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 float const scalar,
                                                 bool inplace,
                                                 char const *name) {
@@ -1014,7 +1015,7 @@ flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 float const scalar,
                                                 bool inplace,
                                                 char const *name) {
@@ -1032,7 +1033,7 @@ flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle_,
 
 flexflow_tensor_t
     flexflow_model_add_scalar_truediv(flexflow_model_t handle_,
-                                      const flexflow_tensor_t input_,
+                                      flexflow_tensor_t const input_,
                                       float const scalar,
                                       bool inplace,
                                       char const *name) {
@@ -1049,7 +1050,7 @@ flexflow_tensor_t
 }
 
 flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1059,7 +1060,7 @@ flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1070,7 +1071,7 @@ flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           bool inplace,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1081,7 +1082,7 @@ flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1092,7 +1093,7 @@ flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1102,7 +1103,7 @@ flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input_,
+                                         flexflow_tensor_t const input_,
                                          bool inplace,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1113,7 +1114,7 @@ flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              float rate,
                                              unsigned long long seed,
                                              char const *name) {
@@ -1131,9 +1132,9 @@ flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle_,
 
 flexflow_tensor_t flexflow_model_add_multihead_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t query_,
-    const flexflow_tensor_t key_,
-    const flexflow_tensor_t value_,
+    flexflow_tensor_t const query_,
+    flexflow_tensor_t const key_,
+    flexflow_tensor_t const value_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -1186,7 +1187,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention(
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -1198,15 +1199,29 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor = handle->inc_multihead_self_attention(input,
                                                        embed_dim,
                                                        num_heads,
@@ -1218,18 +1233,19 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
                                                        add_zero_attn,
                                                        data_type,
                                                        kernel_initializer,
-                                                       apply_rotary_embedding,
+                                                       rotary_embedding_meta,
                                                        scaling_query,
                                                        scaling_factor,
                                                        qk_prod_scaling,
                                                        position_bias,
+                                                       streaming_cache,
                                                        name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -1241,15 +1257,29 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->spec_inc_multihead_self_attention(input,
                                                 embed_dim,
@@ -1262,18 +1292,19 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
                                                 add_zero_attn,
                                                 data_type,
                                                 kernel_initializer,
-                                                apply_rotary_embedding,
+                                                rotary_embedding_meta,
                                                 scaling_query,
                                                 scaling_factor,
                                                 qk_prod_scaling,
                                                 position_bias,
+                                                streaming_cache,
                                                 name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -1285,6 +1316,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1294,6 +1331,13 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->inc_multihead_self_attention_verify(input,
                                                   embed_dim,
@@ -1306,7 +1350,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
                                                   add_zero_attn,
                                                   data_type,
                                                   kernel_initializer,
-                                                  apply_rotary_embedding,
+                                                  rotary_embedding_meta,
                                                   scaling_query,
                                                   scaling_factor,
                                                   qk_prod_scaling,
@@ -1315,9 +1359,9 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
   return FFCObjectWrapper::wrap(tensor);
 }
 
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
+flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -1330,39 +1374,54 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
-  Tensor tensor = handle->inc_multiquery_self_attention(input,
-                                                        embed_dim,
-                                                        num_q_heads,
-                                                        num_kv_heads,
-                                                        kdim,
-                                                        vdim,
-                                                        dropout,
-                                                        bias,
-                                                        add_bias_kv,
-                                                        add_zero_attn,
-                                                        data_type,
-                                                        kernel_initializer,
-                                                        apply_rotary_embedding,
-                                                        scaling_query,
-                                                        scaling_factor,
-                                                        qk_prod_scaling,
-                                                        position_bias,
-                                                        name);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
+  Tensor tensor = handle->groupquery_self_attention(input,
+                                                    embed_dim,
+                                                    num_q_heads,
+                                                    num_kv_heads,
+                                                    kdim,
+                                                    vdim,
+                                                    dropout,
+                                                    bias,
+                                                    add_bias_kv,
+                                                    add_zero_attn,
+                                                    data_type,
+                                                    kernel_initializer,
+                                                    rotary_embedding_meta,
+                                                    scaling_query,
+                                                    scaling_factor,
+                                                    qk_prod_scaling,
+                                                    position_bias,
+                                                    streaming_cache,
+                                                    name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -1375,15 +1434,29 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->spec_inc_multiquery_self_attention(input,
                                                  embed_dim,
@@ -1397,18 +1470,19 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
                                                  add_zero_attn,
                                                  data_type,
                                                  kernel_initializer,
-                                                 apply_rotary_embedding,
+                                                 rotary_embedding_meta,
                                                  scaling_query,
                                                  scaling_factor,
                                                  qk_prod_scaling,
                                                  position_bias,
+                                                 streaming_cache,
                                                  name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -1421,6 +1495,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1430,6 +1510,13 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->inc_multiquery_self_attention_verify(input,
                                                    embed_dim,
@@ -1443,7 +1530,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
                                                    add_zero_attn,
                                                    data_type,
                                                    kernel_initializer,
-                                                   apply_rotary_embedding,
+                                                   rotary_embedding_meta,
                                                    scaling_query,
                                                    scaling_factor,
                                                    qk_prod_scaling,
@@ -1453,7 +1540,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
 }
 
 flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               float eps,
                                               int dim,
                                               char const *name) {
@@ -1465,8 +1552,8 @@ flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
 
 flexflow_tensor_t *
     flexflow_model_add_residual_rms_norm(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input1_,
-                                         const flexflow_tensor_t input2_,
+                                         flexflow_tensor_t const input1_,
+                                         flexflow_tensor_t const input2_,
                                          float eps,
                                          int dim,
                                          char const *name) {
@@ -1486,31 +1573,30 @@ flexflow_tensor_t *
 }
 
 flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
-                                               const flexflow_tensor_t input_,
+                                               flexflow_tensor_t const input_,
                                                int k,
                                                bool sorted,
-                                               bool speculative_decoding,
+                                               bool renormalize,
                                                char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
-  Tensor tensor =
-      handle->arg_top_k(input, k, sorted, speculative_decoding, name);
+  Tensor tensor = handle->arg_top_k(input, k, sorted, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
-flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
-                                                int max_beam_size,
-                                                bool sorted,
-                                                char const *name) {
-  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  Tensor input = FFCObjectWrapper::unwrap(input_);
-  Tensor tensor = handle->beam_top_k(input, max_beam_size, sorted, name);
-  return FFCObjectWrapper::wrap(tensor);
-}
+// flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
+//                                                 flexflow_tensor_t const
+//                                                 input_, int max_beam_size,
+//                                                 bool sorted,
+//                                                 char const *name) {
+//   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+//   Tensor input = FFCObjectWrapper::unwrap(input_);
+//   Tensor tensor = handle->beam_top_k(input, max_beam_size, sorted, name);
+//   return FFCObjectWrapper::wrap(tensor);
+// }
 
 flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               float top_p,
                                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1520,7 +1606,7 @@ flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
-                                            const flexflow_tensor_t input_,
+                                            flexflow_tensor_t const input_,
                                             bool beam_search,
                                             char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1600,8 +1686,10 @@ void flexflow_model_generate(flexflow_model_t handle_,
                 text_str.c_str(),
                 max_seq_length);
   }
+  std::vector<std::pair<double, double>> slo_ratios = {std::pair(10.0, 1.0)};
+  ConstantEmissionMachine emission_machine(1.0, slo_ratios);
   std::vector<GenerationResult> results =
-      handle->generate(prompts, max_seq_length);
+      handle->generate(prompts, emission_machine);
   // If the prompt exceeds max seq len, check that we return the prompt with no
   // additional token. Otherwise, check that the output does not exceed the max
   // sequence length.
@@ -2525,37 +2613,37 @@ void flexflow_batch_config_destroy(flexflow_batch_config_t handle_) {
 // TreeVerifyBatchConfig
 // -----------------------------------------------------------------------
 
-flexflow_tree_verify_batch_config_t
-    flexflow_tree_verify_batch_config_create(void) {
-  TreeVerifyBatchConfig *config = new TreeVerifyBatchConfig();
-  DEBUG_PRINT("[TreeVerifyBatchConfig] new %p", config);
-  return FFCObjectWrapper::wrap(config);
-}
+// flexflow_tree_verify_batch_config_t
+//     flexflow_tree_verify_batch_config_create(void) {
+//   BatchConfig *config = new BatchConfig();
+//   DEBUG_PRINT("[BatchConfig] new %p", config);
+//   return FFCObjectWrapper::wrap(config);
+// }
 
-void flexflow_tree_verify_batch_config_destroy(
-    flexflow_tree_verify_batch_config_t handle_) {
-  TreeVerifyBatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
-  DEBUG_PRINT("[TreeVerifyBatchConfig] delete %p", handle);
-  delete handle;
-}
+// void flexflow_tree_verify_batch_config_destroy(
+//     flexflow_tree_verify_batch_config_t handle_) {
+//   BatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
+//   DEBUG_PRINT("[BatchConfig] delete %p", handle);
+//   delete handle;
+// }
 
 // -----------------------------------------------------------------------
 // BeamSearchBatchConfig
 // -----------------------------------------------------------------------
 
-flexflow_beam_search_batch_config_t
-    flexflow_beam_search_batch_config_create(void) {
-  BeamSearchBatchConfig *config = new BeamSearchBatchConfig();
-  DEBUG_PRINT("[BeamSearchBatchConfig] new %p", config);
-  return FFCObjectWrapper::wrap(config);
-}
+// flexflow_beam_search_batch_config_t
+//     flexflow_beam_search_batch_config_create(void) {
+//   BatchConfig *config = new BatchConfig();
+//   DEBUG_PRINT("[BeamSearchBatchConfig] new %p", config);
+//   return FFCObjectWrapper::wrap(config);
+// }
 
-void flexflow_beam_search_batch_config_destroy(
-    flexflow_beam_search_batch_config_t handle_) {
-  BeamSearchBatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
-  DEBUG_PRINT("[BeamSearchBatchConfig] delete %p", handle);
-  delete handle;
-}
+// void flexflow_beam_search_batch_config_destroy(
+//     flexflow_beam_search_batch_config_t handle_) {
+//   BatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
+//   DEBUG_PRINT("[BeamSearchBatchConfig] delete %p", handle);
+//   delete handle;
+// }
 
 // -----------------------------------------------------------------------
 // RequestManager
@@ -2582,12 +2670,20 @@ void flexflow_request_manager_set_max_tokens_per_batch(
   DEBUG_PRINT("[RequestManager] set max_tokens_per_batch %d", max_num_tokens);
 }
 
-void flexflow_request_manager_set_max_spec_tree_token_num(
-    flexflow_request_manager_t handle_, int max_num_tokens) {
+void flexflow_request_manager_set_max_tokens_per_ssm_batch(
+    flexflow_request_manager_t handle_, int max_num_ssm_tokens) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_tokens_per_ssm_batch(max_num_ssm_tokens);
+  DEBUG_PRINT("[RequestManager] set max_tokens_per_ssm_batch %d",
+              max_num_ssm_tokens);
+}
+
+void flexflow_request_manager_set_max_tokens_per_prefilling_batch(
+    flexflow_request_manager_t handle_, int max_num_prefilling_tokens) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
-  handle->set_max_spec_tree_token_num(max_num_tokens);
-  DEBUG_PRINT("[RequestManager] set max_spec_tree_token_num %d",
-              max_num_tokens);
+  handle->set_max_tokens_per_prefilling_batch(max_num_prefilling_tokens);
+  DEBUG_PRINT("[RequestManager] set max_tokens_per_prefilling_batch %d",
+              max_num_prefilling_tokens);
 }
 
 void flexflow_request_manager_set_max_sequence_length(
@@ -2597,6 +2693,20 @@ void flexflow_request_manager_set_max_sequence_length(
   DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length);
 }
 
+void flexflow_request_manager_set_max_output_length(
+    flexflow_request_manager_t handle_, int max_output_length) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_output_length(max_output_length);
+  DEBUG_PRINT("[RequestManager] set max_output_length %d", max_output_length);
+}
+
+void flexflow_request_manager_set_max_kv_cache_size(
+    flexflow_request_manager_t handle_, int max_kv_cache_size) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_kv_cache_size(max_kv_cache_size);
+  DEBUG_PRINT("[RequestManager] set max_kv_cache_size %d", max_kv_cache_size);
+}
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
@@ -2608,7 +2718,7 @@ void flexflow_request_manager_register_tokenizer(
          "Cannot convert nullptr char * to std::string");
   std::string const tokenizer_filepath_str(tokenizer_filepath);
   handle->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath_str);
+      model_type, bos_token_id, {eos_token_id}, tokenizer_filepath_str);
   DEBUG_PRINT(
       "[RequestManager] register tokenizer %p %s", handle, tokenizer_filepath);
 }
@@ -2700,7 +2810,7 @@ flexflow_file_data_loader_t
                                      int num_q_heads,
                                      int num_kv_heads,
                                      int hidden_dim,
-                                     int qkv_inner_dim,
+                                     int head_dim,
                                      int tensor_parallelism_degree,
                                      bool use_full_precision) {
   assert(weight_file_path != nullptr &&
@@ -2711,7 +2821,7 @@ flexflow_file_data_loader_t
                                               num_q_heads,
                                               num_kv_heads,
                                               hidden_dim,
-                                              qkv_inner_dim,
+                                              head_dim,
                                               tensor_parallelism_degree,
                                               use_full_precision);
   DEBUG_PRINT("[FileDataLoader] new %p", handle);
@@ -2728,5 +2838,7 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_) {
   FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_);
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
-  handle->load_weights(model);
+  Context ctx = model->config.lg_ctx;
+  Runtime *runtime = model->config.lg_hlr;
+  handle->load_weights_parallel(model, ctx, runtime);
 }
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index c293aecb1..38127a1cf 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -20,7 +20,7 @@ namespace FlexFlow {
 using namespace Legion;
 using namespace Mapping;
 
-LegionRuntime::Logger::Category log_ff_mapper("Mapper");
+Legion::Logger log_ff_mapper("Mapper");
 
 FFShardingFunctor::FFShardingFunctor(int _gpus_per_node,
                                      int _cpus_per_node,
@@ -33,7 +33,7 @@ FFShardingFunctor::~FFShardingFunctor(void) {}
 
 ShardID FFShardingFunctor::shard(DomainPoint const &point,
                                  Domain const &full_space,
-                                 const size_t total_shards) {
+                                 size_t const total_shards) {
   assert(point.get_dim() == full_space.get_dim());
   int device_id = machine_view.start_device_id;
   for (int i = 0; i < point.get_dim(); i++) {
@@ -259,7 +259,7 @@ Mapper::MapperSyncModel FFMapper::get_mapper_sync_model(void) const {
   return SERIALIZED_REENTRANT_MAPPER_MODEL;
 }
 
-void FFMapper::select_task_options(const MapperContext ctx,
+void FFMapper::select_task_options(MapperContext const ctx,
                                    Task const &task,
                                    TaskOptions &output) {
   unsigned long long task_hash = compute_task_hash(task);
@@ -285,9 +285,14 @@ void FFMapper::select_task_options(const MapperContext ctx,
   }
   if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) ||
-      (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
+      (task.task_id == RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID) ||
-      (task.task_id == RM_BACKGROUND_SERVING_TASK_ID)) {
+      (task.task_id == RM_BACKGROUND_SERVING_TASK_ID) ||
+      (task.task_id == RM_GET_NEXT_BATCH_CONFIG_TASK_ID)) {
+    output.initial_proc = all_cpus[0];
+    return;
+  }
+  if (task.task_id == LOAD_WEIGHT_TASK_ID) {
     output.initial_proc = all_cpus[0];
     return;
   }
@@ -296,6 +301,7 @@ void FFMapper::select_task_options(const MapperContext ctx,
     // control replicate top level task
     if (enable_control_replication) {
       output.replicate = true;
+      output.map_locally = false;
     }
     return;
   }
@@ -374,7 +380,7 @@ void FFMapper::select_task_options(const MapperContext ctx,
   assert(task.is_index_space);
 }
 
-void FFMapper::slice_task(const MapperContext ctx,
+void FFMapper::slice_task(MapperContext const ctx,
                           Task const &task,
                           SliceTaskInput const &input,
                           SliceTaskOutput &output) {
@@ -480,7 +486,7 @@ void FFMapper::slice_task(const MapperContext ctx,
   }
 }
 
-void FFMapper::premap_task(const MapperContext ctx,
+void FFMapper::premap_task(MapperContext const ctx,
                            Task const &task,
                            PremapTaskInput const &input,
                            PremapTaskOutput &output) {
@@ -506,7 +512,7 @@ std::string humanReadableSize(size_t size, bool mb = false) {
   return std::string(buffer);
 }
 
-void FFMapper::map_task(const MapperContext ctx,
+void FFMapper::map_task(MapperContext const ctx,
                         Task const &task,
                         MapTaskInput const &input,
                         MapTaskOutput &output) {
@@ -560,6 +566,10 @@ void FFMapper::map_task(const MapperContext ctx,
       assert(output.target_procs[i].address_space() == node_id);
     }
   }
+  if (input.shard_processor.exists()) {
+    output.target_procs = std::vector<Processor>{input.shard_processor};
+  }
+
   // Find instances that still need to be mapped
   std::vector<std::set<FieldID>> missing_fields(task.regions.size());
   runtime->filter_instances(ctx,
@@ -643,17 +653,18 @@ void FFMapper::map_task(const MapperContext ctx,
                                task.regions[idx],
                                created,
                                &footprint)) {
-      if (log_instance_creation) {
-        for (size_t idx = 0; idx < created_instances.size(); idx++) {
-          log_ff_mapper.print("Instance[%zu]: memory:" IDFMT "	proc:" IDFMT
-                              "	size:%zu	task:%s",
-                              idx,
-                              created_instances[idx].memory.id,
-                              created_instances[idx].processor.id,
-                              created_instances[idx].size,
-                              created_instances[idx].task_name.c_str());
-        }
-      }
+      // if (log_instance_creation) {
+      //   for (size_t idx = 0; idx < created_instances.size(); idx++) {
+      //     log_ff_mapper.print("Instance[%zu]: memory: " IDFMT "	proc: "
+      //     IDFMT
+      //                         "	size: %zu	task: %s",
+      //                         idx,
+      //                         created_instances[idx].memory.id,
+      //                         created_instances[idx].processor.id,
+      //                         created_instances[idx].size,
+      //                         created_instances[idx].task_name.c_str());
+      //   }
+      // }
       // Report failed to creation
       log_ff_mapper.error(
           "Out of memory! FlexFlow failed to reserve block of size %s"
@@ -681,17 +692,27 @@ void FFMapper::map_task(const MapperContext ctx,
       clog.memory = target_mem;
       clog.processor = task.target_proc;
       created_instances.push_back(clog);
+      log_ff_mapper.print(
+          "Created Instance[%lu]: memory_kind: %s memory_id: %llx	"
+          "proc: " IDFMT "	size: %zu	(capacity %lu) task: %s",
+          created_instances.size() - 1,
+          Legion::Mapping::Utilities::to_string(clog.memory.kind()),
+          clog.memory.id,
+          clog.processor.id,
+          clog.size,
+          clog.memory.capacity(),
+          clog.task_name.c_str());
     }
   } // for idx
 }
 
-void FFMapper::replicate_task(const MapperContext ctx,
+void FFMapper::replicate_task(MapperContext const ctx,
                               Task const &task,
                               ReplicateTaskInput const &input,
                               ReplicateTaskOutput &output) {
   // Should only be replicated for the top-level task
   assert((task.get_depth() == 0) && (task.regions.size() == 0));
-  const Processor::Kind target_kind = task.target_proc.kind();
+  Processor::Kind const target_kind = task.target_proc.kind();
   VariantID vid;
   {
     std::vector<VariantID> variant_ids;
@@ -707,7 +728,7 @@ void FFMapper::replicate_task(const MapperContext ctx,
   procs.only_kind(target_kind);
   for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end();
        it++) {
-    const AddressSpace space = it->address_space();
+    AddressSpace const space = it->address_space();
     if (handled[space]) {
       continue;
     }
@@ -718,21 +739,21 @@ void FFMapper::replicate_task(const MapperContext ctx,
   assert(count == total_nodes);
 }
 
-void FFMapper::select_task_variant(const MapperContext ctx,
+void FFMapper::select_task_variant(MapperContext const ctx,
                                    Task const &task,
                                    SelectVariantInput const &input,
                                    SelectVariantOutput &output) {
   assert(false);
 }
 
-void FFMapper::postmap_task(const MapperContext ctx,
+void FFMapper::postmap_task(MapperContext const ctx,
                             Task const &task,
                             PostMapInput const &input,
                             PostMapOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_task_sources(const MapperContext ctx,
+void FFMapper::select_task_sources(MapperContext const ctx,
                                    Task const &task,
                                    SelectTaskSrcInput const &input,
                                    SelectTaskSrcOutput &output) {
@@ -817,26 +838,26 @@ void FFMapper::default_policy_select_sources(
 }
 
 void FFMapper::create_task_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Task const &task,
     CreateTaskTemporaryInput const &input,
     CreateTaskTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::speculate(const MapperContext ctx,
+void FFMapper::speculate(MapperContext const ctx,
                          Task const &task,
                          SpeculativeOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Task const &task,
                                 TaskProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Task const &task,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -865,7 +886,7 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
   }
 }
 
-void FFMapper::map_inline(const MapperContext ctx,
+void FFMapper::map_inline(MapperContext const ctx,
                           InlineMapping const &inline_op,
                           MapInlineInput const &input,
                           MapInlineOutput &output) {
@@ -968,7 +989,7 @@ void FFMapper::map_inline(const MapperContext ctx,
   }
 }
 
-void FFMapper::select_inline_sources(const MapperContext ctx,
+void FFMapper::select_inline_sources(MapperContext const ctx,
                                      InlineMapping const &inline_op,
                                      SelectInlineSrcInput const &input,
                                      SelectInlineSrcOutput &output) {
@@ -978,27 +999,27 @@ void FFMapper::select_inline_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_inline_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     InlineMapping const &inline_op,
     CreateInlineTemporaryInput const &input,
     CreateInlineTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 InlineMapping const &inline_op,
                                 InlineProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::map_copy(const MapperContext ctx,
+void FFMapper::map_copy(MapperContext const ctx,
                         Copy const &copy,
                         MapCopyInput const &input,
                         MapCopyOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_copy_sources(const MapperContext ctx,
+void FFMapper::select_copy_sources(MapperContext const ctx,
                                    Copy const &copy,
                                    SelectCopySrcInput const &input,
                                    SelectCopySrcOutput &output) {
@@ -1006,26 +1027,26 @@ void FFMapper::select_copy_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_copy_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Copy const &copy,
     CreateCopyTemporaryInput const &input,
     CreateCopyTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::speculate(const MapperContext ctx,
+void FFMapper::speculate(MapperContext const ctx,
                          Copy const &copy,
                          SpeculativeOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Copy const &copy,
                                 CopyProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Copy const &copy,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -1033,14 +1054,14 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
   assert(false);
 }
 
-void FFMapper::map_close(const MapperContext ctx,
+void FFMapper::map_close(MapperContext const ctx,
                          Close const &close,
                          MapCloseInput const &input,
                          MapCloseOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_close_sources(const MapperContext ctx,
+void FFMapper::select_close_sources(MapperContext const ctx,
                                     Close const &close,
                                     SelectCloseSrcInput const &input,
                                     SelectCloseSrcOutput &output) {
@@ -1048,20 +1069,20 @@ void FFMapper::select_close_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_close_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Close const &close,
     CreateCloseTemporaryInput const &input,
     CreateCloseTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Close const &close,
                                 CloseProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Close const &close,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -1069,26 +1090,26 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
   assert(false);
 }
 
-void FFMapper::map_acquire(const MapperContext ctx,
+void FFMapper::map_acquire(MapperContext const ctx,
                            Acquire const &acquire,
                            MapAcquireInput const &input,
                            MapAcquireOutput &output) {
   assert(false);
 }
 
-void FFMapper::speculate(const MapperContext ctx,
+void FFMapper::speculate(MapperContext const ctx,
                          Acquire const &acquire,
                          SpeculativeOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Acquire const &acquire,
                                 AcquireProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Acquire const &acquire,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -1096,14 +1117,14 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
   assert(false);
 }
 
-void FFMapper::map_release(const MapperContext ctx,
+void FFMapper::map_release(MapperContext const ctx,
                            Release const &release,
                            MapReleaseInput const &input,
                            MapReleaseOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_release_sources(const MapperContext ctx,
+void FFMapper::select_release_sources(MapperContext const ctx,
                                       Release const &release,
                                       SelectReleaseSrcInput const &input,
                                       SelectReleaseSrcOutput &output) {
@@ -1111,26 +1132,26 @@ void FFMapper::select_release_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_release_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Release const &release,
     CreateReleaseTemporaryInput const &input,
     CreateReleaseTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::speculate(const MapperContext ctx,
+void FFMapper::speculate(MapperContext const ctx,
                          Release const &release,
                          SpeculativeOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Release const &release,
                                 ReleaseProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Release const &release,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -1138,21 +1159,21 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
 }
 
 void FFMapper::select_partition_projection(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Partition const &partition,
     SelectPartitionProjectionInput const &input,
     SelectPartitionProjectionOutput &output) {
   assert(false);
 }
 
-void FFMapper::map_partition(const MapperContext ctx,
+void FFMapper::map_partition(MapperContext const ctx,
                              Partition const &partition,
                              MapPartitionInput const &input,
                              MapPartitionOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_partition_sources(const MapperContext ctx,
+void FFMapper::select_partition_sources(MapperContext const ctx,
                                         Partition const &partition,
                                         SelectPartitionSrcInput const &input,
                                         SelectPartitionSrcOutput &output) {
@@ -1160,34 +1181,34 @@ void FFMapper::select_partition_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_partition_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Partition const &partition,
     CreatePartitionTemporaryInput const &input,
     CreatePartitionTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Partition const &partition,
                                 PartitionProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Partition const &partition,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Fill const &fill,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
   assert(false);
 }
 
-void FFMapper::configure_context(const MapperContext ctx,
+void FFMapper::configure_context(MapperContext const ctx,
                                  Task const &task,
                                  ContextConfigOutput &output) {
   // Increase max_window_size to allow Legion tracing to capture larger traces
@@ -1195,21 +1216,21 @@ void FFMapper::configure_context(const MapperContext ctx,
   // Use the default values and do nothing else
 }
 
-void FFMapper::select_tunable_value(const MapperContext ctx,
+void FFMapper::select_tunable_value(MapperContext const ctx,
                                     Task const &task,
                                     SelectTunableInput const &input,
                                     SelectTunableOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        MustEpoch const &epoch,
                                        SelectShardingFunctorInput const &input,
                                        MustEpochShardingFunctorOutput &output) {
   assert(false);
 }
 
-void FFMapper::map_must_epoch(const MapperContext ctx,
+void FFMapper::map_must_epoch(MapperContext const ctx,
                               MapMustEpochInput const &input,
                               MapMustEpochOutput &output) {
   // Directly assign each task to its target_proc
@@ -1220,13 +1241,13 @@ void FFMapper::map_must_epoch(const MapperContext ctx,
   assert(input.constraints.size() == 0);
 }
 
-void FFMapper::map_dataflow_graph(const MapperContext ctx,
+void FFMapper::map_dataflow_graph(MapperContext const ctx,
                                   MapDataflowGraphInput const &input,
                                   MapDataflowGraphOutput &output) {
   assert(false);
 }
 
-void FFMapper::memoize_operation(const MapperContext ctx,
+void FFMapper::memoize_operation(MapperContext const ctx,
                                  Mappable const &mappable,
                                  MemoizeInput const &input,
                                  MemoizeOutput &output) {
@@ -1240,7 +1261,7 @@ void FFMapper::memoize_operation(const MapperContext ctx,
 }
 
 // Mapping control and stealing
-void FFMapper::select_tasks_to_map(const MapperContext ctx,
+void FFMapper::select_tasks_to_map(MapperContext const ctx,
                                    SelectMappingInput const &input,
                                    SelectMappingOutput &output) {
   // Just map all the ready tasks
@@ -1251,13 +1272,13 @@ void FFMapper::select_tasks_to_map(const MapperContext ctx,
   }
 }
 
-void FFMapper::select_steal_targets(const MapperContext ctx,
+void FFMapper::select_steal_targets(MapperContext const ctx,
                                     SelectStealingInput const &input,
                                     SelectStealingOutput &output) {
   // Nothing to do, no stealing in FFMapper
 }
 
-void FFMapper::permit_steal_request(const MapperContext ctx,
+void FFMapper::permit_steal_request(MapperContext const ctx,
                                     StealRequestInput const &intput,
                                     StealRequestOutput &output) {
   // Nothing to do, no stealing in FFMapper
diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp
index 1add43ecd..ae66d9b86 100644
--- a/src/ops/add_bias_residual_layer_norm.cpp
+++ b/src/ops/add_bias_residual_layer_norm.cpp
@@ -38,7 +38,8 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "AddBiasResidualLayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index ceb1a6514..2ce5605b6 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -37,7 +37,8 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "AddBiasResidualLayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 780a77450..ebed5ab0c 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -48,10 +48,10 @@ using PCG::Node;
 // For an input tensor, computes the top k entries in each row
 // (resp. vector along the last dimension). Thus,
 // values.shape = indices.shape = input.shape[:-1] + [k]
-Tensor FFModel::arg_top_k(const Tensor input,
+Tensor FFModel::arg_top_k(Tensor const input,
                           int k,
                           bool sorted,
-                          bool speculative_decoding,
+                          bool renormalize,
                           char const *name) {
   Layer *li = new Layer(this,
                         OP_ARG_TOPK,
@@ -59,7 +59,7 @@ Tensor FFModel::arg_top_k(const Tensor input,
                         name,
                         1 /*inputs*/,
                         0 /*weights*/,
-                        speculative_decoding ? 2 : 1 /*outputs*/,
+                        2 /*outputs*/,
                         input);
   {
     int numdims = input->num_dims;
@@ -72,14 +72,12 @@ Tensor FFModel::arg_top_k(const Tensor input,
     //     numdims, dims, input->data_type, li, 0, true /*create_grad*/);
     li->outputs[0] = create_tensor_legion_ordering(
         numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
-    if (speculative_decoding) {
-      li->outputs[1] = create_tensor_legion_ordering(
-          numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
-    }
+    li->outputs[1] = create_tensor_legion_ordering(
+        numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
   }
   li->add_int_property("k", k);
   li->add_int_property("sorted", sorted);
-  li->add_int_property("speculative_decoding", speculative_decoding);
+  li->add_int_property("renormalize", renormalize);
   layers.push_back(li);
   // outputs[0] = li->outputs[0];
   // outputs[1] = li->outputs[1];
@@ -95,23 +93,18 @@ Op *ArgTopK::create_operator_from_layer(
   int k = value;
   layer->get_int_property("sorted", value);
   bool sorted = (bool)value;
-  layer->get_int_property("speculative_decoding", value);
-  bool speculative_decoding = (bool)value;
-
-  return new ArgTopK(model,
-                     layer->layer_guid,
-                     inputs[0],
-                     k,
-                     sorted,
-                     speculative_decoding,
-                     layer->name);
+  layer->get_int_property("renormalize", value);
+  bool renormalize = (bool)value;
+
+  return new ArgTopK(
+      model, layer->layer_guid, inputs[0], k, sorted, renormalize, layer->name);
 }
 
 ArgTopKParams ArgTopK::get_params() const {
   ArgTopKParams params;
   params.k = this->k;
   params.sorted = this->sorted;
-  params.speculative_decoding = this->speculative_decoding;
+  params.renormalize = this->renormalize;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -125,15 +118,15 @@ bool ArgTopKParams::is_valid(ParallelTensorShape const &) const {
 
 bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) {
   return lhs.k == rhs.k && lhs.sorted == rhs.sorted &&
-         lhs.speculative_decoding == rhs.speculative_decoding;
+         lhs.renormalize == rhs.renormalize;
 }
 
 ArgTopK::ArgTopK(FFModel &model,
                  LayerID const &_layer_guid,
-                 const ParallelTensor _input,
+                 ParallelTensor const _input,
                  int _k,
                  bool _sorted,
-                 bool _speculative_decoding,
+                 bool _renormalize,
                  char const *name)
     : Op(model,
          OP_ARG_TOPK,
@@ -141,9 +134,9 @@ ArgTopK::ArgTopK(FFModel &model,
          name,
          1 /*inputs*/,
          0 /*weights*/,
-         _speculative_decoding ? 2 : 1 /*outputs*/,
+         2 /*outputs*/,
          _input),
-      k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) {
+      k(_k), sorted(_sorted), renormalize(_renormalize) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   int numdim = inputs[0]->num_dims;
@@ -158,22 +151,20 @@ ArgTopK::ArgTopK(FFModel &model,
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       numdim, dims, DT_INT32, this, 0 /*owner_idx*/);
-  if (_speculative_decoding) {
-    outputs[1] = model.create_parallel_tensor_legion_ordering(
-        numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
-  }
+  outputs[1] = model.create_parallel_tensor_legion_ordering(
+      numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
 }
 
 ArgTopK::ArgTopK(FFModel &model,
                  LayerID const &layer_guid,
                  ArgTopK const &other,
-                 const ParallelTensor input)
+                 ParallelTensor const input)
     : ArgTopK(model,
               layer_guid,
               input,
               other.k,
               other.sorted,
-              other.speculative_decoding,
+              other.renormalize,
               other.name) {}
 
 ArgTopK::ArgTopK(FFModel &model,
@@ -185,7 +176,7 @@ ArgTopK::ArgTopK(FFModel &model,
               input,
               params.k,
               params.sorted,
-              params.speculative_decoding,
+              params.renormalize,
               params.name) {}
 
 void ArgTopK::init_inference(FFModel const &ff,
@@ -275,14 +266,19 @@ OpMeta *ArgTopK::init_task(Task const *task,
                            Runtime *runtime) {
   ArgTopK *topk = (ArgTopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  ArgTopKMeta *m = new ArgTopKMeta(handle, topk);
+  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                       .only_kind(Memory::GPU_FB_MEM)
+                       .best_affinity_to(task->target_proc)
+                       .first();
+  MemoryAllocator gpu_mem_allocator(gpu_mem);
+  ArgTopKMeta *m = new ArgTopKMeta(handle, topk, gpu_mem_allocator);
   m->profiling = topk->profiling;
   m->inference_debugging = topk->inference_debugging;
   m->sorted = topk->sorted;
   m->k = topk->k;
   std::strcpy(m->op_name, topk->name);
   m->layer_guid = topk->layer_guid;
-  m->speculative_decoding = topk->speculative_decoding;
+  m->renormalize = topk->renormalize;
   return m;
 }
 
@@ -305,66 +301,38 @@ FutureMap ArgTopK::inference(FFModel const &ff,
   size_t machine_view_hash = view->hash();
   /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  if (speculative_decoding) {
-    IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
-                           parallel_is,
-                           TaskArgument(nullptr, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           machine_view_hash);
-    launcher.add_future(bc);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[0]->region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[0]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[0]->region));
-    launcher.add_field(1, FID_DATA);
-
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[1]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[1]->region));
-    launcher.add_field(2, FID_DATA);
-    return runtime->execute_index_space(ctx, launcher);
-
-  } else {
-    IndexLauncher launcher(ARG_TOPK_INF_TASK_ID,
-                           parallel_is,
-                           TaskArgument(nullptr, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           machine_view_hash);
-    launcher.add_future(bc);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[0]->region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[0]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[0]->region));
-    launcher.add_field(1, FID_DATA);
-    return runtime->execute_index_space(ctx, launcher);
-  }
+  IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[1]->region));
+  launcher.add_field(2, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
 }
 
+// just output the indices
 InferenceResult
     ArgTopK::inference_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
@@ -399,23 +367,23 @@ InferenceResult
   }
 
   InferenceResult ir;
+  ir.num_token_ids = batch_size * m->k;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
 }
 
-BeamInferenceResult ArgTopK::inference_speculative_task(
+InferenceResult ArgTopK::inference_speculative_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
   assert(regions.size() == 3);
   assert(task->regions.size() == 3);
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-  if (bc.num_active_tokens() == 0) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
     // Directly return for empty batch config
-    BeamInferenceResult ir;
+    InferenceResult ir;
     return ir;
   }
   ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args);
@@ -427,10 +395,11 @@ BeamInferenceResult ArgTopK::inference_speculative_task(
   GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO(
       DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
 
-  int batch_size = bc.num_active_tokens();
-  ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
+  int batch_size = bc->num_active_tokens();
+  ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, bc);
 
-  BeamInferenceResult ir;
+  InferenceResult ir;
+  ir.num_token_ids = batch_size * m->k;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
   download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
@@ -448,7 +417,7 @@ void ArgTopK::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->k);
   sez.serialize(this->sorted);
-  sez.serialize(this->speculative_decoding);
+  sez.serialize(this->renormalize);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -465,10 +434,10 @@ Node ArgTopK::deserialize(FFModel &ff,
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
   int k;
   bool sorted;
-  bool speculative_decoding;
+  bool renormalize;
   dez.deserialize(k);
   dez.deserialize(sorted);
-  dez.deserialize(speculative_decoding);
+  dez.deserialize(renormalize);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -477,7 +446,7 @@ Node ArgTopK::deserialize(FFModel &ff,
   params.layer_guid = layer_guid;
   params.k = k;
   params.sorted = sorted;
-  params.speculative_decoding = speculative_decoding;
+  params.renormalize = renormalize;
   strcpy(params.name, name);
   return ff.get_or_create_node<ArgTopK>(inputs[0], params);
 }
@@ -504,7 +473,7 @@ size_t hash<FlexFlow::ArgTopKParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.k);
   hash_combine(key, params.sorted);
-  hash_combine(key, params.speculative_decoding);
+  hash_combine(key, params.renormalize);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp
index f431d3d4b..90dbb5909 100644
--- a/src/ops/arg_topk.cpp
+++ b/src/ops/arg_topk.cpp
@@ -379,7 +379,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
                              int length,
                              int k,
                              bool sorted,
-                             BeamSearchBatchConfig const *bc,
+                             BatchConfig const *bc,
                              hipStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
@@ -398,29 +398,17 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
   size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry<DT>);
   // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
   size_t num_blocks = batch_size;
-  // all requests are in the same beam stages
+  // all requests share the same number of branches
   if (m->speculative_decoding) {
     assert(bc->num_active_requests() >= 0);
 
-    // check
-    int beam_size = -1;
-    for (int i = 1; i < bc->max_requests_per_batch(); i++) {
-      if (bc->request_completed[i]) {
-        continue;
-      } else if (beam_size == -1) {
-        beam_size = bc->beamRequestsInfo[i].beam_size;
-      } else {
-        assert(beam_size == bc->beamRequestsInfo[i].beam_size);
-      }
-    }
-
-    assert(num_shards >= (size_t)beam_size);
+    assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
     num_shards = k;
     arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         input_ptr,
         shared_memory_size,
         length,
-        beam_size,
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
         sorted,
         output_ptr,
         indices_ptr,
@@ -448,7 +436,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                                      // float *output_ptr,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     BeamSearchBatchConfig const *bc) {
+                                     BatchConfig const *bc) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   // Domain in1_domain = runtime->get_index_space_domain(
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 5b7978812..0d4ea2045 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -15,446 +15,125 @@
 
 #include "flexflow/ops/arg_topk.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "raft/matrix/detail/select_k.cuh"
 
 namespace FlexFlow {
 // declare Legion names
 using Legion::coord_t;
 
-enum class HeapType { kMinHeap, kMaxHeap };
-enum class PreferIndices { kLower, kHigher };
+__global__ void half2float_kernel(half const *__restrict__ in,
+                                  float *__restrict__ out,
+                                  int size) {
+  // int stride = blockDim.x * gridDim.x,
+  //     tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-template <typename T>
-struct Entry {
-  int index;
-  T value;
-};
-
-template <typename T>
-struct LinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return data[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-};
-
-template <typename T>
-struct IndirectLinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return backing_data[data[i].index].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-  Entry *const backing_data;
-};
-
-template <typename T>
-struct StridedData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index * blockDim.x + threadIdx.x];
-  }
-
-  __device__ int get_index(int i) const {
-    return (*this)[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return (*this)[i].value;
-  }
-
-  Entry *const data;
-};
-
-// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-struct IndexedHeap {
-  typedef typename Data<T>::Entry Entry;
-  Data<T> const data;
-  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
-
-  __device__ bool is_above(int left, int right) {
-    T left_value = data.get_value(left);
-    T right_value = data.get_value(right);
-    if (left_value == right_value) {
-      if (preferIndices == PreferIndices::kLower) {
-        return data.get_index(left) < data.get_index(right);
-      } else {
-        return data.get_index(left) > data.get_index(right);
-      }
-    }
-    if (heapType == HeapType::kMinHeap) {
-      return left_value < right_value;
-    } else {
-      return left_value > right_value;
-    }
-  }
-
-  __device__ void assign(int i, Entry const &entry) {
-    data[i] = entry;
-  }
-
-  __device__ void push_up(int i) {
-    int child = i;
-    int parent;
-    for (; child > 0; child = parent) {
-      parent = (child - 1) / 2;
-      if (!is_above(child, parent)) {
-        // Heap property satisfied.
-        break;
-      }
-      swap(child, parent);
-    }
-  }
-
-  __device__ void swap(int a, int b) {
-    auto tmp = data[b];
-    data[b] = data[a];
-    data[a] = tmp;
-  }
-
-  __device__ void push_root_down(int k) {
-    push_down(0, k);
-  }
-
-  // MAX-HEAPIFY in Cormen
-  __device__ void push_down(int node, int k) {
-    while (true) {
-      int const left = 2 * node + 1;
-      int const right = left + 1;
-      int smallest = node;
-      if (left < k && is_above(left, smallest)) {
-        smallest = left;
-      }
-      if (right < k && is_above(right, smallest)) {
-        smallest = right;
-      }
-      if (smallest == node) {
-        break;
-      }
-      swap(smallest, node);
-      node = smallest;
-    }
-  }
-
-  // BUILD-MAX-HEAPIFY in Cormen
-  __device__ void build(int k) {
-    for (int node = (k - 1) / 2; node >= 0; node--) {
-      push_down(node, k);
-    }
-  }
-
-  // HEAP-EXTRACT-MAX in Cormen
-  __device__ void remove_root(int k) {
-    data[0] = data[k - 1];
-    push_root_down(k - 1);
-  }
-
-  // in-place HEAPSORT in Cormen
-  // This method destroys the heap property.
-  __device__ void sort(int k) {
-    for (int slot = k - 1; slot > 0; slot--) {
-      // This is like remove_root but we insert the element at the end.
-      swap(slot, 0);
-      // Heap is now an element smaller.
-      push_root_down(/*k=*/slot);
-    }
-  }
-
-  __device__ void replace_root(Entry const &entry, int k) {
-    data[0] = entry;
-    push_root_down(k);
-  }
-
-  __device__ Entry const &root() {
-    return data[0];
-  }
-};
-
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-__device__ IndexedHeap<heapType, preferIndices, Data, T>
-    make_indexed_heap(typename Data<T>::Entry *data) {
-  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
-}
-
-// heapArgTopK walks over [input, input+length) with `step_size` stride starting
-// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
-// using `Accessor` to access elements in `heap_entries`. If sorted=true, the
-// elements will be sorted at the end.
-template <typename T, template <typename> class Data = LinearData>
-__device__ void heapArgTopK(T const *__restrict__ input,
-                            int length,
-                            int k,
-                            Entry<T> *__restrict__ heap_entries,
-                            bool sorted = false,
-                            int start_index = 0,
-                            int step_size = 1) {
-  assert(k <= length);
-
-  auto heap =
-      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
-          heap_entries);
-
-  int heap_end_index = start_index + k * step_size;
-  if (heap_end_index > length) {
-    heap_end_index = length;
-  }
-  // Initialize the min-heap.
-  for (int index = start_index, slot = 0; index < heap_end_index;
-       index += step_size, slot++) {
-    heap.assign(slot, {index, input[index]});
-  }
-
-  heap.build(k);
-
-  // Now iterate over the remaining items.
-  // If an item is smaller than the min element, it is not amongst the top k.
-  // Otherwise, replace the min element with it and push upwards.
-  for (int index = heap_end_index; index < length; index += step_size) {
-    // We prefer elements with lower indices. This is given here.
-    // Later elements automatically have higher indices, so can be discarded.
-    if (input[index] > heap.root().value) {
-      // This element should replace the min.
-      heap.replace_root({index, input[index]}, k);
-    }
-  }
-
-  // Sort if wanted.
-  if (sorted) {
-    heap.sort(k);
+  // for (int i = tid; i < size; i += stride) {
+  //   out[i] = __half2float(in[i]);
+  // }
+  CUDA_KERNEL_LOOP(i, size) {
+    out[i] = __half2float(in[i]);
   }
 }
 
-// mergeShards performs a top-k merge on `num_shards` many sorted streams that
-// are sorted and stored in `entries` in a strided way:
-// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
-// The overall top k elements are written to `top_k_values` and their indices
-// to top_k_indices.
-// `top_k_heap` is used as temporary storage for the merge heap.
-template <typename T>
-__device__ void mergeShards(int num_shards,
-                            int k,
-                            Entry<T> *__restrict__ entries,
-                            Entry<T> *__restrict__ top_k_heap,
-                            float *top_k_values,
-                            int *top_k_indices,
-                            bool speculative_decoding) {
-  // If k < num_shards, we can use a min-heap with k elements to get the top k
-  // of the sorted blocks.
-  // If k > num_shards, we can initialize a min-heap with the top element from
-  // each sorted block.
-  int const heap_size = k < num_shards ? k : num_shards;
-
-  // Min-heap part.
-  {
-    auto min_heap = IndexedHeap<HeapType::kMinHeap,
-                                PreferIndices::kHigher,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Initialize the heap as a min-heap.
-    for (int slot = 0; slot < heap_size; slot++) {
-      min_heap.assign(slot, {slot, entries[slot].value});
-    }
-    min_heap.build(heap_size);
-
-    // Now perform top k with the remaining shards (if num_shards > heap_size).
-    for (int shard = heap_size; shard < num_shards; shard++) {
-      auto const entry = entries[shard];
-      auto const root = min_heap.root();
-      if (entry.value < root.value) {
-        continue;
-      }
-      if (entry.value == root.value &&
-          entry.index > entries[root.index].index) {
-        continue;
-      }
-      // This element should replace the min.
-      min_heap.replace_root({shard, entry.value}, heap_size);
-    }
-  }
-
-  // Max-part.
-  {
-    // Turn the min-heap into a max-heap in-place.
-    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
-                                PreferIndices::kLower,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Heapify into a max heap.
-    max_heap.build(heap_size);
-
-    // Now extract the minimum k-1 times.
-    // k is treated specially.
-    int const last_k = k - 1;
-    for (int rank = 0; rank < last_k; rank++) {
-      Entry<T> const &max_element = max_heap.root();
-      if (speculative_decoding) {
-        assert(top_k_values != nullptr);
-        top_k_values[rank] = static_cast<float>(max_element.value);
+template <typename DT>
+__global__ void insertion_sort_kernel(DT *topk_values,
+                                      int *topk_indices,
+                                      int batch_size,
+                                      int k) {
+  int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (batch_index < batch_size) {
+    DT *values = topk_values + batch_index * k;
+    int *indices = topk_indices + batch_index * k;
+
+    for (int i = 1; i < k; i++) {
+      DT key_val = values[i];
+      int key_idx = indices[i];
+      int j = i - 1;
+      while (j >= 0 && values[j] < key_val) {
+        values[j + 1] = values[j];
+        indices[j + 1] = indices[j];
+        j = j - 1;
       }
-
-      int shard_index = max_element.index;
-      top_k_indices[rank] = entries[shard_index].index;
-      int next_shard_index = shard_index + num_shards;
-      // For rank < k-1, each top k heap still contains at least 1 element,
-      // so we can draw a replacement.
-      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
-                            heap_size);
+      values[j + 1] = key_val;
+      indices[j + 1] = key_idx;
     }
-
-    // rank == last_k.
-    Entry<T> const &max_element = max_heap.root();
-    // top_k_values[last_k] = max_element.value;
-    int shard_index = max_element.index;
-    top_k_indices[last_k] = entries[shard_index].index;
   }
 }
 
-template <typename T>
-__global__ void arg_topk_forward_kernel(T const *__restrict__ input,
-                                        size_t shared_memory_size,
-                                        int length,
-                                        int k,
-                                        bool sorted,
-                                        float *__restrict__ output,
-                                        int *__restrict__ indices,
-                                        bool speculative_decoding) {
-  __shared__ char shared_memory[48 << 10];
-  int const batch_index = blockIdx.x;
-  T const *batch_input = input + batch_index * length;
-  int const thread_index = threadIdx.x;
-  int const thread_count = blockDim.x;
-  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
-  heapArgTopK<T, StridedData>(
-      batch_input, length, k, shared_entries, true, thread_index, thread_count);
-  __syncthreads();
-  if (thread_index == 0) {
-    int const offset = batch_index * k;
-    auto batch_output = output + offset;
-    auto batch_indices = indices + offset;
-    Entry<T> *top_k_heap = shared_entries + thread_count * k;
-    mergeShards(thread_count,
-                k,
-                shared_entries,
-                top_k_heap,
-                batch_output,
-                batch_indices,
-                speculative_decoding);
+template <typename DT>
+__global__ void renormalize_kernel(DT *topk_values,
+                                   int batch_size,
+                                   int k,
+                                   float epsilon = 1e-6) {
+  int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
+  assert(batch_index < batch_size);
+  DT *values = topk_values + batch_index * k;
+  DT sum = 0;
+  for (int i = 0; i < k; i++) {
+    sum += values[i];
+  }
+  sum += epsilon;
+  for (int i = 0; i < k; i++) {
+    values[i] /= sum;
   }
 }
 
+// Adopted from Raft's select_k
+// https://github.com/rapidsai/raft/blob/branch-24.10/cpp/include/raft/matrix/detail/select_k.cuh
+
 /*static*/
 template <typename DT>
-void ArgTopK::forward_kernel(ArgTopKMeta const *m,
+void ArgTopK::forward_kernel(ArgTopKMeta *m,
                              DT const *input_ptr,
-                             float *output_ptr,
+                             DT *output_ptr,
                              int *indices_ptr,
                              size_t batch_size,
                              int length,
                              int k,
                              bool sorted,
-                             BeamSearchBatchConfig const *bc,
+                             bool renormalize,
+                             BatchConfig const *bc,
                              cudaStream_t stream) {
-  // Adopted from TensorFlow's ArgTopK implementation
-  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
-  int num_shards = 0;
-  {
-    constexpr auto shared_memory_size = 48 << 10;
-    auto const heap_size = k * sizeof(Entry<DT>);
-    // shared_memory_size = (num_shards + 1) * heap_size <=>
-    num_shards = shared_memory_size / heap_size - 1;
-    assert(num_shards > 0);
-    if (num_shards > CUDA_NUM_THREADS) {
-      num_shards = CUDA_NUM_THREADS;
-    }
-  }
-  // We are limited by the amount of shared memory we have per block.
-  size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry<DT>);
-  // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
-  size_t num_blocks = batch_size;
-
-  // all requests are in the same beam stages
-  if (m->speculative_decoding) {
-    assert(bc->num_active_requests() >= 0);
-
-    // check
-    // allow last request different with others
-    int beam_size = -1;
-    int num_activate_requests = bc->num_active_requests();
-    int last_request_idx =
-        bc->requestsInfo[num_activate_requests - 1].batch_config_request_id;
-    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-      if (bc->request_completed[i]) {
-        continue;
-      } else if (beam_size == -1) {
-        beam_size = bc->beamRequestsInfo[i].beam_size;
-
-      } else if (i != last_request_idx) {
-        assert(beam_size == bc->beamRequestsInfo[i].beam_size);
-      } else if (i == last_request_idx) {
-      }
-    }
-    assert(num_shards >= (size_t)beam_size);
-    num_shards = k;
-    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
-        input_ptr,
-        shared_memory_size,
-        length,
-        beam_size,
-        sorted,
-        output_ptr,
-        indices_ptr,
-        m->speculative_decoding);
-  } else {
-
-    assert(num_shards >= (size_t)k);
-    num_shards = k;
-    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
-        input_ptr,
-        shared_memory_size,
-        length,
-        k,
-        sorted,
-        nullptr,
-        indices_ptr,
-        false);
+  assert(bc->num_active_requests() >= 0);
+  if (m->device_resources.find(stream) == m->device_resources.end()) {
+    m->device_resources[stream] = new raft::device_resources(stream);
+  }
+  raft::device_resources *handle = m->device_resources[stream];
+  raft::matrix::detail::select_k(*handle,
+                                 input_ptr,
+                                 (int *)nullptr,
+                                 batch_size,
+                                 (size_t)length,
+                                 k,
+                                 output_ptr,
+                                 indices_ptr,
+                                 /*select_min=*/false,
+                                 sorted);
+  // if (sorted) {
+  //   assert(output_ptr != nullptr);
+  //   insertion_sort_kernel<<<GET_BLOCKS(batch_size),
+  //                           min((size_t)CUDA_NUM_THREADS, batch_size),
+  //                           0,
+  //                           stream>>>(output_ptr, indices_ptr, batch_size,
+  //                           k);
+  // }
+  if (renormalize) {
+    assert(output_ptr != nullptr);
+    renormalize_kernel<<<GET_BLOCKS(batch_size),
+                         min((size_t)CUDA_NUM_THREADS, batch_size),
+                         0,
+                         stream>>>(output_ptr, batch_size, k);
   }
 }
 
 /*static*/
-void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
+void ArgTopK::forward_kernel_wrapper(ArgTopKMeta *m,
                                      GenericTensorAccessorR const &input,
                                      // float *output_ptr,
                                      GenericTensorAccessorW const &probs,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     BeamSearchBatchConfig const *bc) {
+                                     BatchConfig const *bc) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
@@ -497,28 +176,36 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
   }
 
   if (input.data_type == DT_HALF) {
+    // printf("ArgTopK: length = %d, batch_size = %d\n", length, batch_size);
     ArgTopK::forward_kernel(m,
                             input.get_half_ptr(),
-                            m->speculative_decoding ? probs.get_float_ptr()
-                                                    : nullptr,
+                            (half *)m->half_precision_output,
                             indices.get_int32_ptr(),
                             batch_size,
                             length,
                             k,
                             m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
+                            m->renormalize,
+                            bc,
                             stream);
+    // transfer data from half to float (half_precision_output to output)
+    int size = k * batch_size;
+    half2float_kernel<<<GET_BLOCKS(size),
+                        min((int)CUDA_NUM_THREADS, size),
+                        0,
+                        stream>>>(
+        (half const *)m->half_precision_output, probs.get_float_ptr(), size);
   } else if (input.data_type == DT_FLOAT) {
     ArgTopK::forward_kernel(m,
                             input.get_float_ptr(),
-                            m->speculative_decoding ? probs.get_float_ptr()
-                                                    : nullptr,
+                            probs.get_float_ptr(),
                             indices.get_int32_ptr(),
                             batch_size,
                             length,
                             k,
                             m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
+                            m->renormalize,
+                            bc,
                             stream);
   } else {
     assert(false && "Unsupported data type");
@@ -535,7 +222,23 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
   }
 }
 
-ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op)
-    : OpMeta(handler, op) {}
+ArgTopKMeta::ArgTopKMeta(FFHandler handler,
+                         Op const *op,
+                         MemoryAllocator &gpu_mem_allocator)
+    : OpMeta(handler, op) {
+  max_output_size = BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS;
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, sizeof(half) * max_output_size, "ArgTopKMeta");
+  half_precision_output = gpu_mem_allocator.allocate_instance_untyped(
+      sizeof(half) * max_output_size);
+}
 
+ArgTopKMeta::~ArgTopKMeta() {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
+  for (auto &kv : device_resources) {
+    delete kv.second;
+  }
+}
 }; // namespace FlexFlow
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index a52ce1886..0524defce 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -44,7 +44,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-Tensor FFModel::argmax(const Tensor input, bool beam_search, char const *name) {
+Tensor FFModel::argmax(Tensor const input, bool beam_search, char const *name) {
   Layer *li = new Layer(this,
                         OP_ARGMAX,
                         input->data_type,
@@ -106,7 +106,7 @@ bool operator==(ArgMaxParams const &lhs, ArgMaxParams const &rhs) {
 }
 
 ArgMax::ArgMax(FFModel &model,
-               const ParallelTensor _input,
+               ParallelTensor const _input,
                bool _beam_search,
                char const *name)
     : Op(model,
@@ -136,12 +136,12 @@ ArgMax::ArgMax(FFModel &model,
   }
 }
 
-ArgMax::ArgMax(FFModel &model, ArgMax const &other, const ParallelTensor input)
+ArgMax::ArgMax(FFModel &model, ArgMax const &other, ParallelTensor const input)
     : ArgMax(model, input, other.beam_search, other.name) {}
 
 ArgMax::ArgMax(FFModel &model,
                ArgMaxParams const &params,
-               const ParallelTensor input,
+               ParallelTensor const input,
                char const *name)
     : ArgMax(model, input, params.beam_search, params.name) {}
 
@@ -332,7 +332,7 @@ FutureMap ArgMax::inference(FFModel const &ff,
   }
 }
 
-BeamInferenceResult
+InferenceResult
     ArgMax::inference_task_beam(Task const *task,
                                 std::vector<PhysicalRegion> const &regions,
                                 Context ctx,
@@ -342,7 +342,7 @@ BeamInferenceResult
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     // Directly return for empty batch config
-    BeamInferenceResult ir;
+    InferenceResult ir;
     return ir;
   }
   ArgMaxMeta *m = *((ArgMaxMeta **)task->local_args);
@@ -355,17 +355,17 @@ BeamInferenceResult
   GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime);
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
-  BeamInferenceResult ir;
+  InferenceResult ir;
+  ir.num_token_ids = batch_size;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   download_tensor(m->probs, ir.probs, batch_size);
-  download_tensor<int>(parent.get_int32_ptr(), ir.parent_id, batch_size);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     ArgMax::save_inference_tensors_to_file(
-        m, shard_id, bc, {}, {}, {input, indices, parent});
+        m, shard_id, bc, {}, {}, {input, indices});
   }
 
   return ir;
@@ -394,6 +394,7 @@ InferenceResult
   int batch_size = bc->num_active_tokens();
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
   InferenceResult ir;
+  ir.num_token_ids = batch_size;
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
index 8a1cf0b3b..bd0b2bd19 100644
--- a/src/ops/argmax.cpp
+++ b/src/ops/argmax.cpp
@@ -493,7 +493,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
   size_t prob_size = batch_size;
   assert(data_type == DT_FLOAT || data_type == DT_HALF);
   size_t total_size = prob_size * sizeof(float);
-  gpu_mem_allocator.create_legion_instance(reserveInst, total_size);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, total_size, "ArgMaxMeta");
   probs = gpu_mem_allocator.allocate_instance<float>(prob_size);
 }
 ArgMaxMeta::~ArgMaxMeta(void) {
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 05c84719c..42d1a96f3 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -23,10 +23,11 @@ __global__ void init_offset(int batch_size,
                             int vocab_size,
                             int total_eles,
                             int *d_offsets) {
-  CUDA_KERNEL_LOOP(i, total_eles) {
-    if (i % vocab_size == 0) {
-      d_offsets[i / vocab_size] = i;
-    }
+  CUDA_KERNEL_LOOP(i, (total_eles) / vocab_size + 1) {
+    // if (i % vocab_size == 0) {
+    //   d_offsets[i / vocab_size] = i;
+    // }
+    d_offsets[i] = i * vocab_size;
   }
 }
 
@@ -83,7 +84,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                           prob_ptr,
                           batch_size,
                           m->beam_search);
-  // print_tensor<int>(indices_ptr, 32, "argmax op");
+  //   print_tensor<int>(indices_ptr, 4, "argmax op");
 }
 
 /*static*/
@@ -151,7 +152,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
-  size_t d_offsets_size = batch_size;
+  size_t d_offsets_size = batch_size + 1;
   size_t prob_size = batch_size;
   assert(data_type == DT_FLOAT || data_type == DT_HALF);
   size_t total_size =
@@ -160,7 +161,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
            ? sizeof(cub::KeyValuePair<int, float>) * batch_size
            : sizeof(cub::KeyValuePair<int, half>) * batch_size) +
       prob_size * sizeof(float);
-  gpu_mem_allocator.create_legion_instance(reserveInst, total_size);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, total_size, "ArgMaxMeta");
   d_offsets = gpu_mem_allocator.allocate_instance<int>(d_offsets_size);
   d_out = data_type == DT_FLOAT
               ? gpu_mem_allocator.allocate_instance_untyped(
@@ -199,7 +201,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
         stream));
   }
 
-  gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, temp_storage_bytes, "ArgMaxMeta");
   d_temp_storage =
       gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes);
 }
diff --git a/src/ops/attention_impl.cu b/src/ops/attention_impl.cu
new file mode 100644
index 000000000..f3cc8df92
--- /dev/null
+++ b/src/ops/attention_impl.cu
@@ -0,0 +1,818 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "cuComplex.h"
+#endif
+#include "flashinfer/attention_impl.cuh"
+
+// This is for instantiating the template attention kernels
+namespace flashinfer {
+
+// warp_layout_literal[] = {
+//   "WarpLayout::k4x1x2",
+//   "WarpLayout::k4x1x1",
+//   "WarpLayout::k1x4x1",
+// }
+// head_dim[] = {64, 128, 256};
+
+/********** batch append instantiations for half precision **********/
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+/********** batch prefill instantiations for half precision **********/
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+/********** batch decode instantiations for half precision **********/
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<64,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<128,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<256,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+} // namespace flashinfer
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
deleted file mode 100644
index d2054cacb..000000000
--- a/src/ops/beam_topk.cc
+++ /dev/null
@@ -1,476 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/ops/beam_topk.h"
-#include "flexflow/model.h"
-#include "flexflow/utils/hash_utils.h"
-#include "legion/legion_utilities.h"
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include "flexflow/utils/cuda_helper.h"
-#else
-#include "flexflow/utils/hip_helper.h"
-#endif
-
-namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::Future;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::InlineLauncher;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
-
-// For an input tensor, computes the top k entries in each row
-// (resp. vector along the last dimension). Thus,
-// values.shape = indices.shape = input.shape[:-1] + [k]
-Tensor FFModel::beam_top_k(const Tensor input,
-                           int max_beam_width,
-                           bool sorted,
-                           char const *name) {
-  Layer *li = new Layer(this,
-                        OP_BEAM_TOPK,
-                        input->data_type,
-                        name,
-                        1 /*inputs*/,
-                        0 /*weights*/,
-                        3 /*outputs*/,
-                        input);
-  {
-    int numdims = input->num_dims;
-
-    int dims[MAX_TENSOR_DIM];
-    for (int i = 0; i < numdims; i++) {
-      dims[i] = input->dims[i];
-    }
-    dims[0] = max_beam_width;
-
-    std::cout << "beam input dimen:" << numdims << "\n";
-    for (int i = 0; i < numdims; i++) {
-      std::cout << input->dims[i] << ", ";
-    }
-
-    // beam width is dynamic
-    li->outputs[0] = create_tensor_legion_ordering(
-        numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
-    li->outputs[1] = create_tensor_legion_ordering(
-        numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
-    li->outputs[2] = create_tensor_legion_ordering(
-        numdims, dims, DT_INT32, li, 1, false /*create_grad*/);
-  }
-  li->add_int_property("sorted", sorted);
-  li->add_int_property("max_beam_width", max_beam_width);
-  layers.push_back(li);
-  // outputs[0] = li->outputs[0];
-  // outputs[1] = li->outputs[1];
-  return li->outputs[1];
-}
-
-Op *BeamTopK::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  long long value;
-  layer->get_int_property("sorted", value);
-  bool sorted = (bool)value;
-  layer->get_int_property("max_beam_width", value);
-  int max_beam_width = value;
-  return new BeamTopK(
-      model, inputs[0], layer->layer_guid, max_beam_width, sorted, layer->name);
-}
-
-BeamTopKParams BeamTopK::get_params() const {
-  BeamTopKParams params;
-  params.layer_guid = this->layer_guid;
-  params.sorted = this->sorted;
-  params.max_beam_width = this->max_beam_width;
-  return params;
-}
-
-bool BeamTopKParams::is_valid(ParallelTensorShape const &) const {
-  // topk is always valid
-  return true;
-}
-
-bool operator==(BeamTopKParams const &lhs, BeamTopKParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid && lhs.sorted == rhs.sorted &&
-         lhs.max_beam_width == rhs.max_beam_width;
-}
-
-BeamTopK::BeamTopK(FFModel &model,
-                   const ParallelTensor _input,
-                   LayerID const &_layer_guid,
-                   int _max_beam_width,
-                   bool _sorted,
-                   char const *name)
-    : Op(model,
-         OP_BEAM_TOPK,
-         _input->data_type,
-         name,
-         1 /*inputs*/,
-         0 /*weights*/,
-         3 /*outputs*/,
-         _input) {
-  sorted = _sorted;
-  max_beam_width = _max_beam_width;
-  layer_guid = _layer_guid;
-  int numdim = inputs[0]->num_dims;
-  assert(inputs[0]->dims[0].degree == 1);
-  assert(inputs[0]->dims[0].parallel_idx == -1);
-  //   outputs[0] = model.create_parallel_tensor_legion_ordering(
-  //       numdim, dims, _input->data_type, this, 0 /*owner_idx*/);
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      numdim, inputs[0]->dims, DT_INT32, this, 0 /*owner_idx*/);
-  outputs[1] = model.create_parallel_tensor_legion_ordering(
-      numdim, inputs[0]->dims, DT_FLOAT, this, 1 /*owner_idx*/);
-  outputs[2] = model.create_parallel_tensor_legion_ordering(
-      numdim, inputs[0]->dims, DT_INT32, this, 2 /*owner_idx*/);
-}
-
-BeamTopK::BeamTopK(FFModel &model,
-                   BeamTopK const &other,
-                   const ParallelTensor input)
-    : BeamTopK(model,
-               input,
-               other.layer_guid,
-               other.max_beam_width,
-               other.sorted,
-               other.name) {}
-
-BeamTopK::BeamTopK(FFModel &model,
-                   BeamTopKParams const &params,
-                   const ParallelTensor input,
-                   char const *name)
-    : BeamTopK(model,
-               input,
-               params.layer_guid,
-               params.max_beam_width,
-               params.sorted,
-               params.name) {}
-
-void BeamTopK::init_inference(FFModel const &ff,
-                              std::vector<ParallelTensor> const &batch_inputs,
-                              std::vector<ParallelTensor> const &batch_outputs,
-                              MachineView const *mv) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = batch_outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  size_t machine_view_hash = view->hash();
-  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
-  IndexLauncher launcher(BEAM_TOPK_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(BeamTopK)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[2]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[2]->region));
-  launcher.add_field(3, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
-}
-
-void BeamTopK::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(BEAM_TOPK_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(BeamTopK)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[2]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[2]->region));
-  launcher.add_field(3, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-}
-
-OpMeta *BeamTopK::init_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  BeamTopK *topk = (BeamTopK *)task->args;
-  FFHandler handle = *((FFHandler *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
-  MemoryAllocator gpu_mem_allocator(gpu_mem);
-  BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator);
-  m->profiling = topk->profiling;
-  m->inference_debugging = topk->inference_debugging;
-  std::strcpy(m->op_name, topk->name);
-  m->layer_guid = topk->layer_guid;
-  m->sorted = topk->sorted;
-  m->max_beam_width = topk->max_beam_width;
-  m->input_type[0] = topk->inputs[0]->data_type;
-  return m;
-}
-
-void BeamTopK::forward(FFModel const &ff) {
-  assert(false);
-}
-
-FutureMap BeamTopK::inference(FFModel const &ff,
-                              BatchConfigFuture const &bc,
-                              std::vector<ParallelTensor> const &batch_inputs,
-                              std::vector<ParallelTensor> const &batch_outputs,
-                              MachineView const *mv) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = batch_outputs[0]->parallel_is;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
-  size_t machine_view_hash = view->hash();
-
-  IndexLauncher launcher(BEAM_TOPK_INF_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[2]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[2]->region));
-  launcher.add_field(3, FID_DATA);
-
-  return runtime->execute_index_space(ctx, launcher);
-}
-
-BeamInferenceResult
-    BeamTopK::inference_task(Task const *task,
-                             std::vector<PhysicalRegion> const &regions,
-                             Context ctx,
-                             Runtime *runtime) {
-
-  assert(regions.size() == 4);
-  assert(task->regions.size() == 4);
-
-  BeamTopKMeta *m = *((BeamTopKMeta **)task->local_args);
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-
-  if (bc.num_tokens == 0) {
-    BeamInferenceResult ir;
-    return ir;
-  }
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW index = helperGetGenericTensorAccessorWO(
-      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW value = helperGetGenericTensorAccessorWO(
-      DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
-      DT_INT32, regions[3], task->regions[3], FID_DATA, ctx, runtime);
-
-  Domain input_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-
-  int *index_ptr = index.get_int32_ptr();
-  float *value_ptr = value.get_float_ptr();
-  int *parent_ptr = parent.get_int32_ptr();
-
-  // embedding size: eg. 4096
-  int length = input_domain.hi()[0] - input_domain.lo()[0] + 1;
-  // total token nums
-  size_t batch_size = bc.num_active_tokens();
-
-  // need meta for: how many sub requests in a main request
-  BeamTopK::forward_kernel_wrapper(m,
-                                   &bc,
-                                   input,
-                                   value_ptr,
-                                   index_ptr,
-                                   parent_ptr,
-                                   batch_size,
-                                   length,
-                                   m->sorted);
-
-  BeamInferenceResult ir;
-
-  download_tensor<int>(index_ptr, ir.token_ids, batch_size * m->max_beam_width);
-  download_tensor<float>(value_ptr, ir.probs, batch_size * m->max_beam_width);
-  download_tensor<int>(
-      parent_ptr, ir.parent_id, batch_size * m->max_beam_width);
-
-  if (m->inference_debugging) {
-    assert(task->index_point.get_dim() == 1);
-    int shard_id = task->index_point.point_data[0];
-    BeamTopK::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, {}, {index, value, parent});
-  }
-
-  return ir;
-}
-
-void BeamTopK::backward(FFModel const &ff) {
-  assert(false);
-}
-
-void BeamTopK::serialize(Legion::Serializer &sez) const {
-  sez.serialize(this->layer_guid.id);
-  sez.serialize(this->layer_guid.transformer_layer_id);
-  sez.serialize(this->layer_guid.model_id);
-  sez.serialize(this->sorted);
-  sez.serialize(this->max_beam_width);
-  sez.serialize(strlen(this->name));
-  sez.serialize(this->name, strlen(this->name));
-}
-
-Node BeamTopK::deserialize(FFModel &ff,
-                           Legion::Deserializer &dez,
-                           ParallelTensor inputs[],
-                           int num_inputs) {
-  assert(num_inputs == 1);
-  bool sorted;
-  size_t id, transformer_layer_id, deserialized_model_id;
-  int max_beam_width;
-  dez.deserialize(id);
-  dez.deserialize(transformer_layer_id);
-  dez.deserialize(deserialized_model_id);
-  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
-  dez.deserialize(sorted);
-  dez.deserialize(max_beam_width);
-  size_t name_len;
-  char name[MAX_OPNAME] = {0};
-  dez.deserialize(name_len);
-  dez.deserialize(name, name_len);
-
-  BeamTopKParams params;
-  params.layer_guid = layer_guid;
-  params.sorted = sorted;
-  params.max_beam_width = max_beam_width;
-  strcpy(params.name, name);
-  return ff.get_or_create_node<BeamTopK>(inputs[0], params);
-}
-
-Op *BeamTopK::materialize(FFModel &ff,
-                          ParallelTensor inputs[],
-                          int num_inputs) const {
-  BeamTopKParams params = get_params();
-  return new BeamTopK(ff, params, inputs[0], this->name);
-}
-
-bool BeamTopK::measure_operator_cost(Simulator *sim,
-                                     MachineView const &mv,
-                                     CostMetrics &cost_metrics) const {
-  return false;
-}
-
-}; // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::BeamTopKParams>::operator()(
-    FlexFlow::BeamTopKParams const &params) const {
-  size_t key = 0;
-  hash_combine(key, params.layer_guid.id);
-  hash_combine(key, params.sorted);
-  hash_combine(key, params.max_beam_width);
-  return key;
-}
-}; // namespace std
diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp
deleted file mode 100644
index 18534455a..000000000
--- a/src/ops/beam_topk.cpp
+++ /dev/null
@@ -1,724 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/ops/beam_topk.h"
-#include "flexflow/ffconst_utils.h"
-#include "flexflow/utils/hip_helper.h"
-#include <hip/hip_runtime.h>
-
-namespace FlexFlow {
-// declare Legion names
-using Legion::coord_t;
-
-enum class HeapType { kMinHeap, kMaxHeap };
-enum class PreferIndices { kLower, kHigher };
-
-LegionRuntime::Logger::Category log_beam_topk("BeamTopK");
-
-template <typename T>
-struct Entry {
-  int index;
-  T value;
-};
-
-template <typename T>
-struct LinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return data[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-};
-
-template <typename T>
-struct IndirectLinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return backing_data[data[i].index].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-  Entry *const backing_data;
-};
-
-template <typename T>
-struct StridedData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index * blockDim.x + threadIdx.x];
-  }
-
-  __device__ int get_index(int i) const {
-    return (*this)[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return (*this)[i].value;
-  }
-
-  Entry *const data;
-};
-
-// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-struct IndexedHeap {
-  typedef typename Data<T>::Entry Entry;
-  Data<T> const data;
-  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
-
-  __device__ bool is_above(int left, int right) {
-    T left_value = data.get_value(left);
-    T right_value = data.get_value(right);
-    if (left_value == right_value) {
-      if (preferIndices == PreferIndices::kLower) {
-        return data.get_index(left) < data.get_index(right);
-      } else {
-        return data.get_index(left) > data.get_index(right);
-      }
-    }
-    if (heapType == HeapType::kMinHeap) {
-      return left_value < right_value;
-    } else {
-      return left_value > right_value;
-    }
-  }
-
-  __device__ void assign(int i, Entry const &entry) {
-    data[i] = entry;
-  }
-
-  __device__ void push_up(int i) {
-    int child = i;
-    int parent;
-    for (; child > 0; child = parent) {
-      parent = (child - 1) / 2;
-      if (!is_above(child, parent)) {
-        // Heap property satisfied.
-        break;
-      }
-      swap(child, parent);
-    }
-  }
-
-  __device__ void swap(int a, int b) {
-    auto tmp = data[b];
-    data[b] = data[a];
-    data[a] = tmp;
-  }
-
-  __device__ void push_root_down(int k) {
-    push_down(0, k);
-  }
-
-  // MAX-HEAPIFY in Cormen
-  __device__ void push_down(int node, int k) {
-    while (true) {
-      int const left = 2 * node + 1;
-      int const right = left + 1;
-      int smallest = node;
-      if (left < k && is_above(left, smallest)) {
-        smallest = left;
-      }
-      if (right < k && is_above(right, smallest)) {
-        smallest = right;
-      }
-      if (smallest == node) {
-        break;
-      }
-      swap(smallest, node);
-      node = smallest;
-    }
-  }
-
-  // BUILD-MAX-HEAPIFY in Cormen
-  __device__ void build(int k) {
-    for (int node = (k - 1) / 2; node >= 0; node--) {
-      push_down(node, k);
-    }
-  }
-
-  // HEAP-EXTRACT-MAX in Cormen
-  __device__ void remove_root(int k) {
-    data[0] = data[k - 1];
-    push_root_down(k - 1);
-  }
-
-  // in-place HEAPSORT in Cormen
-  // This method destroys the heap property.
-  __device__ void sort(int k) {
-    for (int slot = k - 1; slot > 0; slot--) {
-      // This is like remove_root but we insert the element at the end.
-      swap(slot, 0);
-      // Heap is now an element smaller.
-      push_root_down(/*k=*/slot);
-    }
-  }
-
-  __device__ void replace_root(Entry const &entry, int k) {
-    data[0] = entry;
-    push_root_down(k);
-  }
-
-  __device__ Entry const &root() {
-    return data[0];
-  }
-};
-
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-__device__ IndexedHeap<heapType, preferIndices, Data, T>
-    make_indexed_heap(typename Data<T>::Entry *data) {
-  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
-}
-
-// heapBeamTopK walks over [input, input+length) with `step_size` stride
-// starting at `start_index`. It builds a top-`k` heap that is stored in
-// `heap_entries` using `Accessor` to access elements in `heap_entries`. If
-// sorted=true, the elements will be sorted at the end.
-template <typename T, template <typename> class Data = LinearData>
-__device__ void heapBeamTopK(T const *__restrict__ input,
-                             int batch_index,
-                             int length,
-                             int k,
-                             Entry<T> *__restrict__ heap_entries,
-                             bool sorted = false,
-                             int start_index = 0,
-                             int step_size = 1) {
-  assert(k <= length);
-  auto heap =
-      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
-          heap_entries);
-
-  int heap_end_index = start_index + k * step_size;
-  if (heap_end_index > length) {
-    heap_end_index = length;
-  }
-  // Initialize the min-heap.
-  for (int index = start_index, slot = 0; index < heap_end_index;
-       index += step_size, slot++) {
-    heap.assign(slot, {index, input[index]});
-  }
-
-  heap.build(k);
-
-  // Now iterate over the remaining items.
-  // If an item is smaller than the min element, it is not amongst the top k.
-  // Otherwise, replace the min element with it and push upwards.
-  for (int index = heap_end_index; index < length; index += step_size) {
-    // We prefer elements with lower indices. This is given here.
-    // Later elements automatically have higher indices, so can be discarded.
-    if (input[index] > heap.root().value) {
-      // This element should replace the min.
-      heap.replace_root({index, input[index]}, k);
-    }
-  }
-
-  // Sort if wanted.
-  if (sorted) {
-    heap.sort(k);
-  }
-
-  // if(batch_index == 0){
-  //   printf("top elemmments: %d, value %.15f\n", start_index,
-  //   heap.root().value);
-  // }
-}
-
-template <typename T>
-__device__ void mergeBeamShards(int num_shards,
-                                int batch_index,
-                                int k,
-                                int max_heap_size,
-                                int request_id,
-                                int *parent_id,
-                                T *probs,
-                                Entry<T> *__restrict__ entries,
-                                Entry<T> *__restrict__ top_k_heap,
-                                float *top_k_values,
-                                int *top_k_indices,
-                                int *top_k_parents,
-                                bool verbose) {
-  // If k < num_shards, we can use a min-heap with k elements to get the top k
-  // of the sorted blocks.
-  // If k > num_shards, we can initialize a min-heap with the top element from
-  // each sorted block.
-  int const heap_size = k < num_shards ? k : num_shards;
-  // printf("see value: %f", entries[0].value);
-  // Min-heap part.
-
-  {
-    auto min_heap = IndexedHeap<HeapType::kMinHeap,
-                                PreferIndices::kHigher,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Initialize the heap as a min-heap.
-    for (int slot = 0; slot < heap_size; slot++) {
-      // int beam = (slot % max_heap_size) / k;
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((slot % max_heap_size) / k)];
-      min_heap.assign(slot, {slot, (entries[slot].value * prob)});
-    }
-    min_heap.build(heap_size);
-
-    // Now perform top k with the remaining shards (if num_shards > heap_size).
-    for (int shard = heap_size; shard < num_shards; shard++) {
-      auto const entry = entries[shard];
-      auto const root = min_heap.root();
-
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((shard % max_heap_size) / k)];
-      if (entry.value * prob < root.value) {
-        continue;
-      }
-      if (entry.value * prob == root.value &&
-          entry.index > entries[root.index].index) {
-        continue;
-      }
-      // This element should replace the min.
-      min_heap.replace_root({shard, entry.value * prob}, heap_size);
-    }
-  }
-
-  // Max-part.
-  {
-    // Turn the min-heap into a max-heap in-place.
-    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
-                                PreferIndices::kLower,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Heapify into a max heap.
-    max_heap.build(heap_size);
-
-    // Now extract the minimum k-1 times.
-    // k is treated specially.
-    int const last_k = k - 1;
-    for (int rank = 0; rank < last_k; rank++) {
-      Entry<T> const &max_element = max_heap.root();
-      top_k_values[rank] = __half2float(max_element.value);
-      int shard_index = max_element.index;
-      top_k_indices[rank] = entries[shard_index].index;
-      top_k_parents[rank] =
-          parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                    ((shard_index % max_heap_size) / k)];
-      int next_shard_index = shard_index + num_shards;
-
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((next_shard_index % max_heap_size) / k)];
-
-      max_heap.replace_root(
-          {next_shard_index, entries[next_shard_index].value * prob},
-          heap_size);
-    }
-
-    // rank == last_k.
-    Entry<T> const &max_element = max_heap.root();
-    top_k_values[last_k] = __half2float(max_element.value);
-    int shard_index = max_element.index;
-    top_k_indices[last_k] = entries[shard_index].index;
-    top_k_parents[last_k] =
-        parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                  ((shard_index % max_heap_size) / k)];
-  }
-}
-
-template <typename T>
-__global__ void
-    mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
-  }
-}
-
-template <typename T>
-__global__ void beam_topk_forward_kernel(T const *__restrict__ input,
-                                         size_t shared_memory_size,
-                                         int length,
-                                         int k,
-                                         int max_heap_size,
-                                         int *parent_ids,
-                                         T *acc_probs,
-                                         int *gpu_block_start_index,
-                                         int *gpu_request_id,
-                                         int *tokens_per_request,
-                                         bool sorted,
-                                         float *__restrict__ output,
-                                         int *__restrict__ indices,
-                                         int *__restrict__ parents,
-                                         bool verbose) {
-  __shared__ char shared_memory[48 << 10];
-  int const batch_index = blockIdx.x;
-  // T const *batch_input = input + batch_index * length;
-  int const thread_index = threadIdx.x;
-  int const thread_count = blockDim.x;
-  int const request_id = gpu_request_id[batch_index];
-  int const token_nums = tokens_per_request[batch_index];
-  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
-
-  int sub_request_id = thread_index / k;
-  // if (verbose) {
-  //   printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d,
-  //   "
-  //          "request_id %d, token_nums %d\n",
-  //          batch_index,
-  //          thread_index,
-  //          sub_request_id,
-  //          request_id,
-  //          token_nums);
-  // }
-
-  T const *batch_input = input + gpu_block_start_index[batch_index] +
-                         (sub_request_id * token_nums * length);
-
-  // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index,
-  // thread_count, batch_index);
-  heapBeamTopK<T, StridedData>(batch_input,
-                               batch_index,
-                               length,
-                               k,
-                               shared_entries,
-                               true,
-                               thread_index % k,
-                               k);
-  __syncthreads();
-  // printf("beam thread index %d, thread_count %d, thread index %d, batch_index
-  // "
-  //        "%d, k %d, parent_id %d, acc_prob: %f, sub id: %d, request_id: %d,
-  //        offset: %d, offset2 %d, sub_request_id %d\n", thread_index,
-  //        thread_count,
-  //        thread_index,
-  //        batch_index,
-  //        k,
-  //        parent_ids[request_id * BatchConfig::MAX_NUM_BEAMS +
-  //        sub_request_id], acc_probs[request_id * BatchConfig::MAX_NUM_BEAMS +
-  //        sub_request_id], sub_request_id, request_id,
-  //        gpu_block_start_index[batch_index],
-  //        batch_index * length,
-  //        sub_request_id);
-
-  if (thread_index == 0) {
-    // merge beam_width heaps and store the parent
-    // find which req it belongs to, replace the offset
-    // printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n",
-    //       batch_index,
-    //       sub_request_id,
-    //       acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-    //                 sub_request_id]);
-    int const offset = batch_index * k;
-    auto batch_output = output + offset;
-    auto batch_indices = indices + offset;
-    auto batch_parents = parents + offset;
-    Entry<T> *top_k_heap = shared_entries + thread_count * k;
-
-    // if(batch_index == 0 && verbose) {
-    //   for(int i = 0; i < 18; i++){
-    //       printf("see value: %.15f\n", shared_entries[i].value);
-    //   }
-    // }
-
-    // get parent/acc based on the sub request and main request
-    mergeBeamShards(thread_count,
-                    batch_index,
-                    k,
-                    max_heap_size,
-                    request_id,
-                    parent_ids,
-                    acc_probs,
-                    shared_entries,
-                    top_k_heap,
-                    batch_output,
-                    batch_indices,
-                    batch_parents,
-                    verbose /*verbose prints*/);
-  }
-}
-
-/*static*/
-template <typename DT>
-void BeamTopK::forward_kernel(BeamTopKMeta const *m,
-                              BeamSearchBatchConfig const *bc,
-                              DT const *input_ptr,
-                              float *output_ptr,
-                              int *indices_ptr,
-                              int *parent_ptr,
-                              int batch_size,
-                              int length,
-                              bool sorted,
-                              hipStream_t stream) {
-  // Adopted from TensorFlow's BeamTopK implementation
-  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
-
-  int num_shards = 0;
-  int max_heap_size = 0;
-  int max_beam_width = 0;
-  int req_index = 0;
-
-  // sub request
-  int const *sub_requests = bc->sub_requests;
-
-  // std::vector<BatchConfig::BeamSlot> beam_slots = bc->beam_slots;
-  // assert(bc->beam_slots.size() > 0);
-
-  int beam_num_blocks = 0;
-  std::vector<int> beam_block_start_index;
-  std::vector<int> request_id;
-  std::vector<int> tokens_per_request;
-
-  int block_start_index = 0;
-
-  // a data structure for prob, parent_id,
-  int max_total_requests =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests();
-  int parent_ids[max_total_requests];
-  DT acc_probs[max_total_requests];
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    assert(bc->beamRequestsInfo[i].beam_size > 0);
-
-    // int num_new_tokens = bc->num_processing_tokens[i];
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-
-    // get beam size;
-    int beam_size = bc->beamRequestsInfo[i].beam_size;
-
-    // initial request
-    log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i]
-                          << "\n";
-    assert(sub_requests[i] > 0);
-    // process sub requests
-    for (int j = 0; j < sub_requests[i]; j++) {
-      parent_ids[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = j;
-      // beam_slots[i].parent_id[j];
-      acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] =
-          bc->beamRequestsInfo[i].probs[j];
-      log_beam_topk.debug()
-          << "probbbb req: " << i
-          << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j]
-          << ", sub request id " << j << ", parent id "
-          << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd"
-          << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n";
-    }
-
-    // process tokens
-    for (int k = 0; k < num_new_tokens; k++) {
-      beam_block_start_index.push_back(block_start_index);
-      request_id.push_back(i);
-      tokens_per_request.push_back(num_new_tokens);
-      block_start_index += length;
-      beam_num_blocks++;
-    }
-
-    max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]);
-    max_beam_width = std::max(max_beam_width, beam_size);
-    req_index += 1;
-    block_start_index += (sub_requests[i] - 1) * num_new_tokens * length;
-  }
-  log_beam_topk.debug() << "what index: " << block_start_index
-                        << ", block num: " << beam_num_blocks << "\n";
-
-  assert(batch_size >= beam_num_blocks);
-  assert(bc->num_active_requests() == req_index);
-
-  {
-    constexpr auto shared_memory_size = 48 << 10;
-    auto const heap_size = max_heap_size * sizeof(Entry<DT>);
-    // shared_memory_size = (num_shards + 1) * heap_size <=>
-    num_shards = shared_memory_size / heap_size - 1;
-    assert(num_shards > 0);
-    if (num_shards > CUDA_NUM_THREADS) {
-      num_shards = CUDA_NUM_THREADS;
-    }
-    log_beam_topk.debug() << "maxheap size:  " << max_heap_size << "\n";
-    log_beam_topk.debug() << "maxbeam width:  " << max_beam_width
-                          << ", heap size: " << heap_size << "\n";
-  }
-  // We are limited by the amount of shared memory we have per block.
-  size_t shared_memory_size =
-      (num_shards + 1) * max_heap_size * sizeof(Entry<DT>);
-
-  assert(num_shards >= (size_t)max_heap_size);
-  num_shards = max_heap_size;
-
-  checkCUDA(hipMemcpy(m->parent_ids,
-                      parent_ids,
-                      sizeof(int) * max_total_requests,
-                      hipMemcpyHostToDevice));
-  checkCUDA(hipMemcpy(m->acc_probs,
-                      acc_probs,
-                      sizeof(DT) * max_total_requests,
-                      hipMemcpyHostToDevice));
-  checkCUDA(hipMemcpy(m->block_start_index,
-                      beam_block_start_index.data(),
-                      sizeof(int) * beam_num_blocks,
-                      hipMemcpyHostToDevice));
-  checkCUDA(hipMemcpy(m->request_id,
-                      request_id.data(),
-                      sizeof(int) * beam_num_blocks,
-                      hipMemcpyHostToDevice));
-  checkCUDA(hipMemcpy(m->tokens_per_request,
-                      tokens_per_request.data(),
-                      sizeof(int) * beam_num_blocks,
-                      hipMemcpyHostToDevice));
-  // int depth =
-  //     bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth;
-  beam_topk_forward_kernel<<<beam_num_blocks, num_shards, 0, stream>>>(
-      input_ptr,
-      shared_memory_size,
-      length,
-      max_beam_width,
-      max_heap_size,
-      m->parent_ids,
-      static_cast<DT *>(m->acc_probs),
-      m->block_start_index,
-      m->request_id,
-      m->tokens_per_request,
-      sorted,
-      output_ptr,
-      indices_ptr,
-      parent_ptr,
-      false /*verbose*/ // depth == 1
-  );
-
-  // merge sub
-}
-
-/*static*/
-void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
-                                      BeamSearchBatchConfig const *bc,
-                                      GenericTensorAccessorR const &input,
-                                      float *output_ptr,
-                                      int *indices_ptr,
-                                      int *parent_ptr,
-                                      int batch_size,
-                                      int length,
-                                      bool sorted) {
-  hipStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
-  hipEvent_t t_start, t_end;
-  if (m->profiling) {
-    checkCUDA(hipEventCreate(&t_start));
-    checkCUDA(hipEventCreate(&t_end));
-    checkCUDA(hipEventRecord(t_start, stream));
-  }
-
-  if (input.data_type == DT_HALF) {
-    BeamTopK::forward_kernel(m,
-                             bc,
-                             input.get_half_ptr(),
-                             output_ptr,
-                             indices_ptr,
-                             parent_ptr,
-                             batch_size,
-                             length,
-                             sorted,
-                             stream);
-  } else if (input.data_type == DT_FLOAT) {
-    BeamTopK::forward_kernel(m,
-                             bc,
-                             input.get_float_ptr(),
-                             output_ptr,
-                             indices_ptr,
-                             parent_ptr,
-                             batch_size,
-                             length,
-                             sorted,
-                             stream);
-  }
-
-  if (m->profiling) {
-    checkCUDA(hipEventRecord(t_end, stream));
-    checkCUDA(hipEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
-    checkCUDA(hipEventDestroy(t_start));
-    checkCUDA(hipEventDestroy(t_end));
-    printf("[BeamTopK] forward time = %.2lfms\n", elapsed);
-  }
-}
-
-BeamTopKMeta::BeamTopKMeta(FFHandler handler,
-                           Op const *op,
-                           MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
-  DataType data_type = op->inputs[0]->data_type;
-  int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-  int max_requests_per_batch = BatchConfig::max_requests_per_batch();
-  size_t parent_id_size =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
-  size_t acc_probs_size =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
-  size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch;
-  size_t request_id_size = max_tokens_per_batch * max_requests_per_batch;
-  size_t tokens_per_request_size =
-      max_tokens_per_batch * max_requests_per_batch;
-  size_t totalSize = sizeof(int) * parent_id_size +
-                     data_type_size(data_type) * acc_probs_size +
-                     sizeof(int) * block_start_index_size +
-                     sizeof(int) * request_id_size +
-                     sizeof(int) * tokens_per_request_size;
-
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
-  parent_ids = gpu_mem_allocator.allocate_instance<int>(parent_id_size);
-  if (data_type == DT_FLOAT) {
-    acc_probs = gpu_mem_allocator.allocate_instance<float>(acc_probs_size);
-  } else if (data_type == DT_HALF) {
-    acc_probs = gpu_mem_allocator.allocate_instance<half>(acc_probs_size);
-  } else {
-    assert(false);
-  }
-
-  block_start_index =
-      gpu_mem_allocator.allocate_instance<int>(block_start_index_size);
-  request_id = gpu_mem_allocator.allocate_instance<int>(request_id_size);
-  tokens_per_request =
-      gpu_mem_allocator.allocate_instance<int>(tokens_per_request_size);
-}
-
-BeamTopKMeta::~BeamTopKMeta(void) {
-  if (reserveInst != Realm::RegionInstance::NO_INST) {
-    reserveInst.destroy();
-  }
-}
-}; // namespace FlexFlow
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
deleted file mode 100644
index a958786be..000000000
--- a/src/ops/beam_topk.cu
+++ /dev/null
@@ -1,766 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/ffconst_utils.h"
-#include "flexflow/ops/beam_topk.h"
-#include "flexflow/request_manager.h"
-#include "flexflow/utils/cuda_helper.h"
-
-namespace FlexFlow {
-// declare Legion names
-using Legion::coord_t;
-
-enum class HeapType { kMinHeap, kMaxHeap };
-enum class PreferIndices { kLower, kHigher };
-
-LegionRuntime::Logger::Category log_beam_topk("BeamTopK");
-
-template <typename T>
-struct Entry {
-  int index;
-  T value;
-};
-
-template <typename T>
-struct LinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return data[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-};
-
-template <typename T>
-struct IndirectLinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return backing_data[data[i].index].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-  Entry *const backing_data;
-};
-
-template <typename T>
-struct StridedData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index * blockDim.x + threadIdx.x];
-  }
-
-  __device__ int get_index(int i) const {
-    return (*this)[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return (*this)[i].value;
-  }
-
-  Entry *const data;
-};
-
-// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-struct IndexedHeap {
-  typedef typename Data<T>::Entry Entry;
-  Data<T> const data;
-  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
-
-  __device__ bool is_above(int left, int right) {
-    T left_value = data.get_value(left);
-    T right_value = data.get_value(right);
-    if (left_value == right_value) {
-      if (preferIndices == PreferIndices::kLower) {
-        return data.get_index(left) < data.get_index(right);
-      } else {
-        return data.get_index(left) > data.get_index(right);
-      }
-    }
-    if (heapType == HeapType::kMinHeap) {
-      return left_value < right_value;
-    } else {
-      return left_value > right_value;
-    }
-  }
-
-  __device__ void assign(int i, Entry const &entry) {
-    data[i] = entry;
-  }
-
-  __device__ void push_up(int i) {
-    int child = i;
-    int parent;
-    for (; child > 0; child = parent) {
-      parent = (child - 1) / 2;
-      if (!is_above(child, parent)) {
-        // Heap property satisfied.
-        break;
-      }
-      swap(child, parent);
-    }
-  }
-
-  __device__ void swap(int a, int b) {
-    auto tmp = data[b];
-    data[b] = data[a];
-    data[a] = tmp;
-  }
-
-  __device__ void push_root_down(int k) {
-    push_down(0, k);
-  }
-
-  // MAX-HEAPIFY in Cormen
-  __device__ void push_down(int node, int k) {
-    while (true) {
-      int const left = 2 * node + 1;
-      int const right = left + 1;
-      int smallest = node;
-      if (left < k && is_above(left, smallest)) {
-        smallest = left;
-      }
-      if (right < k && is_above(right, smallest)) {
-        smallest = right;
-      }
-      if (smallest == node) {
-        break;
-      }
-      swap(smallest, node);
-      node = smallest;
-    }
-  }
-
-  // BUILD-MAX-HEAPIFY in Cormen
-  __device__ void build(int k) {
-    for (int node = (k - 1) / 2; node >= 0; node--) {
-      push_down(node, k);
-    }
-  }
-
-  // HEAP-EXTRACT-MAX in Cormen
-  __device__ void remove_root(int k) {
-    data[0] = data[k - 1];
-    push_root_down(k - 1);
-  }
-
-  // in-place HEAPSORT in Cormen
-  // This method destroys the heap property.
-  __device__ void sort(int k) {
-    for (int slot = k - 1; slot > 0; slot--) {
-      // This is like remove_root but we insert the element at the end.
-      swap(slot, 0);
-      // Heap is now an element smaller.
-      push_root_down(/*k=*/slot);
-    }
-  }
-
-  __device__ void replace_root(Entry const &entry, int k) {
-    data[0] = entry;
-    push_root_down(k);
-  }
-
-  __device__ Entry const &root() {
-    return data[0];
-  }
-};
-
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-__device__ IndexedHeap<heapType, preferIndices, Data, T>
-    make_indexed_heap(typename Data<T>::Entry *data) {
-  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
-}
-
-// heapBeamTopK walks over [input, input+length) with `step_size` stride
-// starting at `start_index`. It builds a top-`k` heap that is stored in
-// `heap_entries` using `Accessor` to access elements in `heap_entries`. If
-// sorted=true, the elements will be sorted at the end.
-template <typename T, template <typename> class Data = LinearData>
-__device__ void heapBeamTopK(T const *__restrict__ input,
-                             int batch_index,
-                             int length,
-                             int k,
-                             Entry<T> *__restrict__ heap_entries,
-                             bool sorted = false,
-                             int start_index = 0,
-                             int step_size = 1) {
-  assert(k <= length);
-  auto heap =
-      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
-          heap_entries);
-
-  int heap_end_index = start_index + k * step_size;
-  if (heap_end_index > length) {
-    heap_end_index = length;
-  }
-  // Initialize the min-heap.
-  for (int index = start_index, slot = 0; index < heap_end_index;
-       index += step_size, slot++) {
-    heap.assign(slot, {index, input[index]});
-  }
-
-  heap.build(k);
-
-  // Now iterate over the remaining items.
-  // If an item is smaller than the min element, it is not amongst the top k.
-  // Otherwise, replace the min element with it and push upwards.
-  for (int index = heap_end_index; index < length; index += step_size) {
-    // We prefer elements with lower indices. This is given here.
-    // Later elements automatically have higher indices, so can be discarded.
-    if (input[index] > heap.root().value) {
-      // This element should replace the min.
-      heap.replace_root({index, input[index]}, k);
-    }
-  }
-
-  // Sort if wanted.
-  if (sorted) {
-    heap.sort(k);
-  }
-
-  // if(batch_index == 0){
-  //   printf("top elemmments: %d, value %.15f\n", start_index,
-  //   heap.root().value);
-  // }
-}
-
-template <typename T>
-__device__ void mergeBeamShards(int num_shards,
-                                int batch_index,
-                                int k,
-                                int max_heap_size,
-                                int request_id,
-                                int *parent_id,
-                                T *probs,
-                                Entry<T> *__restrict__ entries,
-                                Entry<T> *__restrict__ top_k_heap,
-                                float *top_k_values,
-                                int *top_k_indices,
-                                int *top_k_parents,
-                                bool verbose) {
-  // If k < num_shards, we can use a min-heap with k elements to get the top k
-  // of the sorted blocks.
-  // If k > num_shards, we can initialize a min-heap with the top element from
-  // each sorted block.
-  int const heap_size = k < num_shards ? k : num_shards;
-  // printf("see value: %f", entries[0].value);
-  // Min-heap part.
-
-  {
-    auto min_heap = IndexedHeap<HeapType::kMinHeap,
-                                PreferIndices::kHigher,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Initialize the heap as a min-heap.
-    for (int slot = 0; slot < heap_size; slot++) {
-      // int beam = (slot % max_heap_size) / k;
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((slot % max_heap_size) / k)];
-      min_heap.assign(slot, {slot, (entries[slot].value * prob)});
-      if (verbose && batch_index == 0) {
-        printf("slot %d, value %.15f, prob %15f\n",
-               slot,
-               static_cast<float>(entries[slot].value),
-               static_cast<float>(prob));
-      }
-    }
-    min_heap.build(heap_size);
-
-    // Now perform top k with the remaining shards (if num_shards > heap_size).
-    for (int shard = heap_size; shard < num_shards; shard++) {
-      auto const entry = entries[shard];
-      auto const root = min_heap.root();
-
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((shard % max_heap_size) / k)];
-      if (verbose && batch_index == 0) {
-        printf("shard %d, index %d, value %.15f, prob %.15f\n",
-               shard,
-               entry.index,
-               static_cast<float>(entry.value),
-               static_cast<float>(prob));
-      }
-      if (entry.value * prob < root.value) {
-        continue;
-      }
-      if (entry.value * prob == root.value &&
-          entry.index > entries[root.index].index) {
-        continue;
-      }
-      // This element should replace the min.
-      min_heap.replace_root({shard, entry.value * prob}, heap_size);
-    }
-  }
-
-  // Max-part.
-  {
-    // Turn the min-heap into a max-heap in-place.
-    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
-                                PreferIndices::kLower,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Heapify into a max heap.
-    max_heap.build(heap_size);
-
-    // Now extract the minimum k-1 times.
-    // k is treated specially.
-    int const last_k = k - 1;
-    for (int rank = 0; rank < last_k; rank++) {
-      Entry<T> const &max_element = max_heap.root();
-      top_k_values[rank] = __half2float(max_element.value);
-      int shard_index = max_element.index;
-      top_k_indices[rank] = entries[shard_index].index;
-      top_k_parents[rank] =
-          parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                    ((shard_index % max_heap_size) / k)];
-      int next_shard_index = shard_index + num_shards;
-
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((next_shard_index % max_heap_size) / k)];
-      // if (batch_index == 0) {
-      //   printf("next_shard_index %d, value %.15f, prob %.15f\n",
-      //          next_shard_index,
-      //          entries[next_shard_index].value,
-      //          prob);
-      // }
-      max_heap.replace_root(
-          {next_shard_index, entries[next_shard_index].value * prob},
-          heap_size);
-    }
-
-    // rank == last_k.
-    Entry<T> const &max_element = max_heap.root();
-    top_k_values[last_k] = __half2float(max_element.value);
-    int shard_index = max_element.index;
-    top_k_indices[last_k] = entries[shard_index].index;
-    top_k_parents[last_k] =
-        parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                  ((shard_index % max_heap_size) / k)];
-  }
-}
-
-template <typename T>
-__global__ void
-    mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
-  }
-}
-
-template <typename T>
-__global__ void beam_topk_forward_kernel(T const *__restrict__ input,
-                                         size_t shared_memory_size,
-                                         int length,
-                                         int k,
-                                         int max_heap_size,
-                                         int *parent_ids,
-                                         T *acc_probs,
-                                         int *gpu_block_start_index,
-                                         int *gpu_request_id,
-                                         int *tokens_per_request,
-                                         bool sorted,
-                                         float *__restrict__ output,
-                                         int *__restrict__ indices,
-                                         int *__restrict__ parents,
-                                         bool verbose) {
-  __shared__ char shared_memory[48 << 10];
-  int const batch_index = blockIdx.x;
-  // T const *batch_input = input + batch_index * length;
-  int const thread_index = threadIdx.x;
-  int const thread_count = blockDim.x;
-  int const request_id = gpu_request_id[batch_index];
-  int const token_nums = tokens_per_request[batch_index];
-  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
-
-  int sub_request_id = thread_index / k;
-  // if (verbose) {
-  //   printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d,
-  //   "
-  //          "request_id %d, token_nums %d\n",
-  //          batch_index,
-  //          thread_index,
-  //          sub_request_id,
-  //          request_id,
-  //          token_nums);
-  // }
-
-  T const *batch_input = input + gpu_block_start_index[batch_index] +
-                         (sub_request_id * token_nums * length);
-
-  if (verbose && batch_index == 0) {
-    printf("request 0 start index: thread index %d, offset %d, batch_input %p, "
-           "acc index %d acc "
-           "prob %f, thread_count %d, request_id %d\n",
-           thread_index,
-           gpu_block_start_index[batch_index] +
-               (sub_request_id * token_nums * length),
-           batch_input,
-           request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id,
-           static_cast<float>(
-               acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                         sub_request_id]),
-           thread_count,
-           request_id);
-  }
-  // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index,
-  // thread_count, batch_index);
-  heapBeamTopK<T, StridedData>(batch_input,
-                               batch_index,
-                               length,
-                               k,
-                               shared_entries,
-                               true,
-                               thread_index % k,
-                               k);
-  __syncthreads();
-  // printf("beam thread index %d, thread_count %d, thread index %d, batch_index
-  // "
-  //        "%d, k %d, parent_id %d, acc_prob: %f, sub id: %d, request_id: %d,
-  //        offset: %d, offset2 %d, sub_request_id %d\n", thread_index,
-  //        thread_count,
-  //        thread_index,
-  //        batch_index,
-  //        k,
-  //        parent_ids[request_id * BatchConfig::MAX_NUM_BEAMS +
-  //        sub_request_id], acc_probs[request_id * BatchConfig::MAX_NUM_BEAMS +
-  //        sub_request_id], sub_request_id, request_id,
-  //        gpu_block_start_index[batch_index],
-  //        batch_index * length,
-  //        sub_request_id);
-
-  if (thread_index == 0) {
-    // merge beam_width heaps and store the parent
-    // find which req it belongs to, replace the offset
-    // printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n",
-    //       batch_index,
-    //       sub_request_id,
-    //       acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-    //                 sub_request_id]);
-    int const offset = batch_index * k;
-    auto batch_output = output + offset;
-    auto batch_indices = indices + offset;
-    auto batch_parents = parents + offset;
-    Entry<T> *top_k_heap = shared_entries + thread_count * k;
-
-    // if(batch_index == 0 && verbose) {
-    //   for(int i = 0; i < 18; i++){
-    //       printf("see value: %.15f\n", shared_entries[i].value);
-    //   }
-    // }
-
-    // get parent/acc based on the sub request and main request
-    mergeBeamShards(thread_count,
-                    batch_index,
-                    k,
-                    max_heap_size,
-                    request_id,
-                    parent_ids,
-                    acc_probs,
-                    shared_entries,
-                    top_k_heap,
-                    batch_output,
-                    batch_indices,
-                    batch_parents,
-                    verbose /*verbose prints*/);
-  }
-}
-
-/*static*/
-template <typename DT>
-void BeamTopK::forward_kernel(BeamTopKMeta const *m,
-                              BeamSearchBatchConfig const *bc,
-                              DT const *input_ptr,
-                              float *output_ptr,
-                              int *indices_ptr,
-                              int *parent_ptr,
-                              int batch_size,
-                              int length,
-                              bool sorted,
-                              cudaStream_t stream) {
-  // Adopted from TensorFlow's BeamTopK implementation
-  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
-
-  int num_shards = 0;
-  int max_heap_size = 0;
-  int max_beam_width = 0;
-  int req_index = 0;
-
-  // sub request
-  int const *sub_requests = bc->sub_requests;
-
-  // std::vector<BatchConfig::BeamSlot> beam_slots = bc->beam_slots;
-  // assert(bc->beam_slots.size() > 0);
-
-  int beam_num_blocks = 0;
-  std::vector<int> beam_block_start_index;
-  std::vector<int> request_id;
-  std::vector<int> tokens_per_request;
-
-  int block_start_index = 0;
-
-  // a data structure for prob, parent_id,
-  int max_total_requests =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests();
-  int parent_ids[max_total_requests];
-  DT acc_probs[max_total_requests];
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    assert(bc->beamRequestsInfo[i].beam_size > 0);
-
-    // int num_new_tokens = bc->num_processing_tokens[i];
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-
-    // get beam size;
-    int beam_size = bc->beamRequestsInfo[i].beam_size;
-
-    // initial request
-    assert(sub_requests[i] > 0);
-    // process sub requests
-    for (int j = 0; j < sub_requests[i]; j++) {
-      parent_ids[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = j;
-      // beam_slots[i].parent_id[j];
-      acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] =
-          bc->beamRequestsInfo[i].probs[j];
-      // std::cout << "probbbb req: " << i << ", sub req probability : "
-      //           << bc->beamRequestsInfo[i].probs[j] << ", sub request id " <<
-      //           j
-      //           << ", parent id " << bc->beamRequestsInfo[i].parent_id[j]
-      //           << ", data inddd"
-      //           << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j
-      //           << "\n";
-    }
-
-    // process tokens
-    for (int k = 0; k < num_new_tokens; k++) {
-      beam_block_start_index.push_back(block_start_index);
-      request_id.push_back(i);
-      tokens_per_request.push_back(num_new_tokens);
-      block_start_index += length;
-      beam_num_blocks++;
-    }
-
-    max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]);
-    max_beam_width = std::max(max_beam_width, beam_size);
-
-    req_index += 1;
-    block_start_index += (sub_requests[i] - 1) * num_new_tokens * length;
-  }
-  log_beam_topk.debug() << "what index: " << block_start_index
-                        << ", block num: " << beam_num_blocks << "\n";
-
-  assert(batch_size >= beam_num_blocks);
-  assert(bc->num_active_requests() == req_index);
-
-  {
-    constexpr auto shared_memory_size = 48 << 10;
-    auto const heap_size = max_heap_size * sizeof(Entry<DT>);
-    // shared_memory_size = (num_shards + 1) * heap_size <=>
-    num_shards = shared_memory_size / heap_size - 1;
-    assert(num_shards > 0);
-    if (num_shards > CUDA_NUM_THREADS) {
-      num_shards = CUDA_NUM_THREADS;
-    }
-    log_beam_topk.debug() << "maxheap size:  " << max_heap_size << "\n";
-    log_beam_topk.debug() << "maxbeam width:  " << max_beam_width
-                          << ", heap size: " << heap_size << "\n";
-  }
-  // We are limited by the amount of shared memory we have per block.
-  size_t shared_memory_size =
-      (num_shards + 1) * max_heap_size * sizeof(Entry<DT>);
-
-  assert(num_shards >= (size_t)max_heap_size);
-  num_shards = max_heap_size;
-
-  checkCUDA(cudaMemcpyAsync(m->parent_ids,
-                            parent_ids,
-                            sizeof(int) * max_total_requests,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  checkCUDA(cudaMemcpyAsync(m->acc_probs,
-                            acc_probs,
-                            sizeof(DT) * max_total_requests,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  // trick, set acc_probs to 0;
-  checkCUDA(cudaMemsetAsync(
-      m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream));
-  checkCUDA(cudaMemcpyAsync(m->block_start_index,
-                            beam_block_start_index.data(),
-                            sizeof(int) * beam_num_blocks,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  checkCUDA(cudaMemcpyAsync(m->request_id,
-                            request_id.data(),
-                            sizeof(int) * beam_num_blocks,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  checkCUDA(cudaMemcpyAsync(m->tokens_per_request,
-                            tokens_per_request.data(),
-                            sizeof(int) * beam_num_blocks,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  // int depth =
-  //     bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth;
-  beam_num_blocks = bc->num_active_tokens();
-  beam_topk_forward_kernel<<<beam_num_blocks, num_shards, 0, stream>>>(
-      input_ptr,
-      shared_memory_size,
-      length,
-      max_beam_width,
-      max_heap_size,
-      m->parent_ids,
-      static_cast<DT *>(m->acc_probs),
-      m->block_start_index,
-      m->request_id,
-      m->tokens_per_request,
-      sorted,
-      output_ptr,
-      indices_ptr,
-      parent_ptr,
-      false /*verbose*/ // depth == 1
-  );
-
-  // merge sub
-}
-
-/*static*/
-void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
-                                      BeamSearchBatchConfig const *bc,
-                                      GenericTensorAccessorR const &input,
-                                      float *output_ptr,
-                                      int *indices_ptr,
-                                      int *parent_ptr,
-                                      int batch_size,
-                                      int length,
-                                      bool sorted) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
-  cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
-
-  if (input.data_type == DT_HALF) {
-    BeamTopK::forward_kernel(m,
-                             bc,
-                             input.get_half_ptr(),
-                             output_ptr,
-                             indices_ptr,
-                             parent_ptr,
-                             batch_size,
-                             length,
-                             sorted,
-                             stream);
-  } else if (input.data_type == DT_FLOAT) {
-    BeamTopK::forward_kernel(m,
-                             bc,
-                             input.get_float_ptr(),
-                             output_ptr,
-                             indices_ptr,
-                             parent_ptr,
-                             batch_size,
-                             length,
-                             sorted,
-                             stream);
-  }
-
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-    printf("[BeamTopK] forward time = %.2lfms\n", elapsed);
-  }
-}
-
-BeamTopKMeta::BeamTopKMeta(FFHandler handler,
-                           Op const *op,
-                           MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
-  DataType data_type = op->inputs[0]->data_type;
-  int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-  int max_requests_per_batch = BatchConfig::max_requests_per_batch();
-  size_t parent_id_size =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
-  size_t acc_probs_size =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
-  size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch;
-  size_t request_id_size = max_tokens_per_batch * max_requests_per_batch;
-  size_t tokens_per_request_size =
-      max_tokens_per_batch * max_requests_per_batch;
-  size_t totalSize = sizeof(int) * parent_id_size +
-                     data_type_size(data_type) * acc_probs_size +
-                     sizeof(int) * block_start_index_size +
-                     sizeof(int) * request_id_size +
-                     sizeof(int) * tokens_per_request_size;
-
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
-  parent_ids = gpu_mem_allocator.allocate_instance<int>(parent_id_size);
-  if (data_type == DT_FLOAT) {
-    acc_probs = gpu_mem_allocator.allocate_instance<float>(acc_probs_size);
-  } else if (data_type == DT_HALF) {
-    acc_probs = gpu_mem_allocator.allocate_instance<half>(acc_probs_size);
-  } else {
-    assert(false);
-  }
-
-  block_start_index =
-      gpu_mem_allocator.allocate_instance<int>(block_start_index_size);
-  request_id = gpu_mem_allocator.allocate_instance<int>(request_id_size);
-  tokens_per_request =
-      gpu_mem_allocator.allocate_instance<int>(tokens_per_request_size);
-}
-
-BeamTopKMeta::~BeamTopKMeta(void) {
-  if (reserveInst != Realm::RegionInstance::NO_INST) {
-    reserveInst.destroy();
-  }
-}
-}; // namespace FlexFlow
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index e630563b6..3cc8ceea0 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -469,7 +469,7 @@ FutureMap Embedding::inference(FFModel const &ff,
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
 
-  IndexLauncher launcher(EMBED_FWD_TASK_ID,
+  IndexLauncher launcher(EMBED_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(NULL, 0),
                          argmap,
@@ -625,6 +625,8 @@ void Embedding::inference_task(Task const *task,
     effective_batch_size = output.domain.get_volume() / out_dim;
     assert(effective_batch_size * in_dim == input.domain.get_volume());
   }
+  // use active batch size
+  effective_batch_size = bc->num_active_tokens();
   forward_kernel_wrapper(
       m, input, output, kernel, in_dim, out_dim, effective_batch_size);
   if (m->inference_debugging) {
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 9ad5c4dc9..a22873847 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -355,6 +355,7 @@ void FusedOp::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  // launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -445,6 +446,7 @@ void FusedOp::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  // launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -479,6 +481,7 @@ void FusedOp::forward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
     assert(inputs[i]->part != LogicalPartition::NO_PART);
@@ -538,6 +541,7 @@ FutureMap FusedOp::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
@@ -589,6 +593,7 @@ void FusedOp::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   int idx = 0;
   for (int i = 0; i < numInputs; i++) {
     launcher.add_region_requirement(RegionRequirement(inputs[i]->part,
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 3282bc57d..6111a8fd0 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -871,8 +871,8 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         TreeIncMultiHeadSelfAttentionMeta *m =
             (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        TreeVerifyBatchConfig const &tree_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+        BatchConfig const &verify_bc =
+            Future(task->futures[0]).get_result<BatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
@@ -882,7 +882,7 @@ __host__ void
         }
         TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            &tree_bc,
+            &verify_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
             my_weight_accessor[0],
@@ -895,10 +895,10 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         SpecIncMultiHeadSelfAttentionMeta const *m =
             (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+        // TreeSearchBatchConfig const *search_bc =
+        //     (TreeSearchBatchConfig *)task->args;
+        BatchConfig const &search_bc =
+            Future(task->futures[0]).get_result<BatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
@@ -908,7 +908,7 @@ __host__ void
         }
         SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            &beam_bc,
+            &search_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
             my_weight_accessor[0],
@@ -1047,7 +1047,7 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
         Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+            ctx, runtime, m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       default: {
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 483028599..78983d579 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -13,7 +13,9 @@
  * limitations under the License.
  */
 
+#include "cuda.h"
 #include "flexflow/accessor.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/add_bias_residual_layer_norm.h"
 #include "flexflow/ops/batch_norm.h"
@@ -45,17 +47,6 @@
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::Future;
-using Legion::LogicalPartition;
-using Legion::LogicalRegion;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
 
 OpMeta *FusedOp::init_task(Task const *task,
                            std::vector<PhysicalRegion> const &regions,
@@ -142,6 +133,8 @@ __host__ void FusedOp::forward_task(Task const *task,
   for (int op = start + 1; op < fused->numOperators; op++) {
     if (metas->meta[op] != NULL) {
       assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.blasLt ==
+             metas->meta[op]->handle.blasLt);
       assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
     }
   }
@@ -539,7 +532,6 @@ __host__ void
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  // BatchConfig const *bc = (BatchConfig *)task->args;
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   // Return if no active tokens
   if (bc->num_tokens == 0) {
@@ -604,534 +596,596 @@ __host__ void
   for (int op = start + 1; op < fused->numOperators; op++) {
     if (metas->meta[op] != NULL) {
       assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.blasLt ==
+             metas->meta[op]->handle.blasLt);
       assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
     }
   }
 
-  int ioff = 0, woff = 0, ooff = 0;
-  for (int op = 0; op < fused->numOperators; op++) {
-    // Domain my_id[MAX_NUM_INPUTS];
-    // Domain my_wd[MAX_NUM_WEIGHTS];
-    // Domain my_od[MAX_NUM_OUTPUTS];
-    GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
-    GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
-    GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
-    for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-      int my_off = fused->op_input_idx[i + ioff];
-      if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        // my_id[i] = input_domain[my_off];
-        assert(my_off < fused->numInputs);
-        my_input_accessor[i] = input_accessor[my_off];
-      } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        // my_id[i] = output_domain[my_off];
-        assert(my_off < fused->numOutputs);
-        my_input_accessor[i] = output_accessor[my_off];
-      } else {
-        assert(false);
-      }
-    }
-    for (int i = 0; i < fused->op_num_weights[op]; i++) {
-      assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
-      assert(fused->op_weight_idx[i + woff] < fused->numWeights);
-      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
-    }
-    for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-      int my_off = fused->op_output_idx[i + ooff];
-      assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      assert(my_off < fused->numOutputs);
-      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
-      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
-      my_output_accessor[i] = output_accessor[my_off];
-    }
-    switch (fused->op_op_type[op]) {
-      case OP_CONCAT: {
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        ConcatMeta *m = (ConcatMeta *)metas->meta[op];
-        int num_inputs = fused->op_num_inputs[op];
-        Kernels::Concat::forward_kernel_wrapper(m,
-                                                my_output_accessor[0],
-                                                my_input_accessor,
-                                                num_inputs,
-                                                m->legion_axis);
-        break;
-      }
-      case OP_BATCHNORM: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_dim() == 5);
-        assert(my_output_accessor[0].domain.get_dim() == 5);
-        assert(my_weight_accessor[0].domain.get_dim() == 2);
-        assert(my_weight_accessor[1].domain.get_dim() == 2);
-        BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
-        BatchNorm::forward_kernel(m,
-                                  my_input_accessor[0].get_float_ptr(),
-                                  my_output_accessor[0].get_float_ptr(),
-                                  my_weight_accessor[0].get_float_ptr(),
-                                  my_weight_accessor[1].get_float_ptr());
-        break;
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  // create new cuda graph
+  cudaGraphExec_t instance;
+
+  GraphParams graph_params = {
+      bc->num_active_requests(), bc->num_active_tokens(), bc->prompt_phase};
+  // int shard_id = task->index_point.point_data[0];
+
+  // bool use_cuda_graph = (bc->get_mode() == TREE_SEARCH_MODE or bc->get_mode()
+  // == TREE_VERIFY_MODE);
+  bool use_cuda_graph =
+      (bc->get_mode() == TREE_SEARCH_MODE && bc->prompt_phase == 0);
+  // bool use_cuda_graph = (bc->get_mode() == TREE_VERIFY_MODE);
+  // bool use_cuda_graph = false;
+  bool captured = false;
+
+  if (use_cuda_graph && metas->graph_collections.count(graph_params) != 0) {
+    captured = true;
+    instance = metas->graph_collections[graph_params];
+  }
+
+  if (!captured) {
+    cudaGraph_t graph;
+    {
+      if (use_cuda_graph) {
+        cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
       }
-      case OP_LINEAR: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        Domain kernel_domain = my_weight_accessor[0].domain;
-        int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
-        int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
-        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
-        assert(my_output_accessor[0].domain.get_volume() ==
-               out_dim * batch_size);
-        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        void const *bias_ptr = nullptr;
-        LinearMeta *m = (LinearMeta *)metas->meta[op];
-        if (fused->op_num_weights[op] == 2) {
-          assert(my_weight_accessor[1].domain.get_volume() == out_dim);
-          if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].ptr;
+      int ioff = 0, woff = 0, ooff = 0;
+      for (int op = 0; op < fused->numOperators; op++) {
+        // Domain my_id[MAX_NUM_INPUTS];
+        // Domain my_wd[MAX_NUM_WEIGHTS];
+        // Domain my_od[MAX_NUM_OUTPUTS];
+        GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
+        GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+        GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
+        for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+          int my_off = fused->op_input_idx[i + ioff];
+          if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+            // my_id[i] = input_domain[my_off];
+            assert(my_off < fused->numInputs);
+            my_input_accessor[i] = input_accessor[my_off];
+          } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+            // my_id[i] = output_domain[my_off];
+            assert(my_off < fused->numOutputs);
+            my_input_accessor[i] = output_accessor[my_off];
+          } else {
+            assert(false);
           }
-        } else {
-          assert(fused->op_num_weights[op] == 1);
         }
-        assert(m->input_type[0] == my_input_accessor[0].data_type);
-        assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
-        Kernels::Linear::forward_kernel_wrapper(m,
-                                                my_input_accessor[0].ptr,
-                                                my_output_accessor[0].ptr,
-                                                my_weight_accessor[0].ptr,
-                                                bias_ptr,
-                                                in_dim,
-                                                out_dim,
-                                                batch_size);
-        break;
-      }
-      case OP_BATCHMATMUL: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        Domain out_domain = my_output_accessor[0].domain;
-        Domain a_domain = my_input_accessor[0].domain;
-        Domain b_domain = my_input_accessor[1].domain;
-        int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
-        assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
-        int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
-        assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
-        int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
-        assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
-        assert(a_domain.get_dim() == b_domain.get_dim());
-        assert(a_domain.get_dim() == out_domain.get_dim());
-        int batch = 1;
-        for (int i = 2; i < a_domain.get_dim(); i++) {
-          int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
-          assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
-          assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
-          batch *= dim_size;
+        for (int i = 0; i < fused->op_num_weights[op]; i++) {
+          assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+          // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+          // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+          assert(fused->op_weight_idx[i + woff] < fused->numWeights);
+          my_weight_accessor[i] =
+              weight_accessor[fused->op_weight_idx[i + woff]];
         }
-        BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
-        Kernels::BatchMatmul::forward_kernel_wrapper(
-            meta,
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].get_float_ptr(),
-            my_input_accessor[1].get_float_ptr(),
-            (float const *)nullptr,
-            m,
-            n,
-            k,
-            batch,
-            meta->a_seq_length_dim,
-            meta->b_seq_length_dim,
-            fused->iter_config.seq_length);
-        break;
-      }
-      case OP_EW_ADD:
-      case OP_EW_SUB:
-      case OP_EW_MUL:
-      case OP_EW_DIV:
-      case OP_EW_MAX:
-      case OP_EW_MIN: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
-        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
-        Kernels::ElementBinary::forward_kernel_wrapper(m,
+        for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+          int my_off = fused->op_output_idx[i + ooff];
+          assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+          assert(my_off < fused->numOutputs);
+          // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+          // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+          my_output_accessor[i] = output_accessor[my_off];
+        }
+        switch (fused->op_op_type[op]) {
+          case OP_CONCAT: {
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+            int num_inputs = fused->op_num_inputs[op];
+            Kernels::Concat::forward_kernel_wrapper(m,
+                                                    my_output_accessor[0],
+                                                    my_input_accessor,
+                                                    num_inputs,
+                                                    m->legion_axis);
+            break;
+          }
+          case OP_BATCHNORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain.get_dim() == 5);
+            assert(my_output_accessor[0].domain.get_dim() == 5);
+            assert(my_weight_accessor[0].domain.get_dim() == 2);
+            assert(my_weight_accessor[1].domain.get_dim() == 2);
+            BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+            BatchNorm::forward_kernel(m,
+                                      my_input_accessor[0].get_float_ptr(),
+                                      my_output_accessor[0].get_float_ptr(),
+                                      my_weight_accessor[0].get_float_ptr(),
+                                      my_weight_accessor[1].get_float_ptr());
+            break;
+          }
+          case OP_LINEAR: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            Domain kernel_domain = my_weight_accessor[0].domain;
+            int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+            int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+            int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+            assert(my_output_accessor[0].domain.get_volume() ==
+                   out_dim * batch_size);
+            assert(my_input_accessor[0].domain.get_volume() ==
+                   in_dim * batch_size);
+            void const *bias_ptr = nullptr;
+            LinearMeta *m = (LinearMeta *)metas->meta[op];
+            if (fused->op_num_weights[op] == 2) {
+              assert(my_weight_accessor[1].domain.get_volume() == out_dim);
+              if (!m->add_bias_only_once ||
+                  task->index_point.point_data[0] == 0) {
+                bias_ptr = my_weight_accessor[1].ptr;
+              }
+            } else {
+              assert(fused->op_num_weights[op] == 1);
+            }
+            assert(m->input_type[0] == my_input_accessor[0].data_type);
+            assert(m->input_type[0] == my_output_accessor[0].data_type);
+            batch_size = bc->num_active_tokens();
+            Kernels::Linear::forward_kernel_wrapper(m,
+                                                    my_input_accessor[0].ptr,
+                                                    my_output_accessor[0].ptr,
+                                                    my_weight_accessor[0].ptr,
+                                                    bias_ptr,
+                                                    in_dim,
+                                                    out_dim,
+                                                    batch_size);
+            break;
+          }
+          case OP_BATCHMATMUL: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            Domain out_domain = my_output_accessor[0].domain;
+            Domain a_domain = my_input_accessor[0].domain;
+            Domain b_domain = my_input_accessor[1].domain;
+            int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+            assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+            int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+            assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+            int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+            assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+            assert(a_domain.get_dim() == b_domain.get_dim());
+            assert(a_domain.get_dim() == out_domain.get_dim());
+            int batch = 1;
+            for (int i = 2; i < a_domain.get_dim(); i++) {
+              int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+              assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+              assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+              batch *= dim_size;
+            }
+            BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+            Kernels::BatchMatmul::forward_kernel_wrapper(
+                meta,
+                my_output_accessor[0].get_float_ptr(),
+                my_input_accessor[0].get_float_ptr(),
+                my_input_accessor[1].get_float_ptr(),
+                (float const *)nullptr,
+                m,
+                n,
+                k,
+                batch,
+                meta->a_seq_length_dim,
+                meta->b_seq_length_dim,
+                fused->iter_config.seq_length);
+            break;
+          }
+          case OP_EW_ADD:
+          case OP_EW_SUB:
+          case OP_EW_MUL:
+          case OP_EW_DIV:
+          case OP_EW_MAX:
+          case OP_EW_MIN: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
+            assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+            ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+            Kernels::ElementBinary::forward_kernel_wrapper(
+                m,
+                my_input_accessor[0],
+                my_input_accessor[1],
+                my_output_accessor[0]);
+            break;
+          }
+          case OP_EMBEDDING: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
+            if (m->aggr == AGGR_MODE_NONE) {
+              // assert(kernel_domain.get_dim() == 2);
+              assert(my_input_accessor[0].domain.get_dim() + 1 ==
+                     my_output_accessor[0].domain.get_dim());
+              for (size_t i = 0; i < my_input_accessor[0].domain.get_dim();
+                   i++) {
+                assert(my_input_accessor[0].domain.hi()[i] ==
+                       my_output_accessor[0].domain.hi()[i + 1]);
+                assert(my_input_accessor[0].domain.lo()[i] ==
+                       my_output_accessor[0].domain.lo()[i + 1]);
+              }
+              assert(my_weight_accessor[0].domain.hi()[0] -
+                         my_weight_accessor[0].domain.lo()[0] ==
+                     my_output_accessor[0].domain.hi()[0] -
+                         my_output_accessor[0].domain.lo()[0]);
+            } else {
+              assert(my_input_accessor[0].domain.get_dim() ==
+                     my_output_accessor[0].domain.get_dim());
+              for (size_t i = 1; i < my_input_accessor[0].domain.get_dim();
+                   i++) {
+                assert(my_input_accessor[0].domain.hi()[i] ==
+                       my_output_accessor[0].domain.hi()[i]);
+                assert(my_input_accessor[0].domain.lo()[i] ==
+                       my_output_accessor[0].domain.lo()[i]);
+              }
+              assert(my_weight_accessor[0].domain.hi()[0] -
+                         my_weight_accessor[0].domain.lo()[0] ==
+                     my_output_accessor[0].domain.hi()[0] -
+                         my_output_accessor[0].domain.lo()[0]);
+            }
+            int in_dim, out_dim, effective_batch_size;
+            if (m->aggr == AGGR_MODE_NONE) {
+              in_dim = 1;
+              out_dim = my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0] + 1;
+              effective_batch_size =
+                  my_output_accessor[0].domain.get_volume() / out_dim;
+              assert(effective_batch_size * in_dim ==
+                     my_input_accessor[0].domain.get_volume());
+            } else {
+              assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
+              in_dim = my_input_accessor[0].domain.hi()[0] -
+                       my_input_accessor[0].domain.lo()[0] + 1;
+              out_dim = my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0] + 1;
+              effective_batch_size =
+                  my_output_accessor[0].domain.get_volume() / out_dim;
+              assert(effective_batch_size * in_dim ==
+                     my_input_accessor[0].domain.get_volume());
+            }
+            // use active batch size
+            effective_batch_size = bc->num_active_tokens();
+
+            assert(my_input_accessor[0].data_type == DT_INT32 ||
+                   my_input_accessor[0].data_type == DT_INT64);
+            Kernels::Embedding::forward_kernel_wrapper(m,
+                                                       my_input_accessor[0],
+                                                       my_output_accessor[0],
+                                                       my_weight_accessor[0],
+                                                       in_dim,
+                                                       out_dim,
+                                                       effective_batch_size);
+            break;
+          }
+          case OP_GELU:
+          case OP_RELU:
+          case OP_SIGMOID:
+          case OP_TANH:
+          case OP_ELU:
+          case OP_SCALAR_TRUE_DIV: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+            ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+            if (m->data_type == DT_HALF) {
+              ElementUnary::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_half_ptr(),
+                  my_output_accessor[0].get_half_ptr(),
+                  my_input_accessor[0].domain.get_volume());
+            } else if (m->data_type == DT_FLOAT) {
+              ElementUnary::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_float_ptr(),
+                  my_output_accessor[0].get_float_ptr(),
+                  my_input_accessor[0].domain.get_volume());
+            } else {
+              assert(false && "Unsupported data type in ElementUnary forward");
+            }
+            break;
+          }
+          case OP_RMS_NORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+            Kernels::RMSNorm::forward_kernel_wrapper(m,
+                                                     my_input_accessor[0],
+                                                     my_weight_accessor[0],
+                                                     my_output_accessor[0]);
+            break;
+          }
+          case OP_RESIDUAL_RMS_NORM: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 2);
+            ResidualRMSNormMeta const *m =
+                (ResidualRMSNormMeta *)metas->meta[op];
+            Kernels::ResidualRMSNorm::forward_kernel_wrapper(
+                m,
+                my_input_accessor[0],
+                my_input_accessor[1],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                my_output_accessor[1],
+                bc->num_active_tokens());
+            break;
+          }
+          case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            IncMultiHeadSelfAttentionMeta *m =
+                (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            assert(fused->op_num_weights[op] ==
+                   (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            IncMultiHeadSelfAttention::inference_kernel_wrapper(
+                m,
+                bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
+          }
+          case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            TreeIncMultiHeadSelfAttentionMeta *m =
+                (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            BatchConfig const *verify_bc =
+                BatchConfig::from_future(task->futures[0]);
+            assert(fused->op_num_weights[op] ==
+                   (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+                m,
+                verify_bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
+          }
+          case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            SpecIncMultiHeadSelfAttentionMeta *m =
+                (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            BatchConfig const *search_bc =
+                BatchConfig::from_future(task->futures[0]);
+            assert(fused->op_num_weights[op] ==
+                   (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
+                m,
+                search_bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
+          }
+          case OP_LAYERNORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+            if (m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[0];
+              if (m->use_bias) {
+                beta = my_weight_accessor[1];
+              }
+            }
+            LayerNorm::forward_kernel_wrapper(
+                m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
+            break;
+          }
+          case OP_RESIDUAL_LAYERNORM: {
+            assert(fused->op_num_outputs[op] == 2);
+            ResidualLayerNormMeta const *m =
+                (ResidualLayerNormMeta *)metas->meta[op];
+            if (m->use_two_residuals) {
+              assert(fused->op_num_inputs[op] == 3);
+            } else {
+              assert(fused->op_num_inputs[op] == 2);
+            }
+            if (!m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 0);
+            } else {
+              if (!m->use_bias) {
+                assert(fused->op_num_weights[op] == 1); // weight
+              } else {
+                assert(fused->op_num_weights[op] == 2); // weight + bias
+              }
+            }
+            GenericTensorAccessorR residual2;
+            if (m->use_two_residuals) {
+              residual2 = my_input_accessor[2];
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[0];
+              if (m->use_bias) {
+                beta = my_weight_accessor[1];
+              }
+            }
+            ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                        my_input_accessor[0],
+                                                        my_input_accessor[1],
+                                                        residual2,
+                                                        my_output_accessor[0],
+                                                        my_output_accessor[1],
+                                                        gamma,
+                                                        beta);
+            break;
+          }
+          case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_outputs[op] == 2);
+            AddBiasResidualLayerNormMeta const *m =
+                (AddBiasResidualLayerNormMeta *)metas->meta[op];
+            if (!m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 1); // attn bias
+            } else {
+              if (!m->use_bias) {
+                assert(fused->op_num_weights[op] == 2); // attn bias + weight
+              } else {
+                assert(fused->op_num_weights[op] ==
+                       3); // attn bias + weight + bias
+              }
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[1];
+              if (m->use_bias) {
+                beta = my_weight_accessor[2];
+              }
+            }
+            Domain attn_bias_domain = my_weight_accessor[0].domain;
+            Domain residual_domain = my_input_accessor[1].domain;
+            int attn_bias_dim =
+                attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
+            int residual_volume = residual_domain.get_volume();
+            AddBiasResidualLayerNorm::inference_kernel_wrapper(
+                m,
+                attn_bias_dim,
+                residual_volume,
+                my_input_accessor[0],
+                my_output_accessor[0],
+                my_output_accessor[1],
+                my_input_accessor[1],
+                my_weight_accessor[0],
+                gamma,
+                beta);
+            break;
+          }
+          case OP_SIGMOID_SILU_MULTI: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_outputs[op] == 1);
+            SigmoidSiluMultiMeta const *m =
+                (SigmoidSiluMultiMeta *)metas->meta[op];
+            // use active number of tokens
+            SigmoidSiluMulti::inference_kernel_wrapper(m,
                                                        my_input_accessor[0],
                                                        my_input_accessor[1],
-                                                       my_output_accessor[0]);
-        break;
-      }
-      case OP_EMBEDDING: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
-        if (m->aggr == AGGR_MODE_NONE) {
-          // assert(kernel_domain.get_dim() == 2);
-          assert(my_input_accessor[0].domain.get_dim() + 1 ==
-                 my_output_accessor[0].domain.get_dim());
-          for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
-            assert(my_input_accessor[0].domain.hi()[i] ==
-                   my_output_accessor[0].domain.hi()[i + 1]);
-            assert(my_input_accessor[0].domain.lo()[i] ==
-                   my_output_accessor[0].domain.lo()[i + 1]);
+                                                       my_output_accessor[0],
+                                                       bc->num_active_tokens());
+            break;
           }
-          assert(my_weight_accessor[0].domain.hi()[0] -
-                     my_weight_accessor[0].domain.lo()[0] ==
-                 my_output_accessor[0].domain.hi()[0] -
-                     my_output_accessor[0].domain.lo()[0]);
-        } else {
-          assert(my_input_accessor[0].domain.get_dim() ==
-                 my_output_accessor[0].domain.get_dim());
-          for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
-            assert(my_input_accessor[0].domain.hi()[i] ==
-                   my_output_accessor[0].domain.hi()[i]);
-            assert(my_input_accessor[0].domain.lo()[i] ==
-                   my_output_accessor[0].domain.lo()[i]);
+          case OP_SOFTMAX: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain.get_volume() ==
+                   my_output_accessor[0].domain.get_volume());
+            SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+            if (m->input_type == DT_HALF) {
+              Kernels::Softmax::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_half_ptr(),
+                  my_output_accessor[0].get_half_ptr());
+            } else if (m->input_type == DT_FLOAT) {
+              Kernels::Softmax::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_float_ptr(),
+                  my_output_accessor[0].get_float_ptr());
+            }
+            break;
           }
-          assert(my_weight_accessor[0].domain.hi()[0] -
-                     my_weight_accessor[0].domain.lo()[0] ==
-                 my_output_accessor[0].domain.hi()[0] -
-                     my_output_accessor[0].domain.lo()[0]);
-        }
-        int in_dim, out_dim, effective_batch_size;
-        if (m->aggr == AGGR_MODE_NONE) {
-          in_dim = 1;
-          out_dim = my_output_accessor[0].domain.hi()[0] -
-                    my_output_accessor[0].domain.lo()[0] + 1;
-          effective_batch_size =
-              my_output_accessor[0].domain.get_volume() / out_dim;
-          assert(effective_batch_size * in_dim ==
-                 my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
-          in_dim = my_input_accessor[0].domain.hi()[0] -
-                   my_input_accessor[0].domain.lo()[0] + 1;
-          out_dim = my_output_accessor[0].domain.hi()[0] -
-                    my_output_accessor[0].domain.lo()[0] + 1;
-          effective_batch_size =
-              my_output_accessor[0].domain.get_volume() / out_dim;
-          assert(effective_batch_size * in_dim ==
-                 my_input_accessor[0].domain.get_volume());
-        }
-
-        assert(my_input_accessor[0].data_type == DT_INT32 ||
-               my_input_accessor[0].data_type == DT_INT64);
-        Kernels::Embedding::forward_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_output_accessor[0],
-                                                   my_weight_accessor[0],
-                                                   in_dim,
-                                                   out_dim,
-                                                   effective_batch_size);
-        break;
-      }
-      case OP_GELU:
-      case OP_RELU:
-      case OP_SIGMOID:
-      case OP_TANH:
-      case OP_ELU:
-      case OP_SCALAR_TRUE_DIV: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        if (m->data_type == DT_HALF) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_FLOAT) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(false && "Unsupported data type in ElementUnary forward");
-        }
-        break;
-      }
-      case OP_RMS_NORM: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-        Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                 my_input_accessor[0],
-                                                 my_weight_accessor[0],
-                                                 my_output_accessor[0]);
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
+          case OP_ALLREDUCE: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            AllReduceMeta *m = (AllReduceMeta *)metas->meta[op];
+            Kernels::AllReduce::inference_kernel_wrapper(ctx,
+                                                         runtime,
+                                                         m,
+                                                         bc,
                                                          my_input_accessor[0],
-                                                         my_input_accessor[1],
-                                                         my_weight_accessor[0],
-                                                         my_output_accessor[0],
-                                                         my_output_accessor[1]);
-        break;
-      }
-      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        IncMultiHeadSelfAttentionMeta const *m =
-            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        IncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
-        break;
-      }
-      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        TreeIncMultiHeadSelfAttentionMeta *m =
-            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // TreeVerifyBatchConfig const *tree_bc =
-        //     (TreeVerifyBatchConfig *)task->args;
-        TreeVerifyBatchConfig const &tree_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &tree_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
-        break;
-      }
-      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        SpecIncMultiHeadSelfAttentionMeta const *m =
-            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &beam_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
-        break;
-      }
-      case OP_LAYERNORM: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
-        if (m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
+                                                         my_output_accessor[0]);
+            break;
           }
-        }
-        LayerNorm::forward_kernel_wrapper(
-            m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
-        break;
-      }
-      case OP_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualLayerNormMeta const *m =
-            (ResidualLayerNormMeta *)metas->meta[op];
-        if (m->use_two_residuals) {
-          assert(fused->op_num_inputs[op] == 3);
-        } else {
-          assert(fused->op_num_inputs[op] == 2);
-        }
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 0);
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 1); // weight
-          } else {
-            assert(fused->op_num_weights[op] == 2); // weight + bias
+          default: {
+            fprintf(stderr,
+                    "Fusion currently does not support type = %d\n",
+                    fused->op_op_type[op]);
+            assert(false && "Fusion currently does not support type");
           }
         }
-        GenericTensorAccessorR residual2;
-        if (m->use_two_residuals) {
-          residual2 = my_input_accessor[2];
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
+        if (metas->meta[op]->inference_debugging) {
+          std::vector<GenericTensorAccessorR> input_accessors_to_save;
+          std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+          std::vector<GenericTensorAccessorR> output_accessors_to_save;
+          for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+            int my_off = fused->op_input_idx[i + ioff];
+            if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+              input_accessors_to_save.push_back(input_accessor[my_off]);
+            } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+              input_accessors_to_save.push_back(output_accessor[my_off]);
+            } else {
+              assert(false);
+            }
           }
-        }
-        ResidualLayerNorm::inference_kernel_wrapper(m,
-                                                    my_input_accessor[0],
-                                                    my_input_accessor[1],
-                                                    residual2,
-                                                    my_output_accessor[0],
-                                                    my_output_accessor[1],
-                                                    gamma,
-                                                    beta);
-        break;
-      }
-      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 2);
-        AddBiasResidualLayerNormMeta const *m =
-            (AddBiasResidualLayerNormMeta *)metas->meta[op];
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 1); // attn bias
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 2); // attn bias + weight
-          } else {
-            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          for (int i = 0; i < fused->op_num_weights[op]; i++) {
+            assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+            weight_accessors_to_save.push_back(
+                weight_accessor[fused->op_weight_idx[i + woff]]);
           }
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[1];
-          if (m->use_bias) {
-            beta = my_weight_accessor[2];
+          for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+            output_accessors_to_save.push_back(output_accessor[i + ooff]);
           }
+          assert(task->index_point.get_dim() == 1);
+          int shard_id = task->index_point.point_data[0];
+          FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                                  shard_id,
+                                                  bc,
+                                                  input_accessors_to_save,
+                                                  weight_accessors_to_save,
+                                                  output_accessors_to_save);
         }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
-        AddBiasResidualLayerNorm::inference_kernel_wrapper(
-            m,
-            attn_bias_dim,
-            residual_volume,
-            my_input_accessor[0],
-            my_output_accessor[0],
-            my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
-            gamma,
-            beta);
-        break;
+        ioff += fused->op_num_inputs[op];
+        woff += fused->op_num_weights[op];
+        ooff += fused->op_num_outputs[op];
       }
-      case OP_SIGMOID_SILU_MULTI: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-        SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_input_accessor[1],
-                                                   my_output_accessor[0]);
-        break;
-      }
-      case OP_SOFTMAX: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
-        }
-        break;
-      }
-      case OP_ALLREDUCE: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-        Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
-        break;
-      }
-      default: {
-        fprintf(stderr,
-                "Fusion currently does not support type = %d\n",
-                fused->op_op_type[op]);
-        assert(false && "Fusion currently does not support type");
+      // for (int i = 0; i < fused->numOutputs; i++)
+      //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
+      //   "[Fused:forward:output]");
+      if (use_cuda_graph) {
+        cudaStreamEndCapture(stream, &graph);
       }
     }
-    if (metas->meta[op]->inference_debugging) {
-      std::vector<GenericTensorAccessorR> input_accessors_to_save;
-      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
-      std::vector<GenericTensorAccessorR> output_accessors_to_save;
-      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-        int my_off = fused->op_input_idx[i + ioff];
-        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-          input_accessors_to_save.push_back(input_accessor[my_off]);
-        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-          input_accessors_to_save.push_back(output_accessor[my_off]);
-        } else {
-          assert(false);
-        }
-      }
-      for (int i = 0; i < fused->op_num_weights[op]; i++) {
-        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-        weight_accessors_to_save.push_back(
-            weight_accessor[fused->op_weight_idx[i + woff]]);
-      }
-      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-        output_accessors_to_save.push_back(output_accessor[i + ooff]);
-      }
-      assert(task->index_point.get_dim() == 1);
-      int shard_id = task->index_point.point_data[0];
-      FusedOp::save_inference_tensors_to_file(metas->meta[op],
-                                              shard_id,
-                                              bc,
-                                              input_accessors_to_save,
-                                              weight_accessors_to_save,
-                                              output_accessors_to_save);
+    if (use_cuda_graph) {
+      cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
+      metas->graph_collections[graph_params] = instance;
+      cudaGraphDestroy(graph);
     }
-    ioff += fused->op_num_inputs[op];
-    woff += fused->op_num_weights[op];
-    ooff += fused->op_num_outputs[op];
   }
-  // for (int i = 0; i < fused->numOutputs; i++)
-  //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
-  //   "[Fused:forward:output]");
+
+  if (use_cuda_graph) {
+    assert(metas->graph_collections.find(graph_params) !=
+           metas->graph_collections.end());
+    cudaGraphLaunch(instance, stream);
+  }
 }
 
 /*
@@ -1255,6 +1309,8 @@ __host__ void FusedOp::backward_task(Task const *task,
   for (int op = start + 1; op < fused->numOperators; op++) {
     if (metas->meta[op] != NULL) {
       assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.blasLt ==
+             metas->meta[op]->handle.blasLt);
       assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
     }
   }
diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
new file mode 100644
index 000000000..fb7f8a978
--- /dev/null
+++ b/src/ops/gumbel_topk.cc
@@ -0,0 +1,536 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/gumbel_topk.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/hash_utils.h"
+#include "legion/legion_utilities.h"
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+using PCG::Node;
+
+// For an input tensor, computes the top k entries in each row
+// (resp. vector along the last dimension) using Gumbel trick
+// (https://arxiv.org/abs/1903.06059). Thus, values.shape = indices.shape =
+// input.shape[:-1] + [k]
+Tensor FFModel::gumbel_top_k(Tensor const input,
+                             int k,
+                             bool sorted,
+                             bool speculative_decoding,
+                             char const *name) {
+  Layer *li = new Layer(this,
+                        OP_GUMBEL_TOPK,
+                        input->data_type,
+                        name,
+                        1,
+                        0,
+                        speculative_decoding ? 3 : 1 /*outputs*/,
+                        input);
+  {
+    int numdims = input->num_dims;
+    int dims[MAX_TENSOR_DIM];
+    for (int i = 0; i < numdims; i++) {
+      dims[i] = input->dims[i];
+    }
+    dims[0] = k;
+    // token_ids
+    li->outputs[0] = create_tensor_legion_ordering(
+        numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
+    if (speculative_decoding) {
+      // log_probs
+      li->outputs[1] = create_tensor_legion_ordering(
+          numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
+      // perturbed_log_probs
+      li->outputs[2] = create_tensor_legion_ordering(
+          numdims, dims, DT_FLOAT, li, 2, false /*create_grad*/);
+    }
+  }
+  li->add_int_property("k", k);
+  li->add_int_property("sorted", sorted);
+  li->add_int_property("speculative_decoding", speculative_decoding);
+  layers.push_back(li);
+  return li->outputs[0];
+}
+
+Op *GumbelTopK::create_operator_from_layer(
+    FFModel &model,
+    Layer const *layer,
+    std::vector<ParallelTensor> const &inputs) {
+  long long value;
+  layer->get_int_property("k", value);
+  int k = value;
+  layer->get_int_property("sorted", value);
+  bool sorted = (bool)value;
+  layer->get_int_property("speculative_decoding", value);
+  bool speculative_decoding = (bool)value;
+
+  return new GumbelTopK(model,
+                        layer->layer_guid,
+                        inputs[0],
+                        k,
+                        sorted,
+                        speculative_decoding,
+                        layer->name);
+}
+
+GumbelTopKParams GumbelTopK::get_params() const {
+  GumbelTopKParams params;
+  params.k = this->k;
+  params.sorted = this->sorted;
+  params.speculative_decoding = this->speculative_decoding;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
+  return params;
+}
+
+bool GumbelTopKParams::is_valid(ParallelTensorShape const &) const {
+  // gumbel_topk is always valid
+  return true;
+}
+
+bool operator==(GumbelTopKParams const &lhs, GumbelTopKParams const &rhs) {
+  return lhs.k == rhs.k && lhs.sorted == rhs.sorted &&
+         lhs.speculative_decoding == rhs.speculative_decoding;
+}
+
+GumbelTopK::GumbelTopK(FFModel &model,
+                       LayerID const &_layer_guid,
+                       ParallelTensor const _input,
+                       int _k,
+                       bool _sorted,
+                       bool _speculative_decoding,
+                       char const *name)
+    : Op(model,
+         OP_GUMBEL_TOPK,
+         _input->data_type,
+         name,
+         1 /*inputs*/,
+         0 /*weights*/,
+         _speculative_decoding ? 3 : 1 /*outputs*/,
+         _input),
+      k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) {
+  // overwrite layer_guid
+  layer_guid = _layer_guid;
+  int numdim = inputs[0]->num_dims;
+  ParallelDim dims[MAX_TENSOR_DIM];
+  for (int i = 0; i < numdim; i++) {
+    dims[i] = inputs[0]->dims[i];
+  }
+
+  dims[0].size = k;
+  assert(inputs[0]->dims[0].degree == 1);
+  assert(inputs[0]->dims[0].parallel_idx == -1);
+
+  outputs[0] = model.create_parallel_tensor_legion_ordering(
+      numdim, dims, DT_INT32, this, 0 /*owner_idx*/);
+  if (_speculative_decoding) {
+    outputs[1] = model.create_parallel_tensor_legion_ordering(
+        numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
+    outputs[2] = model.create_parallel_tensor_legion_ordering(
+        numdim, dims, DT_FLOAT, this, 2 /*owner_idx*/);
+  }
+}
+
+GumbelTopK::GumbelTopK(FFModel &model,
+                       LayerID const &layer_guid,
+                       GumbelTopK const &other,
+                       ParallelTensor const input)
+    : GumbelTopK(model,
+                 layer_guid,
+                 input,
+                 other.k,
+                 other.sorted,
+                 other.speculative_decoding,
+                 other.name) {}
+
+GumbelTopK::GumbelTopK(FFModel &model,
+                       GumbelTopKParams const &params,
+                       ParallelTensor const input,
+                       char const *name)
+    : GumbelTopK(model,
+                 params.layer_guid,
+                 input,
+                 params.k,
+                 params.sorted,
+                 params.speculative_decoding,
+                 params.name) {}
+
+void GumbelTopK::init_inference(
+    FFModel const &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  parallel_is = batch_outputs[0]->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(GUMBEL_TOPK_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(GumbelTopK)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  //   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
+  //                                                     0 /*projection id*/,
+  //                                                     WRITE_ONLY,
+  //                                                     EXCLUSIVE,
+  //                                                     batch_outputs[1]->region));
+  launcher.add_field(2, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
+}
+
+void GumbelTopK::init(FFModel const &ff) {
+  assert(check_output_input_weight_same_parallel_is());
+  parallel_is = outputs[0]->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_init(ff, argmap);
+  IndexLauncher launcher(GUMBEL_TOPK_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(GumbelTopK)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap(ff, fm);
+}
+
+OpMeta *GumbelTopK::init_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  GumbelTopK *gumbel_topk = (GumbelTopK *)task->args;
+  FFHandler handle = *((FFHandler *)task->local_args);
+  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                       .only_kind(Memory::GPU_FB_MEM)
+                       .best_affinity_to(task->target_proc)
+                       .first();
+  MemoryAllocator gpu_mem_allocator(gpu_mem);
+  GumbelTopKMeta *m =
+      new GumbelTopKMeta(handle, gumbel_topk, gpu_mem_allocator);
+  m->profiling = gumbel_topk->profiling;
+  m->inference_debugging = gumbel_topk->inference_debugging;
+  m->sorted = gumbel_topk->sorted;
+  m->k = gumbel_topk->k;
+  std::strcpy(m->op_name, gumbel_topk->name);
+  m->layer_guid = gumbel_topk->layer_guid;
+  m->speculative_decoding = gumbel_topk->speculative_decoding;
+  return m;
+}
+
+void GumbelTopK::forward(FFModel const &ff) {
+  // GumbelTopK does not support forward
+  assert(false);
+}
+
+FutureMap
+    GumbelTopK::inference(FFModel const &ff,
+                          BatchConfigFuture const &bc,
+                          std::vector<ParallelTensor> const &batch_inputs,
+                          std::vector<ParallelTensor> const &batch_outputs,
+                          MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "GumbelTopK op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  if (speculative_decoding) {
+    IndexLauncher launcher(GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID,
+                           parallel_is,
+                           TaskArgument(nullptr, 0),
+                           argmap,
+                           Predicate::TRUE_PRED,
+                           false /*must*/,
+                           0 /*mapper_id*/,
+                           machine_view_hash);
+    launcher.add_future(bc);
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      batch_inputs[0]->region));
+    launcher.add_field(0, FID_DATA);
+
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(1, FID_DATA);
+
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[1]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[1]->region));
+    launcher.add_field(2, FID_DATA);
+
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[2]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[2]->region));
+    launcher.add_field(3, FID_DATA);
+
+    return runtime->execute_index_space(ctx, launcher);
+  } else {
+    IndexLauncher launcher(GUMBEL_TOPK_INF_TASK_ID,
+                           parallel_is,
+                           TaskArgument(nullptr, 0),
+                           argmap,
+                           Predicate::TRUE_PRED,
+                           false /*must*/,
+                           0 /*mapper_id*/,
+                           machine_view_hash);
+    launcher.add_future(bc);
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      batch_inputs[0]->region));
+    launcher.add_field(0, FID_DATA);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(1, FID_DATA);
+
+    return runtime->execute_index_space(ctx, launcher);
+  }
+}
+
+InferenceResult
+    GumbelTopK::inference_task(Task const *task,
+                               std::vector<PhysicalRegion> const &regions,
+                               Context ctx,
+                               Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  // const GumbelTopK* topk = (const GumbelTopK*) task->args;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_tokens == 0) {
+    // Directly return for empty batch config
+    InferenceResult ir;
+    return ir;
+  }
+  GumbelTopKMeta *m = *((GumbelTopKMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
+      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW log_probs;
+  GenericTensorAccessorW perturbed_log_probs;
+
+  int batch_size = bc->num_active_tokens();
+  GumbelTopK::forward_kernel_wrapper(
+      m, input, log_probs, perturbed_log_probs, indices, batch_size, nullptr);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    GumbelTopK::save_inference_tensors_to_file(
+        m, shard_id, bc, {input}, {}, {indices});
+  }
+
+  InferenceResult ir;
+  ir.num_token_ids = batch_size * m->k;
+  ir.num_gumbel_logits = batch_size * m->k;
+  download_tensor<BatchConfig::TokenId>(
+      indices.get_int32_ptr(), ir.token_ids, batch_size);
+  return ir;
+}
+
+InferenceResult GumbelTopK::inference_speculative_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(regions.size() == 4);
+  assert(task->regions.size() == 4);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    // Directly return for empty batch config
+    InferenceResult ir;
+    return ir;
+  }
+  GumbelTopKMeta *m = *((GumbelTopKMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
+      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW log_probs = helperGetGenericTensorAccessorWO(
+      DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW perturbed_log_probs = helperGetGenericTensorAccessorWO(
+      DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime);
+
+  int batch_size = bc->num_active_tokens();
+  GumbelTopK::forward_kernel_wrapper(
+      m, input, log_probs, perturbed_log_probs, indices, batch_size, bc);
+
+  InferenceResult ir;
+  ir.num_token_ids = batch_size * m->k;
+  ir.num_gumbel_logits = batch_size * m->k;
+  download_tensor<BatchConfig::TokenId>(
+      indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
+  download_tensor<float>(
+      log_probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  download_tensor<float>(
+      perturbed_log_probs.get_float_ptr(), ir.gumbel_logits, batch_size * m->k);
+  return ir;
+}
+
+void GumbelTopK::backward(FFModel const &ff) {
+  // GumbelTopK does not support backward
+  assert(false);
+}
+
+void GumbelTopK::serialize(Legion::Serializer &sez) const {
+  sez.serialize(this->layer_guid.id);
+  sez.serialize(this->layer_guid.transformer_layer_id);
+  sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->k);
+  sez.serialize(this->sorted);
+  sez.serialize(this->speculative_decoding);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
+}
+
+Node GumbelTopK::deserialize(FFModel &ff,
+                             Legion::Deserializer &dez,
+                             ParallelTensor inputs[],
+                             int num_inputs) {
+  assert(num_inputs == 1);
+  size_t id, transformer_layer_id, deserialized_model_id;
+  dez.deserialize(id);
+  dez.deserialize(transformer_layer_id);
+  dez.deserialize(deserialized_model_id);
+  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
+  int k;
+  bool sorted;
+  bool speculative_decoding;
+  dez.deserialize(k);
+  dez.deserialize(sorted);
+  dez.deserialize(speculative_decoding);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
+  GumbelTopKParams params;
+  params.layer_guid = layer_guid;
+  params.k = k;
+  params.sorted = sorted;
+  params.speculative_decoding = speculative_decoding;
+  strcpy(params.name, name);
+  return ff.get_or_create_node<GumbelTopK>(inputs[0], params);
+}
+
+Op *GumbelTopK::materialize(FFModel &ff,
+                            ParallelTensor inputs[],
+                            int num_inputs) const {
+  GumbelTopKParams params = get_params();
+  return new GumbelTopK(ff, params, inputs[0], this->name);
+}
+
+bool GumbelTopK::measure_operator_cost(Simulator *sim,
+                                       MachineView const &mv,
+                                       CostMetrics &cost_metrics) const {
+  return false;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::GumbelTopKParams>::operator()(
+    FlexFlow::GumbelTopKParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.layer_guid.id);
+  hash_combine(key, params.k);
+  hash_combine(key, params.sorted);
+  hash_combine(key, params.speculative_decoding);
+  return key;
+}
+}; // namespace std
diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
new file mode 100644
index 000000000..1af6c5eab
--- /dev/null
+++ b/src/ops/gumbel_topk.cu
@@ -0,0 +1,618 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/gumbel_topk.h"
+#include "flexflow/utils/cuda_helper.h"
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::coord_t;
+
+enum class HeapType { kMinHeap, kMaxHeap };
+enum class PreferIndices { kLower, kHigher };
+
+template <typename T>
+struct GumbelEntry {
+  int index;
+  T value;
+  T perturbed_value;
+};
+
+template <typename T>
+struct LinearData {
+  typedef GumbelEntry<T> GumbelEntry;
+
+  __device__ GumbelEntry &operator[](std::size_t i) const {
+    return data[i];
+  }
+
+  __device__ int get_index(int i) const {
+    return data[i].index;
+  }
+  __device__ T get_value(int i) const {
+    return data[i].value;
+  }
+  __device__ T get_perturbed_value(int i) const {
+    return data[i].perturbed_value;
+  }
+
+  GumbelEntry *const data;
+};
+
+template <typename T>
+struct IndirectLinearData {
+  typedef GumbelEntry<T> GumbelEntry;
+
+  __device__ GumbelEntry &operator[](std::size_t i) const {
+    return data[i];
+  }
+
+  __device__ int get_index(int i) const {
+    return backing_data[data[i].index].index;
+  }
+  __device__ T get_value(int i) const {
+    return data[i].value;
+  }
+  __device__ T get_perturbed_value(int i) const {
+    return data[i].perturbed_value;
+  }
+
+  GumbelEntry *const data;
+  GumbelEntry *const backing_data;
+};
+
+template <typename T>
+struct StridedData {
+  typedef GumbelEntry<T> GumbelEntry;
+
+  __device__ GumbelEntry &operator[](std::size_t i) const {
+    return data[i * blockDim.x + threadIdx.x];
+  }
+
+  __device__ int get_index(int i) const {
+    return (*this)[i].index;
+  }
+  __device__ T get_value(int i) const {
+    return (*this)[i].value;
+  }
+  __device__ T get_perturbed_value(int i) const {
+    return (*this)[i].perturbed_value;
+  }
+
+  GumbelEntry *const data;
+};
+
+// A heap of GumbelEntry<T> that can either work as a min-heap or as a max-heap.
+template <HeapType heapType,
+          PreferIndices preferIndices,
+          template <typename>
+          class Data,
+          typename T>
+struct IndexedHeap {
+  typedef typename Data<T>::GumbelEntry GumbelEntry;
+  Data<T> const data;
+  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
+
+  __device__ bool is_above(int left, int right) {
+    T left_perturbed_value = data.get_perturbed_value(left);
+    T right_perturbed_value = data.get_perturbed_value(right);
+    if (left_perturbed_value == right_perturbed_value) {
+      if (preferIndices == PreferIndices::kLower) {
+        return data.get_index(left) < data.get_index(right);
+      } else {
+        return data.get_index(left) > data.get_index(right);
+      }
+    }
+    if (heapType == HeapType::kMinHeap) {
+      return left_perturbed_value < right_perturbed_value;
+    } else {
+      return left_perturbed_value > right_perturbed_value;
+    }
+  }
+
+  __device__ void assign(int i, GumbelEntry const &entry) {
+    data[i] = entry;
+  }
+
+  __device__ void push_up(int i) {
+    int child = i;
+    int parent;
+    for (; child > 0; child = parent) {
+      parent = (child - 1) / 2;
+      if (!is_above(child, parent)) {
+        // Heap property satisfied.
+        break;
+      }
+      swap(child, parent);
+    }
+  }
+
+  __device__ void swap(int a, int b) {
+    auto tmp = data[b];
+    data[b] = data[a];
+    data[a] = tmp;
+  }
+
+  __device__ void push_root_down(int k) {
+    push_down(0, k);
+  }
+
+  // MAX-HEAPIFY in Cormen
+  __device__ void push_down(int node, int k) {
+    while (true) {
+      int const left = 2 * node + 1;
+      int const right = left + 1;
+      int smallest = node;
+      if (left < k && is_above(left, smallest)) {
+        smallest = left;
+      }
+      if (right < k && is_above(right, smallest)) {
+        smallest = right;
+      }
+      if (smallest == node) {
+        break;
+      }
+      swap(smallest, node);
+      node = smallest;
+    }
+  }
+
+  // BUILD-MAX-HEAPIFY in Cormen
+  __device__ void build(int k) {
+    for (int node = (k - 1) / 2; node >= 0; node--) {
+      push_down(node, k);
+    }
+  }
+
+  // HEAP-EXTRACT-MAX in Cormen
+  __device__ void remove_root(int k) {
+    data[0] = data[k - 1];
+    push_root_down(k - 1);
+  }
+
+  // in-place HEAPSORT in Cormen
+  // This method destroys the heap property.
+  __device__ void sort(int k) {
+    for (int slot = k - 1; slot > 0; slot--) {
+      // This is like remove_root but we insert the element at the end.
+      swap(slot, 0);
+      // Heap is now an element smaller.
+      push_root_down(/*k=*/slot);
+    }
+  }
+
+  __device__ void replace_root(GumbelEntry const &entry, int k) {
+    data[0] = entry;
+    push_root_down(k);
+  }
+
+  __device__ GumbelEntry const &root() {
+    return data[0];
+  }
+};
+
+template <HeapType heapType,
+          PreferIndices preferIndices,
+          template <typename>
+          class Data,
+          typename T>
+__device__ IndexedHeap<heapType, preferIndices, Data, T>
+    make_indexed_heap(typename Data<T>::GumbelEntry *data) {
+  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
+}
+
+__global__ void
+    init_random_state_kernel(curandState *state, int batch_size, long rand) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
+    curand_init(rand, i, 0, &state[i]);
+  }
+}
+
+// Unified log function for float
+__device__ inline float unified_log(float x) {
+  return logf(x);
+}
+
+// Unified log function for half
+__device__ inline __half unified_log(__half x) {
+  return hlog(x);
+}
+
+// heapGumbelTopK walks over [input, input+length) with `step_size` stride
+// starting at `start_index`. It builds a top-`k` heap that is stored in
+// `heap_entries` using `Accessor` to access elements in `heap_entries`. If
+// sorted=true, the elements will be sorted at the end. NOTE that it applies
+// Gumbel trick on `input`, which is, input -> log(input) - log(-log(U)), where
+// U is a uniform random number in (0, 1).
+template <typename T, template <typename> class Data = LinearData>
+__device__ void heapGumbelTopK(curandState state,
+                               T const *__restrict__ input,
+                               int length,
+                               int k,
+                               GumbelEntry<T> *__restrict__ heap_entries,
+                               bool sorted = false,
+                               int start_index = 0,
+                               int step_size = 1) {
+  assert(k <= length);
+
+  auto heap =
+      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
+          heap_entries);
+
+  int heap_end_index = start_index + k * step_size;
+  if (heap_end_index > length) {
+    heap_end_index = length;
+  }
+  // Initialize the min-heap.
+  for (int index = start_index, slot = 0; index < heap_end_index;
+       index += step_size, slot++) {
+    T value = unified_log(input[index]);
+    T perturbed_value =
+        value - unified_log(-unified_log((T)curand_uniform(&state)));
+    heap.assign(slot, {index, value, perturbed_value});
+  }
+
+  heap.build(k);
+
+  // Now iterate over the remaining items.
+  // If an item is smaller than the min element, it is not amongst the top k.
+  // Otherwise, replace the min element with it and push upwards.
+  for (int index = heap_end_index; index < length; index += step_size) {
+    // We prefer elements with lower indices. This is given here.
+    // Later elements automatically have higher indices, so can be discarded.
+    T value = unified_log(input[index]);
+    T perturbed_value =
+        value - unified_log(-unified_log((T)curand_uniform(&state)));
+    if (perturbed_value > heap.root().perturbed_value) {
+      // This element should replace the min.
+      heap.replace_root({index, value, perturbed_value}, k);
+    }
+  }
+
+  // Sort if wanted.
+  if (sorted) {
+    heap.sort(k);
+  }
+}
+
+// mergeShards performs a top-k merge on `num_shards` many sorted streams that
+// are sorted and stored in `entries` in a strided way:
+// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
+// The overall top k elements are written to `top_k_values` and
+// `top_k_perturbed_values`, and their indices to `top_k_indices`. `top_k_heap`
+// is used as temporary storage for the merge heap.
+template <typename T>
+__device__ void mergeShards(int num_shards,
+                            int k,
+                            GumbelEntry<T> *__restrict__ entries,
+                            GumbelEntry<T> *__restrict__ top_k_heap,
+                            float *top_k_values,
+                            float *top_k_perturbed_values,
+                            int *top_k_indices,
+                            bool speculative_decoding) {
+  // If k < num_shards, we can use a min-heap with k elements to get the top k
+  // of the sorted blocks.
+  // If k > num_shards, we can initialize a min-heap with the top element from
+  // each sorted block.
+  int const heap_size = k < num_shards ? k : num_shards;
+
+  // Min-heap part.
+  {
+    auto min_heap = IndexedHeap<HeapType::kMinHeap,
+                                PreferIndices::kHigher,
+                                IndirectLinearData,
+                                T>{IndirectLinearData<T>{top_k_heap, entries}};
+    // Initialize the heap as a min-heap.
+    for (int slot = 0; slot < heap_size; slot++) {
+      min_heap.assign(
+          slot, {slot, entries[slot].value, entries[slot].perturbed_value});
+    }
+    min_heap.build(heap_size);
+
+    // Now perform top k with the remaining shards (if num_shards > heap_size).
+    for (int shard = heap_size; shard < num_shards; shard++) {
+      auto const entry = entries[shard];
+      auto const root = min_heap.root();
+      if (entry.perturbed_value < root.perturbed_value) {
+        continue;
+      }
+      if (entry.perturbed_value == root.perturbed_value &&
+          entry.index > entries[root.index].index) {
+        continue;
+      }
+      // This element should replace the min.
+      min_heap.replace_root({shard, entry.value, entry.perturbed_value},
+                            heap_size);
+    }
+  }
+
+  // Max-part.
+  {
+    // Turn the min-heap into a max-heap in-place.
+    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
+                                PreferIndices::kLower,
+                                IndirectLinearData,
+                                T>{IndirectLinearData<T>{top_k_heap, entries}};
+    // Heapify into a max heap.
+    max_heap.build(heap_size);
+
+    // Now extract the minimum k-1 times.
+    // k is treated specially.
+    int const last_k = k - 1;
+    for (int rank = 0; rank < last_k; rank++) {
+      GumbelEntry<T> const &max_element = max_heap.root();
+      int shard_index = max_element.index;
+      top_k_indices[rank] = entries[shard_index].index;
+      if (speculative_decoding) {
+        assert(top_k_values != nullptr);
+        top_k_values[rank] = static_cast<float>(max_element.value);
+        top_k_perturbed_values[rank] =
+            static_cast<float>(max_element.perturbed_value);
+      }
+      int next_shard_index = shard_index + num_shards;
+      // For rank < k-1, each top k heap still contains at least 1 element,
+      // so we can draw a replacement.
+      max_heap.replace_root({next_shard_index,
+                             entries[next_shard_index].value,
+                             entries[next_shard_index].perturbed_value},
+                            heap_size);
+    }
+
+    // rank == last_k.
+    GumbelEntry<T> const &max_element = max_heap.root();
+    int shard_index = max_element.index;
+    top_k_indices[last_k] = entries[shard_index].index;
+    if (speculative_decoding) {
+      assert(top_k_values != nullptr);
+      top_k_values[last_k] = static_cast<float>(max_element.value);
+      top_k_perturbed_values[last_k] =
+          static_cast<float>(max_element.perturbed_value);
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+    gumbel_topk_forward_kernel(curandState *state,
+                               T const *__restrict__ input,
+                               size_t shared_memory_size,
+                               int length,
+                               int k,
+                               bool sorted,
+                               float *__restrict__ log_probs_ptr,
+                               float *__restrict__ perturbed_log_probs_ptr,
+                               int *__restrict__ indices,
+                               bool speculative_decoding) {
+  __shared__ char shared_memory[48 << 10]; // block-wise shared memory
+  int const batch_index = blockIdx.x;
+  T const *batch_input = input + batch_index * length;
+  int const thread_index = threadIdx.x;
+  int const thread_count = blockDim.x;
+  GumbelEntry<T> *shared_entries = (GumbelEntry<T> *)shared_memory;
+  heapGumbelTopK<T, StridedData>(
+      state[thread_index + batch_index * thread_count],
+      batch_input,
+      length,
+      k,
+      shared_entries,
+      true,
+      thread_index,
+      thread_count);
+  __syncthreads();
+  if (thread_index == 0) {
+    int const offset = batch_index * k;
+    auto batch_log_probs_ptr = log_probs_ptr + offset;
+    auto batch_perturbed_log_probs_ptr = perturbed_log_probs_ptr + offset;
+    auto batch_indices = indices + offset;
+    GumbelEntry<T> *top_k_heap = shared_entries + thread_count * k;
+    mergeShards(thread_count,
+                k,
+                shared_entries,
+                top_k_heap,
+                batch_log_probs_ptr,
+                batch_perturbed_log_probs_ptr,
+                batch_indices,
+                speculative_decoding);
+  }
+}
+
+/*static*/
+template <typename DT>
+void GumbelTopK::forward_kernel(GumbelTopKMeta const *m,
+                                DT const *input_ptr,
+                                float *log_probs_ptr,
+                                float *perturbed_log_probs_ptr,
+                                int *indices_ptr,
+                                size_t batch_size,
+                                int length,
+                                int k,
+                                bool sorted,
+                                BatchConfig const *bc,
+                                cudaStream_t stream) {
+  // Adopted from TensorFlow's ArgTopK implementation
+  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
+  int num_shards = 0;
+  {
+    constexpr auto shared_memory_size = 48 << 10;
+    auto const heap_size = k * sizeof(GumbelEntry<DT>);
+    // shared_memory_size = (num_shards + 1) * heap_size <=>
+    num_shards = shared_memory_size / heap_size - 1;
+    assert(num_shards > 0);
+    if (num_shards > CUDA_NUM_THREADS) {
+      num_shards = CUDA_NUM_THREADS;
+    }
+  }
+  // We are limited by the amount of shared memory we have per block.
+  size_t shared_memory_size = (num_shards + 1) * k * sizeof(GumbelEntry<DT>);
+  // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
+  size_t num_blocks = batch_size;
+
+  // all requests share the same number of branches
+  if (m->speculative_decoding) {
+    assert(bc->num_active_requests() >= 0);
+    assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+    num_shards = k;
+
+    int state_length = batch_size * num_shards;
+    init_random_state_kernel<<<GET_BLOCKS(state_length),
+                               min((int)CUDA_NUM_THREADS, state_length),
+                               0,
+                               stream>>>(m->state, state_length, rand());
+
+    gumbel_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+        m->state,
+        input_ptr,
+        shared_memory_size,
+        length,
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
+        sorted,
+        log_probs_ptr,
+        perturbed_log_probs_ptr,
+        indices_ptr,
+        m->speculative_decoding);
+  } else {
+    assert(num_shards >= (size_t)k);
+    num_shards = k;
+
+    int state_length = batch_size * num_shards;
+    init_random_state_kernel<<<GET_BLOCKS(state_length),
+                               min((int)CUDA_NUM_THREADS, state_length),
+                               0,
+                               stream>>>(m->state, state_length, rand());
+
+    gumbel_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+        m->state,
+        input_ptr,
+        shared_memory_size,
+        length,
+        k,
+        sorted,
+        nullptr,
+        nullptr,
+        indices_ptr,
+        false);
+  }
+}
+
+/*static*/
+void GumbelTopK::forward_kernel_wrapper(
+    GumbelTopKMeta const *m,
+    GenericTensorAccessorR const &input,
+    // float *output_ptr,
+    GenericTensorAccessorW const &log_probs,
+    GenericTensorAccessorW const &perturbed_log_probs,
+    GenericTensorAccessorW const &indices,
+    int batch_size,
+    BatchConfig const *bc) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  // Domain in1_domain = runtime->get_index_space_domain(
+  //     ctx, task->regions[0].region.get_index_space());
+  //   Domain out1_domain = runtime->get_index_space_domain(
+  //       ctx, task->regions[1].region.get_index_space());
+  // Domain out2_domain = runtime->get_index_space_domain(
+  //     ctx, task->regions[1].region.get_index_space());
+  int numdims = input.domain.get_dim();
+  assert(indices.domain.get_dim() == numdims);
+
+  int in_cols = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1;
+  int out2_cols = indices.domain.hi()[0] - indices.domain.lo()[0] + 1;
+
+  // assert(out1_domain == out2_domain);
+  for (int i = 1; i < input.domain.get_dim(); i++) {
+    assert(input.domain.lo()[i] == indices.domain.lo()[i]);
+    assert(input.domain.hi()[i] == indices.domain.hi()[i]);
+  }
+  // float const *in_ptr = helperGetTensorPointerRO<float>(
+  //     regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  //   float *value_ptr = helperGetTensorPointerWO<float>(
+  //       regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  // int *index_ptr = helperGetTensorPointerWO<int>(
+  //    regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  int length = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int k = indices.domain.hi()[0] - indices.domain.lo()[0] +
+          1; /*TODO: This prints to 5*/
+
+  // batch_size = input.domain.get_volume() / length;
+  // assert(indices.domain.get_volume() / k == batch_size);
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (input.data_type == DT_HALF) {
+    GumbelTopK::forward_kernel(
+        m,
+        input.get_half_ptr(),
+        m->speculative_decoding ? log_probs.get_float_ptr() : nullptr,
+        m->speculative_decoding ? perturbed_log_probs.get_float_ptr() : nullptr,
+        indices.get_int32_ptr(),
+        batch_size,
+        length,
+        k,
+        m->sorted,
+        m->speculative_decoding ? bc : nullptr,
+        stream);
+  } else if (input.data_type == DT_FLOAT) {
+    GumbelTopK::forward_kernel(
+        m,
+        input.get_float_ptr(),
+        m->speculative_decoding ? log_probs.get_float_ptr() : nullptr,
+        m->speculative_decoding ? perturbed_log_probs.get_float_ptr() : nullptr,
+        indices.get_int32_ptr(),
+        batch_size,
+        length,
+        k,
+        m->sorted,
+        m->speculative_decoding ? bc : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[GumbelTopK] forward time = %.2lfms\n", elapsed);
+  }
+}
+
+GumbelTopKMeta::GumbelTopKMeta(FFHandler handler,
+                               Op const *op,
+                               MemoryAllocator &gpu_mem_allocator)
+    : OpMeta(handler, op) {
+  state_max_length =
+      BatchConfig::MAX_NUM_TOKENS *
+      max(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, CUDA_NUM_THREADS);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, sizeof(curandState) * state_max_length, "GumbelTopKMeta");
+  state = gpu_mem_allocator.allocate_instance<curandState>(state_max_length);
+}
+
+GumbelTopKMeta::~GumbelTopKMeta(void) {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
+}
+}; // namespace FlexFlow
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 7aa350377..b819b4936 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -46,7 +46,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-LegionRuntime::Logger::Category log_inc_mha("IncrementalMHA");
+Legion::Logger log_inc_mha("IncrementalMHA");
 
 bool IncMultiHeadSelfAttentionParams::is_valid(
     ParallelTensorShape const &input) const {
@@ -54,61 +54,66 @@ bool IncMultiHeadSelfAttentionParams::is_valid(
   return is_valid;
 }
 
-Tensor FFModel::inc_multihead_self_attention(const Tensor input,
-                                             int embed_dim,
-                                             int num_heads,
-                                             int kdim,
-                                             int vdim,
-                                             float dropout,
-                                             bool qkv_bias,
-                                             bool final_bias,
-                                             bool add_zero_attn,
-                                             DataType data_type,
-                                             Initializer *kernel_initializer,
-                                             bool apply_rotary_embedding,
-                                             bool scaling_query,
-                                             float scaling_factor,
-                                             bool qk_prod_scaling,
-                                             bool position_bias,
-                                             char const *name) {
-  return inc_multiquery_self_attention(input,
-                                       embed_dim,
-                                       num_heads,
-                                       num_heads,
-                                       kdim,
-                                       vdim,
-                                       dropout,
-                                       qkv_bias,
-                                       final_bias,
-                                       add_zero_attn,
-                                       data_type,
-                                       kernel_initializer,
-                                       apply_rotary_embedding,
-                                       scaling_query,
-                                       scaling_factor,
-                                       qk_prod_scaling,
-                                       position_bias,
-                                       name);
+Tensor FFModel::inc_multihead_self_attention(
+    const Tensor input,
+    int embed_dim,
+    int num_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool qkv_bias,
+    bool final_bias,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    bool streaming_cache,
+    char const *name) {
+  return groupquery_self_attention(input,
+                                   embed_dim,
+                                   num_heads,
+                                   num_heads,
+                                   kdim,
+                                   vdim,
+                                   dropout,
+                                   qkv_bias,
+                                   final_bias,
+                                   add_zero_attn,
+                                   data_type,
+                                   kernel_initializer,
+                                   rotary_embedding_meta,
+                                   scaling_query,
+                                   scaling_factor,
+                                   qk_prod_scaling,
+                                   position_bias,
+                                   streaming_cache,
+                                   name);
 }
 
-Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
-                                              int embed_dim,
-                                              int num_q_heads,
-                                              int num_kv_heads,
-                                              int kdim,
-                                              int vdim,
-                                              float dropout,
-                                              bool qkv_bias,
-                                              bool final_bias,
-                                              bool add_zero_attn,
-                                              DataType data_type,
-                                              Initializer *kernel_initializer,
-                                              bool apply_rotary_embedding,
-                                              bool scaling_query,
-                                              float scaling_factor,
-                                              bool qk_prod_scaling,
-                                              bool position_bias,
-                                              char const *name) {
+Tensor FFModel::groupquery_self_attention(
+    const Tensor input,
+    int embed_dim,
+    int num_q_heads,
+    int num_kv_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool qkv_bias,
+    bool final_bias,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    bool streaming_cache,
+    char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
   }
@@ -147,13 +152,12 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
+  int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim;
+  int hidden_size = input->dims[0];
+  int qParas = qk_dim * hidden_size;
+  int kParas = qk_dim * hidden_size;
+  int vParas = v_dim * hidden_size;
+  int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size);
 
   // allocate num_q_heads for key, value for replication
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
@@ -178,10 +182,8 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
   }
   if (qkv_bias || final_bias) {
     // q, k, v, o
-    int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
+    int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -200,13 +202,24 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
   li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);
   li->add_int_property("offload", offload);
+  li->add_int_property("streaming_cache", streaming_cache);
   li->add_int_property("tensor_parallelism_degree",
                        config.tensor_parallelism_degree);
   layers.push_back(li);
@@ -237,8 +250,18 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
   bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   layer->get_int_property("scaling_query", value);
   bool scaling_query = (bool)value;
   float scaling_factor;
@@ -252,6 +275,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
   DataType quantization_type = (DataType)value;
   layer->get_int_property("offload", value);
   bool offload = (bool)value;
+  layer->get_int_property("streaming_cache", value);
+  bool streaming_cache = (bool)value;
   layer->get_int_property("tensor_parallelism_degree", value);
   int tensor_parallelism_degree = (int)value;
 
@@ -267,7 +292,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
                                        qkv_bias,
                                        final_bias,
                                        add_zero_attn,
-                                       apply_rotary_embedding,
+                                       rotary_embedding_meta,
                                        scaling_query,
                                        scaling_factor,
                                        qk_prod_scaling,
@@ -275,6 +300,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
                                        false /*allocate_weights*/,
                                        quantization_type,
                                        offload,
+                                       streaming_cache,
                                        tensor_parallelism_degree,
                                        layer->name);
 }
@@ -292,7 +318,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -300,6 +326,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     bool allocate_weights,
     DataType _quantization_type,
     bool _offload,
+    bool _streaming_cache,
     int _tensor_parallelism_degree,
     char const *name)
     // Initializer* _bias_initializer)
@@ -314,14 +341,13 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
-      quantization_type(_quantization_type), offload(_offload),
+      rotary_embedding_meta(_rotary_embedding_meta),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), quantization_type(_quantization_type),
+      offload(_offload), streaming_cache(_streaming_cache),
       tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
@@ -340,11 +366,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -368,10 +394,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -406,7 +431,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -414,6 +439,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     bool allocate_weights,
     DataType _quantization_type,
     bool _offload,
+    bool _streaming_cache,
     int _tensor_parallelism_degree,
     char const *name)
     // Initializer* _bias_initializer)
@@ -429,14 +455,13 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
-      quantization_type(_quantization_type), offload(_offload),
+      rotary_embedding_meta(_rotary_embedding_meta),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), quantization_type(_quantization_type),
+      offload(_offload), streaming_cache(_streaming_cache),
       tensor_parallelism_degree(_tensor_parallelism_degree)
 // bias_initializer(_bias_initializer)
 {
@@ -453,11 +478,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -482,10 +507,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -518,16 +542,16 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     : IncMultiHeadSelfAttention(model,
                                 other.layer_guid,
                                 input,
-                                other.oProjSize,
+                                other.o_dim,
                                 other.num_q_heads,
                                 other.num_kv_heads,
-                                other.qProjSize,
-                                other.vProjSize,
+                                other.qk_dim,
+                                other.v_dim,
                                 other.dropout,
                                 other.qkv_bias,
                                 other.final_bias,
                                 other.add_zero_attn,
-                                other.apply_rotary_embedding,
+                                other.rotary_embedding_meta,
                                 other.scaling_query,
                                 other.scaling_factor,
                                 other.qk_prod_scaling,
@@ -535,6 +559,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 allocate_weights,
                                 other.quantization_type,
                                 other.offload,
+                                other.streaming_cache,
                                 other.tensor_parallelism_degree,
                                 other.name) {}
 
@@ -556,7 +581,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 params.qkv_bias,
                                 params.final_bias,
                                 params.add_zero_attn,
-                                params.apply_rotary_embedding,
+                                params.rotary_embedding_meta,
                                 params.scaling_query,
                                 params.scaling_factor,
                                 params.qk_prod_scaling,
@@ -564,6 +589,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 allocate_weights,
                                 params.quantization_type,
                                 params.offload,
+                                params.streaming_cache,
                                 params.tensor_parallelism_degree,
                                 params.name) {}
 
@@ -696,7 +722,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
 
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                        .only_kind(Memory::GPU_FB_MEM)
@@ -902,25 +928,38 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs,
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
          lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
-         lhs.position_bias == rhs.position_bias;
+         lhs.position_bias == rhs.position_bias &&
+         lhs.streaming_cache == rhs.streaming_cache;
 }
 
 IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   IncMultiHeadSelfAttentionParams params;
   params.layer_guid = this->layer_guid;
-  params.embed_dim = this->oProjSize;
+  params.embed_dim = this->o_dim;
   params.num_q_heads = this->num_q_heads;
-  params.kdim = this->kProjSize;
-  params.vdim = this->vProjSize;
+  params.kdim = this->qk_dim;
+  params.vdim = this->v_dim;
   params.dropout = this->dropout;
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
@@ -928,6 +967,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   params.tensor_parallelism_degree = this->tensor_parallelism_degree,
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
+  params.streaming_cache = this->streaming_cache;
   params.num_kv_heads = this->num_kv_heads;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
@@ -952,13 +992,21 @@ size_t hash<FlexFlow::IncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.qkv_bias);
   hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
   hash_combine(key, params.position_bias);
   hash_combine(key, params.quantization_type);
   hash_combine(key, params.offload);
+  hash_combine(key, params.streaming_cache);
   hash_combine(key, params.tensor_parallelism_degree);
   return key;
 }
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index d60386f92..449940155 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/ops/inc_multihead_self_attention.h"
+#include "flexflow/ffconst.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
+#include <math_constants.h>
 
 namespace FlexFlow {
 
@@ -123,56 +125,17 @@ __global__ void scaling_query_kernel(DT *input_ptr,
   }
 }
 
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_native(DT *input_ptr,
-                                  hipFloatComplex *complex_input,
-                                  BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qProjSize,
-                                  int kProjSize,
-                                  int num_q_heads,
-                                  int num_tokens,
-                                  int num_kv_heads,
-                                  int q_block_size,
-                                  int k_block_size,
-                                  int q_array_size) {
-  CUDA_KERNEL_LOOP(
-      i,
-      num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int head_idx = real_i / (num_tokens * proj_size / 2);
-    int idx = real_i % (num_tokens * proj_size / 2);
-    int real_part_index = idx * 2 +
-                          head_idx * (q_tensor ? q_block_size : k_block_size) +
-                          (q_tensor ? 0 : q_array_size);
-
-    int complex_part_index = real_part_index + 1;
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    int token_idx =
-        (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    hipFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = hipCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
 template <typename DT>
 __global__ void
     apply_rotary_embedding_hf(DT *input_ptr,
                               hipFloatComplex *complex_input,
                               BatchConfig::PerTokenInfo const *tokenInfos,
+                              float rope_theta,
+                              bool llama3_rope,
+                              float factor,
+                              float low_freq_factor,
+                              float high_freq_factor,
+                              int original_max_position_embeddings,
                               int qProjSize,
                               int kProjSize,
                               int num_tokens,
@@ -207,7 +170,29 @@ __global__ void
 
     // float before_real = complex_input[i].x, before_complex =
     int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
+
+    float freq =
+        pos * (1.0 / pow(rope_theta, (float)2 * pos_i / proj_size)); // θ_i
+
+    if (llama3_rope) {
+      float pi = CUDART_PI_F;
+      float wavelen = 2 * pi / freq;
+      float low_freq_wavelen =
+          original_max_position_embeddings / low_freq_factor;
+      float high_freq_wavelen =
+          original_max_position_embeddings / high_freq_factor;
+      if (wavelen < high_freq_wavelen) {
+      } else if (wavelen > low_freq_wavelen) {
+        freq = freq / factor;
+      } else {
+        assert(low_freq_wavelen != high_freq_wavelen);
+        float smooth =
+            (original_max_position_embeddings / wavelen - low_freq_factor) /
+            (high_freq_factor - low_freq_factor);
+        freq = ((1 - smooth) * freq / factor + smooth * freq);
+      }
+    }
+
     hipFloatComplex complex_pos = {cos(freq), sin(freq)};
 
     complex_input[i] = hipCmulf(complex_input[i], complex_pos);
@@ -232,7 +217,7 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
 
     // key cache
     kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
@@ -243,14 +228,14 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
 }
 
 template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        hipStream_t stream) {
+void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
+                 BatchConfig const *bc,
+                 int shard_id,
+                 DT const *input_ptr,
+                 DT const *weight_ptr,
+                 DT *output_ptr,
+                 DT const *bias_ptr,
+                 hipStream_t stream) {
 
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
@@ -319,7 +304,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        m->num_q_heads,
                        *m->scaling_query,
                        m->scaling_factor,
-                       m->hidden_size);
+                       m->local_hidden_size);
   } else if (m->scaling_query) {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel<DT>),
                        GET_BLOCKS(parallelism),
@@ -331,24 +316,31 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        m->num_q_heads,
                        m->qProjSize,
                        m->scaling_factor,
-                       m->hidden_size);
+                       m->local_hidden_size);
   }
-  if (*m->apply_rotary_embedding) {
+  if (m->rotary_embedding_meta->apply_rotary_embedding) {
     /*q&k*/
     parallelism = num_tokens * m->hidden_size;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       m->complex_input,
-                       m->token_infos,
-                       m->qProjSize,
-                       m->kProjSize,
-                       num_tokens,
-                       q_array_size,
-                       m->hidden_size);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(apply_rotary_embedding_hf),
+        GET_BLOCKS(parallelism),
+        min(CUDA_NUM_THREADS, parallelism),
+        0,
+        stream,
+        output_ptr,
+        m->complex_input,
+        m->token_infos,
+        m->rotary_embedding_meta->rope_theta,
+        (m->rotary_embedding_meta->rope_type == "llama3"),
+        m->rotary_embedding_meta->factor,
+        m->rotary_embedding_meta->low_freq_factor,
+        m->rotary_embedding_meta->high_freq_factor,
+        m->rotary_embedding_meta->original_max_position_embeddings,
+        m->qProjSize,
+        m->kProjSize,
+        num_tokens,
+        q_array_size,
+        m->hidden_size);
   }
 }
 
@@ -358,7 +350,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             hipStream_t stream) {
   int num_tokens = bc->num_active_tokens();
   if (num_tokens > 0) {
-    int parallelism = m->hidden_size * num_tokens;
+    int parallelism = m->local_hidden_size * num_tokens;
     hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache<DT>),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
@@ -370,15 +362,15 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        m->token_infos,
                        num_tokens,
                        BatchConfig::max_sequence_length(),
-                       m->hidden_size);
+                       m->local_hidden_size);
   }
 }
 
 template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             hipStream_t stream) {
+void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
+                      GenericTensorAccessorR const weight,
+                      DataType data_type,
+                      hipStream_t stream) {
   // additional processing for weight uploading
   // Note that we update weight_ptr and bias_ptr when uploading weight and
   // bias
@@ -457,14 +449,14 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                            hipMemcpyHostToDevice,
                            stream));
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+  compute_qkv(m,
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
 
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
@@ -530,11 +522,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (!bc->request_available[i]) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+    int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
     // bc->token_last_available_idx[i] + 1;
     // Compute (QK^T/sqrt(d_k))
@@ -773,7 +765,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   if (input.data_type == DT_HALF) {
     if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+      pre_build_weight<half>(m, weight, input.data_type, stream);
     }
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
@@ -788,7 +780,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+      pre_build_weight<float>(m, weight, input.data_type, stream);
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
@@ -838,7 +830,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
@@ -866,7 +858,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _kProjSize,
     int _vProjSize,
     int _oProjSize,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _qkv_bias,
     bool _scaling_query,
     bool _qk_prod_scaling,
@@ -906,7 +898,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   global_num_kv_heads = _global_num_kv_heads;
   num_q_heads = _num_q_heads;
   num_kv_heads = _num_kv_heads;
-  hidden_size = num_q_heads * qProjSize;
+  local_hidden_size = num_q_heads * qProjSize;
 
   weightSize =
       ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) *
@@ -927,8 +919,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // has_load_weights = (bool *)calloc(1, sizeof(bool));
   //*has_load_weights = false;
-  apply_rotary_embedding = (bool *)calloc(1, sizeof(bool));
-  *apply_rotary_embedding = _apply_rotary_embedding;
+  rotary_embedding_meta =
+      (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta));
+  *rotary_embedding_meta = _rotary_embedding_meta;
   qkv_bias = (bool *)calloc(1, sizeof(bool));
   *qkv_bias = _qkv_bias;
   scaling_query = (bool *)calloc(1, sizeof(bool));
@@ -949,7 +942,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = std::max(
+        infer_mode == TREE_SEARCH_MODE ? BatchConfig::max_tokens_per_ssm_batch()
+                                       : BatchConfig::max_tokens_per_batch(),
+        BatchConfig::max_tokens_per_prefilling_batch());
     size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
@@ -965,15 +961,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                            BatchConfig::max_sequence_length();
         break;
       }
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         key_cache_size = num_q_heads * kProjSize *
-                         BeamSearchBatchConfig::max_requests_per_batch() *
+                         BatchConfig::max_requests_per_batch() *
                          BatchConfig::max_sequence_length() *
-                         BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+                         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
         value_cache_size = num_q_heads * vProjSize *
-                           BeamSearchBatchConfig::max_requests_per_batch() *
+                           BatchConfig::max_requests_per_batch() *
                            BatchConfig::max_sequence_length() *
-                           BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+                           BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
         break;
       }
       default:
@@ -1014,9 +1010,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       assert(gpu_mem_allocator.reserved_total_size -
                  gpu_mem_allocator.reserved_allocated_size >=
              totalSharedSize);
-      gpu_mem_allocator.create_legion_instance(reserveInst, instance_size);
+      gpu_mem_allocator.create_legion_instance(
+          reserveInst, instance_size, "IncMultiHeadSelfAttentionMeta");
     } else {
-      gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+      gpu_mem_allocator.create_legion_instance(
+          reserveInst, totalSize, "IncMultiHeadSelfAttentionMeta");
     }
 
     // in tree_verify, enable devQKVProjArray;
@@ -1086,13 +1084,13 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {
   }
 }
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
+template void Kernels::IncMultiHeadAttention::pre_build_weight<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     GenericTensorAccessorR const weight,
     DataType data_type,
     hipStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
+template void Kernels::IncMultiHeadAttention::pre_build_weight<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     GenericTensorAccessorR const weight,
     DataType data_type,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index a0d31bb6e..30c0586a5 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -15,12 +15,16 @@
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
+#include "flashinfer/decode_attention_decl.cuh"
+#include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
+#include "flexflow/page_manager.h"
 #include "flexflow/utils/cuda_helper.h"
+#include <math_constants.h>
 
 namespace FlexFlow {
 
@@ -33,773 +37,198 @@ using Legion::Memory;
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
-// gridDim = num_heads
-// blockDim = num_tokens/num_request * head_size
-// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads|
-// one thread process one head_size
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_attention_kernel_generation_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int max_seq_length,
-    int per_head_size,
-    int hidden_size,
-    BatchConfig::PerRequestInfo *request_infos) {
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  // eg.  if head_size = 128, thread_per_key = 4, with float32 precision
-  // then K_VEC_SIZE = 1,  QK_VEC_SIZE = 4
-  //  K_ELTS_PER_THREAD = 128 / 4 = 32
-  //  K_VECS_PER_THREAD = 32 / 1 = 32
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  // constexpr int QK_VEC_SIZE = 16 / sizeof(DT);
-  // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  // request idx
-  int const request_idx = blockIdx.y;
-
-  int const batch_config_request_id =
-      request_infos[request_idx].batch_config_request_id;
-
-  int const first_step = 0;
-
-  int const tlength =
-      request_infos[batch_config_request_id].first_token_depth_in_request +
-      request_infos[batch_config_request_id].num_tokens_in_batch;
-
-  // shared memory objects
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_);
-
-  float qk_max = -FLT_MAX;
-
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-  const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-  // DT const *q_ptr =
-  //     query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size;
-
-  // q tensor in this thread
-  // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total
-  // K_VECS_PER_THREAD elements
-  // QK_vec_k: 32->1, 64->2, 128->4... head_size
-  // K_vec_k: 4->1, 2->2, 1->4 threads_per_key
-
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-#pragma unroll
-  for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-    q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-        q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE);
-  }
-  __syncthreads();
-  // first iter = 128 / 4 = 32
-  // K_VECS_PER_THREAD = 32
-  //  K_PER_ITER how many keys in this loop
-  //  The number of timesteps loaded per iteration.
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  //   // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  DT const *k_cache_batch =
-      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
-
-  int ti_end =
-      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-  // get k, perform qk proj
-
-  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    K_vec k[K_VECS_PER_THREAD];
-    int const ti_circ = ti % max_seq_length;
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-      if (ti < tlength) {
-        k[ii] = *reinterpret_cast<K_vec const *>(k_cache_batch +
-                                                 ti_circ * hidden_size +
-                                                 head_idx * per_head_size + jj);
-      }
-      // Compute dot product.
-      // This includes a reduction across the threads in the same thread group.
-    }
-    float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-    // // todo add positional embedding to the qk production
-    // // Store the product to shared memory. There's one qk value per
-    // timestep.
-    // // Update the max.
-    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-      // todo add alobi here
-      bool const mask = ti_circ >= tlength;
-      if (mask) {
-        assert(false);
-      }
-      qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-      qk_smem[ti - first_step] = mask ? 0.f : qk;
-    }
-  }
-
-  __syncthreads();
-
-#pragma unroll
-  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Decompose the thread index into warp and lane.
-  int const warp = tidx / WARP_SIZE;
-  int const lane = tidx % WARP_SIZE;
-
-  // The warp leader writes the max to shared memory.
-  if (lane == 0) {
-    red_smem[warp] = qk_max;
-  }
-
-  // Make sure the products are in shared memory.
-  __syncthreads();
-
-  // The warps finalize the reduction.
-  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Broadcast to all the threads in the warp.
-  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-  float exp_sum = 0.f;
-  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-    float logit = __expf(qk_smem[ti - first_step] - qk_max);
-    exp_sum += logit;
-    qk_smem[ti - first_step] = logit;
-  }
-
-  // Compute the sum.
-  exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
-
-  // softmax
-  float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-    qk_smem[ti - first_step] *= inv_sum;
-  }
-
-  __syncthreads();
-  // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
-  //   printf("softmax %.10f\n", qk_smem[0]);
-  // }
-
-  // value projection
-  constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-  // A vector of V elements for the current timestep.
-  // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-  // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-  // The value computed by this thread.
-  int vo = tidx / THREADS_PER_VALUE;
-  // The hidden dimensions computed by this particular thread.
-  int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-  Out_sum out;
-  zero(out);
-
-  // The base pointer for the value in the cache buffer.
-  DT const *v_cache_batch =
-      value_cache + batch_config_request_id * max_seq_length * hidden_size + vi;
-
-  if (Dh == Dh_MAX || vi < Dh) {
-    for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
-      // Load the values from the cache.
-      int const ti_circ = ti % max_seq_length;
-
-      V_vec v = *reinterpret_cast<V_vec const *>(
-          v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-      float logit = qk_smem[ti - first_step];
-      out = FlexFlow::fma(logit, cast_to_float(v), out);
-    }
-  }
-
-  //   // Make sure we can start writing to shared memory.
-  __syncthreads();
-
-  // Run the final reduction amongst the different groups computing different
-  // partial outputs.
-  if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-    for (int active_groups = V_PER_ITER; active_groups >= 2;
-         active_groups /= 2) {
-
-      // The midpoint in the number of active groups.
-      int midpoint = active_groups / 2;
-
-      // The upper part of active threads store to shared memory.
-      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-        *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-            out;
-      }
-      __syncthreads();
-
-      // The bottom warps update their values.
-      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-        out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                  out);
-      }
-      __syncthreads();
-    }
-  }
-
-  // Output the final values.
-  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-    convert_from_float(
-        *reinterpret_cast<V_vec *>(output_ptr + request_idx * hidden_size +
-                                   head_idx * per_head_size + vi),
-        out);
-  }
-}
-
-// only used by MPT model. https://arxiv.org/abs/2108.12409
-template <typename DT>
-__global__ void apply_position_bias_qkprd(DT *input_ptr,
-                                          int num_tokens,
-                                          int num_total_tokens,
-                                          int num_heads,
-                                          int global_num_q_heads,
-                                          int shard_id) {
-  CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) {
-    // get head_idx,
-    int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id);
-    int position_idx = (i / num_tokens) % num_total_tokens;
-    position_idx = position_idx + 1 - num_total_tokens;
-    // 8 is alibi_bias_max in
-    // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json
-    float base = (float)(head_idx + 1) * 8 / global_num_q_heads;
-    float slopes = 1.0 / pow(2, base);
-    // if(i == 0){
-    //   printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes,
-    //   position_idx * slopes);
-    // }
-    input_ptr[i] += static_cast<DT>(position_idx * slopes);
-  }
-}
-
-template <typename DT>
-__global__ void apply_proj_bias_w(DT *input_ptr,
-                                  DT const *bias_ptr,
-                                  int num_tokens,
-                                  int qkv_weight_size,
-                                  int oProjSize) {
-  CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) {
-    int bias_idx = qkv_weight_size + i % oProjSize;
-    input_ptr[i] += bias_ptr[bias_idx];
-  }
-}
-
-template <typename DT>
-__global__ void apply_proj_bias_qkv(DT *input_ptr,
-                                    DT const *bias_ptr,
-                                    int shard_id,
-                                    int num_tokens,
-                                    int qProjSize,
-                                    int kProjSize,
-                                    int vProjSize,
-                                    int global_num_q_heads,
-                                    int num_q_heads,
-                                    bool scaling_query,
-                                    float scaling_factor,
-                                    int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) {
-    // for simplicity, assume q, k, v is in same shape
-    // 0->q, 1->k, 2->v
-    // int qkv_index = i / (num_tokens * qProjSize) % 3;
-
-    int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
-    size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
-
-    int qkv_index = in_token_idx / hidden_size;
-
-    int proj_size = qkv_index == 0 ? qProjSize : kProjSize;
-
-    int head_idx =
-        (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size;
-    int global_head_idx = head_idx + shard_id * num_q_heads;
-
-    size_t pre_length =
-        qkv_index == 0
-            ? 0
-            : (qkv_index == 1 ? qProjSize * global_num_q_heads
-                              : qProjSize * global_num_q_heads * KV_WEIGHT_NUM);
-
-    size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size;
-
-    input_ptr[i] += bias_ptr[bias_idx];
-
-    if (scaling_query && qkv_index == 0) {
-      input_ptr[i] *= scaling_factor;
-    }
-  }
-}
-
-template <typename DT>
-__global__ void scaling_query_kernel(DT *input_ptr,
-                                     int qProjSize,
-                                     int num_tokens,
-                                     int num_q_heads,
-                                     float scaling_factor,
-                                     int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *=
-        scaling_factor;
-  }
-}
-
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_native(DT *input_ptr,
-                                  cuFloatComplex *complex_input,
-                                  BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qProjSize,
-                                  int kProjSize,
-                                  int num_q_heads,
-                                  int num_tokens,
-                                  int num_kv_heads,
-                                  int q_block_size,
-                                  int k_block_size,
-                                  int q_array_size) {
-  CUDA_KERNEL_LOOP(
-      i,
-      num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int head_idx = real_i / (num_tokens * proj_size / 2);
-    int idx = real_i % (num_tokens * proj_size / 2);
-    int real_part_index = idx * 2 +
-                          head_idx * (q_tensor ? q_block_size : k_block_size) +
-                          (q_tensor ? 0 : q_array_size);
-
-    int complex_part_index = real_part_index + 1;
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    int token_idx =
-        (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    // complex_input[i].y;
-
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
+using flashinfer::BatchDecodeHandler;
+using flashinfer::BatchDecodeWithPagedKVCacheWrapperDispatched;
+using flashinfer::BatchPrefillHandler;
+using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
+using flashinfer::LogitsPostHook;
+using flashinfer::MaskMode;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
 
 template <typename DT>
-__global__ void
-    apply_rotary_embedding_hf(DT *input_ptr,
-                              cuFloatComplex *complex_input,
-                              BatchConfig::PerTokenInfo const *tokenInfos,
-                              int qProjSize,
-                              int kProjSize,
-                              int num_tokens,
-                              size_t q_array_size,
-                              int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int token_idx = real_i / (hidden_size / 2);
-    int idx = real_i % (proj_size / 2);
-    int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
-
-    int real_part_index = idx + head_idx * proj_size +
-                          token_idx * hidden_size * QKV_WEIGHT_NUM +
-                          hidden_size * (q_tensor ? 0 : 1);
-    int complex_part_index = real_part_index + (proj_size / 2);
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64
-    // apply a Cartesian coordinate transformation
-    // multiple with input & /copy back to q/k
-
-    // get position of token
-
-    // size_t pos = id_map[token_idx].token_position;
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
-template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        cudaStream_t stream) {
-
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  assert(m->qSize == m->vSize && m->qSize == m->kSize);
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-
-  // Step 1: Compute QKV projections
-  {
-    DT alpha = 1.0f, beta = 0.0f;
-    // after transpositions
-    int m_q = m->qProjSize * m->num_q_heads;
-    int m_k = m->kProjSize * m->num_q_heads;
-    int m_v = m->vProjSize * m->num_q_heads;
-    assert(m_q == m_k && m_k == m_v); // keep things simple for now
-    int n = bc->num_active_tokens();
-    int k = m->qSize;
-    int m_ = m_q * QKV_WEIGHT_NUM;
-    // before transpositions
-    int lda = k, ldb = k, ldc = m_;
-    // matrix A: QKV weights
-    // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
-    // matrix B: input
-    // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
-    // matrix C: devQKVProjArray
-    // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           m_,
-                           n,
-                           k,
-                           &alpha,
-                           weight_ptr,
-                           cublas_data_type,
-                           lda,
-                           input_ptr,
-                           cublas_data_type,
-                           ldb,
-                           &beta,
-                           output_ptr,
-                           cublas_data_type,
-                           ldc,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  }
-
-  int num_tokens = bc->num_active_tokens();
-  int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
-  size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
-
-  // Step 2: apply bias for QKV, or scale the query
-  if (*m->qkv_bias) {
-    apply_proj_bias_qkv<<<GET_BLOCKS(parallelism),
-                          min(CUDA_NUM_THREADS, parallelism),
-                          0,
-                          stream>>>(output_ptr,
-                                    bias_ptr,
-                                    shard_id,
-                                    num_tokens,
-                                    m->qProjSize,
-                                    m->kProjSize,
-                                    m->vProjSize,
-                                    m->global_num_q_heads,
-                                    m->num_q_heads,
-                                    *m->scaling_query,
-                                    m->scaling_factor,
-                                    m->hidden_size);
-  } else if (m->scaling_query) {
-    scaling_query_kernel<<<GET_BLOCKS(parallelism),
-                           min(CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream>>>(output_ptr,
-                                     num_tokens,
-                                     m->num_q_heads,
-                                     m->qProjSize,
-                                     m->scaling_factor,
-                                     m->hidden_size);
-  }
-
-  // Step 3: apply rotary embedding if needed
-  if (*m->apply_rotary_embedding) {
-    /*q&k*/
-    parallelism = num_tokens * m->hidden_size;
-    apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
-                                min(CUDA_NUM_THREADS, parallelism),
-                                0,
-                                stream>>>(output_ptr,
-                                          m->complex_input,
-                                          m->token_infos,
-                                          m->qProjSize,
-                                          m->kProjSize,
-                                          num_tokens,
-                                          q_array_size,
-                                          m->hidden_size);
-  }
-}
-
-template <typename DT>
-void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                            BatchConfig const *bc,
-                            cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
-  if (num_tokens > 0) {
-    int parallelism = m->hidden_size * num_tokens;
-    store_kv_cache<<<GET_BLOCKS(parallelism),
-                     min(CUDA_NUM_THREADS, parallelism),
-                     0,
-                     stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                               static_cast<DT *>(m->keyCache),
-                               static_cast<DT *>(m->valueCache),
-                               m->token_infos,
-                               num_tokens,
-                               BatchConfig::max_sequence_length(),
-                               m->hidden_size);
-  }
-}
-
-template <typename DT>
-void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         int shard_id,
-                         DT *output_ptr,
-                         DT const *weight_ptr,
-                         DT const *bias_ptr,
-                         int num_tokens,
-                         cudaStream_t stream) {
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
-#endif
-  // Project to output, save result directly on output tensor
-  {
-    DT alpha = 1.0f, beta = 0.0f;
-    // after transpositions
-    int m_ = m->oProjSize;
-    int k = m->vProjSize * m->num_q_heads;
-    int n = num_tokens;
-    // before transpositions
-    int lda = k, ldb = k, ldc = m_;
-    // matrix A: output projection weight
-    // matrix A's layout: [vProjSize * num_heads, oProjSize]
-    DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                           m->kProjSize * m->num_q_heads +
-                                           m->vProjSize * m->num_q_heads);
-    // matrix B: attn heads
-    // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
-    DT const *B = static_cast<DT *>(m->attn_heads);
-    // matrix B: output
-    // matrix B's layout: [oProjSize, num_new_tokens]
-    DT *C = static_cast<DT *>(output_ptr);
-
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           m_,
-                           n,
-                           k,
-                           &alpha,
-                           A,
-                           cublas_data_type,
-                           lda,
-                           B,
-                           cublas_data_type,
-                           ldb,
-                           &beta,
-                           C,
-                           cublas_data_type,
-                           ldc,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  }
-  // Add final output bias
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
-                        min(CUDA_NUM_THREADS, parallelism),
-                        0,
-                        stream>>>(
-        output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize);
-  }
-}
-
-#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
-  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
-                                   BatchConfig::max_sequence_length(),         \
-                                   THREADS_PER_VALUE,                          \
-                                   THDS_PER_BLOCK);                            \
-  compute_attention_kernel_generation_kernel<DT,                               \
-                                             THDS_PER_BLOCK,                   \
-                                             Dh,                               \
-                                             Dh_MAX,                           \
-                                             THDS_PER_KEY,                     \
-                                             THREADS_PER_VALUE>                \
-      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
-          static_cast<DT *>(m->devQKVProjArray),                               \
-          static_cast<DT *>(m->keyCache),                                      \
-          static_cast<DT *>(m->valueCache),                                    \
-          output_ptr,                                                          \
-          scale,                                                               \
-          BatchConfig::max_sequence_length(),                                  \
-          m->qProjSize,                                                        \
-          m->hidden_size,                                                      \
-          m->request_infos)
-
-template <typename DT>
-void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
-                                         BatchConfig const *bc,
-                                         DT *output_ptr,
-                                         cudaStream_t stream) {
-  dim3 grid(m->num_q_heads, bc->num_generation_tokens);
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  size_t smem_sz;
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+void incr_attention(IncMultiHeadSelfAttentionMeta *m,
+                    BatchConfig const *bc,
+                    DT *output_ptr,
+                    cudaStream_t stream) {
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  // global constant parameters
+  uint32_t const num_q_heads = m->num_q_heads;
+  uint32_t const num_kv_heads = m->num_kv_heads;
+  uint32_t const head_dim = m->qk_dim;
+  uint32_t const batch_size = bc->num_active_requests();
+  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Update custom mask time: " << elapsed << " ms\n";
+  //   }
+
+  half *q = static_cast<half *>(m->queryTmp),
+       *kv = static_cast<half *>(m->kvCache),
+       *o = static_cast<half *>(m->outputTmp);
+  paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
+      num_kv_heads,
+      kPagesize,
+      head_dim,
+      batch_size,
+      QKVLayout::kNHD,
+      kv,
+      m->handle.incr_attention_metadata->kv_indices,
+      m->handle.incr_attention_metadata->kv_indptr,
+      m->handle.incr_attention_metadata->kv_last_page_len);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    attn prep time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  void *handler = nullptr;
+
+  if (!bc->prompt_phase) {
+    assert(m->handle.incr_attention_metadata->decode_handler_collections.count(
+               batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = m->handle.incr_attention_metadata
+                  ->decode_handler_collections[batch_size];
   } else {
-    assert(false && "a unsupported head size");
-  }
-}
-
-template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             cudaStream_t stream) {
-  // additional processing for weight uploading
-  // Note that we update weight_ptr and bias_ptr when uploading weight and
-  // bias
-  if (m->quantization_type != DT_NONE) {
-    // copy weight_ptr to quantized_weight_ptr, do compression and store in
-    // m->weight_ptr
-    cudaMemcpyAsync(m->quantized_weight_ptr,
-                    weight.get_byte_ptr(),
-                    m->quantized_weightSize,
-                    cudaMemcpyHostToDevice,
-                    stream);
-
-    if (m->quantization_type == DT_INT4) {
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2;
-      decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
+    assert(m->handle.incr_attention_metadata->prompt_handler_collections.count(
+               batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = m->handle.incr_attention_metadata
+                  ->prompt_handler_collections[batch_size];
+  }
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    BeginForward time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  DISPATCH_HEADDIM(head_dim, HEAD_DIM, {
+    cudaError_t result;
+    if (bc->prompt_phase) {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCausal,
+                                                        half,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              static_cast<BatchPrefillHandler *>(handler),
+              q,
+              m->handle.incr_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              /*custom_mask=*/nullptr,
+              /*qk_indptr=*/nullptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*window_left=*/-1,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
     } else {
-      assert(m->quantization_type == DT_INT8);
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads;
-      decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
+      result =
+          BatchDecodeWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                       HEAD_DIM,
+                                                       LogitsPostHook::kNone,
+                                                       PosEncodingMode::kNone,
+                                                       half,
+                                                       half,
+                                                       half,
+                                                       int32_t>(
+              static_cast<BatchDecodeHandler *>(handler),
+              q,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*window_left=*/-1,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
     }
-  } else {
-    if (data_type == DT_FLOAT) {
-      cudaMemcpyAsync(m->weight_ptr,
-                      weight.get_float_ptr(),
-                      m->weightSize,
-                      cudaMemcpyHostToDevice,
-                      stream);
-    } else if (data_type == DT_HALF) {
-      cudaMemcpyAsync(m->weight_ptr,
-                      weight.get_half_ptr(),
-                      m->weightSize,
-                      cudaMemcpyHostToDevice,
-                      stream);
-    } else {
-      assert(false);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to run "
+                               "IncrementalDecodingAttentionForwardKernel: " +
+                               std::string(cudaGetErrorString(result)));
     }
-  }
+  });
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    actual attn time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  produce_output(m, bc, output_ptr, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    produce_output_kernel time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
 }
 
 template <typename DT>
-void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
@@ -808,36 +237,51 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
 
+  // cudaEvent_t t_start, t_end;
+  // cudaEventCreate(&t_start);
+  // cudaEventCreate(&t_end);
+  // cudaEventRecord(t_start, stream);
+
   if (m->offload && m->biasSize > 0) {
     cudaMemcpyAsync(
         m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
 
-  // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
-  update_kv_cache_kernel<DT>(m, bc, stream);
-
-  if (bc->num_generation_tokens > 0) {
-    // phase 3: Compute attention score for generation tokens
-    compute_attention_kernel_generation<DT>(
-        m, bc, static_cast<DT *>(m->attn_heads), stream);
+  // phase 1: Compute QKV Projections of the batch
+  compute_qkv(m,
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
+
+  // phase 2: First maintain the streaming cache, because it need
+  // pre-pos-encoding values
+  if (m->streaming_cache) {
+    // Move pre-pos-encoding cache to where took by attention
+    update_kv_in_streaming_cache<DT>(m, bc, stream);
+    // Apply pos-encoding to those k values
+    apply_pos_encoding_to_streaming_proj<DT>(m, bc, stream);
+    // Commit to the streaming cache
+    commit_kv<DT>(m, bc, stream);
+  }
+
+  // phase 3: Take care of the batch
+  {
+    // Apply pos-encoding to the batch
+    apply_pos_encoding_to_tokens_in_batch(
+        m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
+    // Move the batch qkv values to where took by attention
+    update_qkv_in_batch_paged<DT>(m, bc, stream, false);
   }
 
-  if (bc->num_tokens > bc->num_generation_tokens) {
-    // phase 4: Compute attention score for prompt tokens;
-    compute_attention_kernel_prompt(
-        m, bc, shard_id, bias_ptr, weight_ptr, stream);
-  }
+  // phase 4: Attention computation
+  incr_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
-  // compute output production and bias together for all tokens
+  // phase 5: Compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
   compute_o_prod_bias(
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
@@ -848,295 +292,9 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
 using namespace Kernels::IncMultiHeadAttention;
 
-template <typename DT>
-__global__ void store_kv_cache(DT const *devQKVProjArray,
-                               DT *kCache_ptr,
-                               DT *vCache_ptr,
-                               BatchConfig::PerTokenInfo const *tokenInfos,
-                               int num_tokens,
-                               int max_seq_len,
-                               int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-
-    // key cache
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
-  }
-}
-
-template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
-  }
-}
-
-template <typename DT>
-void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
-                                     BatchConfig const *bc,
-                                     int shard_id,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
-                                     cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-  // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
-  int tokens_previous_requests = 0;
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) {
-      continue;
-    }
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
-                       bc->requestsInfo[i].num_tokens_in_batch;
-    // Step 1: compute query-key product QK.T/sqrt(d_k)
-    {
-      // Scale by sqrt(d_k) as per the original attention paper
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // after transpositions
-      int m_ = num_new_tokens;
-      int n = total_tokens;
-      int k = m->qProjSize;
-      // before transpositions
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      // N.B. strides are applied before transpose operations
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens;
-
-      // matrix A: devQKVProjArray
-      // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens]
-      // To get query projection, skip over Q entries from previous requests
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    bc->requestsInfo[i].first_token_offset_in_batch *
-                        m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-      // matrix B: key cache
-      // matrix B's layout: [kProjSize * num_heads, total_tokens]
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // matrix C: qk_prods
-      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-      // To get C, skip over QK.T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-    // Step 2: Add alibi position bias to qk production
-    // matrix C: qk_prods
-    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-    // To get C, skip over QK.T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods);
-    if (*m->position_bias) {
-      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                  min((size_t)CUDA_NUM_THREADS, parallelism),
-                                  0,
-                                  stream>>>(C,
-                                            num_new_tokens,
-                                            total_tokens,
-                                            m->num_q_heads,
-                                            m->global_num_q_heads,
-                                            shard_id);
-    }
-
-    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
-    // with -inf to force causal attention.
-    assert(num_new_tokens <= total_tokens);
-    size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
-    if (entries_above_diagonal > 0) {
-      size_t parallelism = m->num_q_heads * entries_above_diagonal;
-      fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                    min((size_t)CUDA_NUM_THREADS, parallelism),
-                                    0,
-                                    stream>>>(C,
-                                              num_new_tokens,
-                                              total_tokens,
-                                              m->num_q_heads,
-                                              entries_above_diagonal,
-                                              static_cast<DT>(-INFINITY));
-    }
-
-    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
-    {
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                            CUDNN_TENSOR_NCHW,
-                                            cudnn_data_type,
-                                            n_param,
-                                            c_param,
-                                            h_param,
-                                            w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_CHANNEL,
-                                     &softmax_alpha,
-                                     m->qk_tensor,
-                                     C,
-                                     &softmax_beta,
-                                     m->qk_tensor,
-                                     C_softmax));
-    }
-    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
-    // softmax(QK.T/sqrt(d_k)).T
-    {
-      DT alpha = 1.0f, beta = 0.0f;
-      // after transpositions
-      int m_ = m->vProjSize;
-      int n = num_new_tokens;
-      int k = total_tokens;
-      // before transpositions
-      int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      // N.B. strides are applied before transpose operations
-      int strideA = vt_block_size;
-      int strideB = num_new_tokens * total_tokens;
-      int strideC = m->vProjSize;
-      // matrix A: value cache
-      // matrix A's layout: [vProjSize, num_heads, total_tokens]
-      // To get A, skip over V.T entries from previous requests (all heads +
-      // padding)
-      DT *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // matrix B: qk_prods_softmax
-      // matrix B's layout: [num_new_tokens, total_tokens, num_heads]
-      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      DT *B = static_cast<DT *>(m->qk_prods_softmax);
-      ;
-      // matrix C: attn heads
-      // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
-      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
-      // requests
-      // store the result attn heads, also skip the genration tokens
-      DT *C = static_cast<DT *>(m->attn_heads) +
-              (bc->requestsInfo[i].first_token_offset_in_batch) *
-                  m->num_q_heads * m->vProjSize;
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_T,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-    tokens_previous_requests += num_new_tokens;
-  }
-  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
-    bc->print();
-    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
-    printf("num_tokens: %i\n", num_tokens);
-    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
-  }
-  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
-}
-
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
-    IncMultiHeadSelfAttentionMeta const *m,
+    IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
@@ -1147,12 +305,10 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
 
-  cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
+  // cudaEvent_t t_start, t_end;
+  // cudaEventCreate(&t_start);
+  // cudaEventCreate(&t_end);
+  // cudaEventRecord(t_start, stream);
 
   // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
@@ -1162,11 +318,11 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   if (input.data_type == DT_HALF) {
     if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+      pre_build_weight<half>(m, weight, input.data_type, stream);
     }
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::IncMultiHeadAttention::inference_kernel(
+    Kernels::IncMultiHeadAttention::inference_kernel<half>(
         m,
         bc,
         shard_id,
@@ -1177,11 +333,11 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+      pre_build_weight<float>(m, weight, input.data_type, stream);
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::IncMultiHeadAttention::inference_kernel(
+    Kernels::IncMultiHeadAttention::inference_kernel<float>(
         m,
         bc,
         shard_id,
@@ -1195,15 +351,13 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     assert(false && "Unspported data type");
   }
 
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-    printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
-  }
+  // cudaEventRecord(t_end, stream);
+  // checkCUDA(cudaEventSynchronize(t_end));
+  // float elapsed = 0;
+  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  // cudaEventDestroy(t_start);
+  // cudaEventDestroy(t_end);
+  // printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
 }
 
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
@@ -1217,14 +371,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     : IncMultiHeadSelfAttentionMeta(handler,
                                     INC_DECODING_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
-                                    attn->qProjSize,
-                                    attn->kProjSize,
-                                    attn->vProjSize,
-                                    attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->hidden_size,
+                                    attn->qk_dim,
+                                    attn->v_dim,
+                                    attn->o_dim,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
@@ -1239,20 +390,18 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     _num_q_heads,
                                     _num_kv_heads,
                                     attn->quantization_type,
-                                    attn->offload) {}
+                                    attn->offload,
+                                    attn->streaming_cache) {}
 
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     InferenceMode infer_mode,
     Op const *attn,
-    int _qSize,
-    int _kSize,
-    int _vSize,
-    int _qProjSize,
-    int _kProjSize,
-    int _vProjSize,
-    int _oProjSize,
-    bool _apply_rotary_embedding,
+    int _hidden_size,
+    int _qk_dim,
+    int _v_dim,
+    int _o_dim,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _qkv_bias,
     bool _scaling_query,
     bool _qk_prod_scaling,
@@ -1267,54 +416,49 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _num_q_heads,
     int _num_kv_heads,
     DataType _quantization_type,
-    bool _offload)
+    bool _offload,
+    bool _streaming_cache)
     : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
   checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor));
-  qSize = _qSize;
-  kSize = _kSize;
-  vSize = _vSize;
-  // assume dimensions match for now
-  assert(qSize == kSize);
-  assert(kSize == vSize);
-  qProjSize = _qProjSize;
-  kProjSize = _kProjSize;
-  assert(qProjSize == kProjSize); // required for attention QK.T matmul
-  vProjSize = _vProjSize;
-  oProjSize = _oProjSize;
+  hidden_size = _hidden_size;
+  qk_dim = _qk_dim;
+  v_dim = _v_dim;
+  o_dim = _o_dim;
   size_t size_of_dt = data_type_size(attn->data_type);
   quantization_type = _quantization_type;
   offload = _offload;
+  streaming_cache = _streaming_cache;
 
   global_num_q_heads = _global_num_q_heads;
   global_num_kv_heads = _global_num_kv_heads;
   num_q_heads = _num_q_heads;
   num_kv_heads = _num_kv_heads;
-  hidden_size = num_q_heads * qProjSize;
+  local_hidden_size = num_q_heads * qk_dim;
 
   weightSize =
-      ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) *
+      ((hidden_size * qk_dim + o_dim * (v_dim > 0 ? v_dim : hidden_size)) *
            num_q_heads +
-       (kSize * kProjSize + vSize * vProjSize) * num_q_heads) *
+       (hidden_size * qk_dim + hidden_size * v_dim) * num_q_heads) *
       size_of_dt;
   if (quantization_type != DT_NONE) {
     quantized_weightSize = get_quantization_to_byte_size(
         attn->data_type, quantization_type, weightSize);
   }
-  // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0;
+  // biasSize = _bias ? o_dim * size_of_dt * 4 : 0;
 
-  int qkv_bias_size =
-      qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-  int final_bias_size = oProjSize;
+  int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+  int final_bias_size = o_dim;
   biasSize =
       (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0);
 
   // has_load_weights = (bool *)calloc(1, sizeof(bool));
   //*has_load_weights = false;
-  apply_rotary_embedding = (bool *)calloc(1, sizeof(bool));
-  *apply_rotary_embedding = _apply_rotary_embedding;
+  rotary_embedding_meta =
+      (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta));
+  *rotary_embedding_meta = _rotary_embedding_meta;
   qkv_bias = (bool *)calloc(1, sizeof(bool));
   *qkv_bias = _qkv_bias;
   scaling_query = (bool *)calloc(1, sizeof(bool));
@@ -1335,51 +479,83 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE
-                                   ? BatchConfig::max_verify_tokens_per_batch()
-                                   : BatchConfig::max_tokens_per_batch();
-    size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
-                                                       kProjSize * num_q_heads +
-                                                       vProjSize * num_q_heads);
-    size_t key_cache_size = 0, value_cache_size = 0;
+    int max_tokens_per_batch = std::max(
+        infer_mode == TREE_SEARCH_MODE ? BatchConfig::max_tokens_per_ssm_batch()
+                                       : BatchConfig::max_tokens_per_batch(),
+        BatchConfig::max_tokens_per_prefilling_batch());
+    size_t qkv_max_proj_size =
+        max_tokens_per_batch *
+        (qk_dim * num_q_heads + qk_dim * num_q_heads + v_dim * num_q_heads);
+    size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0;
+    size_t streaming_pre_pos_enc_size = 0;
+    // assert((BatchConfig::max_sequence_length() +
+    //         BatchConfig::max_spec_tree_token_num()) %
+    //            kPagesize ==
+    //        0);
+    size_t max_num_pages =
+        round_up_pages(BatchConfig::max_sequence_length() +
+                       BatchConfig::max_spec_tree_token_num());
+    PageManager *pm = PageManager::get_page_manager();
+    size_t total_kv_cache_size_per_layer = pm->get_kv_cache_size_per_layer();
     switch (infer_mode) {
-      case INC_DECODING_MODE: {
-        key_cache_size = num_q_heads * kProjSize *
-                         BatchConfig::max_requests_per_batch() *
-                         BatchConfig::max_sequence_length();
-        value_cache_size = num_q_heads * vProjSize *
+      case INC_DECODING_MODE:
+      case TREE_VERIFY_MODE: {
+        query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
+        // a K-ary tree max node is (k^n - 1) / 2
+        if (total_kv_cache_size_per_layer == 0) {
+          key_cache_size = num_kv_heads * qk_dim *
                            BatchConfig::max_requests_per_batch() *
-                           BatchConfig::max_sequence_length();
+                           max_num_pages * kPagesize;
+          value_cache_size = num_kv_heads * v_dim *
+                             BatchConfig::max_requests_per_batch() *
+                             max_num_pages * kPagesize;
+        } else {
+          key_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
+          value_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
+        }
         break;
       }
-      case BEAM_SEARCH_MODE:
-      case TREE_VERIFY_MODE: {
+      case TREE_SEARCH_MODE: {
+        query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
-        key_cache_size = num_q_heads * kProjSize *
-                         BeamSearchBatchConfig::max_requests_per_batch() *
-                         (BatchConfig::max_sequence_length() +
-                          BatchConfig::max_spec_tree_token_num());
-        value_cache_size = num_q_heads * vProjSize *
-                           BeamSearchBatchConfig::max_requests_per_batch() *
-                           (BatchConfig::max_sequence_length() +
-                            BatchConfig::max_spec_tree_token_num());
+        key_cache_size = num_kv_heads * qk_dim *
+                         BatchConfig::max_requests_per_batch() * max_num_pages *
+                         kPagesize;
+        value_cache_size = num_kv_heads * v_dim *
+                           BatchConfig::max_requests_per_batch() *
+                           max_num_pages * kPagesize;
         break;
       }
       default:
         assert(false && "Unkown inference mode");
     }
-    size_t requestinfo_size = BatchConfig::max_requests_per_batch();
-    // size_t tokeninfo_size = max_tokens_per_batch;
-    size_t qk_prod_size =
-        max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads;
-    size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize;
-    size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads +
-                                                   kProjSize * num_q_heads)) /
-                          2;
+    if (streaming_cache) {
+      size_t max_post_pos_enc_pages = round_up_pages(
+          BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
+          max(max_tokens_per_batch, BatchConfig::max_spec_tree_token_num()));
+      key_cache_size = num_kv_heads * qk_dim *
+                       BatchConfig::max_requests_per_batch() *
+                       max_post_pos_enc_pages * kPagesize;
+      value_cache_size = num_kv_heads * v_dim *
+                         BatchConfig::max_requests_per_batch() *
+                         max_post_pos_enc_pages * kPagesize;
+      streaming_pre_pos_enc_size =
+          num_kv_heads * (qk_dim + v_dim) *
+          BatchConfig::max_requests_per_batch() *
+          round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                         BatchConfig::get_max_tree_depth()) *
+          kPagesize;
+    }
+    size_t attn_heads_size = max_tokens_per_batch * num_q_heads * v_dim;
+    size_t output_tmp_size = max_tokens_per_batch * num_q_heads * v_dim;
+    size_t complex_size =
+        (max_tokens_per_batch * (qk_dim * num_q_heads + qk_dim * num_q_heads)) /
+        2;
     size_t totalSize =
-        (qkv_max_proj_size + key_cache_size + value_cache_size +
-         2 * qk_prod_size + attn_heads_size) *
+        (qkv_max_proj_size + query_tmp_size + key_cache_size +
+         value_cache_size + streaming_pre_pos_enc_size + attn_heads_size) *
             size_of_dt +
+        output_tmp_size * data_type_size(DT_HALF) +
         complex_size * sizeof(cuFloatComplex); // more components will
                                                // be added here later
     if (offload) {
@@ -1387,15 +563,20 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       size_t totalSharedSize =
           infer_mode == TREE_VERIFY_MODE
               ? totalSize -
-                    (key_cache_size + value_cache_size + qkv_max_proj_size) *
+                    (query_tmp_size + key_cache_size + value_cache_size +
+                     streaming_pre_pos_enc_size + qkv_max_proj_size) *
                         size_of_dt
-              : totalSize - (key_cache_size + value_cache_size) * size_of_dt;
+              : totalSize - (query_tmp_size + key_cache_size +
+                             value_cache_size + streaming_pre_pos_enc_size) *
+                                size_of_dt;
 
       size_t instance_size =
           size_of_dt *
           (infer_mode == TREE_VERIFY_MODE
-               ? key_cache_size + value_cache_size + qkv_max_proj_size
-               : key_cache_size + value_cache_size);
+               ? query_tmp_size + key_cache_size + value_cache_size +
+                     streaming_pre_pos_enc_size + qkv_max_proj_size
+               : query_tmp_size + key_cache_size + value_cache_size +
+                     streaming_pre_pos_enc_size);
 
       if (quantization_type != DT_NONE) {
         totalSharedSize += quantized_weightSize;
@@ -1403,44 +584,54 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       assert(gpu_mem_allocator.reserved_total_size -
                  gpu_mem_allocator.reserved_allocated_size >=
              totalSharedSize);
-      gpu_mem_allocator.create_legion_instance(reserveInst, instance_size);
+      gpu_mem_allocator.create_legion_instance(
+          reserveInst, instance_size, "IncMultiHeadSelfAttentionMeta");
     } else {
-      gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+      gpu_mem_allocator.create_legion_instance(
+          reserveInst, totalSize, "IncMultiHeadSelfAttentionMeta");
     }
 
     // in tree_verify, enable devQKVProjArray;
-    if (!offload || infer_mode == TREE_VERIFY_MODE) {
-      devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped(
+    if (offload) {
+      devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped(
           qkv_max_proj_size * size_of_dt);
     } else {
-      devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped(
+      devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped(
           qkv_max_proj_size * size_of_dt);
-      // offset += qkv_max_proj_size * size_of_dt;
     }
 
     // use key value cache in all mode.
-    keyCache = gpu_mem_allocator.allocate_instance_untyped(key_cache_size *
-                                                           size_of_dt);
-    valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
+    if (query_tmp_size > 0) {
+      queryTmp = gpu_mem_allocator.allocate_instance_untyped(query_tmp_size *
                                                              size_of_dt);
+    }
+    kvCache = gpu_mem_allocator.allocate_instance_untyped(
+        (key_cache_size + value_cache_size) * size_of_dt);
+    if (streaming_pre_pos_enc_size > 0) {
+      streamingPrePosEncBuf = gpu_mem_allocator.allocate_instance_untyped(
+          streaming_pre_pos_enc_size * size_of_dt);
+    }
+    outputTmp = gpu_mem_allocator.allocate_instance<half>(output_tmp_size);
 
     token_infos =
         static_cast<BatchConfig::PerTokenInfo *>(handler.batch_config_metadata);
     request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
         reinterpret_cast<char *>(handler.batch_config_metadata) +
         sizeof(BatchConfig::tokensInfo));
+    request_available = reinterpret_cast<bool *>(
+        reinterpret_cast<char *>(handler.batch_config_metadata) +
+        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo));
+    streaming_cache_infos = reinterpret_cast<StreamingCacheInfo *>(
+        reinterpret_cast<char *>(handler.batch_config_metadata) +
+        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
+        sizeof(BatchConfig::request_available) +
+        sizeof(BatchConfig::causalMask));
 
     if (offload) {
       // token_infos =
       //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerTokenInfo>(
       //         tokeninfo_size);
       // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size;
-      qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size *
-                                                             size_of_dt);
-      // offset += qk_prod_size * size_of_dt;
-      qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped(
-          qk_prod_size * size_of_dt);
-      // offset += qk_prod_size * size_of_dt;
       attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size *
                                                                size_of_dt);
       // offset += attn_heads_size * size_of_dt;
@@ -1454,10 +645,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       // token_infos =
       //     gpu_mem_allocator.allocate_instance<BatchConfig::PerTokenInfo>(
       //         tokeninfo_size);
-      qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size *
-                                                             size_of_dt);
-      qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped(
-          qk_prod_size * size_of_dt);
       attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size *
                                                                size_of_dt);
       complex_input =
@@ -1478,6 +665,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
              gpu_mem_allocator.reserved_allocated_size);
     }
   }
+
+  // set attention constants
+  handler.incr_attention_metadata->set_enabled(true);
+  handler.incr_attention_metadata->set_num_q_heads(num_q_heads);
+  handler.incr_attention_metadata->set_num_kv_heads(num_kv_heads);
+  handler.incr_attention_metadata->set_head_dim(qk_dim);
+
   cudaStreamSynchronize(stream);
 }
 
@@ -1487,49 +681,4 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {
   }
 }
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    cudaStream_t stream);
-
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    cudaStream_t stream);
-
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    BatchConfig const *bc,
-    int shard_id,
-    float *output_ptr,
-    float const *weight_ptr,
-    float const *bias_ptr,
-    int num_tokens,
-    cudaStream_t stream);
-
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    BatchConfig const *bc,
-    int shard_id,
-    half *output_ptr,
-    half const *weight_ptr,
-    half const *bias_ptr,
-    int num_tokens,
-    cudaStream_t stream);
-
-template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        float *output_ptr,
-        cudaStream_t stream);
-
-template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        half *output_ptr,
-        cudaStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu
index 22d8161ff..f58d2dde9 100644
--- a/src/ops/kernels/embedding_kernels.cu
+++ b/src/ops/kernels/embedding_kernels.cu
@@ -48,7 +48,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else if (weight.data_type == DT_FLOAT) {
       Internal::forward_kernel(input.get_int32_ptr(),
@@ -58,7 +58,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else if (weight.data_type == DT_DOUBLE) {
       Internal::forward_kernel(input.get_int32_ptr(),
@@ -68,7 +68,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
@@ -82,7 +82,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else if (weight.data_type == DT_FLOAT) {
       Internal::forward_kernel(input.get_int64_ptr(),
@@ -92,7 +92,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else if (weight.data_type == DT_DOUBLE) {
       Internal::forward_kernel(input.get_int64_ptr(),
@@ -102,7 +102,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
@@ -139,7 +139,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else if (m->output_type[0] == DT_FLOAT) {
       Internal::backward_kernel(input.get_int32_ptr(),
@@ -149,7 +149,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else if (m->output_type[0] == DT_DOUBLE) {
       Internal::backward_kernel(input.get_int32_ptr(),
@@ -159,7 +159,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
@@ -173,7 +173,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else if (m->output_type[0] == DT_FLOAT) {
       Internal::backward_kernel(input.get_int64_ptr(),
@@ -183,7 +183,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else if (m->output_type[0] == DT_DOUBLE) {
       Internal::backward_kernel(input.get_int64_ptr(),
@@ -193,7 +193,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
diff --git a/src/ops/kernels/gemm_impl.cu b/src/ops/kernels/gemm_impl.cu
new file mode 100644
index 000000000..939eaeb3b
--- /dev/null
+++ b/src/ops/kernels/gemm_impl.cu
@@ -0,0 +1,559 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/kernels/gemm_impl.h"
+#include "flexflow/utils/cuda_helper.h"
+#include <memory>
+
+namespace Internal {
+
+GemmEngine::GemmEngine(cublasHandle_t blas_,
+                       cublasLtHandle_t blasLt_,
+                       cudaDeviceProp *device_prop_,
+                       size_t workspace_size_) {
+  blas = blas_;
+  blasLt = blasLt_;
+  if (device_prop_ == nullptr) {
+    device_prop = new cudaDeviceProp;
+  } else {
+    device_prop = device_prop_;
+  }
+  workspace_size = workspace_size_;
+  workspace = nullptr;
+}
+
+void GemmEngine::assign_workspace(void *workspace_, size_t workspace_size_) {
+  assert(workspace_size_ >= workspace_size);
+  workspace = workspace_;
+}
+
+template <typename Dtype>
+void GemmEngine::gemm_internal(cublasOperation_t transa,
+                               cublasOperation_t transb,
+                               int64_t m,
+                               int64_t n,
+                               int64_t k,
+                               Dtype alpha,
+                               Dtype const *a,
+                               int64_t lda,
+                               Dtype const *b,
+                               int64_t ldb,
+                               Dtype beta,
+                               Dtype *c,
+                               int64_t ldc,
+                               cudaStream_t stream) {
+  static_assert(false && sizeof(Dtype), "gemm_internal: not implemented");
+}
+
+#ifdef USE_CUBLASLT
+/* Implementations for gemm_internal_cublaslt */
+template <typename T, cublasStatus_t (*destructor)(T *)>
+struct CuBlasLtDeleter {
+  void operator()(T *x) {
+    if (x != nullptr) {
+      checkCUDA(destructor(x));
+    }
+  }
+};
+
+template <typename T, cublasStatus_t (*destructor)(T *)>
+class CuBlasLtDescriptor {
+public:
+  T *descriptor() const {
+    return descriptor_.get();
+  }
+  T *descriptor() {
+    return descriptor_.get();
+  }
+
+protected:
+  std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class CuBlasLtMatmulDescriptor
+    : public CuBlasLtDescriptor<cublasLtMatmulDescOpaque_t,
+                                &cublasLtMatmulDescDestroy> {
+public:
+  CuBlasLtMatmulDescriptor(cublasComputeType_t compute_type,
+                           cudaDataType_t scale_type) {
+    cublasLtMatmulDesc_t raw_descriptor = nullptr;
+    checkCUDA(
+        cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
+    checkCUDA(::cublasLtMatmulDescSetAttribute(
+        descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+class CuBlasLtMatrixLayout
+    : public CuBlasLtDescriptor<cublasLtMatrixLayoutOpaque_t,
+                                &cublasLtMatrixLayoutDestroy> {
+public:
+  CuBlasLtMatrixLayout(cudaDataType_t type,
+                       uint64_t rows,
+                       uint64_t cols,
+                       int64_t ld,
+                       bool t = false) {
+    cublasLtMatrixLayout_t raw_descriptor = nullptr;
+    checkCUDA(cublasLtMatrixLayoutCreate(
+        &raw_descriptor, type, t ? cols : rows, t ? rows : cols, ld));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr,
+                           const T value) {
+    checkCUDA(::cublasLtMatrixLayoutSetAttribute(
+        descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+class CuBlasLtMatmulPreference
+    : public CuBlasLtDescriptor<cublasLtMatmulPreferenceOpaque_t,
+                                &cublasLtMatmulPreferenceDestroy> {
+public:
+  CuBlasLtMatmulPreference() {
+    cublasLtMatmulPreference_t raw_descriptor = nullptr;
+    checkCUDA(cublasLtMatmulPreferenceCreate(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr,
+                           const T value) {
+    checkCUDA(::cublasLtMatmulPreferenceSetAttribute(
+        descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+inline uint32_t _getAlignment(uintptr_t address) {
+  // alignment are in bytes
+  uint32_t alignment = 256;
+  for (;; alignment /= 2) {
+    if (!(address % alignment)) {
+      return alignment;
+    }
+  }
+}
+
+template <typename Dtype>
+inline void gemm_internal_cublaslt(cublasLtHandle_t handle,
+                                   cudaDeviceProp *prop,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   cublasOperation_t transa,
+                                   cublasOperation_t transb,
+                                   int64_t m,
+                                   int64_t n,
+                                   int64_t k,
+                                   Dtype alpha,
+                                   Dtype const *a,
+                                   int64_t lda,
+                                   Dtype const *b,
+                                   int64_t ldb,
+                                   Dtype beta,
+                                   Dtype *c,
+                                   int64_t ldc,
+                                   cudaStream_t stream) {
+  assert(workspace != nullptr && "workspace must be provided.");
+  cudaDataType_t abcType = CUDA_R_32F;
+  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
+  cudaDataType_t scaleType = CUDA_R_32F;
+  if constexpr (std::is_same_v<Dtype, double>) {
+    abcType = CUDA_R_64F;
+    computeType = CUBLAS_COMPUTE_64F;
+    scaleType = CUDA_R_64F;
+  } else if constexpr (std::is_same_v<Dtype, float>) {
+    computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+  } else if constexpr (std::is_same_v<Dtype, half>) {
+    abcType = CUDA_R_16F;
+    computeType = CUBLAS_COMPUTE_16F;
+  } else {
+    static_assert(false && sizeof(Dtype),
+                  "bgemm_internal_cublaslt: not implemented");
+  }
+
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb);
+  CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, transa == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, transb == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
+
+  CuBlasLtMatmulPreference preference;
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                          workspace_size);
+
+  uint32_t a_alignment = _getAlignment(reinterpret_cast<uintptr_t>(a));
+  uint32_t b_alignment = _getAlignment(reinterpret_cast<uintptr_t>(b));
+  uint32_t c_alignment = _getAlignment(reinterpret_cast<uintptr_t>(c));
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES,
+                          a_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES,
+                          b_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES,
+                          c_alignment);
+
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  checkCUDA(cublasLtMatmulAlgoGetHeuristic(handle,
+                                           computeDesc.descriptor(),
+                                           Adesc.descriptor(),
+                                           Bdesc.descriptor(),
+                                           Cdesc.descriptor(),
+                                           Cdesc.descriptor(),
+                                           preference.descriptor(),
+                                           1,
+                                           &heuristicResult,
+                                           &returnedResult));
+  if (returnedResult == 0) {
+    assert(false && "cuBLASLt failed to find a valid algorithm.");
+  }
+
+  checkCUDA(cublasLtMatmul(handle,
+                           computeDesc.descriptor(),
+                           &alpha,
+                           a,
+                           Adesc.descriptor(),
+                           b,
+                           Bdesc.descriptor(),
+                           &beta,
+                           c,
+                           Cdesc.descriptor(),
+                           c,
+                           Cdesc.descriptor(),
+                           &heuristicResult.algo,
+                           workspace,
+                           workspace_size,
+                           stream));
+}
+#else
+/* Implementations for gemm_internal_cublas */
+template <typename Dtype>
+inline void gemm_internal_cublas(cublasHandle_t handle,
+                                 cudaDeviceProp *prop,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb,
+                                 int64_t m,
+                                 int64_t n,
+                                 int64_t k,
+                                 Dtype alpha,
+                                 Dtype const *a,
+                                 int64_t lda,
+                                 Dtype const *b,
+                                 int64_t ldb,
+                                 Dtype beta,
+                                 Dtype *c,
+                                 int64_t ldc,
+                                 cudaStream_t stream) {
+  static_assert(false && sizeof(Dtype),
+                "gemm_internal_cublas: not implemented");
+}
+
+template <>
+void gemm_internal_cublas<double>(cublasHandle_t handle,
+                                  cudaDeviceProp *prop,
+                                  cublasOperation_t transa,
+                                  cublasOperation_t transb,
+                                  int64_t m,
+                                  int64_t n,
+                                  int64_t k,
+                                  double alpha,
+                                  double const *a,
+                                  int64_t lda,
+                                  double const *b,
+                                  int64_t ldb,
+                                  double beta,
+                                  double *c,
+                                  int64_t ldc,
+                                  cudaStream_t stream) {
+  checkCUDA(cublasDgemm(
+      handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
+}
+
+template <>
+void gemm_internal_cublas<float>(cublasHandle_t handle,
+                                 cudaDeviceProp *prop,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb,
+                                 int64_t m,
+                                 int64_t n,
+                                 int64_t k,
+                                 float alpha,
+                                 float const *a,
+                                 int64_t lda,
+                                 float const *b,
+                                 int64_t ldb,
+                                 float beta,
+                                 float *c,
+                                 int64_t ldc,
+                                 cudaStream_t stream) {
+  checkCUDA(cublasSgemm(
+      handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
+  // checkCUDA(cublasGemmEx(
+  //     handle,
+  //     transa,
+  //     transb,
+  //     m,
+  //     n,
+  //     k,
+  //     &alpha,
+  //     a,
+  //     CUDA_R_32F,
+  //     lda,
+  //     b,
+  //     CUDA_R_32F,
+  //     ldb,
+  //     &beta,
+  //     c,
+  //     CUDA_R_32F,
+  //     ldc,
+  //     CUBLAS_COMPUTE_32F_FAST_16F,
+  //     CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+
+template <>
+void gemm_internal_cublas<half>(cublasHandle_t handle,
+                                cudaDeviceProp *prop,
+                                cublasOperation_t transa,
+                                cublasOperation_t transb,
+                                int64_t m,
+                                int64_t n,
+                                int64_t k,
+                                half alpha,
+                                half const *a,
+                                int64_t lda,
+                                half const *b,
+                                int64_t ldb,
+                                half beta,
+                                half *c,
+                                int64_t ldc,
+                                cudaStream_t stream) {
+  if (prop->major >= 5) {
+    // Disallow fp16 reductions that could lead to unexpected overflow issues.
+    // cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+    // if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+    //   cublas_flags = static_cast<cublasMath_t>(cublas_flags |
+    //   CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    // }
+    // checkCUDA(cublasSetMathMode(handle, cublas_flags));
+    checkCUDA(cublasGemmEx(handle,
+                           transa,
+                           transb,
+                           m,
+                           n,
+                           k,
+                           &alpha,
+                           a,
+                           CUDA_R_16F,
+                           lda,
+                           b,
+                           CUDA_R_16F,
+                           ldb,
+                           &beta,
+                           c,
+                           CUDA_R_16F,
+                           ldc,
+                           CUBLAS_COMPUTE_16F,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // checkCUDA(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
+  } else {
+    float falpha = alpha;
+    float fbeta = beta;
+    checkCUDA(cublasSgemmEx(handle,
+                            transa,
+                            transb,
+                            m,
+                            n,
+                            k,
+                            &falpha,
+                            a,
+                            CUDA_R_16F,
+                            lda,
+                            b,
+                            CUDA_R_16F,
+                            ldb,
+                            &fbeta,
+                            c,
+                            CUDA_R_16F,
+                            ldc));
+  }
+}
+#endif
+
+template <>
+void GemmEngine::gemm_internal(cublasOperation_t transa,
+                               cublasOperation_t transb,
+                               int64_t m,
+                               int64_t n,
+                               int64_t k,
+                               double alpha,
+                               double const *a,
+                               int64_t lda,
+                               double const *b,
+                               int64_t ldb,
+                               double beta,
+                               double *c,
+                               int64_t ldc,
+                               cudaStream_t stream) {
+#ifdef USE_CUBLASLT
+  gemm_internal_cublaslt(blasLt,
+                         device_prop,
+                         workspace,
+                         workspace_size,
+                         transa,
+                         transb,
+                         m,
+                         n,
+                         k,
+                         alpha,
+                         a,
+                         lda,
+                         b,
+                         ldb,
+                         beta,
+                         c,
+                         ldc,
+                         stream);
+#else
+  gemm_internal_cublas(blas,
+                       device_prop,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       alpha,
+                       a,
+                       lda,
+                       b,
+                       ldb,
+                       beta,
+                       c,
+                       ldc,
+                       stream);
+#endif
+}
+
+template <>
+void GemmEngine::gemm_internal(cublasOperation_t transa,
+                               cublasOperation_t transb,
+                               int64_t m,
+                               int64_t n,
+                               int64_t k,
+                               float alpha,
+                               float const *a,
+                               int64_t lda,
+                               float const *b,
+                               int64_t ldb,
+                               float beta,
+                               float *c,
+                               int64_t ldc,
+                               cudaStream_t stream) {
+#ifdef USE_CUBLASLT
+  gemm_internal_cublaslt(blasLt,
+                         device_prop,
+                         workspace,
+                         workspace_size,
+                         transa,
+                         transb,
+                         m,
+                         n,
+                         k,
+                         alpha,
+                         a,
+                         lda,
+                         b,
+                         ldb,
+                         beta,
+                         c,
+                         ldc,
+                         stream);
+#else
+  gemm_internal_cublas(blas,
+                       device_prop,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       alpha,
+                       a,
+                       lda,
+                       b,
+                       ldb,
+                       beta,
+                       c,
+                       ldc,
+                       stream);
+#endif
+}
+
+template <>
+void GemmEngine::gemm_internal(cublasOperation_t transa,
+                               cublasOperation_t transb,
+                               int64_t m,
+                               int64_t n,
+                               int64_t k,
+                               half alpha,
+                               half const *a,
+                               int64_t lda,
+                               half const *b,
+                               int64_t ldb,
+                               half beta,
+                               half *c,
+                               int64_t ldc,
+                               cudaStream_t stream) {
+#ifdef USE_CUBLASLT
+  gemm_internal_cublaslt(blasLt,
+                         device_prop,
+                         workspace,
+                         workspace_size,
+                         transa,
+                         transb,
+                         m,
+                         n,
+                         k,
+                         alpha,
+                         a,
+                         lda,
+                         b,
+                         ldb,
+                         beta,
+                         c,
+                         ldc,
+                         stream);
+#else
+  gemm_internal_cublas(blas,
+                       device_prop,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       alpha,
+                       a,
+                       lda,
+                       b,
+                       ldb,
+                       beta,
+                       c,
+                       ldc,
+                       stream);
+#endif
+}
+} // namespace Internal
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
new file mode 100644
index 000000000..9bb58794a
--- /dev/null
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -0,0 +1,1118 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "flexflow/batch_config.h"
+#include <cassert>
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "cuComplex.h"
+#endif
+#include "flashinfer/pos_enc.cuh"
+#include "flexflow/attention_config.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/inc_multihead_self_attention.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
+#include "flexflow/utils/cuda_helper.h"
+#include <math_constants.h>
+
+namespace FlexFlow {
+
+// declare Legion names
+using Legion::coord_t;
+using Legion::Memory;
+
+using flashinfer::BatchQKApplyLlama31Rotary;
+using flashinfer::BatchQKApplyRotary;
+
+#define WARP_SIZE 32
+
+namespace Kernels {
+namespace IncMultiHeadAttention {
+
+// only used by MPT model. https://arxiv.org/abs/2108.12409
+template <typename DT>
+__global__ void apply_position_bias_qkprd(DT *input_ptr,
+                                          int num_tokens,
+                                          int num_total_tokens,
+                                          int num_heads,
+                                          int global_num_q_heads,
+                                          int shard_id) {
+  CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) {
+    // get head_idx,
+    int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id);
+    int position_idx = (i / num_tokens) % num_total_tokens;
+    position_idx = position_idx + 1 - num_total_tokens;
+    // 8 is alibi_bias_max in
+    // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json
+    float base = (float)(head_idx + 1) * 8 / global_num_q_heads;
+    float slopes = 1.0 / pow(2, base);
+    // if(i == 0){
+    //   printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes,
+    //   position_idx * slopes);
+    // }
+    input_ptr[i] += static_cast<DT>(position_idx * slopes);
+  }
+}
+
+template <typename DT>
+__global__ void apply_proj_bias_w(DT *input_ptr,
+                                  DT const *bias_ptr,
+                                  int num_tokens,
+                                  int qkv_weight_size,
+                                  int o_dim) {
+  CUDA_KERNEL_LOOP(i, num_tokens * o_dim) {
+    int bias_idx = qkv_weight_size + i % o_dim;
+    input_ptr[i] += bias_ptr[bias_idx];
+  }
+}
+
+template <typename DT>
+__global__ void apply_proj_bias_qkv(DT *input_ptr,
+                                    DT const *bias_ptr,
+                                    int shard_id,
+                                    int num_tokens,
+                                    int qk_dim,
+                                    int v_dim,
+                                    int global_num_q_heads,
+                                    int num_q_heads,
+                                    bool scaling_query,
+                                    float scaling_factor,
+                                    int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) {
+    // for simplicity, assume q, k, v is in same shape
+    // 0->q, 1->k, 2->v
+    // int qkv_index = i / (num_tokens * qk_dim) % 3;
+
+    int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
+    size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
+
+    int qkv_index = in_token_idx / hidden_size;
+
+    int proj_size = qkv_index == 0 ? qk_dim : qk_dim;
+
+    int head_idx =
+        (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size;
+    int global_head_idx = head_idx + shard_id * num_q_heads;
+
+    size_t pre_length =
+        qkv_index == 0
+            ? 0
+            : (qkv_index == 1 ? qk_dim * global_num_q_heads
+                              : qk_dim * global_num_q_heads * KV_WEIGHT_NUM);
+
+    size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size;
+
+    input_ptr[i] += bias_ptr[bias_idx];
+
+    if (scaling_query && qkv_index == 0) {
+      input_ptr[i] *= scaling_factor;
+    }
+  }
+}
+
+template <typename DT>
+__global__ void scaling_query_kernel(DT *input_ptr,
+                                     int qk_dim,
+                                     int num_tokens,
+                                     int num_q_heads,
+                                     float scaling_factor,
+                                     int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *=
+        scaling_factor;
+  }
+}
+
+template <typename DT>
+void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
+                 BatchConfig const *bc,
+                 int shard_id,
+                 DT const *input_ptr,
+                 DT const *weight_ptr,
+                 DT *output_ptr,
+                 DT const *bias_ptr,
+                 cudaStream_t stream) {
+
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   checkCUDA(cudaEventCreate(&t_start));
+  //   checkCUDA(cudaEventCreate(&t_end));
+  //   checkCUDA(cudaEventRecord(t_start, stream));
+
+  // Step 1: Compute QKV projections
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_q = m->qk_dim * m->num_q_heads;
+    int m_k = m->qk_dim * m->num_q_heads;
+    int m_v = m->v_dim * m->num_q_heads;
+    assert(m_q == m_k && m_k == m_v); // keep things simple for now
+    int n = bc->num_active_tokens();
+    int k = m->hidden_size;
+    int m_ = m_q * QKV_WEIGHT_NUM;
+    // before transpositions
+    int lda = k, ldb = k, ldc = m_;
+    // matrix A: QKV weights
+    // matrix A's layout: [hidden_size (hidden_dim), qk_dim, num_heads, 3]
+    // matrix B: input
+    // matrix B's layout: [hidden_size (hidden_dim), num_new_tokens]
+    // matrix C: devQKVProjArray
+    // matrix B's layout: [qk_dim, num_heads, 3, num_new_tokens]
+    m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
+                                         CUBLAS_OP_N,
+                                         m_,
+                                         n,
+                                         k,
+                                         alpha,
+                                         weight_ptr,
+                                         lda,
+                                         input_ptr,
+                                         ldb,
+                                         beta,
+                                         output_ptr,
+                                         ldc,
+                                         stream);
+  }
+
+  //   checkCUDA(cudaEventRecord(t_end, stream));
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) {
+  //     std::cout << "GEMM time: " << elapsed << " ms\n";
+  //   }
+
+  int num_tokens = bc->num_active_tokens();
+  if (num_tokens == 0) {
+    return;
+  }
+  int parallelism = m->qk_dim * num_tokens * m->num_q_heads;
+
+  // Step 2: apply bias for QKV, or scale the query
+  if (*m->qkv_bias) {
+    apply_proj_bias_qkv<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(output_ptr,
+                                    bias_ptr,
+                                    shard_id,
+                                    num_tokens,
+                                    m->qk_dim,
+                                    m->v_dim,
+                                    m->global_num_q_heads,
+                                    m->num_q_heads,
+                                    *m->scaling_query,
+                                    m->scaling_factor,
+                                    m->local_hidden_size);
+  } else if (m->scaling_query) {
+    scaling_query_kernel<<<GET_BLOCKS(parallelism),
+                           min(CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream>>>(output_ptr,
+                                     num_tokens,
+                                     m->num_q_heads,
+                                     m->qk_dim,
+                                     m->scaling_factor,
+                                     m->local_hidden_size);
+  }
+}
+
+template <typename DT>
+__global__ void apply_pos_encoding_to_tokens_in_batch_kernel(
+    DT *input_ptr,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    float rope_theta,
+    bool llama3_rope,
+    float factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
+    int qk_dim,
+    int num_tokens,
+    size_t q_array_size,
+    int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // create complex number
+    bool q_tensor = i < (q_array_size / 2);
+    int proj_size = q_tensor ? qk_dim : qk_dim;
+    int real_i = q_tensor ? i : i - q_array_size / 2;
+
+    int token_idx = real_i / (hidden_size / 2);
+    int idx = real_i % (proj_size / 2);
+    int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
+
+    int real_part_index = idx + head_idx * proj_size +
+                          token_idx * hidden_size * QKV_WEIGHT_NUM +
+                          hidden_size * (q_tensor ? 0 : 1);
+    int complex_part_index = real_part_index + (proj_size / 2);
+
+    cuFloatComplex cii = {input_ptr[real_part_index],
+                          input_ptr[complex_part_index]};
+
+    // get the freq_cis: shape 1 * (qk_dim/2) = 1 * 64
+    // apply a Cartesian coordinate transformation
+    // multiple with input & /copy back to q/k
+
+    // get position of token
+
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    float freq = pos * (1.0 / pow(rope_theta, (float)2 * idx / proj_size));
+
+    if (llama3_rope) {
+      float pi = CUDART_PI_F;
+      float wavelen = 2 * pi / freq;
+      float low_freq_wavelen =
+          original_max_position_embeddings / low_freq_factor;
+      float high_freq_wavelen =
+          original_max_position_embeddings / high_freq_factor;
+      if (wavelen < high_freq_wavelen) {
+      } else if (wavelen > low_freq_wavelen) {
+        freq = freq / factor;
+      } else {
+        assert(low_freq_wavelen != high_freq_wavelen);
+        float smooth =
+            (original_max_position_embeddings / wavelen - low_freq_factor) /
+            (high_freq_factor - low_freq_factor);
+        freq = ((1 - smooth) * freq / factor + smooth * freq);
+      }
+    }
+
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    cii = cuCmulf(cii, complex_pos);
+    input_ptr[real_part_index] = cii.x;
+    input_ptr[complex_part_index] = cii.y;
+  }
+}
+
+template <typename DT>
+void apply_pos_encoding_to_tokens_in_batch(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    DT *output_ptr,
+    cudaStream_t stream) {
+  // apply rotary embedding if needed
+  if (!m->rotary_embedding_meta->apply_rotary_embedding) {
+    return;
+  }
+  int num_tokens = bc->num_active_tokens();
+  if (num_tokens == 0) {
+    return;
+  }
+  int parallelism = num_tokens * m->local_hidden_size;
+  size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
+  bool llama3_rope = (m->rotary_embedding_meta->rope_type == "llama3");
+  apply_pos_encoding_to_tokens_in_batch_kernel<<<GET_BLOCKS(parallelism),
+                                                 min(CUDA_NUM_THREADS,
+                                                     parallelism),
+                                                 0,
+                                                 stream>>>(
+      output_ptr,
+      m->token_infos,
+      m->rotary_embedding_meta->rope_theta,
+      llama3_rope,
+      m->rotary_embedding_meta->factor,
+      m->rotary_embedding_meta->low_freq_factor,
+      m->rotary_embedding_meta->high_freq_factor,
+      m->rotary_embedding_meta->original_max_position_embeddings,
+      m->qk_dim,
+      num_tokens,
+      q_array_size,
+      m->local_hidden_size);
+}
+
+__global__ void apply_pos_encoding_to_streaming_proj_kernel(
+    half *kv_cache,
+    BatchConfig::PerRequestInfo const *requestInfos,
+    bool const *request_available,
+    int const max_num_pages,
+    int num_kv_heads,
+    int head_dim,
+    float rope_theta,
+    bool llama3_rope,
+    float factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
+    StreamingCacheInfo const *streaming_cache_infos,
+    uint32_t const max_num_requests) {
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int token_idx = thread_idx / (kv_hidden_size / 2);
+  // Each complex is consist of (i, i + head_dim / 2) wuthin the same head.
+  int const head_idx = (thread_idx % (kv_hidden_size / 2)) / (head_dim / 2);
+  int const offset_in_head = thread_idx % (head_dim / 2);
+  // Get the corresponding request index and token index in the request.
+  int request_idx = 0;
+  while (token_idx >= 0 && request_idx < max_num_requests) {
+    if (request_available[request_idx]) {
+      token_idx -= streaming_cache_infos[request_idx].commit_len;
+    }
+    request_idx++;
+  }
+  if (token_idx >= 0) {
+    return;
+  }
+  request_idx--;
+  token_idx += streaming_cache_infos[request_idx].commit_len;
+
+  // Get the real and complex part index for the current complex.
+  int const real_part_idx =
+      get_k_entry_offset(
+          request_idx, token_idx, max_num_pages, num_kv_heads, head_dim) +
+      head_idx * head_dim + offset_in_head;
+  int const complex_part_idx = real_part_idx + head_dim / 2;
+
+  // Apply the rotary position encoding.
+  cuFloatComplex cii = {kv_cache[real_part_idx], kv_cache[complex_part_idx]};
+  size_t pos = token_idx;
+  float freq =
+      pos * (1.0 / pow(rope_theta, (float)2 * offset_in_head / head_dim));
+
+  if (llama3_rope) {
+    float pi = CUDART_PI_F;
+    float wavelen = 2 * pi / freq;
+    float low_freq_wavelen = original_max_position_embeddings / low_freq_factor;
+    float high_freq_wavelen =
+        original_max_position_embeddings / high_freq_factor;
+    if (wavelen < high_freq_wavelen) {
+    } else if (wavelen > low_freq_wavelen) {
+      freq = freq / factor;
+    } else {
+      assert(low_freq_wavelen != high_freq_wavelen);
+      float smooth =
+          (original_max_position_embeddings / wavelen - low_freq_factor) /
+          (high_freq_factor - low_freq_factor);
+      freq = ((1 - smooth) * freq / factor + smooth * freq);
+    }
+  }
+
+  cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+  cii = cuCmulf(cii, complex_pos);
+  kv_cache[real_part_idx] = cii.x;
+  kv_cache[complex_part_idx] = cii.y;
+}
+
+template <typename DT>
+void apply_pos_encoding_to_streaming_proj(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream) {
+  assert(m->streaming_cache);
+  // apply rotary embedding if needed
+  if (!m->rotary_embedding_meta->apply_rotary_embedding) {
+    return;
+  }
+  int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
+  int num_tokens = 0;
+  for (int req_idx = 0; req_idx < BatchConfig::max_requests_per_batch();
+       req_idx++) {
+    if (!bc->request_available[req_idx]) {
+      continue;
+    }
+    num_tokens += bc->streamingCacheInfo[req_idx].commit_len;
+  }
+  if (num_tokens == 0) {
+    return;
+  }
+  int parallelism = num_tokens * kv_hidden_size / 2;
+  int const max_num_pages = round_up_pages(
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
+      BatchConfig::max_spec_tree_token_num());
+  bool llama3_rope = (m->rotary_embedding_meta->rope_type == "llama3");
+  apply_pos_encoding_to_streaming_proj_kernel<<<GET_BLOCKS(parallelism),
+                                                min(CUDA_NUM_THREADS,
+                                                    parallelism),
+                                                0,
+                                                stream>>>(
+      static_cast<half *>(m->kvCache),
+      m->request_infos,
+      m->request_available,
+      max_num_pages,
+      m->num_kv_heads,
+      m->qk_dim,
+      m->rotary_embedding_meta->rope_theta,
+      llama3_rope,
+      m->rotary_embedding_meta->factor,
+      m->rotary_embedding_meta->low_freq_factor,
+      m->rotary_embedding_meta->high_freq_factor,
+      m->rotary_embedding_meta->original_max_position_embeddings,
+      m->streaming_cache_infos,
+      bc->max_requests_per_batch());
+}
+
+template <typename DT>
+__global__ void
+    update_qkv_in_batch_kernel(DT *qkv_proj_array,
+                               half *qTmp_ptr,
+                               half *kvCache_ptr,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int const max_num_pages,
+                               int num_q_heads,
+                               int num_kv_heads,
+                               int head_dim,
+                               int num_new_tokens) {
+  int const q_hidden_size = num_q_heads * head_dim;
+  int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / q_hidden_size;
+  int const offset = thread_idx % q_hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
+  qTmp_ptr[token_idx * q_hidden_size + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + offset]);
+
+  if (offset < kv_hidden_size) {
+    size_t to_k_idx = get_k_entry_offset(
+               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim),
+           to_v_idx = get_v_entry_offset(
+               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
+    // key and value cache should be stored interleaved
+    int const stride = num_q_heads / num_kv_heads;
+    int const kv_offset =
+        offset / head_dim * stride * head_dim + offset % head_dim;
+    kvCache_ptr[to_k_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
+    kvCache_ptr[to_v_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+                                         temp_kv_hidden_size + kv_offset]);
+  }
+}
+
+template <typename DT>
+void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         cudaStream_t stream) {
+  int num_new_tokens = bc->num_active_tokens();
+  if (num_new_tokens == 0) {
+    return;
+  }
+  int parallelism = m->local_hidden_size * num_new_tokens;
+  int const max_num_pages =
+      round_up_pages(BatchConfig::max_sequence_length() +
+                     BatchConfig::max_spec_tree_token_num());
+  update_qkv_in_batch_kernel<<<GET_BLOCKS(parallelism),
+                               min(CUDA_NUM_THREADS, parallelism),
+                               0,
+                               stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                                         static_cast<half *>(m->queryTmp),
+                                         static_cast<half *>(m->kvCache),
+                                         m->token_infos,
+                                         max_num_pages,
+                                         m->num_q_heads,
+                                         m->num_kv_heads,
+                                         m->qk_dim,
+                                         num_new_tokens);
+}
+
+template <typename DT>
+__global__ void update_qkv_in_batch_paged_kernel(
+    DT *qkv_proj_array,
+    half *qTmp_ptr,
+    half *kvCache_ptr,
+    int32_t *kv_indptr,
+    int32_t *kv_page_indices,
+    bool const *request_available,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    int num_q_heads,
+    int num_kv_heads,
+    int head_dim,
+    int num_new_tokens) {
+  int const q_hidden_size = num_q_heads * head_dim;
+  int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / q_hidden_size;
+  int const offset = thread_idx % q_hidden_size;
+
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  // calculate the compact request index in the easiest way
+  // TODO: recheck
+  int req_idx_compact = -1;
+  int cnt = 0;
+  while (cnt < req_idx + 1) {
+    if (request_available[cnt]) {
+      req_idx_compact++;
+    }
+    cnt++;
+  }
+
+  assert(req_idx_compact >= 0 && "Invalid request index");
+
+  size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
+  qTmp_ptr[token_idx * q_hidden_size + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + offset]);
+
+  if (offset < kv_hidden_size) {
+    int start = kv_indptr[req_idx_compact];
+    int end = kv_indptr[req_idx_compact + 1] - 1;
+    assert(start <= end && "Invalid kv_indptr");
+    assert(start + (token_abs_idx / kPagesize) <= end && "Invalid page index");
+    int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
+    size_t to_k_idx = get_k_entry_offset_verify(
+               token_abs_idx, page_idx, num_kv_heads, head_dim),
+           to_v_idx = get_v_entry_offset_verify(
+               token_abs_idx, page_idx, num_kv_heads, head_dim);
+    // key and value cache should be stored interleaved
+    int const stride = num_q_heads / num_kv_heads;
+    int const kv_offset =
+        offset / head_dim * stride * head_dim + offset % head_dim;
+    kvCache_ptr[to_k_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
+    kvCache_ptr[to_v_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+                                         temp_kv_hidden_size + kv_offset]);
+  }
+}
+
+template <typename DT>
+void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
+                               BatchConfig const *bc,
+                               cudaStream_t stream,
+                               bool is_spec) {
+  // printf("entered update_qkv_in_batch_verify\n");
+  int num_new_tokens = bc->num_active_tokens();
+  if (num_new_tokens == 0) {
+    return;
+  }
+  int parallelism = m->local_hidden_size * num_new_tokens;
+  int32_t *kv_indptr = is_spec
+                           ? m->handle.tree_verify_attention_metadata->kv_indptr
+                           : m->handle.incr_attention_metadata->kv_indptr;
+  int32_t *kv_indices =
+      is_spec ? m->handle.tree_verify_attention_metadata->kv_indices
+              : m->handle.incr_attention_metadata->kv_indices;
+  update_qkv_in_batch_paged_kernel<<<GET_BLOCKS(parallelism),
+                                     min(CUDA_NUM_THREADS, parallelism),
+                                     0,
+                                     stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<half *>(m->queryTmp),
+      static_cast<half *>(m->kvCache),
+      kv_indptr,
+      kv_indices,
+      m->request_available,
+      m->token_infos,
+      m->num_q_heads,
+      m->num_kv_heads,
+      m->qk_dim,
+      num_new_tokens);
+}
+
+__global__ void update_kv_in_streaming_cache_kernel(
+    half *pre_pos_enc_buf,
+    half *kv_cache,
+    BatchConfig::PerRequestInfo const *requestInfos,
+    bool const *request_available,
+    int const max_num_pages_pre_pos_enc_buf,
+    int const max_num_pages_kv_cache,
+    int num_kv_heads,
+    int head_dim,
+    StreamingCacheInfo const *streaming_cache_infos,
+    uint32_t const max_num_requests) {
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int token_idx = thread_idx / kv_hidden_size;
+  int const offset = thread_idx % kv_hidden_size;
+  int request_idx = 0;
+  while (token_idx >= 0 && request_idx < max_num_requests) {
+    if (request_available[request_idx]) {
+      token_idx -= streaming_cache_infos[request_idx].commit_len;
+    }
+    request_idx++;
+  }
+  if (token_idx >= 0) {
+    return;
+  }
+  request_idx--;
+  token_idx += streaming_cache_infos[request_idx].commit_len;
+
+  size_t from_k_idx = get_k_entry_offset(request_idx,
+                                         token_idx,
+                                         max_num_pages_pre_pos_enc_buf,
+                                         num_kv_heads,
+                                         head_dim),
+         from_v_idx = get_v_entry_offset(request_idx,
+                                         token_idx,
+                                         max_num_pages_pre_pos_enc_buf,
+                                         num_kv_heads,
+                                         head_dim);
+
+  // to_idx should consider the rolling property of the window cache
+  int to_idx = token_idx;
+  StreamingCacheInfo const &info = streaming_cache_infos[request_idx];
+  if (info.commit_len >= info.sink_cache_size + info.window_cache_size &&
+      to_idx >= info.sink_cache_size) {
+    to_idx -= info.sink_cache_size;
+    to_idx = (to_idx + info.window_cache_size - info.window_back) %
+             info.window_cache_size;
+    to_idx += info.sink_cache_size;
+  }
+
+  size_t to_k_idx = get_k_entry_offset(request_idx,
+                                       to_idx,
+                                       max_num_pages_kv_cache,
+                                       num_kv_heads,
+                                       head_dim),
+         to_v_idx = get_v_entry_offset(request_idx,
+                                       to_idx,
+                                       max_num_pages_kv_cache,
+                                       num_kv_heads,
+                                       head_dim);
+
+  kv_cache[to_k_idx + offset] = pre_pos_enc_buf[from_k_idx + offset];
+  kv_cache[to_v_idx + offset] = pre_pos_enc_buf[from_v_idx + offset];
+}
+
+template <typename DT>
+void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m,
+                                  BatchConfig const *bc,
+                                  cudaStream_t stream) {
+  assert(m->streaming_cache);
+  int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
+  int num_tokens = 0;
+  for (int req_idx = 0; req_idx < BatchConfig::max_requests_per_batch();
+       req_idx++) {
+    if (!bc->request_available[req_idx]) {
+      continue;
+    }
+    num_tokens += bc->streamingCacheInfo[req_idx].commit_len;
+  }
+  if (num_tokens == 0) {
+    return;
+  }
+  int parallelism = kv_hidden_size * num_tokens;
+  int const max_num_pages_pre_pos_enc_buf = round_up_pages(
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
+  int const max_num_pages_kv_cache = round_up_pages(
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
+      BatchConfig::max_spec_tree_token_num());
+
+  update_kv_in_streaming_cache_kernel<<<GET_BLOCKS(parallelism),
+                                        min(CUDA_NUM_THREADS, parallelism),
+                                        0,
+                                        stream>>>(
+      static_cast<half *>(m->streamingPrePosEncBuf),
+      static_cast<half *>(m->kvCache),
+      m->request_infos,
+      m->request_available,
+      max_num_pages_pre_pos_enc_buf,
+      max_num_pages_kv_cache,
+      m->num_kv_heads,
+      m->qk_dim,
+      m->streaming_cache_infos,
+      bc->max_requests_per_batch());
+}
+
+template <typename DT>
+__global__ void
+    commit_kv_kernel(DT const *qkv_proj_array,
+                     half *pre_pos_enc_buf,
+                     BatchConfig::PerTokenInfo const *tokenInfos,
+                     BatchConfig::PerRequestInfo const *requestInfos,
+                     int const max_num_pages,
+                     int num_q_heads,
+                     int num_kv_heads,
+                     int head_dim,
+                     StreamingCacheInfo const *streaming_cache_infos,
+                     int num_new_tokens) {
+  int const q_hidden_size = num_q_heads * head_dim;
+  int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / kv_hidden_size;
+  int const offset = thread_idx % kv_hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+  int const request_idx = tokenInfos[token_idx].request_index;
+
+  StreamingCacheInfo const &info = streaming_cache_infos[request_idx];
+  int to_idx = tokenInfos[token_idx].abs_index_in_request;
+  // cases that get over the boundary:
+  // 1. commit_len < sink_cache_size: commit to sink, window, window_back is
+  // after commit_len.
+  // 2. sink_cache_size <= commit_len < sink_cache_size + window_cache_size:
+  // commit to window, window_back + sink_cache_size = commit_len, pointing to
+  // the same position.
+  // 3. commit_len >= sink_cache_size + window_cache_size: commit to window,
+  // window is full before this commit, window_back is pointing to the real
+  // position.
+  if (to_idx >= info.sink_cache_size + info.window_cache_size) {
+    to_idx = to_idx - info.commit_len + info.window_back;
+    if (info.commit_len < info.sink_cache_size) {
+      // For case 1, compensating for sink offset, because window_back is
+      // someway back from commit_len.
+      to_idx -= info.sink_cache_size - info.commit_len;
+    }
+    to_idx = info.sink_cache_size + to_idx % info.window_cache_size;
+  }
+  // TODO: For now don't consider the case that the commit tokens roll over the
+  // for more than once. In this case, we should only count the last tokens in
+  // the same window position.
+
+  size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
+  size_t to_k_idx = get_k_entry_offset(
+             request_idx, to_idx, max_num_pages, num_kv_heads, head_dim),
+         to_v_idx = get_v_entry_offset(
+             request_idx, to_idx, max_num_pages, num_kv_heads, head_dim);
+
+  int const stride = num_q_heads / num_kv_heads;
+  int const kv_offset =
+      offset / head_dim * stride * head_dim + offset % head_dim;
+
+  pre_pos_enc_buf[to_k_idx + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
+  pre_pos_enc_buf[to_v_idx + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+                                       temp_kv_hidden_size + kv_offset]);
+}
+
+template <typename DT>
+void commit_kv(IncMultiHeadSelfAttentionMeta const *m,
+               BatchConfig const *bc,
+               cudaStream_t stream) {
+  assert(m->streaming_cache);
+  int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
+  int const num_new_tokens = bc->num_active_tokens();
+  if (num_new_tokens == 0) {
+    return;
+  }
+  int parallelism = kv_hidden_size * num_new_tokens;
+  int const max_num_pages = round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                                           BatchConfig::get_max_tree_depth());
+
+  commit_kv_kernel<<<GET_BLOCKS(parallelism),
+                     min(CUDA_NUM_THREADS, parallelism),
+                     0,
+                     stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                               static_cast<half *>(m->streamingPrePosEncBuf),
+                               m->token_infos,
+                               m->request_infos,
+                               max_num_pages,
+                               m->num_q_heads,
+                               m->num_kv_heads,
+                               m->qk_dim,
+                               m->streaming_cache_infos,
+                               num_new_tokens);
+}
+
+template <typename DT>
+__global__ void produce_output_kernel(half const *input_ptr,
+                                      DT *output_ptr,
+                                      int parallelism) {
+  CUDA_KERNEL_LOOP(idx, parallelism) {
+    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
+  }
+}
+
+template <typename DT>
+void produce_output(IncMultiHeadSelfAttentionMeta const *m,
+                    BatchConfig const *bc,
+                    DT *output_ptr,
+                    cudaStream_t stream) {
+  int const num_tokens = bc->num_active_tokens();
+  if (num_tokens == 0) {
+    return;
+  }
+  int parallelism = m->v_dim * m->num_q_heads * num_tokens;
+  produce_output_kernel<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(m->outputTmp, output_ptr, parallelism);
+}
+
+template <typename DT>
+void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         int shard_id,
+                         DT *output_ptr,
+                         DT const *weight_ptr,
+                         DT const *bias_ptr,
+                         int num_tokens,
+                         cudaStream_t stream) {
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+  // Project to output, save result directly on output tensor
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_ = m->o_dim;
+    int k = m->v_dim * m->num_q_heads;
+    int n = num_tokens;
+    // before transpositions
+    int lda = k, ldb = k, ldc = m_;
+    // matrix A: output projection weight
+    // matrix A's layout: [v_dim * num_heads, o_dim]
+    DT const *A = weight_ptr + m->hidden_size * (m->qk_dim * m->num_q_heads +
+                                                 m->qk_dim * m->num_q_heads +
+                                                 m->v_dim * m->num_q_heads);
+    // matrix B: attn heads
+    // matrix B's layout: [v_dim * num_heads, num_new_tokens]
+    DT const *B = static_cast<DT *>(m->attn_heads);
+    // matrix B: output
+    // matrix B's layout: [o_dim, num_new_tokens]
+    DT *C = static_cast<DT *>(output_ptr);
+
+    m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
+                                         CUBLAS_OP_N,
+                                         m_,
+                                         n,
+                                         k,
+                                         alpha,
+                                         A,
+                                         lda,
+                                         B,
+                                         ldb,
+                                         beta,
+                                         C,
+                                         ldc,
+                                         stream);
+  }
+  // Add final output bias
+  if (*m->final_bias && shard_id == 0) {
+    int parallelism = m->o_dim * num_tokens;
+    int qkv_weight_size = m->qk_dim * m->global_num_q_heads +
+                          m->qk_dim * m->global_num_q_heads +
+                          m->v_dim * m->global_num_q_heads;
+    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
+                        min(CUDA_NUM_THREADS, parallelism),
+                        0,
+                        stream>>>(
+        output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->o_dim);
+  }
+}
+
+template <typename DT>
+void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
+                      GenericTensorAccessorR const weight,
+                      DataType data_type,
+                      cudaStream_t stream) {
+  // additional processing for weight uploading
+  // Note that we update weight_ptr and bias_ptr when uploading weight and
+  // bias
+  if (m->quantization_type != DT_NONE) {
+    // copy weight_ptr to quantized_weight_ptr, do compression and store in
+    // m->weight_ptr
+    cudaMemcpyAsync(m->quantized_weight_ptr,
+                    weight.get_byte_ptr(),
+                    m->quantized_weightSize,
+                    cudaMemcpyHostToDevice,
+                    stream);
+
+    if (m->quantization_type == DT_INT4) {
+      int parallelism = m->qk_dim * m->hidden_size * m->num_q_heads / 2;
+      decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
+                                          min(CUDA_NUM_THREADS, parallelism),
+                                          0,
+                                          stream>>>(
+          m->quantized_weight_ptr,
+          static_cast<DT *>(m->weight_ptr),
+          m->qk_dim,
+          m->hidden_size,
+          m->num_q_heads);
+    } else {
+      assert(m->quantization_type == DT_INT8);
+      int parallelism = m->qk_dim * m->hidden_size * m->num_q_heads;
+      decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
+                                          min(CUDA_NUM_THREADS, parallelism),
+                                          0,
+                                          stream>>>(
+          m->quantized_weight_ptr,
+          static_cast<DT *>(m->weight_ptr),
+          m->qk_dim,
+          m->hidden_size,
+          m->num_q_heads);
+    }
+  } else {
+    if (data_type == DT_FLOAT) {
+      cudaMemcpyAsync(m->weight_ptr,
+                      weight.get_float_ptr(),
+                      m->weightSize,
+                      cudaMemcpyHostToDevice,
+                      stream);
+    } else if (data_type == DT_HALF) {
+      cudaMemcpyAsync(m->weight_ptr,
+                      weight.get_half_ptr(),
+                      m->weightSize,
+                      cudaMemcpyHostToDevice,
+                      stream);
+    } else {
+      assert(false);
+    }
+  }
+}
+
+} // namespace IncMultiHeadAttention
+} // namespace Kernels
+
+using namespace Kernels::IncMultiHeadAttention;
+
+template void Kernels::IncMultiHeadAttention::pre_build_weight<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    GenericTensorAccessorR const weight,
+    DataType data_type,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::pre_build_weight<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    GenericTensorAccessorR const weight,
+    DataType data_type,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_qkv<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    float const *input_ptr,
+    float const *weight_ptr,
+    float *output_ptr,
+    float const *bias_ptr,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_qkv<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    half const *input_ptr,
+    half const *weight_ptr,
+    half *output_ptr,
+    half const *bias_ptr,
+    cudaStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch<
+        float>(IncMultiHeadSelfAttentionMeta const *m,
+               BatchConfig const *bc,
+               float *output_ptr,
+               cudaStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        half *output_ptr,
+        cudaStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::apply_pos_encoding_to_streaming_proj<float>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        cudaStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::apply_pos_encoding_to_streaming_proj<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_paged<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream,
+    bool is_spec);
+
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_paged<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream,
+    bool is_spec);
+
+template void
+    Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        cudaStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache<float>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::commit_kv<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::commit_kv<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::produce_output<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    float *output_ptr,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::produce_output<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    half *output_ptr,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    float *output_ptr,
+    float const *weight_ptr,
+    float const *bias_ptr,
+    int num_tokens,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    half *output_ptr,
+    half const *weight_ptr,
+    half const *bias_ptr,
+    int num_tokens,
+    cudaStream_t stream);
+}; // namespace FlexFlow
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index c30c9f71c..2c049be68 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -40,7 +40,7 @@ LinearMeta::LinearMeta(FFHandler handler,
   }
   // Allocate an all-one's vector
   gpu_mem_allocator.create_legion_instance(
-      reserveInst, data_type_size(data_type) * batch_size);
+      reserveInst, data_type_size(data_type) * batch_size, "LinearMeta");
   one_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * batch_size);
   int parallelism = batch_size;
@@ -323,6 +323,10 @@ void forward_kernel(LinearMeta const *m,
                                    : ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
+  DT const *input_p = static_cast<DT const *>(input_ptr),
+           *weight_p =
+               static_cast<DT const *>(m->offload ? m->weight_ptr : weight_ptr);
+  DT *output_p = static_cast<DT *>(output_ptr);
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
 #else
@@ -334,25 +338,20 @@ void forward_kernel(LinearMeta const *m,
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
   }
 #endif
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         out_dim,
-                         batch_size,
-                         in_dim,
-                         &alpha,
-                         m->offload ? m->weight_ptr : weight_ptr,
-                         weight_type,
-                         in_dim,
-                         input_ptr,
-                         input_type,
-                         in_dim,
-                         &beta,
-                         output_ptr,
-                         output_type,
-                         out_dim,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
+                                       CUBLAS_OP_N,
+                                       out_dim,
+                                       batch_size,
+                                       in_dim,
+                                       alpha,
+                                       weight_p,
+                                       in_dim,
+                                       input_p,
+                                       in_dim,
+                                       beta,
+                                       output_p,
+                                       out_dim,
+                                       stream);
   // use_bias = True
   if (bias_ptr != NULL) {
     // fuse bias and relu
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp
index 690655645..ed0b0f9a5 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cpp
+++ b/src/ops/kernels/residual_rms_norm_kernels.cpp
@@ -42,7 +42,8 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
   size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "ResidualRMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 17ac14449..7530c179e 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -13,6 +13,11 @@
  * limitations under the License.
  */
 
+#include "flashinfer/utils.cuh"
+#include <numeric>
+
+#include "flashinfer/math.cuh"
+#include "flashinfer/vec_dtypes.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
 #include "flexflow/ops/residual_rms_norm.h"
@@ -43,7 +48,8 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
   size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "ResidualRMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
@@ -55,89 +61,133 @@ ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   }
 }
 
-namespace Kernels {
-namespace ResidualRMSNorm {
+// Adopted from flashinfer
+// (https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/norm.cuh)
+// Main modification is for non-inplace computation
+template <uint32_t VEC_SIZE, typename T>
+__global__ void FusedAddRMSNormKernel(T const *__restrict__ input,
+                                      T const *__restrict__ residual,
+                                      T const *__restrict__ weight,
+                                      T *__restrict__ output,
+                                      T *__restrict__ residual_output,
+                                      const uint32_t d,
+                                      float eps) {
+  const uint32_t bx = blockIdx.x;
+  const uint32_t tx = threadIdx.x, ty = threadIdx.y;
+  constexpr uint32_t warp_size = 32;
+  const uint32_t num_warps = blockDim.y;
+  const uint32_t thread_id = tx + ty * warp_size;
+  const uint32_t num_threads = num_warps * warp_size;
+  const uint32_t rounds = flashinfer::ceil_div(d, VEC_SIZE * num_threads);
+  extern __shared__ float smem[];
 
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_DOWN(T value,
-                                            unsigned int delta,
-                                            int width = warpSize,
-                                            unsigned int mask = 0xffffffff) {
-#ifndef __HIP_PLATFORM_HCC__
-  return __shfl_down_sync(mask, value, delta, width);
-#else
-  return __shfl_down(value, delta, width);
-#endif
-}
+  float sum_sq = 0.f;
 
-template <typename T>
-__inline__ __device__ T WarpReduceSum(T val) {
+  for (uint32_t i = 0; i < rounds; i++) {
+    flashinfer::vec_t<T, VEC_SIZE> input_vec;
+    flashinfer::vec_t<T, VEC_SIZE> residual_vec;
+    flashinfer::vec_t<T, VEC_SIZE> residual_output_vec;
+    input_vec.fill(0);
+    residual_vec.fill(0);
+    residual_output_vec.fill(0);
+    if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE +
+                     thread_id * VEC_SIZE);
+      residual_vec.load(residual + bx * d + i * num_threads * VEC_SIZE +
+                        thread_id * VEC_SIZE);
+    }
 #pragma unroll
-  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
-    val += WARP_SHFL_DOWN(val, offset);
+    for (uint32_t j = 0; j < VEC_SIZE; j++) {
+      float x = float(input_vec[j]);
+      x += float(residual_vec[j]);
+      sum_sq += x * x;
+      residual_output_vec[j] = (T)x;
+    }
+    if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+      residual_output_vec.store(residual_output + bx * d +
+                                i * num_threads * VEC_SIZE +
+                                thread_id * VEC_SIZE);
+    }
   }
-  return val;
-}
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
+  // first, warp reduce sum
+#pragma unroll
+  for (uint32_t offset = warp_size / 2; offset > 0; offset /= 2) {
+    sum_sq += flashinfer::math::shfl_xor_sync(sum_sq, offset);
+  }
+
+  smem[ty] = sum_sq;
   __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
+  // then, cross warp reduce sum using only the first warp
+  if (ty == 0) {
+    sum_sq = (tx < num_warps) ? smem[tx] : 0.f;
+#pragma unroll
+    for (uint32_t offset = warp_size / 2; offset > 0; offset /= 2) {
+      sum_sq += flashinfer::math::shfl_xor_sync(sum_sq, offset);
+    }
+    smem[0] = sum_sq;
   }
   __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
+
+  float rms_rcp = flashinfer::math::rsqrt(smem[0] / float(d) + eps);
+
+  for (uint32_t i = 0; i < rounds; i++) {
+    flashinfer::vec_t<T, VEC_SIZE> weight_vec;
+    flashinfer::vec_t<T, VEC_SIZE> residual_output_vec;
+    flashinfer::vec_t<T, VEC_SIZE> output_vec;
+    weight_vec.fill(0);
+    residual_output_vec.fill(0);
+    output_vec.fill(0);
+    if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+      weight_vec.load(weight + i * num_threads * VEC_SIZE +
+                      thread_id * VEC_SIZE);
+      residual_output_vec.load(residual_output + bx * d +
+                               i * num_threads * VEC_SIZE +
+                               thread_id * VEC_SIZE);
+    }
+#pragma unroll
+    for (uint32_t j = 0; j < VEC_SIZE; j++) {
+      output_vec[j] =
+          float(residual_output_vec[j]) * rms_rcp * float(weight_vec[j]);
+    }
+    if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+      output_vec.store(output + bx * d + i * num_threads * VEC_SIZE +
+                       thread_id * VEC_SIZE);
+    }
   }
-  return val;
 }
 
 template <typename T>
-__global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
-                                                  float eps,
-                                                  T const *X1,
-                                                  T const *X2,
-                                                  T *X_out,
-                                                  T *rms,
-                                                  T *Y,
-                                                  T const *weights,
-                                                  T *output) {
-  __shared__ float v_shared[C10_WARP_SIZE];
-  int64_t const i = blockIdx.x;
-  float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
-    int64_t const index = i * N + j;
-    X_out[index] = X1[index] + X2[index];
-    sum +=
-        (static_cast<float>(X_out[index]) * static_cast<float>(X_out[index]));
-  }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
-
-  if (threadIdx.x == 0) {
-    rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
-  }
+cudaError_t FusedAddRMSNorm(T const *input,
+                            T const *residual,
+                            T const *weight,
+                            T *output,
+                            T *residual_output,
+                            uint32_t batch_size,
+                            uint32_t d,
+                            float eps = 1e-5,
+                            cudaStream_t stream = 0) {
+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
-  __syncthreads();
+  const uint32_t block_size = std::min<uint32_t>(1024, d / vec_size);
+  const uint32_t num_warps = flashinfer::ceil_div(block_size, 32);
+  dim3 nblks(batch_size);
+  dim3 nthrs(32, num_warps);
+  const uint32_t smem_size = num_warps * sizeof(float);
+  void *args[] = {
+      &input, &residual, &weight, &output, &residual_output, &d, &eps};
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
-    const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X_out[index]) * static_cast<T_ACC>(rms[i]);
-    output[index] = Y[index] * weights[index % N];
-  }
+  DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
+    auto kernel = FusedAddRMSNormKernel<VEC_SIZE, T>;
+    FLASHINFER_CUDA_CALL(cudaLaunchKernel(
+        (void *)kernel, nblks, nthrs, args, smem_size, stream));
+  });
+
+  return cudaSuccess;
 }
 
+namespace Kernels {
+namespace ResidualRMSNorm {
 template <typename T>
 void forward_kernel(ResidualRMSNormMeta const *m,
                     T const *input1_ptr,
@@ -145,28 +195,29 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                     T const *weight_ptr,
                     T *residual_output_ptr,
                     T *output_ptr,
+                    int batch_size,
                     cudaStream_t stream) {
-
+  assert(batch_size <= m->batch_size);
+  // use active batch size
   std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
+      std::make_pair(batch_size, kCUDABlockReduceNumThreads);
   std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
+      std::make_pair(batch_size, kCUDANumThreads);
 
   int num_blocks =
       std::max(kernel1_parallelism.first, kernel2_parallelism.first);
   int num_threads =
       std::max(kernel1_parallelism.second, kernel2_parallelism.second);
 
-  ResidualRMSNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->in_dim,
-                                               m->eps,
-                                               input1_ptr,
-                                               input2_ptr,
-                                               residual_output_ptr,
-                                               static_cast<T *>(m->rms_ptr),
-                                               static_cast<T *>(m->norm_ptr),
-                                               weight_ptr,
-                                               output_ptr);
+  checkCUDA(FusedAddRMSNorm<T>(input1_ptr,
+                               input2_ptr,
+                               weight_ptr,
+                               output_ptr,
+                               residual_output_ptr,
+                               batch_size,
+                               m->in_dim,
+                               m->eps,
+                               stream));
 }
 
 void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
@@ -174,7 +225,8 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorR const &input2,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &residual_output,
-                            GenericTensorAccessorW const &output) {
+                            GenericTensorAccessorW const &output,
+                            int batch_size) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -195,6 +247,7 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                    weight.get_half_ptr(),
                    residual_output.get_half_ptr(),
                    output.get_half_ptr(),
+                   batch_size,
                    stream);
   } else if (output.data_type == DT_FLOAT) {
     forward_kernel(m,
@@ -203,6 +256,7 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                    weight.get_float_ptr(),
                    residual_output.get_float_ptr(),
                    output.get_float_ptr(),
+                   batch_size,
                    stream);
   } else {
     assert(false && "Unsupported data type");
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index 24ab7051e..9636929d9 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -42,7 +42,8 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
   size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "RMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 7c9f4a9f9..8555e58be 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -43,7 +43,8 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
   size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "RMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 44979c48f..4289a9236 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -37,7 +37,8 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 6;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "LayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index f1b7a537b..046a4bc25 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -38,7 +38,8 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "ResidualLayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index e5ebdce6e..05e66db02 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -37,7 +37,8 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "ResidualLayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index f4f5bb72d..713486268 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -447,7 +447,8 @@ void ResidualRMSNorm::inference_task(Task const *task,
       m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input1, input2, weight, residual_output, output);
+  forward_kernel_wrapper(
+      m, input1, input2, weight, residual_output, output, bc->num_tokens);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index 9fc2316f9..92db9a958 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -47,7 +47,7 @@ using PCG::Node;
 // For an input tensor, computes the top k entries in each row
 // (resp. vector along the last dimension). Thus,
 // values.shape = indices.shape = input.shape[:-1] + [k]
-Tensor FFModel::sampling(const Tensor input, float top_p, char const *name) {
+Tensor FFModel::sampling(Tensor const input, float top_p, char const *name) {
   Layer *li = new Layer(this,
                         OP_SAMPLING,
                         input->data_type,
@@ -103,7 +103,7 @@ bool operator==(SamplingParams const &lhs, SamplingParams const &rhs) {
 }
 
 Sampling::Sampling(FFModel &model,
-                   const ParallelTensor _input,
+                   ParallelTensor const _input,
                    float _top_p,
                    char const *name)
     : Op(model,
@@ -132,12 +132,12 @@ Sampling::Sampling(FFModel &model,
 
 Sampling::Sampling(FFModel &model,
                    Sampling const &other,
-                   const ParallelTensor input)
+                   ParallelTensor const input)
     : Sampling(model, input, other.top_p, other.name) {}
 
 Sampling::Sampling(FFModel &model,
                    SamplingParams const &params,
-                   const ParallelTensor input,
+                   ParallelTensor const input,
                    char const *name)
     : Sampling(model, input, params.top_p, params.name) {}
 
@@ -316,6 +316,7 @@ InferenceResult
   }
 
   InferenceResult ir;
+  ir.num_token_ids = batch_size;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp
index 3d8f10352..03e37333e 100644
--- a/src/ops/sampling.cpp
+++ b/src/ops/sampling.cpp
@@ -204,7 +204,8 @@ SamplingMeta::SamplingMeta(FFHandler handler,
                                     idx_size + sorted_idx_size) +
                      data_type_size(data_type) * sorted_logits_size +
                      sizeof(hiprandState) * state_size;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "SamplingMeta");
   begin_offset = gpu_mem_allocator.allocate_instance<int>(begin_offset_size);
   end_offset = gpu_mem_allocator.allocate_instance<int>(end_offset_size);
   idx = gpu_mem_allocator.allocate_instance<int>(idx_size);
@@ -262,7 +263,8 @@ SamplingMeta::SamplingMeta(FFHandler handler,
   //   assert(false && "input type in float and half");
   // }
 
-  gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, temp_storage_bytes, "SamplingMeta");
   d_temp_storage =
       gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes);
 }
diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu
index 461d72ec7..686817096 100644
--- a/src/ops/sampling.cu
+++ b/src/ops/sampling.cu
@@ -45,8 +45,12 @@ __global__ void init_idxs(int batch_size,
                           int *idx,
                           int *begin_offset,
                           int *end_offset) {
-  CUDA_KERNEL_LOOP(i, total_eles) {
-    idx[i] = i % vocab_size;
+  // +1 to include the upper boundary
+  CUDA_KERNEL_LOOP(i, total_eles + 1) {
+    if (i < total_eles) {
+      // Exclude the last element
+      idx[i] = i % vocab_size;
+    }
     if (i % vocab_size == 0) {
       begin_offset[i / vocab_size] = i;
       end_offset[i / vocab_size] = i;
@@ -55,9 +59,9 @@ __global__ void init_idxs(int batch_size,
 }
 
 __global__ void
-    init_random_kernel(curandState *state, int batch_size, long rand) {
+    init_random_kernel(curandState *state, int batch_size, long seed) {
   CUDA_KERNEL_LOOP(i, batch_size) {
-    curand_init(rand, i, 0, &state[i]);
+    curand_init(seed, i, 0, &state[i]);
   }
 }
 
@@ -74,11 +78,14 @@ __global__ void sampling_topp_kernel(int batch_size,
   int const batch_idx = blockIdx.x;
   __shared__ float random_n;
   __shared__ long long result_idx;
+  __shared__ bool is_end;
 
   // random num
   if (threadIdx.x == 0) {
     // number must < topp
     random_n = curand_uniform(state + batch_idx) * topp;
+    is_end = false;
+    result_idx = vocab_size - 1;
     // printf("batch idx: %d, random num%f\n", batch_idx, random_n);
   }
 
@@ -91,14 +98,19 @@ __global__ void sampling_topp_kernel(int batch_size,
   int offset = batch_idx * vocab_size;
   float prefix_sum = 0.0f;
   BlockPrefixCallbackOp prefix_op(0);
-  result_idx = vocab_size - 1;
 
   for (long long j = threadIdx.x; j < vocab_size; j += blockDim.x) {
     float logit = (float)(sorted_logits[offset + j]);
     BlockScan(temp_storage).InclusiveSum(logit, prefix_sum, prefix_op);
-    prefix_sum /= topp;
+    __syncthreads();
     if (prefix_sum >= random_n) {
       atomicMin(&result_idx, j);
+      is_end = true;
+    }
+    // Synchronize to make sure all threads see the updated flag
+    __syncthreads();
+    if (is_end) {
+      break;
     }
   }
   indices_ptr[batch_idx] = sorted_idx[offset + result_idx];
@@ -216,7 +228,8 @@ SamplingMeta::SamplingMeta(FFHandler handler,
                                     idx_size + sorted_idx_size) +
                      data_type_size(data_type) * sorted_logits_size +
                      sizeof(curandState) * state_size;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "SamplingMeta");
   begin_offset = gpu_mem_allocator.allocate_instance<int>(begin_offset_size);
   end_offset = gpu_mem_allocator.allocate_instance<int>(end_offset_size);
   idx = gpu_mem_allocator.allocate_instance<int>(idx_size);
@@ -274,7 +287,8 @@ SamplingMeta::SamplingMeta(FFHandler handler,
     assert(false && "input type in float and half");
   }
 
-  gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, temp_storage_bytes, "SamplingMeta");
   d_temp_storage =
       gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes);
 }
diff --git a/src/ops/select_k_impl.cu b/src/ops/select_k_impl.cu
new file mode 100644
index 000000000..9fcdbb719
--- /dev/null
+++ b/src/ops/select_k_impl.cu
@@ -0,0 +1,35 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "raft/matrix/detail/select_k-inl.cuh"
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                       \
+  template void raft::matrix::detail::select_k(raft::resources const &handle,  \
+                                               const T *in_val,                \
+                                               const IdxT *in_idx,             \
+                                               size_t batch_size,              \
+                                               size_t len,                     \
+                                               int k,                          \
+                                               T *out_val,                     \
+                                               IdxT *out_idx,                  \
+                                               bool select_min,                \
+                                               bool sorted,                    \
+                                               raft::matrix::SelectAlgo algo,  \
+                                               const IdxT *len_i)
+
+instantiate_raft_matrix_detail_select_k(half, int);
+instantiate_raft_matrix_detail_select_k(float, int);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 3ddd6b8d6..b39a424c6 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -41,7 +41,9 @@ using Legion::TaskLauncher;
 
 bool operator==(SigmoidSiluMultiParams const &lhs,
                 SigmoidSiluMultiParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid;
+  return lhs.layer_guid == rhs.layer_guid &&
+         lhs.intermediate_size == rhs.intermediate_size &&
+         lhs.tensor_parallelism_degree == rhs.tensor_parallelism_degree;
 }
 
 bool SigmoidSiluMultiParams::is_valid(
@@ -52,6 +54,8 @@ bool SigmoidSiluMultiParams::is_valid(
 SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const {
   SigmoidSiluMultiParams params;
   params.layer_guid = this->layer_guid;
+  params.intermediate_size = this->intermediate_size;
+  params.tensor_parallelism_degree = this->tensor_parallelism_degree;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -60,6 +64,7 @@ SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const {
 
 Tensor FFModel::sigmoid_silu_multi(const Tensor input1,
                                    const Tensor input2,
+                                   int intermediate_size,
                                    DataType data_type,
                                    char const *name) {
 
@@ -94,6 +99,9 @@ Tensor FFModel::sigmoid_silu_multi(const Tensor input1,
                          casted_input2);
   ssm->outputs[0] = create_tensor_legion_ordering(
       input1->num_dims, input1->dims, data_type, ssm, 0, false /*create_grad*/);
+  ssm->add_int_property("intermediate_size", intermediate_size);
+  ssm->add_int_property("tensor_parallelism_degree",
+                        config.tensor_parallelism_degree);
   layers.push_back(ssm);
   return ssm->outputs[0];
 }
@@ -102,9 +110,18 @@ Op *SigmoidSiluMulti::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
-
-  return new SigmoidSiluMulti(
-      model, layer->layer_guid, inputs[0], inputs[1], layer->name);
+  long long value;
+  layer->get_int_property("intermediate_size", value);
+  int intermediate_size = value;
+  layer->get_int_property("tensor_parallelism_degree", value);
+  int tensor_parallelism_degree = value;
+  return new SigmoidSiluMulti(model,
+                              layer->layer_guid,
+                              inputs[0],
+                              inputs[1],
+                              intermediate_size,
+                              tensor_parallelism_degree,
+                              layer->name);
 }
 
 SigmoidSiluMulti::SigmoidSiluMulti(
@@ -112,13 +129,20 @@ SigmoidSiluMulti::SigmoidSiluMulti(
     SigmoidSiluMultiParams const &params,
     std::pair<ParallelTensor, ParallelTensor> const &inputs,
     char const *name)
-    : SigmoidSiluMulti(
-          model, params.layer_guid, inputs.first, inputs.second, params.name) {}
+    : SigmoidSiluMulti(model,
+                       params.layer_guid,
+                       inputs.first,
+                       inputs.second,
+                       params.intermediate_size,
+                       params.tensor_parallelism_degree,
+                       params.name) {}
 
 SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model,
                                    LayerID const &_layer_guid,
                                    const ParallelTensor _input1,
                                    const ParallelTensor _input2,
+                                   int _intermediate_size,
+                                   int _tensor_parallelism_degree,
                                    char const *name)
     : Op(model,
          OP_SIGMOID_SILU_MULTI,
@@ -128,7 +152,9 @@ SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model,
          0 /*weights*/,
          1 /*outputs*/,
          _input1,
-         _input2) {
+         _input2),
+      intermediate_size(_intermediate_size),
+      tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   outputs[0] = model.create_parallel_tensor_legion_ordering(_input1->num_dims,
@@ -242,8 +268,13 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task,
                        .best_affinity_to(task->target_proc)
                        .first();
   MemoryAllocator gpu_mem_allocator(gpu_mem);
-  SigmoidSiluMultiMeta *meta =
-      new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator);
+  int intermediate_size =
+      ssm->intermediate_size / ssm->tensor_parallelism_degree;
+  SigmoidSiluMultiMeta *meta = new SigmoidSiluMultiMeta(handle,
+                                                        ssm,
+                                                        gpu_mem_allocator,
+                                                        ssm->intermediate_size,
+                                                        intermediate_size);
   meta->input_type[0] = ssm->inputs[0]->data_type;
   meta->input_type[1] = ssm->inputs[1]->data_type;
   meta->output_type[0] = ssm->outputs[0]->data_type;
@@ -350,7 +381,9 @@ void SigmoidSiluMulti::inference_task(
   assert(input1_domain == input2_domain);
   assert(input1_domain == output_domain);
 
-  SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output);
+  // use active number of tokens
+  SigmoidSiluMulti::inference_kernel_wrapper(
+      m, input1, input2, output, bc->num_active_tokens());
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -369,6 +402,8 @@ void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->intermediate_size);
+  sez.serialize(this->tensor_parallelism_degree);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -381,9 +416,12 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff,
                                    int num_inputs) {
   assert(num_inputs == 2);
   size_t id, transformer_layer_id, deserialized_model_id;
+  int intermediate_size, tensor_parallelism_degree;
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
+  dez.deserialize(intermediate_size);
+  dez.deserialize(tensor_parallelism_degree);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -392,6 +430,8 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff,
 
   SigmoidSiluMultiParams params;
   params.layer_guid = layer_guid;
+  params.intermediate_size = intermediate_size;
+  params.tensor_parallelism_degree = tensor_parallelism_degree;
   strcpy(params.name, name);
   return ff.get_or_create_node<SigmoidSiluMulti>({inputs[0], inputs[1]},
                                                  params);
@@ -406,6 +446,8 @@ size_t hash<FlexFlow::SigmoidSiluMultiParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.layer_guid.transformer_layer_id);
   hash_combine(key, params.layer_guid.model_id);
+  hash_combine(key, params.intermediate_size);
+  hash_combine(key, params.tensor_parallelism_degree);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 590b641b5..962777ff3 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -21,10 +21,14 @@ namespace FlexFlow {
 
 SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
                                            SigmoidSiluMulti const *ssm,
-                                           MemoryAllocator &gpu_mem_allocator)
+                                           MemoryAllocator &gpu_mem_allocator,
+                                           int _global_intermediate_size,
+                                           int _intermediate_size)
     : OpMeta(handle) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
+  global_intermediate_size = _global_intermediate_size;
+  intermediate_size = _intermediate_size;
 }
 
 SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) {
@@ -50,13 +54,18 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiMeta const *m,
     GenericTensorAccessorR const &input1,
     GenericTensorAccessorR const &input2,
-    GenericTensorAccessorW const &output) {
+    GenericTensorAccessorW const &output,
+    int token_size) {
+  if (token_size == 0) {
+    return;
+  }
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
-  int num_elements = input1.domain.get_volume();
-  assert(input2.domain.get_volume() == num_elements);
-  assert(output.domain.get_volume() == num_elements);
+  assert(input2.domain.get_volume() == input1.domain.get_volume());
+  assert(output.domain.get_volume() == input1.domain.get_volume());
+
+  int num_elements = token_size * m->intermediate_size;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -68,7 +77,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
                              0,
-                             stream>>>(input1.domain.get_volume(),
+                             stream>>>(num_elements,
                                        input1.get_float_ptr(),
                                        input2.get_float_ptr(),
                                        output.get_float_ptr());
@@ -76,7 +85,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
                              0,
-                             stream>>>(input1.domain.get_volume(),
+                             stream>>>(num_elements,
                                        input1.get_half_ptr(),
                                        input2.get_half_ptr(),
                                        output.get_half_ptr());
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 03618423b..4c94f3e5a 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -92,7 +92,7 @@ SoftmaxParams Softmax::get_params() const {
   return params;
 }
 
-Tensor FFModel::softmax(const Tensor _input,
+Tensor FFModel::softmax(Tensor const _input,
                         int dim,
                         DataType data_type,
                         char const *name) {
@@ -135,7 +135,7 @@ Op *Softmax::create_operator_from_layer(
 
 Softmax::Softmax(FFModel &model,
                  LayerID const &_layer_guid,
-                 const ParallelTensor _input,
+                 ParallelTensor const _input,
                  int _dim,
                  char const *name)
     : Op(model,
@@ -160,7 +160,7 @@ Softmax::Softmax(FFModel &model,
 
 Softmax::Softmax(FFModel &model,
                  SoftmaxParams const &params,
-                 const ParallelTensor input,
+                 ParallelTensor const input,
                  char const *name)
     : Softmax(model, params.layer_guid, input, params.dim, params.name) {}
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 9c6ed0e0b..421780dd4 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -52,24 +52,25 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid(
   return is_valid;
 }
 
-Tensor
-    FFModel::spec_inc_multihead_self_attention(Tensor const input,
-                                               int embed_dim,
-                                               int num_heads,
-                                               int kdim,
-                                               int vdim,
-                                               float dropout,
-                                               bool qkv_bias,
-                                               bool final_bias,
-                                               bool add_zero_attn,
-                                               DataType data_type,
-                                               Initializer *kernel_initializer,
-                                               bool apply_rotary_embedding,
-                                               bool scaling_query,
-                                               float scaling_factor,
-                                               bool qk_prod_scaling,
-                                               bool position_bias,
-                                               char const *name) {
+Tensor FFModel::spec_inc_multihead_self_attention(
+    Tensor const input,
+    int embed_dim,
+    int num_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool qkv_bias,
+    bool final_bias,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    bool streaming_cache,
+    char const *name) {
   return spec_inc_multiquery_self_attention(input,
                                             embed_dim,
                                             num_heads,
@@ -82,33 +83,35 @@ Tensor
                                             add_zero_attn,
                                             data_type,
                                             kernel_initializer,
-                                            apply_rotary_embedding,
+                                            rotary_embedding_meta,
                                             scaling_query,
                                             scaling_factor,
                                             qk_prod_scaling,
                                             position_bias,
+                                            streaming_cache,
                                             name);
 }
 
-Tensor
-    FFModel::spec_inc_multiquery_self_attention(Tensor const input,
-                                                int embed_dim,
-                                                int num_q_heads,
-                                                int num_kv_heads,
-                                                int kdim,
-                                                int vdim,
-                                                float dropout,
-                                                bool qkv_bias,
-                                                bool final_bias,
-                                                bool add_zero_attn,
-                                                DataType data_type,
-                                                Initializer *kernel_initializer,
-                                                bool apply_rotary_embedding,
-                                                bool scaling_query,
-                                                float scaling_factor,
-                                                bool qk_prod_scaling,
-                                                bool position_bias,
-                                                char const *name) {
+Tensor FFModel::spec_inc_multiquery_self_attention(
+    Tensor const input,
+    int embed_dim,
+    int num_q_heads,
+    int num_kv_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool qkv_bias,
+    bool final_bias,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    bool streaming_cache,
+    char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
   }
@@ -145,13 +148,12 @@ Tensor
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
+  int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim;
+  int hidden_size = input->dims[0];
+  int qParas = qk_dim * hidden_size;
+  int kParas = qk_dim * hidden_size;
+  int vParas = v_dim * hidden_size;
+  int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size);
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
   {
@@ -166,10 +168,8 @@ Tensor
   }
   if (qkv_bias || final_bias) {
     // q, k, v, o
-    int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
+    int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -188,11 +188,24 @@ Tensor
   li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
+  li->add_int_property("streaming_cache", streaming_cache);
+  li->add_int_property("tensor_parallelism_degree",
+                       config.tensor_parallelism_degree);
   layers.push_back(li);
   return li->outputs[0];
 }
@@ -222,8 +235,18 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   layer->get_int_property("scaling_query", value);
   bool scaling_query = (bool)value;
   float scaling_factor;
@@ -232,6 +255,10 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   bool qk_prod_scaling = (bool)value;
   layer->get_int_property("position_bias", value);
   bool position_bias = (bool)value;
+  layer->get_int_property("streaming_cache", value);
+  bool streaming_cache = (bool)value;
+  layer->get_int_property("tensor_parallelism_degree", value);
+  int tensor_parallelism_degree = (int)value;
 
   return new SpecIncMultiHeadSelfAttention(model,
                                            layer->layer_guid,
@@ -245,12 +272,14 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
                                            qkv_bias,
                                            final_bias,
                                            add_zero_attn,
-                                           apply_rotary_embedding,
+                                           rotary_embedding_meta,
                                            scaling_query,
                                            scaling_factor,
                                            qk_prod_scaling,
                                            position_bias,
                                            false /*allocate_weights*/,
+                                           streaming_cache,
+                                           tensor_parallelism_degree,
                                            layer->name);
 }
 
@@ -267,12 +296,14 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
     bool allocate_weights,
+    bool _streaming_cache,
+    int _tensor_parallelism_degree,
     char const *name)
     // Initializer* _bias_initializer)
     : Op(model,
@@ -286,13 +317,13 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) {
+      rotary_embedding_meta(_rotary_embedding_meta),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), streaming_cache(_streaming_cache),
+      tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
 
@@ -309,11 +340,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -331,10 +362,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                                  CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -369,12 +399,14 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
     bool allocate_weights,
+    bool _streaming_cache,
+    int _tensor_parallelism_degree,
     char const *name)
     // Initializer* _bias_initializer)
     : Op(model,
@@ -389,13 +421,13 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias)
+      rotary_embedding_meta(_rotary_embedding_meta),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), streaming_cache(_streaming_cache),
+      tensor_parallelism_degree(_tensor_parallelism_degree)
 // bias_initializer(_bias_initializer)
 {
   numOutputs = 1;
@@ -411,11 +443,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -434,10 +466,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                                  CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -470,21 +501,23 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     : SpecIncMultiHeadSelfAttention(model,
                                     other.layer_guid,
                                     input,
-                                    other.oProjSize,
+                                    other.o_dim,
                                     other.num_q_heads,
                                     other.num_kv_heads,
-                                    other.qProjSize,
-                                    other.vProjSize,
+                                    other.qk_dim,
+                                    other.v_dim,
                                     other.dropout,
                                     other.qkv_bias,
                                     other.final_bias,
                                     other.add_zero_attn,
-                                    other.apply_rotary_embedding,
+                                    other.rotary_embedding_meta,
                                     other.scaling_query,
                                     other.scaling_factor,
                                     other.qk_prod_scaling,
                                     other.position_bias,
                                     allocate_weights,
+                                    other.streaming_cache,
+                                    other.tensor_parallelism_degree,
                                     other.name) {}
 
 SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
@@ -505,12 +538,14 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     params.qkv_bias,
                                     params.final_bias,
                                     params.add_zero_attn,
-                                    params.apply_rotary_embedding,
+                                    params.rotary_embedding_meta,
                                     params.scaling_query,
                                     params.scaling_factor,
                                     params.qk_prod_scaling,
                                     params.position_bias,
                                     allocate_weights,
+                                    params.streaming_cache,
+                                    params.tensor_parallelism_degree,
                                     params.name) {}
 
 void SpecIncMultiHeadSelfAttention::init_inference(
@@ -636,9 +671,11 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1;
   assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
-  int num_q_heads = attn->num_q_heads;
-  int num_kv_heads = attn->num_kv_heads;
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree;
+  int num_kv_heads =
+      attn->num_kv_heads / attn->tensor_parallelism_degree +
+      (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
+  assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                        .only_kind(Memory::GPU_FB_MEM)
@@ -736,9 +773,9 @@ void SpecIncMultiHeadSelfAttention::inference_task(
     Runtime *runtime) {
   assert(task->regions.size() == regions.size());
 
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-  if (bc.num_tokens == 0) {
+  // BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_tokens == 0) {
     return;
   }
 
@@ -778,7 +815,7 @@ void SpecIncMultiHeadSelfAttention::inference_task(
 
   assert(task->index_point.get_dim() == 1);
   SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, &bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, bc, task->index_point.point_data[0], input, weight, output, biases);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -788,7 +825,7 @@ void SpecIncMultiHeadSelfAttention::inference_task(
       weights_accessors.push_back(biases);
     }
     SpecIncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, weights_accessors, {output});
+        m, shard_id, bc, {input}, weights_accessors, {output});
   }
 }
 
@@ -828,31 +865,46 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs,
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
          lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
-         lhs.position_bias == rhs.position_bias;
+         lhs.position_bias == rhs.position_bias &&
+         lhs.streaming_cache == rhs.streaming_cache;
 }
 
 SpecIncMultiHeadSelfAttentionParams
     SpecIncMultiHeadSelfAttention::get_params() const {
   SpecIncMultiHeadSelfAttentionParams params;
   params.layer_guid = this->layer_guid;
-  params.embed_dim = this->oProjSize;
+  params.embed_dim = this->o_dim;
   params.num_q_heads = this->num_q_heads;
   params.num_kv_heads = this->num_kv_heads;
-  params.kdim = this->kProjSize;
-  params.vdim = this->vProjSize;
+  params.kdim = this->qk_dim;
+  params.vdim = this->v_dim;
   params.dropout = this->dropout;
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
+  params.streaming_cache = this->streaming_cache;
+  params.tensor_parallelism_degree = this->tensor_parallelism_degree;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -876,11 +928,20 @@ size_t hash<FlexFlow::SpecIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.qkv_bias);
   hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
   hash_combine(key, params.position_bias);
+  hash_combine(key, params.streaming_cache);
+  hash_combine(key, params.tensor_parallelism_degree);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index b1687d12a..92bcbc546 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -36,10 +36,11 @@ __global__ void spec_store_kv_cache(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
+    /* Reserved: BatchConfig Updated, leave HIP code to be updated */
     BatchConfig::PerTokenInfo *tokenInfos,
     BatchConfig::PerRequestInfo *requestInfo,
-    BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
-    BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
+    TreeSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
+    TreeSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
     int qProjSize,
     int kProjSize,
     int vProjSize,
@@ -67,7 +68,7 @@ __global__ void spec_store_kv_cache(
     // int const beam_width = id_map[token_idx].beam_width;
 
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
     int const sub_req_id = beamTokenInfos[token_idx].sub_request_index;
     int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id];
     int const beam_depth = beamRequestInfos[req_id].current_depth;
@@ -139,14 +140,14 @@ __global__ void spec_store_kv_cache(
 
 template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                            BeamSearchBatchConfig const *bc,
+                            TreeSearchBatchConfig const *bc,
                             hipStream_t stream) {
   int num_tokens = bc->num_active_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   // printf("curr depth: %d\n", curr_depth);
   // assert(curr_depth < 3);
   if (num_tokens > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
+    int parallelism = m->local_hidden_size * KV_WEIGHT_NUM * num_tokens;
     hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_store_kv_cache<DT>),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
@@ -164,9 +165,9 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                        m->vProjSize,
                        num_tokens,
                        BatchConfig::max_sequence_length(),
-                       BeamSearchBatchConfig::MAX_BEAM_WIDTH,
+                       TreeSearchBatchConfig::MAX_BEAM_WIDTH,
                        /*root*/ curr_depth == 0,
-                       m->hidden_size);
+                       m->local_hidden_size);
   }
 }
 
@@ -189,7 +190,7 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix,
 
 template <typename DT>
 void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                              BeamSearchBatchConfig const *bc,
+                              TreeSearchBatchConfig const *bc,
                               int shard_id,
                               DT *output_ptr,
                               DT const *bias_ptr,
@@ -223,7 +224,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_available[i]) {
       continue;
     }
     for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) {
@@ -232,7 +233,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       // int total_tokens = bc->token_last_available_idx[i] + 1;
 
       int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+      int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
                          bc->requestsInfo[i].num_tokens_in_batch;
       // Compute (QK^T/sqrt(d_k))
       int m_ = num_new_tokens;
@@ -458,7 +459,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                      BeamSearchBatchConfig const *bc,
+                      TreeSearchBatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
@@ -466,7 +467,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       DT const *bias_ptr,
                       hipStream_t stream) {
   // here because we need postion info in infernece 1
-  int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+  int max_tokens_per_batch = bc->max_tokens_per_ssm_batch();
   checkCUDA(
       hipMemcpyAsync(m->token_infos,
                      &(bc->tokensInfo),
@@ -483,25 +484,25 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       hipMemcpyAsync(m->beam_token_infos,
                      &(bc->beamTokenInfo),
                      max_tokens_per_batch * bc->MAX_BEAM_WIDTH *
-                         sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo),
+                         sizeof(TreeSearchBatchConfig::BeamSearchPerTokenInfo),
                      hipMemcpyHostToDevice,
                      stream));
   checkCUDA(hipMemcpyAsync(
       m->beam_request_infos,
       &(bc->beamRequestsInfo),
       bc->max_requests_per_batch() *
-          sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo),
+          sizeof(TreeSearchBatchConfig::BeamSearchPerRequestInfo),
       hipMemcpyHostToDevice,
       stream));
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+  compute_qkv(m,
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
 
@@ -517,7 +518,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 /*static*/
 void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     SpecIncMultiHeadSelfAttentionMeta const *m,
-    BeamSearchBatchConfig const *bc,
+    TreeSearchBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
@@ -586,7 +587,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
-                                    BEAM_SEARCH_MODE,
+                                    TREE_SEARCH_MODE,
                                     attn,
                                     attn->qSize,
                                     attn->kSize,
@@ -595,7 +596,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
@@ -617,27 +618,29 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = BatchConfig::max_tokens_per_ssm_batch();
     size_t beam_tokeninfo_size =
-        max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-    size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch();
+        max_tokens_per_batch * TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+    size_t requestinfo_size = TreeSearchBatchConfig::max_requests_per_batch();
     size_t beam_requestinfo_size =
-        BeamSearchBatchConfig::max_requests_per_batch();
+        TreeSearchBatchConfig::max_requests_per_batch();
     size_t total_size =
         requestinfo_size * sizeof(BatchConfig::PerRequestInfo) +
         beam_tokeninfo_size *
-            sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) +
+            sizeof(TreeSearchBatchConfig::BeamSearchPerTokenInfo) +
         beam_requestinfo_size *
-            sizeof(BeamSearchBatchConfig::
+            sizeof(TreeSearchBatchConfig::
                        BeamSearchPerRequestInfo); // more components will
                                                   // be added here later
 
     // We always directly allocate memory for small speculative models
-    gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst,
-                                             total_size);
+    gpu_mem_allocator.create_legion_instance(
+        beam_search_reserve_inst,
+        total_size,
+        "SpecIncMultiHeadSelfAttentionMeta");
     beam_token_infos =
         gpu_mem_allocator
-            .allocate_instance<BeamSearchBatchConfig::BeamSearchPerTokenInfo>(
+            .allocate_instance<TreeSearchBatchConfig::BeamSearchPerTokenInfo>(
                 beam_tokeninfo_size);
     // offset += beam_tokeninfo_size *
     //           sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo);
@@ -647,7 +650,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     // offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo);
     beam_request_infos =
         gpu_mem_allocator
-            .allocate_instance<BeamSearchBatchConfig::BeamSearchPerRequestInfo>(
+            .allocate_instance<TreeSearchBatchConfig::BeamSearchPerRequestInfo>(
                 beam_requestinfo_size);
     // offset += beam_requestinfo_size *
     //           sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo);
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index a00ea9c95..6d7bf1364 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -15,15 +15,17 @@
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
+#include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/utils/cuda_helper.h"
 
-namespace FlexFlow {
+#include <sstream>
+#include <stdexcept>
 
-#define WARP_SIZE 32
+namespace FlexFlow {
 
 // declare Legion names
 using Legion::coord_t;
@@ -33,711 +35,267 @@ using namespace Kernels::IncMultiHeadAttention;
 namespace Kernels {
 namespace SpecIncMultiHeadSelfAttention {
 
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_spec_inc_attention_kernel_generation_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int const max_seq_length,
-    int per_head_size,
-    int hidden_size,
-    BatchConfig::PerRequestInfo *request_infos,
-    BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos,
-    BatchConfig::BitMask *causalMask,
-    bool *request_completed) {
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  // nth request idx
-  int const request_idx = blockIdx.y;
-
-  // request id in batch config
-  int const batch_config_request_id =
-      request_infos[request_idx].batch_config_request_id;
-
-  // request_idx = re
-
-  BatchConfig::BitMask bitmask = causalMask[batch_config_request_id];
-
-  int const first_step = 0;
-
-  // int const tlength =
-  //     request_infos[batch_config_request_id].first_token_depth_in_request +
-  //     request_infos[batch_config_request_id].num_tokens_in_batch;
-
-  int const totalCacheSize =
-      bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1;
-
-  int first_token_idx = 0;
-  for (int r = 0; r < batch_config_request_id; r++) {
-    first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size;
-  }
-
-  int const tree_branch_num =
-      beam_request_infos[batch_config_request_id].sub_request_num;
-
-  // shared memory objects
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_);
-
-  float qk_max = -FLT_MAX;
-
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  DT const *k_cache_batch =
-      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
-
-  int ti_end =
-      div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-
-  for (int qi = 0; qi < tree_branch_num; qi += 1) {
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
-          ii * THREADS_PER_KEY * K_VEC_SIZE);
-    }
-
-    int const query_token =
-        bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi;
-
-    __syncthreads();
-    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-      K_vec k[K_VECS_PER_THREAD];
-      int const ti_circ = ti % max_seq_length;
-
-      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-        if (ti < totalCacheSize) {
-
-          k[ii] = *reinterpret_cast<K_vec const *>(
-              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
-              jj);
-        }
-      }
-      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-
-      if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) {
-        // todo add alobi here
-        // bool const mask = ti_circ >= totalCacheSize;
-        bool const mask = (ti >= bitmask.non_tree_cache_size &&
-                           (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                              (1 << query_token))));
-
-        // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) {
-        //   printf("spec inc attn qkqkqk  request id %d,  %.10f, %d\n",
-        //          batch_config_request_id,
-        //          ti,
-        //          qk,
-        //          qi);
-        // }
-        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-        qk_smem[ti - first_step] = mask ? 0.f : qk;
-      }
-    }
-
-    __syncthreads();
-
-#pragma unroll
-    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Decompose the thread index into warp and lane.
-    int const warp = tidx / WARP_SIZE;
-    int const lane = tidx % WARP_SIZE;
-
-    // The warp leader writes the max to shared memory.
-    if (lane == 0) {
-      red_smem[warp] = qk_max;
-    }
-
-    // Make sure the products are in shared memory.
-    __syncthreads();
-
-    // The warps finalize the reduction.
-    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Broadcast to all the threads in the warp.
-    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-    // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
-    //   printf("spec inc attn first token qk_max %.10f\n", qk_max);
-    // }
-
-    float exp_sum = 0.f;
-    for (int ti = first_step + tidx; ti < totalCacheSize;
-         ti += THREADS_PER_BLOCK) {
-      bool const mask = (ti >= bitmask.non_tree_cache_size &&
-                         (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                            (1 << query_token))));
-      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
-      exp_sum += logit;
-      qk_smem[ti - first_step] = mask ? 0.0f : logit;
-    }
-
-    // Compute the sum.
-    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
-
-    // softmax
-    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-    for (int ti = first_step + tidx; ti < totalCacheSize;
-         ti += THREADS_PER_BLOCK) {
-      qk_smem[ti - first_step] *= inv_sum;
-    }
-
-    __syncthreads();
-
-    // value projection
-    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-    // A vector of V elements for the current timestep.
-    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-    // The value computed by this thread.
-    int vo = tidx / THREADS_PER_VALUE;
-    // The hidden dimensions computed by this particular thread.
-    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-    Out_sum out;
-    zero(out);
-
-    // The base pointer for the value in the cache buffer.
-    DT const *v_cache_batch =
-        value_cache + batch_config_request_id * max_seq_length * hidden_size +
-        vi;
-
-    if (Dh == Dh_MAX || vi < Dh) {
-      for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) {
-        // Load the values from the cache.
-        int const ti_circ = ti % max_seq_length;
-        V_vec v = *reinterpret_cast<V_vec const *>(
-            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-
-        bool const mask = (ti >= bitmask.non_tree_cache_size &&
-                           (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                              (1 << query_token))));
-        float logit = mask ? 0.0f : qk_smem[ti - first_step];
-        out = FlexFlow::fma(logit, cast_to_float(v), out);
-      }
-    }
-
-    //   // Make sure we can start writing to shared memory.
-    __syncthreads();
-
-    // Run the final reduction amongst the different groups computing different
-    // partial outputs.
-    if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-      for (int active_groups = V_PER_ITER; active_groups >= 2;
-           active_groups /= 2) {
-
-        // The midpoint in the number of active groups.
-        int midpoint = active_groups / 2;
-
-        // The upper part of active threads store to shared memory.
-        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-              out;
-        }
-        __syncthreads();
-
-        // The bottom warps update their values.
-        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                    out);
-        }
-        __syncthreads();
-      }
-    }
-
-    // Output the final values.
-    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-      convert_from_float(*reinterpret_cast<V_vec *>(
-                             output_ptr + (first_token_idx + qi) * hidden_size +
-                             head_idx * per_head_size + vi),
-                         out);
-    }
-  }
-}
+using flashinfer::BatchPrefillHandler;
+using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
+using flashinfer::LogitsPostHook;
+using flashinfer::MaskMode;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
 
 template <typename DT>
-__global__ void spec_inc_store_kv_cache(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    BatchConfig::PerTokenInfo *tokenInfos,
-    BatchConfig::PerRequestInfo *requestInfo,
-    BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
-    BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
-    BatchConfig::BitMask *causalMask,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_tokens,
-    int max_seq_len,
-    bool is_root,
-    int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / (hidden_size);
-    int offset = i % hidden_size;
-
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_id = tokenInfos[token_idx].request_index;
-    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-
-    int const request_token_offset =
-        requestInfo[req_id].first_token_offset_in_batch;
-
-    BatchConfig::BitMask bitmask = causalMask[req_id];
-
-    // if prompt token -> token id
-    // if tree token:
-
-    int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size +
-                          bitmask.tree_size - 1 - bitmask.this_layer_size +
-                          token_idx - request_token_offset;
-
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
-               offset] = vVal;
-  }
-}
-
-template <typename DT>
-void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                            BeamSearchBatchConfig const *bc,
-                            cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
-  int curr_depth = bc->beamRequestsInfo[0].current_depth;
-  if (num_tokens > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
-    spec_inc_store_kv_cache<<<GET_BLOCKS(parallelism),
-                              min(CUDA_NUM_THREADS, parallelism),
-                              0,
-                              stream>>>(
-        static_cast<DT *>(m->devQKVProjArray),
-        static_cast<DT *>(m->keyCache),
-        static_cast<DT *>(m->valueCache),
-        m->token_infos,
-        m->request_infos,
-        m->beam_token_infos,
-        m->beam_request_infos,
-        m->causalMask,
-        m->qProjSize,
-        m->kProjSize,
-        m->vProjSize,
-        num_tokens,
-        BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num(),
-        /*root*/ curr_depth == 0,
-        m->hidden_size);
-  }
-}
-
-#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(                                \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
-  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
-                                   BatchConfig::max_sequence_length() +        \
-                                       BatchConfig::max_spec_tree_token_num(), \
-                                   THREADS_PER_VALUE,                          \
-                                   THDS_PER_BLOCK);                            \
-  compute_spec_inc_attention_kernel_generation_kernel<DT,                      \
-                                                      THDS_PER_BLOCK,          \
-                                                      Dh,                      \
-                                                      Dh_MAX,                  \
-                                                      THDS_PER_KEY,            \
-                                                      THREADS_PER_VALUE>       \
-      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
-          static_cast<DT *>(m->devQKVProjArray),                               \
-          static_cast<DT *>(m->keyCache),                                      \
-          static_cast<DT *>(m->valueCache),                                    \
-          output_ptr,                                                          \
-          scale,                                                               \
-          BatchConfig::max_sequence_length() +                                 \
-              BatchConfig::max_spec_tree_token_num(),                          \
-          m->qProjSize,                                                        \
-          m->hidden_size,                                                      \
-          m->request_infos,                                                    \
-          m->beam_request_infos,                                               \
-          m->causalMask,                                                       \
-          m->request_completed)
-
-template <typename DT>
-void compute_spec_inc_attention_kernel_generation(
-    SpecIncMultiHeadSelfAttentionMeta const *m,
-    BeamSearchBatchConfig const *bc,
-    DT *output_ptr,
-    cudaStream_t stream) {
-  // one block == one head per request
-  // how many generation requests
-  dim3 grid(m->num_q_heads, bc->get_speculative_request_num());
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  size_t smem_sz;
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
+                           BatchConfig const *bc,
+                           DT *output_ptr,
+                           cudaStream_t stream) {
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  // global constant parameters
+  uint32_t const num_q_heads = m->num_q_heads;
+  uint32_t const num_kv_heads = m->num_kv_heads;
+  uint32_t const head_dim = m->qk_dim;
+  uint32_t const batch_size = bc->num_active_requests();
+  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Update custom mask time: " << elapsed << " ms\n";
+  //   }
+
+  half *q = static_cast<half *>(m->queryTmp),
+       *kv = static_cast<half *>(m->kvCache),
+       *o = static_cast<half *>(m->outputTmp);
+  paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
+      num_kv_heads,
+      kPagesize,
+      head_dim,
+      batch_size,
+      QKVLayout::kNHD,
+      kv,
+      m->handle.tree_search_attention_metadata->kv_indices,
+      m->handle.tree_search_attention_metadata->kv_indptr,
+      m->handle.tree_search_attention_metadata->kv_last_page_len);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    attn prep time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  BatchPrefillHandler *handler = nullptr;
+
+  if (!bc->prompt_phase) {
+    assert(m->handle.tree_search_attention_metadata->decode_handler_collections
+                   .count(batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = static_cast<BatchPrefillHandler *>(
+        m->handle.tree_search_attention_metadata
+            ->decode_handler_collections[batch_size]);
   } else {
-    assert(false && "a unsupported head size");
-  }
-}
-
-template <typename DT>
-__global__ void spec_fill_entries_above_diagonal(DT *matrix,
-                                                 size_t new_tokens,
-                                                 size_t total_tokens_in_request,
-                                                 size_t num_q_heads,
-                                                 DT value) {
-  CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) {
-    // size_t head_idx = i / (new_tokens * total_tokens_in_request);
-    size_t src_idx = (i / new_tokens) % total_tokens_in_request;
-    size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens;
-    // Casual Mask
-    if (src_idx > dst_idx) {
-      matrix[i] = value;
-    }
-  }
-}
-
-template <typename DT>
-void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
-                                     BeamSearchBatchConfig const *bc,
-                                     int shard_id,
-                                     DT *output_ptr,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
-                                     cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+    assert(m->handle.tree_search_attention_metadata->prompt_handler_collections
+                   .count(batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = static_cast<BatchPrefillHandler *>(
+        m->handle.tree_search_attention_metadata
+            ->prompt_handler_collections[batch_size]);
   }
-#endif
-  // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
-  int tokens_previous_requests = 0;
-  int tokens_prev_requests_squares = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens;
-  int q_block_size = m->qProjSize;
-
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size = kt_block_size * m->num_q_heads *
-                          (BatchConfig::max_sequence_length() +
-                           BatchConfig::max_spec_tree_token_num());
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size = vt_block_size * m->num_q_heads *
-                          (BatchConfig::max_sequence_length() +
-                           BatchConfig::max_spec_tree_token_num());
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) ||
-        (bc->requestsInfo[i].num_tokens_in_batch == 0)) {
-      continue;
-    } else if (tokens_previous_requests < bc->num_generation_tokens) {
-      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
-      continue;
-    }
-
-    // all requests in prompt phase should only have one sub requests;
-    assert(bc->sub_requests[i] == 1);
-    // int num_new_tokens = bc->num_processing_tokens[i];
-    // int total_tokens = bc->token_last_available_idx[i] + 1;
-
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
-                       bc->requestsInfo[i].num_tokens_in_batch;
 
-    if (num_new_tokens <= 0) {
-      continue;
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    BeginForward time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  DISPATCH_HEADDIM(head_dim, HEAD_DIM, {
+    cudaError_t result;
+    if (bc->prompt_phase) {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCausal,
+                                                        half,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              handler,
+              q,
+              m->handle.tree_search_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              /*custom_mask=*/nullptr,
+              /*qk_indptr=*/nullptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*window_left=*/-1,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
+    } else {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCustom,
+                                                        half,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              handler,
+              q,
+              m->handle.tree_search_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              m->handle.tree_search_attention_metadata->custom_mask,
+              m->handle.tree_search_attention_metadata->qk_indptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*window_left=*/-1,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
     }
-
-    // Compute (QK^T/sqrt(d_k))
-    int m_ = num_new_tokens;
-    int n = total_tokens;
-    int k = m->qProjSize;
-    int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-        ldc = m_;
-    int strideA = q_block_size;
-    int strideB = kt_block_size;
-    int strideC = num_new_tokens * total_tokens;
-
-    // a flag of using this scaling alpha
-    DT alpha = 1.0f, beta = 0.0f;
-    if (*m->qk_prod_scaling) {
-      alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-    }
-    // To get A, skip over Q entries from previous requests (same head)
-    DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                  bc->requestsInfo[i].first_token_offset_in_batch *
-                      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-    // To get B, skip over K entries from previous requests (all heads +
-    // padding)
-
-    // print_tensor<float>((float*)A, 32, "A");
-    DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-
-    // if (i == 0 && sub_req_id == 0 &&
-    //     bc->beam_slots.at(0).current_depth == 1) {
-    //   int offset = (float *)B - m->keyCache;
-    //   printf("key cache offset %d\n", kt_req_block_size);
-    // }
-    // To get C, skip over QK^T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods) +
-            m->num_q_heads * tokens_prev_requests_squares;
-    checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                         CUBLAS_OP_T,
-                                         CUBLAS_OP_N,
-                                         m_,
-                                         n,
-                                         k,
-                                         &alpha,
-                                         A,
-                                         cublas_data_type,
-                                         lda,
-                                         strideA,
-                                         B,
-                                         cublas_data_type,
-                                         ldb,
-                                         strideB,
-                                         &beta,
-                                         C,
-                                         cublas_data_type,
-                                         ldc,
-                                         strideC,
-                                         m->num_q_heads,
-                                         compute_type,
-                                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // print_tensor<float>((float*)C, 32, "C");
-    // add alibi position bias to qk production
-    // add alibi position bias to qk production
-    if (*m->position_bias) {
-      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                  min((size_t)CUDA_NUM_THREADS, parallelism),
-                                  0,
-                                  stream>>>(C,
-                                            num_new_tokens,
-                                            total_tokens,
-                                            m->num_q_heads,
-                                            m->global_num_q_heads,
-                                            shard_id);
-    }
-    // Fill all elements above diagonal in qk prods with -inf to force
-    // causal attention.
-    assert(num_new_tokens <= total_tokens);
-    if (num_new_tokens > 1) {
-      size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens;
-      spec_fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                         min((size_t)CUDA_NUM_THREADS,
-                                             parallelism),
-                                         0,
-                                         stream>>>(C,
-                                                   num_new_tokens,
-                                                   total_tokens,
-                                                   m->num_q_heads,
-                                                   static_cast<DT>(-INFINITY));
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to run "
+                               "TreeSearchAttentionForwardKernel: " +
+                               std::string(cudaGetErrorString(result)));
     }
-    // Compute Softmax(QK^T/sqrt(d_k))
-    // Before modifying the parameters below, make sure to read the following
-    // description of the CUDNN_TENSOR_NCHW tensor layout, from
-    // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-    // This tensor format specifies that the data is laid out in the following
-    // order: batch size, feature maps, rows, columns. The strides are
-    // implicitly defined in such a way that the data are contiguous in memory
-    // with no padding between images, feature maps, rows, and columns; the
-    // columns are the inner dimension and the images are the outermost
-    // dimension.
-    int n_param = m->num_q_heads;
-    int c_param = total_tokens;
-    int h_param = 1;
-    int w_param = num_new_tokens;
-    checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                          CUDNN_TENSOR_NCHW,
-                                          cudnn_data_type,
-                                          n_param,
-                                          c_param,
-                                          h_param,
-                                          w_param));
-    float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-    DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax) +
-                    m->num_q_heads * tokens_prev_requests_squares;
-    // The softmax operation below is executed according to the
-    // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-    // softmax operation is computed per spatial location (H,W) per image (N)
-    // across dimension C.
-    checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                   CUDNN_SOFTMAX_ACCURATE,
-                                   CUDNN_SOFTMAX_MODE_CHANNEL,
-                                   &softmax_alpha,
-                                   m->qk_tensor,
-                                   C,
-                                   &softmax_beta,
-                                   m->qk_tensor,
-                                   C_softmax));
-    // Matmul softmax(QK^T/sqrt(d_k)) by V
-    alpha = 1.0f, beta = 0.0f;
-    m_ = m->vProjSize;
-    n = num_new_tokens;
-    k = total_tokens;
-    lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-    strideA = vt_block_size;
-    strideB = num_new_tokens * total_tokens;
-    strideC = m->vProjSize;
-    // To get A, skip over V^T entries from previous requests (all heads +
-    // padding)
-    A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-    // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-    // requests (all heads)
-    B = C_softmax;
-    // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-    // requests
-
-    int token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-
-    C = static_cast<DT *>(m->attn_heads) +
-        (token_offset)*m->num_q_heads * m->vProjSize;
-    checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                         CUBLAS_OP_N,
-                                         CUBLAS_OP_T,
-                                         m_,
-                                         n,
-                                         k,
-                                         &alpha,
-                                         A,
-                                         cublas_data_type,
-                                         lda,
-                                         strideA,
-                                         B,
-                                         cublas_data_type,
-                                         ldb,
-                                         strideB,
-                                         &beta,
-                                         C,
-                                         cublas_data_type,
-                                         ldc,
-                                         strideC,
-                                         m->num_q_heads,
-                                         compute_type,
-                                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-    tokens_previous_requests += num_new_tokens;
-    tokens_prev_requests_squares += num_new_tokens * total_tokens;
-  }
-
-  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
-    bc->print();
-    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
-    printf("num_tokens: %i\n", num_tokens);
-    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
-  }
-  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
+  });
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    actual attn time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  produce_output(m, bc, output_ptr, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    produce_output_kernel time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
 }
 
 template <typename DT>
-void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                      BeamSearchBatchConfig const *bc,
+void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
+                      BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
-  // phase 1: Implement kernel to compute KQV for input tokens
-
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
-  // phase 2: Update key/val cache
-  update_kv_cache_kernel<DT>(m, bc, stream);
-  if (bc->num_generation_tokens > 0) {
-    compute_spec_inc_attention_kernel_generation<DT>(
-        m, bc, static_cast<DT *>(m->attn_heads), stream);
+  // phase 1: Compute QKV Projections of the batch
+  compute_qkv(m,
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
+
+  // phase 2: First maintain the streaming cache, because it need
+  // pre-pos-encoding values
+  if (m->streaming_cache) {
+    // Move pre-pos-encoding cache to where took by attention
+    update_kv_in_streaming_cache<DT>(m, bc, stream);
+    // Apply pos-encoding to those k values
+    apply_pos_encoding_to_streaming_proj<DT>(m, bc, stream);
+    // Commit to the streaming cache
+    if (bc->prompt_phase) {
+      commit_kv<DT>(m, bc, stream);
+    }
   }
-  // phase 3: Compute attention score
-  // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  if (bc->num_tokens > bc->num_generation_tokens) {
-    compute_attention_kernel_prompt(
-        m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+
+  // phase 3: Take care of the batch
+  {
+    // Apply pos-encoding to the batch
+    apply_pos_encoding_to_tokens_in_batch(
+        m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
+    // Move the batch qkv values to where took by attention
+    update_qkv_in_batch<DT>(m, bc, stream);
   }
-  // compute output production and bias together for all tokens
-  int num_tokens = bc->num_active_tokens();
 
+  // phase 4: Attention computation
+  tree_search_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+
+  // Debug output:
+  //   int size = m->local_hidden_size * BatchConfig::max_tokens_per_batch();
+  //   float *temp_output = new float[size];
+  //   cudaDeviceSynchronize();
+  //   cudaMemcpy(
+  //       temp_output, m->attn_heads, size * sizeof(float),
+  //       cudaMemcpyDeviceToHost);
+
+  //   printf("Output: ");
+  //   for (int i = 0; i < bc->num_tokens; ++i) {
+  //     float temp = 0;
+  //     for (int j = 0; j < m->local_hidden_size; ++j) {
+  //       temp += temp_output[i * m->local_hidden_size + j];
+  //     }
+  //     printf("%.6f ", temp);
+  //   }
+  //   printf("\n");
+
+  //   delete[] temp_output;
+
+  // phase 5: Compute output production and bias together for all tokens
+  int num_tokens = bc->num_active_tokens();
   compute_o_prod_bias(
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
 }
@@ -747,8 +305,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
 /*static*/
 void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-    SpecIncMultiHeadSelfAttentionMeta const *m,
-    BeamSearchBatchConfig const *bc,
+    SpecIncMultiHeadSelfAttentionMeta *m,
+    BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
@@ -774,7 +332,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
   if (input.data_type == DT_HALF) {
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
+    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel<half>(
         m,
         bc,
         shard_id,
@@ -786,7 +344,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
   } else if (input.data_type == DT_FLOAT) {
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
+    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel<float>(
         m,
         bc,
         shard_id,
@@ -822,16 +380,13 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
-                                    BEAM_SEARCH_MODE,
+                                    TREE_SEARCH_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
-                                    attn->qProjSize,
-                                    attn->kProjSize,
-                                    attn->vProjSize,
-                                    attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->hidden_size,
+                                    attn->qk_dim,
+                                    attn->v_dim,
+                                    attn->o_dim,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
@@ -846,46 +401,30 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     _num_q_heads,
                                     _num_kv_heads,
                                     DT_NONE,
-                                    false) {
+                                    false,
+                                    attn->streaming_cache) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
 
-  // allocate memory for the seqArray and reserve space
-  {
-    beam_token_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo));
-
-    beam_request_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BeamSearchBatchConfig::beamTokenInfo));
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-        sizeof(BatchConfig::causalMask));
-  }
+  // set attention constants
+  handler.tree_search_attention_metadata->set_enabled(true);
+  handler.tree_search_attention_metadata->set_num_q_heads(num_q_heads);
+  handler.tree_search_attention_metadata->set_num_kv_heads(num_kv_heads);
+  handler.tree_search_attention_metadata->set_head_dim(qk_dim);
 
   cudaStreamSynchronize(stream);
 }
 
 SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) {
-  if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) {
-    beam_search_reserve_inst.destroy();
-  }
+  // for (auto &decode_handler: decode_handler_collections) {
+  //   delete static_cast<flashinfer::BatchPrefillHandler
+  //   *>(decode_handler.second);
+  // }
+  // for (auto &prompt_handler: prompt_handler_collections) {
+  //   delete static_cast<flashinfer::BatchPrefillHandler
+  //   *>(prompt_handler.second);
+  // }
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index d0efb01d5..a69bf61b1 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -46,7 +46,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-LegionRuntime::Logger::Category log_tree_verify("TreeVerifyIncMHA");
+Legion::Logger log_tree_verify("TreeVerifyIncMHA");
 
 bool TreeIncMultiHeadSelfAttentionParams::is_valid(
     ParallelTensorShape const &input) const {
@@ -55,7 +55,7 @@ bool TreeIncMultiHeadSelfAttentionParams::is_valid(
 }
 
 Tensor FFModel::inc_multihead_self_attention_verify(
-    const Tensor input,
+    Tensor const input,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -66,7 +66,7 @@ Tensor FFModel::inc_multihead_self_attention_verify(
     bool add_zero_attn,
     DataType data_type,
     Initializer *kernel_initializer,
-    bool apply_rotary_embedding,
+    RotaryEmbeddingMeta rotary_embedding_meta,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -84,7 +84,7 @@ Tensor FFModel::inc_multihead_self_attention_verify(
                                               add_zero_attn,
                                               data_type,
                                               kernel_initializer,
-                                              apply_rotary_embedding,
+                                              rotary_embedding_meta,
                                               scaling_query,
                                               scaling_factor,
                                               qk_prod_scaling,
@@ -93,7 +93,7 @@ Tensor FFModel::inc_multihead_self_attention_verify(
 }
 
 Tensor FFModel::inc_multiquery_self_attention_verify(
-    const Tensor input,
+    Tensor const input,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -105,7 +105,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
     bool add_zero_attn,
     DataType data_type,
     Initializer *kernel_initializer,
-    bool apply_rotary_embedding,
+    RotaryEmbeddingMeta rotary_embedding_meta,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -149,13 +149,12 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
+  int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim;
+  int hidden_size = input->dims[0];
+  int qParas = qk_dim * hidden_size;
+  int kParas = qk_dim * hidden_size;
+  int vParas = v_dim * hidden_size;
+  int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size);
   int one_head_size = qParas + kParas + vParas + oParas;
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
@@ -178,10 +177,8 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   }
   if (qkv_bias || final_bias) {
     // q, k, v, o
-    int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
+    int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -200,10 +197,20 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
-  li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
+  li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);
   li->add_int_property("offload", offload);
@@ -236,9 +243,18 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
   bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
-  layer->get_int_property("scaling_query", value);
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   bool scaling_query = (bool)value;
   float scaling_factor;
   layer->get_float_property("scaling_factor", scaling_factor);
@@ -264,7 +280,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
                                            qkv_bias,
                                            final_bias,
                                            add_zero_attn,
-                                           apply_rotary_embedding,
+                                           rotary_embedding_meta,
                                            scaling_query,
                                            scaling_factor,
                                            qk_prod_scaling,
@@ -279,7 +295,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
     LayerID const &_layer_guid,
-    const ParallelTensor _input,
+    ParallelTensor const _input,
     int _embed_dim,
     int _num_q_heads,
     int _num_kv_heads,
@@ -289,7 +305,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -311,15 +327,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
-      quantization_type(_quantization_type), offload(_offload),
-      tensor_parallelism_degree(_tensor_parallelism_degree) {
+      rotary_embedding_meta(_rotary_embedding_meta),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), quantization_type(_quantization_type),
+      offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
 
@@ -336,11 +350,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -366,10 +380,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -393,8 +406,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
 
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
-    const ParallelTensor _input,
-    const ParallelTensor _weight,
+    ParallelTensor const _input,
+    ParallelTensor const _weight,
     int _embed_dim,
     int _num_q_heads,
     int _num_kv_heads,
@@ -404,7 +417,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -427,15 +440,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
-      quantization_type(_quantization_type), offload(_offload),
-      tensor_parallelism_degree(_tensor_parallelism_degree)
+      rotary_embedding_meta(_rotary_embedding_meta),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), quantization_type(_quantization_type),
+      offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree)
 // bias_initializer(_bias_initializer)
 {
   numOutputs = 1;
@@ -451,11 +462,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -479,10 +490,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -510,21 +520,21 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
     TreeIncMultiHeadSelfAttention const &other,
-    const ParallelTensor input,
+    ParallelTensor const input,
     bool allocate_weights)
     : TreeIncMultiHeadSelfAttention(model,
                                     other.layer_guid,
                                     input,
-                                    other.oProjSize,
+                                    other.o_dim,
                                     other.num_q_heads,
                                     other.num_kv_heads,
-                                    other.qProjSize,
-                                    other.vProjSize,
+                                    other.qk_dim,
+                                    other.v_dim,
                                     other.dropout,
                                     other.qkv_bias,
                                     other.final_bias,
                                     other.add_zero_attn,
-                                    other.apply_rotary_embedding,
+                                    other.rotary_embedding_meta,
                                     other.scaling_query,
                                     other.scaling_factor,
                                     other.qk_prod_scaling,
@@ -553,7 +563,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
                                     params.qkv_bias,
                                     params.final_bias,
                                     params.add_zero_attn,
-                                    params.apply_rotary_embedding,
+                                    params.rotary_embedding_meta,
                                     params.scaling_query,
                                     params.scaling_factor,
                                     params.qk_prod_scaling,
@@ -695,7 +705,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
 
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                        .only_kind(Memory::GPU_FB_MEM)
@@ -806,13 +816,12 @@ void TreeIncMultiHeadSelfAttention::inference_task(
     Runtime *runtime) {
   assert(task->regions.size() == regions.size());
 
-  TreeVerifyBatchConfig const &bc =
-      Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-  log_tree_verify.debug(
-      "TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d",
-      bc.num_tokens,
-      bc.num_active_requests());
-  if (bc.num_tokens == 0) {
+  // BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  log_tree_verify.debug("BatchConfig, num_tokens: %d, num_requests: %d",
+                        bc->num_tokens,
+                        bc->num_active_requests());
+  if (bc->num_tokens == 0) {
     return;
   }
 
@@ -858,7 +867,7 @@ void TreeIncMultiHeadSelfAttention::inference_task(
   assert(task->index_point.get_dim() == 1);
 
   TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, &bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, bc, task->index_point.point_data[0], input, weight, output, biases);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -869,7 +878,7 @@ void TreeIncMultiHeadSelfAttention::inference_task(
       weights_accessors.push_back(biases);
     }
     TreeIncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, weights_accessors, {output});
+        m, shard_id, bc, {input}, weights_accessors, {output});
   }
 }
 
@@ -901,7 +910,19 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs,
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
          lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
@@ -912,16 +933,16 @@ TreeIncMultiHeadSelfAttentionParams
     TreeIncMultiHeadSelfAttention::get_params() const {
   TreeIncMultiHeadSelfAttentionParams params;
   params.layer_guid = this->layer_guid;
-  params.embed_dim = this->oProjSize;
+  params.embed_dim = this->o_dim;
   params.num_q_heads = this->num_q_heads;
   params.num_kv_heads = this->num_kv_heads;
-  params.kdim = this->kProjSize;
-  params.vdim = this->vProjSize;
+  params.kdim = this->qk_dim;
+  params.vdim = this->v_dim;
   params.dropout = this->dropout;
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
@@ -949,7 +970,14 @@ size_t hash<FlexFlow::TreeIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.qkv_bias);
   hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 26291fb3b..cf3426b3e 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -36,7 +36,8 @@ __global__ void commit_tokens_kernel(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos,
+    /* Reserved: BatchConfig Updated, leave HIP code to be updated */
+    BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     int qProjSize,
     int kProjSize,
     int vProjSize,
@@ -70,11 +71,12 @@ __global__ void commit_tokens_kernel(
 
 template <typename DT>
 void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
-                   TreeVerifyBatchConfig const *bc,
+                   BatchConfig const *bc,
                    hipStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   if (num_tokens_to_commit > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
+    int parallelism =
+        m->local_hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
     hipLaunchKernelGGL(
         HIP_KERNEL_NAME(commit_tokens_kernel<DT>),
         GET_BLOCKS(parallelism),
@@ -91,24 +93,24 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         num_tokens_to_commit,
         m->num_active_tokens, // number of active tokens in previous batch
         BatchConfig::max_sequence_length(),
-        m->hidden_size);
+        m->local_hidden_size);
   }
 }
 
 template <typename DT>
-__global__ void update_tree_branch_kv_cache(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_tokens_in_branch,
-    int processed_tokens_in_batch,
-    int total_tokens_in_batch,
-    int max_seq_len,
-    int hidden_size) {
+__global__ void
+    update_tree_branch_kv_cache(DT const *devQKVProjArray,
+                                DT *kCache_ptr,
+                                DT *vCache_ptr,
+                                BatchConfig::PerTokenInfo const *tokenInfos,
+                                int qProjSize,
+                                int kProjSize,
+                                int vProjSize,
+                                int num_tokens_in_branch,
+                                int processed_tokens_in_batch,
+                                int total_tokens_in_batch,
+                                int max_seq_len,
+                                int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) {
     int token_idx = i / (hidden_size * KV_WEIGHT_NUM);
     int offset = i % hidden_size;
@@ -119,7 +121,7 @@ __global__ void update_tree_branch_kv_cache(
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
     kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
                offset] = kVal;
     vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
@@ -146,7 +148,7 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix,
 
 template <typename DT>
 void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
-                              TreeVerifyBatchConfig const *bc,
+                              BatchConfig const *bc,
                               int shard_id,
                               DT *output_ptr,
                               DT const *bias_ptr,
@@ -178,7 +180,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_available[i]) {
       continue;
     }
     int last_token_idx_of_the_request =
@@ -187,17 +189,17 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
       int num_new_tokens = 1;
       int j = processed_tokens_in_batch;
       while ((j + 1 <= last_token_idx_of_the_request) &&
-             (bc->tokensInfo[j].abs_depth_in_request + 1 ==
-              bc->tokensInfo[j + 1].abs_depth_in_request)) {
+             (bc->tokensInfo[j].abs_index_in_request + 1 ==
+              bc->tokensInfo[j + 1].abs_index_in_request)) {
         j++;
         num_new_tokens++;
       }
 
-      int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1;
+      int total_tokens_in_request = bc->tokensInfo[j].abs_index_in_request + 1;
       assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
       {
         // update K-V cache
-        int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens;
+        int parallelism = m->local_hidden_size * KV_WEIGHT_NUM * num_new_tokens;
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(update_tree_branch_kv_cache<DT>),
             GET_BLOCKS(parallelism),
@@ -215,7 +217,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             processed_tokens_in_batch, // num_processed_tokens_in_batch
             m->num_active_tokens,      // total_tokens_in_batch
             BatchConfig::max_sequence_length(),
-            m->hidden_size);
+            m->local_hidden_size);
       }
 
       // bc->token_last_available_idx[i] + 1;
@@ -437,7 +439,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
-                      TreeVerifyBatchConfig const *bc,
+                      BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
@@ -464,13 +466,12 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // Note that m->num_active_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
   // keys/values to the key-value cache
-  checkCUDA(
-      hipMemcpyAsync(m->committed_token_infos,
-                     &(bc->committed_tokens),
-                     bc->num_tokens_to_commit *
-                         sizeof(TreeVerifyBatchConfig::CommittedTokensInfo),
-                     hipMemcpyHostToDevice,
-                     stream));
+  checkCUDA(hipMemcpyAsync(m->committed_token_infos,
+                           &(bc->committed_tokens),
+                           bc->num_tokens_to_commit *
+                               sizeof(BatchConfig::CommittedTokensInfo),
+                           hipMemcpyHostToDevice,
+                           stream));
   commit_tokens<DT>(m, bc, stream);
 
   // After commit we update m->num_active_tokens to be the number of active
@@ -486,18 +487,18 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   checkCUDA(hipMemcpyAsync(m->token_infos,
                            &(bc->tokensInfo),
                            bc->num_active_tokens() *
-                               sizeof(TreeVerifyBatchConfig::PerTokenInfo),
+                               sizeof(BatchConfig::PerTokenInfo),
                            hipMemcpyHostToDevice,
                            stream));
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+  compute_qkv(m,
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
 
   // phase 2: No need to update key/val cache
   // IncMultiHeadSelfAttention::update_kv_cache_kernel(
@@ -515,7 +516,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 /*static*/
 void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     TreeIncMultiHeadSelfAttentionMeta *m,
-    TreeVerifyBatchConfig const *bc,
+    BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
@@ -540,7 +541,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   if (input.data_type == DT_HALF) {
     if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+      pre_build_weight<half>(m, weight, input.data_type, stream);
     }
 
     half const *bias_ptr =
@@ -556,7 +557,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+      pre_build_weight<float>(m, weight, input.data_type, stream);
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
@@ -606,7 +607,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
@@ -631,24 +632,24 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   {
     int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
     size_t committed_tokeninfo_size = max_tokens_per_batch;
-    size_t total_size = committed_tokeninfo_size *
-                        sizeof(TreeVerifyBatchConfig::CommittedTokensInfo);
+    size_t total_size =
+        committed_tokeninfo_size * sizeof(BatchConfig::CommittedTokensInfo);
     if (offload) {
       // assert that we have enough reserved work space left
       assert(gpu_mem_allocator.reserved_total_size -
                  gpu_mem_allocator.reserved_allocated_size >=
              total_size);
       committed_token_infos =
-          gpu_mem_allocator
-              .allocate_reserved<TreeVerifyBatchConfig::CommittedTokensInfo>(
-                  committed_tokeninfo_size);
+          gpu_mem_allocator.allocate_reserved<BatchConfig::CommittedTokensInfo>(
+              committed_tokeninfo_size);
     } else {
-      gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst,
-                                               total_size);
+      gpu_mem_allocator.create_legion_instance(
+          committed_token_reserve_inst,
+          total_size,
+          "TreeIncMultiHeadSelfAttentionMeta");
       committed_token_infos =
-          gpu_mem_allocator
-              .allocate_instance<TreeVerifyBatchConfig::CommittedTokensInfo>(
-                  committed_tokeninfo_size);
+          gpu_mem_allocator.allocate_instance<BatchConfig::CommittedTokensInfo>(
+              committed_tokeninfo_size);
     }
   }
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 50c056c81..058e223c4 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -15,12 +15,16 @@
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
+#include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/utils/cuda_helper.h"
 
+#include <sstream>
+#include <stdexcept>
+
 namespace FlexFlow {
 
 // declare Legion names
@@ -34,850 +38,311 @@ using namespace Kernels::IncMultiHeadAttention;
 namespace Kernels {
 namespace TreeIncMultiHeadAttention {
 
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_attention_kernel_fused_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int const max_seq_length,
-    int const max_token_per_batch,
-    int per_head_size,
-    int hidden_size,
-    BatchConfig::PerRequestInfo *request_infos,
-    int num_heads,
-    int num_requests,
-    BatchConfig::BitMask *causalMask,
-    bool *request_completed,
-    int qk_smem_sz) {
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  // request idx
-  int const request_idx = blockIdx.y;
-
-  int const batch_config_request_id =
-      request_infos[request_idx].batch_config_request_id;
-
-  int const first_step = 0;
-
-  int const tlength =
-      request_infos[batch_config_request_id].first_token_depth_in_request +
-      request_infos[batch_config_request_id].num_tokens_in_batch;
-  int const qlength =
-      request_infos[batch_config_request_id].num_tokens_in_batch;
-
-  BatchConfig::BitMask bitmask = causalMask[batch_config_request_id];
-
-  int first_token_idx = 0;
-  for (int r = 0; r < batch_config_request_id; r++) {
-    first_token_idx +=
-        request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch;
-  }
-
-  bool prompt_phase = request_infos[batch_config_request_id].prompt_phase;
-  int q_start =
-      request_infos[batch_config_request_id].first_token_depth_in_request;
-
-  // shared memory objects
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_ + qk_smem_sz);
-
-  float qk_max = -FLT_MAX;
-
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  DT const *k_cache_batch =
-      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
-
-  int ti_end =
-      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-
-  for (int qi = 0; qi < qlength; qi += 1) {
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
-          ii * THREADS_PER_KEY * K_VEC_SIZE);
-
-      // if (head_idx == 0 && request_idx == 1 && tidx == 0) {
-      //     printf("laod q %d,  %d %.10f\n",
-      //     request_idx,
-      //            qi,q_vecs[ki_o][ii].x);
-      //   }
-    }
-
-    __syncthreads();
-    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-      K_vec k[K_VECS_PER_THREAD];
-      int const ti_circ = ti % max_seq_length;
-
-      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-        if (ti < tlength) {
-          k[ii] = *reinterpret_cast<K_vec const *>(
-              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
-              jj);
-        }
-      }
-      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-
-      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-        bool const mask =
-            prompt_phase ? (qi + q_start < ti)
-                         : (ti >= bitmask.non_tree_cache_size &&
-                            (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                               (1 << qi))));
-
-        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-
-        // if (head_idx == 0 && !mask) {
-        //   printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f,
-        //   %.10f, %d\n",
-        //          request_idx,
-        //          qi,
-        //          ti,
-        //          qk,
-        //          q_vecs[ki_o][0].x,
-        //          k[0].x,
-        //          bitmask.non_tree_cache_size);
-        // }
-        qk_smem[ti - first_step] = mask ? 0.0f : qk;
-      }
-    }
-
-    __syncthreads();
-
-#pragma unroll
-    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Decompose the thread index into warp and lane.
-    int const warp = tidx / WARP_SIZE;
-    int const lane = tidx % WARP_SIZE;
-
-    // The warp leader writes the max to shared memory.
-    if (lane == 0) {
-      red_smem[warp] = qk_max;
-    }
-
-    // Make sure the products are in shared memory.
-    __syncthreads();
-
-    // The warps finalize the reduction.
-    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Broadcast to all the threads in the warp.
-    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-    // if (head_idx == 0 && qi == 9 && tidx == 0) {
-    //   printf("tree attn first token qk_max %f\n", qk_max);
-    // }
-
-    float exp_sum = 0.f;
-    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-      bool const mask =
-          prompt_phase ? (q_start + qi < ti)
-                       : (ti >= bitmask.non_tree_cache_size &&
-                          (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                             (1 << qi))));
-      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
-      exp_sum += logit;
-      qk_smem[ti - first_step] = mask ? 0.0f : logit;
-    }
-
-    // Compute the sum.
-    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
-
-    // softmax
-    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-      qk_smem[ti - first_step] *= inv_sum;
-    }
+using flashinfer::BatchPrefillHandler;
+using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
+using flashinfer::LogitsPostHook;
+using flashinfer::MaskMode;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
 
-    __syncthreads();
-
-    // value projection
-    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-    // A vector of V elements for the current timestep.
-    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-    // The value computed by this thread.
-    int vo = tidx / THREADS_PER_VALUE;
-    // The hidden dimensions computed by this particular thread.
-    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-    Out_sum out;
-    zero(out);
-
-    // The base pointer for the value in the cache buffer.
-    DT const *v_cache_batch =
-        value_cache + batch_config_request_id * max_seq_length * hidden_size +
-        vi;
-
-    if (Dh == Dh_MAX || vi < Dh) {
-      for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
-        // Load the values from the cache.
-        int const ti_circ = ti % max_seq_length;
-        // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti];
-        V_vec v = *reinterpret_cast<V_vec const *>(
-            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-
-        if (ti < tlength) {
-          bool const mask =
-              prompt_phase
-                  ? (q_start + qi < ti)
-                  : (ti >= bitmask.non_tree_cache_size &&
-                     (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                        (1 << qi))));
-          float logit = mask ? 0.0f : qk_smem[ti - first_step];
-          out = FlexFlow::fma(logit, cast_to_float(v), out);
-        }
-      }
+__global__ void commit_tokens_kernel(
+    half *kCache_ptr,
+    BatchConfig::CommittedTokensInfo const *committedTokenInfos,
+    bool const *request_available,
+    int num_requests,
+    int num_kv_heads,
+    int head_dim,
+    int const *num_committed_tokens,
+    int const max_num_pages) {
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const request_compact_idx = idx / kv_hidden_size;
+  int const offset = idx % kv_hidden_size;
+  // request id in batch config
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0;
+  while (cnt_1 < request_compact_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
     }
+  }
 
-    //   // Make sure we can start writing to shared memory.
-    __syncthreads();
-
-    // Run the final reduction amongst the different groups computing different
-    // partial outputs.
-    if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-      for (int active_groups = V_PER_ITER; active_groups >= 2;
-           active_groups /= 2) {
-
-        // The midpoint in the number of active groups.
-        int midpoint = active_groups / 2;
-
-        // The upper part of active threads store to shared memory.
-        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-              out;
-        }
-        __syncthreads();
-
-        // The bottom warps update their values.
-        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                    out);
-        }
-        __syncthreads();
+  for (int i = 0; i < *num_committed_tokens; i++) {
+    if (committedTokenInfos[i].request_index == requext_idx_in_batch) {
+      int const index_in_kv_cache = committedTokenInfos[i].index_in_kv_cache;
+      if (index_in_kv_cache == -1) {
+        continue;
       }
-    }
 
-    // Output the final values.
-    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-      convert_from_float(*reinterpret_cast<V_vec *>(
-                             output_ptr + (first_token_idx + qi) * hidden_size +
-                             head_idx * per_head_size + vi),
-                         out);
-      // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) {
-      //   printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n",
-      //          out.x,
-      //          out.y,
-      //          out.z,
-      //          out.w,
-      //          vi,
-      //          (first_token_idx + qi) * hidden_size + head_idx *
-      //          per_head_size +
-      //              vi);
-      // }
+      int const page_to_idx = committedTokenInfos[i].token_depth / kPagesize;
+      int const page_from_idx =
+          committedTokenInfos[i].index_in_kv_cache / kPagesize;
+
+      size_t from_k_idx = get_k_entry_offset_verify(
+                 committedTokenInfos[i].index_in_kv_cache,
+                 page_from_idx,
+                 num_kv_heads,
+                 head_dim),
+             from_v_idx = get_v_entry_offset_verify(
+                 committedTokenInfos[i].index_in_kv_cache,
+                 page_from_idx,
+                 num_kv_heads,
+                 head_dim);
+      size_t to_k_idx =
+                 get_k_entry_offset_verify(committedTokenInfos[i].token_depth,
+                                           page_to_idx,
+                                           num_kv_heads,
+                                           head_dim),
+             to_v_idx =
+                 get_v_entry_offset_verify(committedTokenInfos[i].token_depth,
+                                           page_to_idx,
+                                           num_kv_heads,
+                                           head_dim);
+
+      kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
+      kCache_ptr[to_v_idx + offset] = kCache_ptr[from_v_idx + offset];
     }
   }
 }
 
-template <typename DT>
-__global__ void commit_tokens_kernel(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_tokens_to_commit,
-    int num_active_tokens_in_last_batch,
-    int max_seq_len,
-    int hidden_size) {
-
-  CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) {
-
-    int token_pos = i / (hidden_size);
-    int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
-    int offset = i % hidden_size;
-    assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
-
-    size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size +
-                     hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_id = committedTokenInfos[token_pos].request_index;
-    int const tok_id = committedTokenInfos[token_pos].token_depth;
-
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
-  }
-}
-
-template <typename DT>
 void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
-                   TreeVerifyBatchConfig const *bc,
+                   BatchConfig const *bc,
                    cudaStream_t stream) {
-  int num_tokens_to_commit = bc->num_tokens_to_commit;
-  if (num_tokens_to_commit > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
-    commit_tokens_kernel<<<GET_BLOCKS(parallelism),
-                           min(CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream>>>(
-        static_cast<DT *>(m->devQKVProjArray),
-        static_cast<DT *>(m->keyCache),
-        static_cast<DT *>(m->valueCache),
-        m->committed_token_infos,
-        m->qProjSize,
-        m->kProjSize,
-        m->vProjSize,
-        num_tokens_to_commit,
-        m->num_active_tokens, // number of active tokens in previous batch
-        BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num(),
-        m->hidden_size);
-  }
-}
-
-template <typename DT>
-__global__ void update_tree_branch_kv_cache(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_tokens_in_branch,
-    int processed_tokens_in_batch,
-    int total_tokens_in_batch,
-    int max_seq_len,
-    int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) {
-
-    int token_idx = i / (hidden_size);
-    int offset = i % hidden_size;
-
-    token_idx += processed_tokens_in_batch; // get index in the whole batch
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
-  }
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  int const max_num_pages =
+      round_up_pages(BatchConfig::max_sequence_length() +
+                     BatchConfig::max_spec_tree_token_num());
+  int const num_requests = bc->num_active_requests();
+  int parallelism = m->num_kv_heads * m->qk_dim * num_requests;
+  commit_tokens_kernel<<<GET_BLOCKS(parallelism),
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream>>>(static_cast<half *>(m->kvCache),
+                                   m->committed_token_infos,
+                                   m->request_available,
+                                   num_requests,
+                                   m->num_kv_heads,
+                                   m->qk_dim,
+                                   m->num_tokens_to_commit,
+                                   max_num_pages);
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   printf("Commit token time: %.2f ms\n", elapsed);
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
 }
 
 template <typename DT>
-__global__ void update_tree_branch_kv_cache_fused(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos,
-    BatchConfig::PerRequestInfo *request_infos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_new_tokens,
-    int max_seq_len,
-    int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) {
-
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_id = tokenInfos[token_idx].request_index;
-    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-
-    int const request_token_offset =
-        request_infos[req_id].first_token_offset_in_batch;
-    int const first_token_depth =
-        request_infos[req_id].first_token_depth_in_request;
-
-    // if(i % hidden_size == 0){
-    //   printf("update token request id: %d, %d, %d  real id %d, value%.10f\n",
-    //   req_id, token_idx, request_token_offset,(token_idx + first_token_depth
-    //   - request_token_offset), kVal);
-    // }
-    kCache_ptr[req_id * (hidden_size * max_seq_len) +
-               (token_idx + first_token_depth - request_token_offset) *
-                   hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) +
-               (token_idx + first_token_depth - request_token_offset) *
-                   hidden_size +
-               offset] = vVal;
-  }
-}
-
-template <typename DT>
-__global__ void tree_fill_entries_above_diagonal(DT *matrix,
-                                                 size_t new_tokens,
-                                                 size_t total_tokens_in_request,
-                                                 size_t num_q_heads,
-                                                 DT value) {
-  CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) {
-    // size_t head_idx = i / (new_tokens * total_tokens_in_request);
-    size_t src_idx = (i / new_tokens) % total_tokens_in_request;
-    size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens;
-    // Casual Mask
-    if (src_idx > dst_idx) {
-      matrix[i] = value;
-    }
+void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
+                           BatchConfig const *bc,
+                           DT *output_ptr,
+                           cudaStream_t stream) {
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  // global constant parameters
+  uint32_t const num_q_heads = m->num_q_heads;
+  uint32_t const num_kv_heads = m->num_kv_heads;
+  uint32_t const head_dim = m->qk_dim;
+  uint32_t const batch_size = bc->num_active_requests();
+  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Update custom mask time: " << elapsed << " ms\n";
+  //   }
+
+  half *q = static_cast<half *>(m->queryTmp),
+       *kv = static_cast<half *>(m->kvCache),
+       *o = static_cast<half *>(m->outputTmp);
+  paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
+      num_kv_heads,
+      kPagesize,
+      head_dim,
+      batch_size,
+      QKVLayout::kNHD,
+      kv,
+      m->handle.tree_verify_attention_metadata->kv_indices,
+      m->handle.tree_verify_attention_metadata->kv_indptr,
+      m->handle.tree_verify_attention_metadata->kv_last_page_len);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    attn prep time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  BatchPrefillHandler *handler = nullptr;
+
+  if (!bc->prompt_phase) {
+    assert(m->handle.tree_verify_attention_metadata->decode_handler_collections
+                   .count(batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = static_cast<BatchPrefillHandler *>(
+        m->handle.tree_verify_attention_metadata
+            ->decode_handler_collections[batch_size]);
+  } else {
+    assert(m->handle.tree_verify_attention_metadata->prompt_handler_collections
+                   .count(batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = static_cast<BatchPrefillHandler *>(
+        m->handle.tree_verify_attention_metadata
+            ->prompt_handler_collections[batch_size]);
   }
-}
 
-template <typename DT>
-void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
-                              TreeVerifyBatchConfig const *bc,
-                              int shard_id,
-                              DT *output_ptr,
-                              DT const *bias_ptr,
-                              DT const *weight_ptr,
-                              cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-  // int num_requests = bc->num_active_requests();
-  int processed_tokens_in_batch = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
-      BatchConfig::max_spec_tree_token_num();
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
-      BatchConfig::max_spec_tree_token_num();
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    BeginForward time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  DISPATCH_HEADDIM(head_dim, HEAD_DIM, {
+    cudaError_t result;
+    if (bc->prompt_phase) {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCausal,
+                                                        half,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              handler,
+              q,
+              m->handle.tree_verify_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              /*custom_mask=*/nullptr,
+              /*qk_indptr=*/nullptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*window_left=*/-1,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
+    } else {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCustom,
+                                                        half,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              handler,
+              q,
+              m->handle.tree_verify_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              m->handle.tree_verify_attention_metadata->custom_mask,
+              m->handle.tree_verify_attention_metadata->qk_indptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*window_left=*/-1,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
     }
-    assert(processed_tokens_in_batch ==
-           bc->requestsInfo[i].first_token_offset_in_batch);
-    int last_token_idx_of_the_request =
-        processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1;
-    while (processed_tokens_in_batch <= last_token_idx_of_the_request) {
-      int num_new_tokens = 1;
-      int j = processed_tokens_in_batch;
-      while ((j + 1 <= last_token_idx_of_the_request) &&
-             (bc->tokensInfo[j].abs_depth_in_request + 1 ==
-              bc->tokensInfo[j + 1].abs_depth_in_request)) {
-        j++;
-        num_new_tokens++;
-      }
-
-      int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1;
-      assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
-      {
-        // update K-V cache
-        int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens;
-        update_tree_branch_kv_cache<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-            static_cast<DT *>(m->devQKVProjArray),
-            static_cast<DT *>(m->keyCache),
-            static_cast<DT *>(m->valueCache),
-            m->token_infos,
-            m->qProjSize,
-            m->kProjSize,
-            m->vProjSize,
-            num_new_tokens,            // num_tokens_in_branch
-            processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
-            BatchConfig::max_sequence_length(),
-            m->hidden_size);
-      }
-
-      // bc->token_last_available_idx[i] + 1;
-      // Compute (QK^T/sqrt(d_k))
-      int m_ = num_new_tokens;
-      int n = total_tokens_in_request;
-      int k = m->qProjSize;
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens_in_request;
-
-      // a flag of using this scaling alpha
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // To get A, skip over Q entries from previous requests (same head)
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    processed_tokens_in_batch * m->qProjSize * m->num_q_heads *
-                        QKV_WEIGHT_NUM;
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // To get C, skip over QK^T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
-
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // add alibi position bias to qk production
-      // add alibi position bias to qk production
-      if (*m->position_bias) {
-        size_t parallelism =
-            m->num_q_heads * total_tokens_in_request * num_new_tokens;
-        apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                    min((size_t)CUDA_NUM_THREADS, parallelism),
-                                    0,
-                                    stream>>>(C,
-                                              num_new_tokens,
-                                              total_tokens_in_request,
-                                              m->num_q_heads,
-                                              m->global_num_q_heads,
-                                              shard_id);
-      }
-
-      // Fill all elements above diagonal in qk prods with -inf to force
-      // causal attention.
-      assert(num_new_tokens <= total_tokens_in_request);
-      if (num_new_tokens > 1) {
-        size_t parallelism =
-            m->num_q_heads * num_new_tokens * total_tokens_in_request;
-        tree_fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                           min((size_t)CUDA_NUM_THREADS,
-                                               parallelism),
-                                           0,
-                                           stream>>>(
-            C,
-            num_new_tokens,
-            total_tokens_in_request,
-            m->num_q_heads,
-            static_cast<DT>(-INFINITY));
-      }
-      // Compute Softmax(QK^T/sqrt(d_k))
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens_in_request;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                            CUDNN_TENSOR_NCHW,
-                                            cudnn_data_type,
-                                            n_param,
-                                            c_param,
-                                            h_param,
-                                            w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_CHANNEL,
-                                     &softmax_alpha,
-                                     m->qk_tensor,
-                                     C,
-                                     &softmax_beta,
-                                     m->qk_tensor,
-                                     C_softmax));
-      // Matmul softmax(QK^T/sqrt(d_k)) by V
-      alpha = 1.0f, beta = 0.0f;
-      m_ = m->vProjSize;
-      n = num_new_tokens;
-      k = total_tokens_in_request;
-      lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      strideA = vt_block_size;
-      strideB = num_new_tokens * total_tokens_in_request;
-      strideC = m->vProjSize;
-      // To get A, skip over V^T entries from previous requests (all heads +
-      // padding)
-      A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      B = C_softmax;
-      // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-      // requests
-      C = static_cast<DT *>(m->attn_heads) +
-          processed_tokens_in_batch * m->num_q_heads * m->vProjSize;
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_T,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      processed_tokens_in_batch += num_new_tokens;
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to run "
+                               "TreeVerifyAttentionKernel: " +
+                               std::string(cudaGetErrorString(result)));
     }
-    // Before moving to the next request
-    // check that we have finished all tokens of the request
-    assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
-  }
-  // Project to output, save result directly on output tensor
-  DT alpha = 1.0f, beta = 0.0f;
-  int m_ = m->oProjSize;
-  int k = m->vProjSize * m->num_q_heads;
-  int n = processed_tokens_in_batch;
-  int lda = k, ldb = k, ldc = m_;
-  DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                         m->kProjSize * m->num_q_heads +
-                                         m->vProjSize * m->num_q_heads);
-  DT const *B = static_cast<DT *>(m->attn_heads);
-  DT *C = static_cast<DT *>(output_ptr);
-
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         m_,
-                         n,
-                         k,
-                         &alpha,
-                         A,
-                         cublas_data_type,
-                         lda,
-                         B,
-                         cublas_data_type,
-                         ldb,
-                         &beta,
-                         C,
-                         cublas_data_type,
-                         ldc,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * processed_tokens_in_batch;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
-                        min(CUDA_NUM_THREADS, parallelism),
-                        0,
-                        stream>>>(output_ptr,
-                                  bias_ptr,
-                                  processed_tokens_in_batch,
-                                  qkv_weight_size,
-                                  m->oProjSize);
-  }
-
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
-}
-
-#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream)      \
-  smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
-                              BatchConfig::max_sequence_length() +             \
-                                  BatchConfig::max_spec_tree_token_num(),      \
-                              THDS_PER_VALUE,                                  \
-                              THDS_PER_BLOCK,                                  \
-                              bc,                                              \
-                              smem_sz);                                        \
-  compute_attention_kernel_fused_kernel<DT,                                    \
-                                        THDS_PER_BLOCK,                        \
-                                        Dh,                                    \
-                                        Dh_MAX,                                \
-                                        THDS_PER_KEY,                          \
-                                        THDS_PER_VALUE>                        \
-      <<<grid, THDS_PER_BLOCK, smem_sz[1], stream>>>(                          \
-          static_cast<DT *>(m->devQKVProjArray),                               \
-          static_cast<DT *>(m->keyCache),                                      \
-          static_cast<DT *>(m->valueCache),                                    \
-          output_ptr,                                                          \
-          scale,                                                               \
-          BatchConfig::max_sequence_length() +                                 \
-              BatchConfig::BatchConfig::max_spec_tree_token_num(),             \
-          BatchConfig::max_tokens_per_batch(),                                 \
-          m->qProjSize,                                                        \
-          m->hidden_size,                                                      \
-          m->request_infos,                                                    \
-          m->num_q_heads,                                                      \
-          bc->num_active_requests(),                                           \
-          m->causalMask,                                                       \
-          m->request_completed,                                                \
-          smem_sz[0])
-
-template <typename DT>
-void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
-                                    TreeVerifyBatchConfig const *bc,
-                                    DT *output_ptr,
-                                    cudaStream_t stream) {
-
-  // update the kv cache
-  //  update K-V cache
-  int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
-  update_tree_branch_kv_cache_fused<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-      static_cast<DT *>(m->devQKVProjArray),
-      static_cast<DT *>(m->keyCache),
-      static_cast<DT *>(m->valueCache),
-      m->token_infos,
-      m->request_infos,
-      m->qProjSize,
-      m->kProjSize,
-      m->vProjSize,
-      num_new_tokens,
-      BatchConfig::max_sequence_length() +
-          BatchConfig::max_spec_tree_token_num(),
-      m->hidden_size);
-
-  dim3 grid(m->num_q_heads, bc->num_active_requests());
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  // 0->qk production size, 1->total shared size
-  int smem_sz[2];
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
-  } else {
-    assert(false && "a unsupported head size");
-  }
+  });
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    actual attn time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  produce_output(m, bc, output_ptr, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    produce_output_kernel time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
 }
 
 template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
-                      TreeVerifyBatchConfig const *bc,
+                      BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   // additional processing for weight uploading
   if (m->handle.offload_reserve_space != nullptr) {
     // Note that we update weight_ptr and bias_ptr when uploading weight and
@@ -902,7 +367,23 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
   // "\n";
 
-  commit_tokens<DT>(m, bc, stream);
+  if (!bc->prompt_phase) {
+    commit_tokens(m, bc, stream);
+  }
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Commit tokens time: " << elapsed << " ms\n";
+  //   }
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
 
   // After commit we update m->num_active_tokens to be the number of active
   // tokens for the current batch
@@ -914,22 +395,86 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
         m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
-  // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
-
-  // phase 2: No need to update key/val cache
-  // IncMultiHeadSelfAttention::update_kv_cache_kernel(
-  //    m, bc, stream);
-  // use the new kernel
-  compute_attention_kernel_fused<DT>(
-      m, bc, static_cast<DT *>(m->attn_heads), stream);
+  // Implement kernel to compute KQV for input tokens
+  compute_qkv(m,
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
+
+  apply_pos_encoding_to_tokens_in_batch(
+      m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Compute qkv time: " << elapsed << " ms\n";
+  //   }
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  // Update key-val cache, compact q array
+  update_qkv_in_batch_paged<DT>(m, bc, stream, true);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Update qkv time: " << elapsed << " ms\n";
+  //   }
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  // Compute attention
+  tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Attn time: " << elapsed << " ms\n";
+  //   }
+
+  // Debug output:
+  // {
+  //   int size = m->local_hidden_size * bc->num_active_tokens();
+  //   float *temp_output = new float[size];
+  //   cudaDeviceSynchronize();
+  //   cudaMemcpy(
+  //       temp_output, m->attn_heads, size * sizeof(float),
+  //       cudaMemcpyDeviceToHost);
+  //   printf("Output (flashinfer attention) :");
+  //   for (int i = 0; i < 1; ++i) {
+  //     float temp = 0;
+  //     for (int j = 0; j < m->local_hidden_size; ++j) {
+  //       temp += temp_output[i * m->local_hidden_size + j];
+  //     }
+  //     printf("%.6f ", temp);
+  //   }
+  //   printf("\n");
+
+  //   delete[] temp_output;
+  // }
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 
@@ -941,6 +486,31 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       bias_ptr,
                       processed_tokens_in_batch,
                       stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Compute output proj time: " << elapsed << " ms\n";
+  //   }
+  // {
+  //   int size = m->o_dim;
+  //   DT *temp_output = new DT[size];
+  //   cudaDeviceSynchronize();
+  //   cudaMemcpy(
+  //       temp_output, output_ptr + m->o_dim * (bc->num_active_tokens() -
+  //       1), size * sizeof(DT), cudaMemcpyDeviceToHost);
+  //   printf("Output :");
+  //   for (int i = 0; i < size; ++i) {
+  //     printf("%.6f ", static_cast<float>(temp_output[i]));
+  //   }
+  //   printf("\n");
+
+  //   delete[] temp_output;
+  // }
 }
 
 } // namespace TreeIncMultiHeadAttention
@@ -949,7 +519,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 /*static*/
 void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     TreeIncMultiHeadSelfAttentionMeta *m,
-    TreeVerifyBatchConfig const *bc,
+    BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
@@ -959,12 +529,12 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
 
-  cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
 
   // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
@@ -974,12 +544,12 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   if (input.data_type == DT_HALF) {
     if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+      pre_build_weight<half>(m, weight, input.data_type, stream);
     }
 
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::TreeIncMultiHeadAttention::inference_kernel(
+    Kernels::TreeIncMultiHeadAttention::inference_kernel<half>(
         m,
         bc,
         shard_id,
@@ -990,11 +560,11 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+      pre_build_weight<float>(m, weight, input.data_type, stream);
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::TreeIncMultiHeadAttention::inference_kernel(
+    Kernels::TreeIncMultiHeadAttention::inference_kernel<float>(
         m,
         bc,
         shard_id,
@@ -1008,14 +578,16 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     assert(false && "Unspported data type");
   }
 
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-  }
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "TreeIncMultiHeadSelfAttention time: " << elapsed << "
+  //     ms\n";
+  //   }
 }
 
 TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
@@ -1029,14 +601,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     : IncMultiHeadSelfAttentionMeta(handler,
                                     TREE_VERIFY_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
-                                    attn->qProjSize,
-                                    attn->kProjSize,
-                                    attn->vProjSize,
-                                    attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->hidden_size,
+                                    attn->qk_dim,
+                                    attn->v_dim,
+                                    attn->o_dim,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
@@ -1051,39 +620,40 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     _num_q_heads,
                                     _num_kv_heads,
                                     attn->quantization_type,
-                                    attn->offload),
+                                    attn->offload,
+                                    false),
       num_active_tokens(0) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
 
+  // set attention constants
+  handler.tree_verify_attention_metadata->set_enabled(true);
+  handler.tree_verify_attention_metadata->set_num_q_heads(num_q_heads);
+  handler.tree_verify_attention_metadata->set_num_kv_heads(num_kv_heads);
+  handler.tree_verify_attention_metadata->set_head_dim(qk_dim);
+
   // allocate memory for the seqArray and reserve space
   {
-
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo));
     committed_token_infos =
-        reinterpret_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
+        reinterpret_cast<BatchConfig::CommittedTokensInfo *>(
             reinterpret_cast<char *>(handler.batch_config_metadata) +
             sizeof(BatchConfig::tokensInfo) +
             sizeof(BatchConfig::requestsInfo) +
-            sizeof(BatchConfig::causalMask));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::causalMask) +
-        sizeof(TreeVerifyBatchConfig::committed_tokens));
+            sizeof(BatchConfig::request_available) +
+            sizeof(BatchConfig::causalMask) +
+            sizeof(BatchConfig::streamingCacheInfo));
+    num_tokens_to_commit = reinterpret_cast<int *>(
+        reinterpret_cast<char *>(committed_token_infos) +
+        sizeof(BatchConfig::committed_tokens));
   }
 
   cudaStreamSynchronize(stream);
 }
 
 TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {
-  if (committed_token_reserve_inst != Realm::RegionInstance::NO_INST) {
-    committed_token_reserve_inst.destroy();
-  }
+  // delete static_cast<flashinfer::BatchPrefillHandler
+  // *>(batch_prefill_handler);
 }
 
 }; // namespace FlexFlow
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 5d38e2890..7f38e2714 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -106,7 +106,12 @@ OpMeta *AllReduce::init_task(Task const *task,
                              Runtime *runtime) {
   AllReduce *ar = (AllReduce *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  AllReduceMeta *meta = new AllReduceMeta(handle, ar);
+  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                       .only_kind(Memory::GPU_FB_MEM)
+                       .best_affinity_to(task->target_proc)
+                       .first();
+  MemoryAllocator gpu_mem_allocator(gpu_mem);
+  AllReduceMeta *meta = new AllReduceMeta(handle, ar, gpu_mem_allocator);
   meta->input_type[0] = ar->inputs[0]->data_type;
   meta->output_type[0] = ar->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
@@ -129,6 +134,7 @@ void AllReduce::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  // launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -167,6 +173,7 @@ void AllReduce::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  // launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -208,6 +215,7 @@ FutureMap AllReduce::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
@@ -240,6 +248,7 @@ void AllReduce::forward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -269,6 +278,7 @@ void AllReduce::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          inputs[0]->machine_view.hash());
+  // launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
@@ -326,7 +336,7 @@ void AllReduce::inference_task(Task const *task,
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);
 
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+  AllReduceMeta *m = *((AllReduceMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
@@ -335,7 +345,7 @@ void AllReduce::inference_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   assert(input.data_type == output.data_type);
-  inference_kernel_wrapper(m, bc, input, output);
+  inference_kernel_wrapper(ctx, runtime, m, bc, input, output);
 }
 
 /*static*/
@@ -354,7 +364,7 @@ void AllReduce::forward_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   assert(input.data_type == output.data_type);
-  forward_kernel_wrapper(m, input, output);
+  forward_kernel_wrapper(ctx, runtime, m, input, output);
 }
 
 void AllReduce::backward_task(Task const *task,
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp
index 8d7e20e39..1e60728fa 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cpp
+++ b/src/parallel_ops/kernels/allreduce_kernels.cpp
@@ -25,7 +25,9 @@ AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
+void inference_kernel_wrapper(Legion::Context ctx,
+                              Legion::Runtime *runtime,
+                              AllReduceMeta const *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output) {
@@ -37,6 +39,7 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
   size_t num_elements = bc->num_tokens * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
                           num_elements,
@@ -44,12 +47,15 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
                           ncclSum,
                           m->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
 #else
   assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
 #endif
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
+void forward_kernel_wrapper(Legion::Context ctx,
+                            Legion::Runtime *runtime,
+                            AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output) {
   hipStream_t stream;
@@ -59,6 +65,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
   size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
                           input.domain.get_volume(),
@@ -66,6 +73,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
                           ncclSum,
                           m->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
 #else
   assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
 #endif
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 2c000137a..879be72b8 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -15,40 +15,199 @@
 
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "tensorrt_llm/custom_allreduce_kernels.h"
+#include <cuda_runtime.h>
 
 namespace FlexFlow {
 
-AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
-    : OpMeta(handle) {}
+AllReduceMeta::AllReduceMeta(FFHandler handle,
+                             AllReduce const *reduct,
+                             MemoryAllocator &gpu_mem_allocator)
+    : OpMeta(handle) {
+  barrier_ptr_size = sizeof(uint32_t) *
+                     (tensorrt_llm::MAX_ALL_REDUCE_BLOCKS + 2) *
+                     tensorrt_llm::MAX_RANKS_PER_NODE;
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst,
+      sizeof(void *) * (handle.num_devices + 1) + barrier_ptr_size * 2,
+      "AllReduceMeta");
+  allgather_src = gpu_mem_allocator.allocate_instance_untyped(sizeof(void *));
+  allgather_dst = gpu_mem_allocator.allocate_instance_untyped(
+      sizeof(void *) * handle.num_devices);
+  // Create barrier helpers for all-reduce.
+  barrier_in_ptr =
+      gpu_mem_allocator.allocate_instance_untyped(barrier_ptr_size);
+  barrier_out_ptr =
+      gpu_mem_allocator.allocate_instance_untyped(barrier_ptr_size);
+  checkCUDA(cudaMemset(barrier_in_ptr, 0, barrier_ptr_size));
+  checkCUDA(cudaMemset(barrier_out_ptr, 0, barrier_ptr_size));
+  // Reset allocated memory to zero.
+  // We explicitly synchronize after memset, to make sure memset finishes
+  // before using all-gather to exchange peer pointers.
+  // This is important to ensure the memory reset get ordered
+  // before any other peers read the memory.
+  checkCUDA(cudaDeviceSynchronize());
+}
+
+AllReduceMeta::~AllReduceMeta() {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
+}
 
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
+CommunicationBuffer *get_or_create_comm_buffer(Context ctx,
+                                               Runtime *runtime,
+                                               AllReduceMeta *m,
+                                               int num_devices,
+                                               int device_id,
+                                               ncclComm_t ncclComm,
+                                               void *local_ptr,
+                                               cudaStream_t stream) {
+  auto iter = m->comm_bufs.find(local_ptr);
+  if (iter != m->comm_bufs.end()) {
+    return iter->second;
+  } else {
+    CommunicationBuffer *comm_buffer =
+        create_comm_buf_with_local_ptr(ctx,
+                                       runtime,
+                                       num_devices,
+                                       device_id,
+                                       ncclComm,
+                                       m->allgather_src,
+                                       m->allgather_dst,
+                                       local_ptr,
+                                       m->barrier_in_ptr,
+                                       m->barrier_out_ptr,
+                                       &(m->barrier_flag),
+                                       stream);
+    m->comm_bufs[local_ptr] = comm_buffer;
+    return comm_buffer;
+  }
+}
+
+// Get the number of bits for a given data type.
+inline int get_bits(DataType dtype) {
+  switch (dtype) {
+    case DataType::DT_INT64:
+    case DataType::DT_DOUBLE:
+      return 64;
+    case DataType::DT_INT32:
+    case DataType::DT_FLOAT:
+      return 32;
+    case DataType::DT_HALF:
+      return 16;
+    case DataType::DT_INT8:
+      return 8;
+    case DataType::DT_INT4:
+      return 4;
+    default:
+      assert(false && "Unsupported data type");
+  }
+}
+
+// Check if customized all-reduce kernels can be applied.
+inline bool CanApplyCustomAllReduce(int64_t num_elements, DataType dtype) {
+  // The customized all-reduce kernel has the following requirement(s).
+  return num_elements % (16 / ((get_bits(dtype) + 7) / 8)) == 0;
+}
+
+// Check if the two-shot customized all-reduce kernel can be applied.
+inline bool CanApplyTwoShotAllReduce(int64_t num_elements,
+                                     DataType dtype,
+                                     int num_workers) {
+  // The two-shot customized all-reduce kernel has the following requirement(s).
+  return (num_elements / num_workers) % (16 / ((get_bits(dtype) + 7) / 8)) == 0;
+}
+
+// Customized all-reduce kernel backed by CUDA Peer memory.
+void inference_kernel_wrapper(Context ctx,
+                              Runtime *runtime,
+                              AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output) {
+#ifndef FF_USE_NCCL
+  assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
+#endif
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
   size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
   size_t num_elements = bc->num_tokens * hidden_dim_size;
-#ifdef FF_USE_NCCL
-  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
-  checkNCCL(ncclAllReduce(input.ptr,
-                          output.ptr,
-                          num_elements,
-                          nccl_data_type,
-                          ncclSum,
-                          m->handle.ncclComm,
-                          stream));
-#else
-  assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
-#endif
+  int num_devices = m->handle.num_devices;
+  int device_id = m->handle.device_id;
+  ncclComm_t ncclComm = m->handle.ncclComm;
+  DataType dtype = input.data_type;
+  if (num_elements == 0) {
+    return;
+  }
+
+  tensorrt_llm::AllReduceStrategyType strategy =
+      tensorrt_llm::SelectImplementation(
+          num_elements * ((get_bits(dtype) + 7) / 8), num_devices);
+
+  if (strategy == tensorrt_llm::AllReduceStrategyType::RING ||
+      !CanApplyCustomAllReduce(num_elements, dtype)) {
+    // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
+    ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype);
+    runtime->concurrent_task_barrier(ctx);
+    checkNCCL(ncclAllReduce(input.ptr,
+                            output.ptr,
+                            num_elements,
+                            nccl_data_type,
+                            ncclSum,
+                            ncclComm,
+                            stream));
+    runtime->concurrent_task_barrier(ctx);
+    return;
+  }
+
+  // Initialize the all-reduce kernel arguments.
+  tensorrt_llm::AllReduceParams params;
+  params.ranks_per_node = num_devices;
+  params.rank = device_id;
+  params.local_rank = device_id;
+  CommunicationBuffer *comm_buffer =
+      get_or_create_comm_buffer(ctx,
+                                runtime,
+                                m,
+                                num_devices,
+                                device_id,
+                                ncclComm,
+                                const_cast<void *>(input.ptr),
+                                stream);
+  params.barrier_flag = ++(*comm_buffer->barrier_flag);
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i];
+  }
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_barrier_ptrs_in[i] =
+        reinterpret_cast<uint32_t *>(comm_buffer->barrier_in[i]);
+  }
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_barrier_ptrs_out[i] =
+        reinterpret_cast<uint32_t *>(comm_buffer->barrier_out[i]);
+  }
+
+  if (!CanApplyTwoShotAllReduce(num_elements, dtype, num_devices)) {
+    // Two-shot all-reduce does not support this case.
+    // So we fallback to the one-shot strategy.
+    strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT;
+  }
+
+  // runtime->concurrent_task_barrier(ctx);
+  tensorrt_llm::customAllReduce(
+      params, output.ptr, num_elements, dtype, strategy, stream);
+  // runtime->concurrent_task_barrier(ctx);
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
+void forward_kernel_wrapper(Context ctx,
+                            Runtime *runtime,
+                            AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output) {
   cudaStream_t stream;
@@ -57,6 +216,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
   assert(input.domain == output.domain);
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
                           input.domain.get_volume(),
@@ -64,6 +224,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
                           ncclSum,
                           m->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
 #else
   assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
 #endif
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index bd96dbb14..0073093d8 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -16,58 +16,62 @@
 #include "flexflow/batch_config.h"
 #include "flexflow/request_manager.h"
 #include "legion.h"
+#include <algorithm>
 #include <cassert>
 #include <climits>
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_bc("BatchConfig");
+Legion::Logger log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-BatchConfig::BatchConfig() : num_tokens(0) {
-  for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
-    requestsInfo[i].first_token_depth_in_request = 0;
-    requestsInfo[i].first_token_offset_in_batch = 0;
-    requestsInfo[i].num_tokens_in_batch = 0;
-    request_completed[i] = true;
+BatchConfig::BatchConfig(InferenceMode inference_mode_, int model_id_)
+    : model_id(model_id_), inference_mode(inference_mode_) {
+  std::fill(std::begin(request_available), std::end(request_available), 0);
+  // Don't need to initialize requestInfo ,tokensInfo, causalMask and
+  // committed_tokens here because they initialize themselves.
+  // Other fields are already initialized to proper value.
+}
+
+BatchConfig::BatchConfig(BatchConfig const &rhs) {
+  model_id = rhs.model_id;
+  inference_mode = rhs.inference_mode;
+  num_available_requests = rhs.num_available_requests;
+  num_tokens = rhs.num_tokens;
+  prompt_phase = rhs.prompt_phase;
+  num_tokens_to_commit = rhs.num_tokens_to_commit;
+  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
+    tokensInfo[token_idx] = rhs.tokensInfo[token_idx];
   }
-  for (int i = 0; i < MAX_NUM_TOKENS; i++) {
-    tokensInfo[i].abs_depth_in_request = 0;
-    tokensInfo[i].request_index = 0;
-    tokensInfo[i].token_id = 0;
+  for (int request_idx = 0; request_idx < max_requests_per_batch();
+       request_idx++) {
+    if (rhs.request_available[request_idx]) {
+      request_available[request_idx] = true;
+      requestsInfo[request_idx] = rhs.requestsInfo[request_idx];
+      streamingCacheInfo[request_idx] = rhs.streamingCacheInfo[request_idx];
+      causalMask[request_idx] = rhs.causalMask[request_idx];
+    }
+  }
+  for (int committed_token_idx = 0; committed_token_idx < num_tokens_to_commit;
+       committed_token_idx++) {
+    committed_tokens[committed_token_idx] =
+        rhs.committed_tokens[committed_token_idx];
   }
 }
 
 /*static*/
 BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future) {
-  BatchConfig const *bc = static_cast<BatchConfig const *>(
+  return static_cast<BatchConfig const *>(
       Future(future).get_buffer(Memory::SYSTEM_MEM));
-  // Check future size
-  if (bc->get_mode() == INC_DECODING_MODE) {
-    assert(Future(future).get_untyped_size() == sizeof(BatchConfig));
-  } else if (bc->get_mode() == BEAM_SEARCH_MODE) {
-    assert(Future(future).get_untyped_size() == sizeof(BeamSearchBatchConfig));
-  } else if (bc->get_mode() == TREE_VERIFY_MODE) {
-    assert(Future(future).get_untyped_size() == sizeof(TreeVerifyBatchConfig));
-  } else {
-    assert(false && "Unsupported inference mode");
-  }
-  return bc;
 }
 
 InferenceMode BatchConfig::get_mode() const {
-  return INC_DECODING_MODE;
+  return inference_mode;
 }
 
 int BatchConfig::num_active_requests() const {
-  int num_requests = 0;
-  for (int i = 0; i < max_requests_per_batch(); i++) {
-    if (!request_completed[i]) {
-      num_requests++;
-    }
-  }
-  return num_requests;
+  return num_available_requests;
 }
 
 int BatchConfig::num_active_tokens() const {
@@ -85,9 +89,14 @@ int BatchConfig::max_tokens_per_batch() {
 }
 
 /*static*/
-int BatchConfig::max_verify_tokens_per_batch() {
+int BatchConfig::max_tokens_per_ssm_batch() {
+  return RequestManager::get_request_manager()->get_max_tokens_per_ssm_batch();
+}
+
+/*static*/
+int BatchConfig::max_tokens_per_prefilling_batch() {
   return RequestManager::get_request_manager()
-      ->get_max_verify_tokens_per_batch();
+      ->get_max_tokens_per_prefilling_batch();
 }
 
 /*static*/
@@ -95,37 +104,102 @@ int BatchConfig::max_sequence_length() {
   return RequestManager::get_request_manager()->get_max_sequence_length();
 }
 
+int BatchConfig::max_output_length() {
+  return RequestManager::get_request_manager()->get_max_output_length();
+}
+
+size_t BatchConfig::max_kv_cache_size() {
+  return RequestManager::get_request_manager()->get_max_kv_cache_size();
+}
+bool BatchConfig::streaming_cache() {
+  return RequestManager::get_request_manager()->get_streaming_cache();
+}
+
 int BatchConfig::max_spec_tree_token_num() {
   return RequestManager::get_request_manager()->get_max_spec_tree_token_num();
 }
 
+int BatchConfig::get_max_tree_depth() {
+  return RequestManager::get_request_manager()->get_max_tree_depth();
+}
+
+// Overloading the << operator for the Bitset class
+std::ostream &operator<<(std::ostream &os,
+                         BatchConfig::BitMask::Bitset const &bitset) {
+  for (size_t i = 0; i < BatchConfig::max_spec_tree_token_num(); i++) {
+    os << (bitset.test_bit(i) ? '1' : '0');
+  }
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, BatchConfig::BitMask const &bm) {
+  os << "BitMask {\n"
+     << "  non_tree_cache_size: " << bm.non_tree_cache_size << "\n"
+     << "  tree_or_prompt_size: " << bm.tree_or_prompt_size << "\n"
+     << "  current_layer_size: " << bm.current_layer_size << "\n"
+     << "  bit_mask: [" << bm.bit_mask << "]\n";
+  os << "}";
+  return os;
+}
+
 std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "@@@@@@@@@@@@@@ Batch Config (mode " << bc.get_mode()
      << ") @@@@@@@@@@@@@@" << std::endl;
-  // Max values
-  os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl;
-  os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl;
-  os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
   // Current values
   os << "Number of tokens: " << bc.num_active_tokens() << std::endl;
   os << "Number of requests: " << bc.num_active_requests() << std::endl;
+  os << "Prompt phase: " << bc.prompt_phase << std::endl;
+  os << "Inference mode: ";
+  switch (bc.inference_mode) {
+    case INC_DECODING_MODE:
+      os << "Incremental decoding";
+      break;
+    case TREE_SEARCH_MODE:
+      os << "Tree search";
+      break;
+    case TREE_VERIFY_MODE:
+      os << "Tree verify";
+      break;
+    default:
+      os << "Unknown";
+  }
+  os << std::endl;
+  if (bc.inference_mode == TREE_VERIFY_MODE) {
+    os << "Number of tokens to commit: " << bc.num_tokens_to_commit
+       << std::endl;
+  }
+  if (bc.inference_mode == TREE_SEARCH_MODE) {
+    os << "Model id: " << bc.model_id << std::endl;
+  }
 
   // Per-request info
   os << "Per-request info:\n";
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-    if (!bc.request_completed[i]) {
+    if (bc.request_available[i]) {
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
-         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
+         << bc.requestsInfo[i].first_token_index_in_request << std::endl;
       os << "    First token offset in batch: "
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
-      os << "    Request completed: " << bc.request_completed[i] << std::endl;
-      os << "    Request running: " << bc.request_running[i] << std::endl;
+      os << "    Request available: " << bc.request_available[i] << std::endl;
+    }
+  }
+
+  // Streaming cache info
+  os << "Streaming cache info:\n";
+  for (int i = 0; i < bc.max_requests_per_batch(); i++) {
+    if (bc.request_available[i]) {
+      os << "  Request " << i << ":\n";
+      os << "    Sink cache size: " << bc.streamingCacheInfo[i].sink_cache_size
+         << std::endl;
+      os << "    Window cache size: "
+         << bc.streamingCacheInfo[i].window_cache_size << std::endl;
+      os << "    Window back: " << bc.streamingCacheInfo[i].window_back
+         << std::endl;
+      os << "    Commit len: " << bc.streamingCacheInfo[i].commit_len
+         << std::endl;
     }
   }
 
@@ -133,15 +207,85 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "Per-token info:\n";
   for (int i = 0; i < bc.num_tokens; i++) {
     os << "  Token " << i << ":\n";
+    os << "    Absolute index in request: "
+       << bc.tokensInfo[i].abs_index_in_request << std::endl;
     os << "    Absolute depth in request: "
        << bc.tokensInfo[i].abs_depth_in_request << std::endl;
     os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
     os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
   }
+
+  if (bc.inference_mode == TREE_VERIFY_MODE) {
+    os << "Committed tokens info:\n";
+    for (int i = 0; i < bc.num_tokens_to_commit; i++) {
+      os << "  Token " << i << ":\n";
+      os << "    Index in kv cache: "
+         << bc.committed_tokens[i].index_in_kv_cache << std::endl;
+      os << "    Request index: " << bc.committed_tokens[i].request_index
+         << std::endl;
+      os << "    Token depth: " << bc.committed_tokens[i].token_depth
+         << std::endl;
+    }
+  }
+
+  if (bc.inference_mode == TREE_SEARCH_MODE ||
+      bc.inference_mode == TREE_VERIFY_MODE) {
+    os << "Causal mask:\n";
+    for (int i = 0; i < bc.max_requests_per_batch(); i++) {
+      if (bc.request_available[i]) {
+        os << "  Request " << i << ":\n";
+        os << "    Non tree cache size: "
+           << bc.causalMask[i].non_tree_cache_size << std::endl;
+        os << "    Tree or prompt size: "
+           << bc.causalMask[i].tree_or_prompt_size
+
+           << std::endl;
+        os << "    Current layer size: " << bc.causalMask[i].current_layer_size
+           << std::endl;
+        os << "    Bit mask: " << std::endl;
+        for (int j = 0; j < BatchConfig::max_spec_tree_token_num(); j++) {
+          os << "      " << bc.causalMask[i].bit_mask[j] << std::endl;
+        }
+      }
+    }
+  }
+
   os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl;
   return os;
 }
 
+std::ostream &operator<<(std::ostream &os, InferenceResult const &ir) {
+  os << "InferenceResult {\n"
+     << "  num_token_ids: " << ir.num_token_ids << "\n"
+     << "  num_gumbel_logits: " << ir.num_gumbel_logits << "\n"
+     << "  token_ids: [";
+  for (int i = 0; i < ir.num_token_ids; i++) {
+    os << ir.token_ids[i];
+    if (i < ir.num_token_ids - 1) {
+      os << ", ";
+    }
+  }
+  os << "]\n"
+     << "  probs: [";
+  for (int i = 0; i < ir.num_token_ids; i++) {
+    os << ir.probs[i];
+    if (i < ir.num_token_ids - 1) {
+      os << ", ";
+    }
+  }
+  os << "]\n"
+     << "  gumbel_logits: [";
+  for (int i = 0; i < ir.num_gumbel_logits; i++) {
+    os << ir.gumbel_logits[i];
+    if (i < ir.num_gumbel_logits - 1) {
+      os << ", ";
+    }
+  }
+  os << "]\n"
+     << "}";
+  return os;
+}
+
 void BatchConfig::print() const {
   std::cout << *this << std::endl;
 }
@@ -158,4 +302,75 @@ void BatchConfig::save_to_file(std::string const &filename) const {
   }
 }
 
+InferenceResult::InferenceResult(InferenceResult const &other) {
+  num_token_ids = other.num_token_ids;
+  num_gumbel_logits = other.num_gumbel_logits;
+  std::copy(other.token_ids, other.token_ids + num_token_ids, token_ids);
+  std::copy(other.probs, other.probs + num_token_ids, probs);
+  std::copy(other.gumbel_logits,
+            other.gumbel_logits + num_gumbel_logits,
+            gumbel_logits);
+}
+
+StreamingCacheInfo::StreamingCacheInfo() : StreamingCacheInfo(0, 0) {}
+
+StreamingCacheInfo::StreamingCacheInfo(int sink_cache_size,
+                                       int window_cache_size)
+    : sink_cache_size(sink_cache_size), window_cache_size(window_cache_size),
+      window_back(0), commit_len(0) {}
+
+StreamingCacheInfo::StreamingCacheInfo(StreamingCacheInfo const &other)
+    : sink_cache_size(other.sink_cache_size),
+      window_cache_size(other.window_cache_size),
+      window_back(other.window_back), commit_len(other.commit_len) {}
+
+StreamingCacheInfo &
+    StreamingCacheInfo::operator=(StreamingCacheInfo const &other) {
+  sink_cache_size = other.sink_cache_size;
+  window_cache_size = other.window_cache_size;
+  window_back = other.window_back;
+  commit_len = other.commit_len;
+  return *this;
+}
+
+// For draft model, we only update the cache when prefill or
+// commit the verified result from target model;
+// For incremental decoding, we update the cache both in prefill and decoding
+void StreamingCacheInfo::commit_cache(int len) {
+  total_len += len;
+  commit_len += len;
+  if (commit_len <= sink_cache_size + window_cache_size) {
+    window_back = std::max(0, commit_len - sink_cache_size);
+  } else {
+    commit_len = sink_cache_size + window_cache_size;
+    window_back = (window_back + len - 1) % window_cache_size + 1;
+  }
+}
+
+void StreamingCacheInfo::reset_cache() {
+  window_back = 0;
+  commit_len = 0;
+  total_len = 0;
+}
+
+// page attention: TODO: I think we just need to change the index
+
+int StreamingCacheInfo::global_2_cache_index(int global_index) {
+  if (global_index < sink_cache_size) {
+    return global_index;
+  }
+  return (global_index - sink_cache_size) % window_cache_size + sink_cache_size;
+}
+
+int StreamingCacheInfo::cache_2_global_index(int cache_index) {
+  if (cache_index < sink_cache_size) {
+    return cache_index;
+  }
+  // cache = (global-sink) % window + sink
+  cache_index -= sink_cache_size;
+  int num_window = (total_len - sink_cache_size) / window_cache_size -
+                   (window_back <= cache_index);
+  return sink_cache_size + cache_index + num_window * window_cache_size;
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
deleted file mode 100644
index ff7bf1a81..000000000
--- a/src/runtime/beam_search_batch_config.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright 2023 CMU, Stanford, Facebook, LANL
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/batch_config.h"
-#include "flexflow/request_manager.h"
-#include "legion.h"
-#include <cassert>
-#include <climits>
-
-#define DEFAULT_BEAM_WIDTH 1
-#define DEFAULT_TARGET_ITERATIONS 3
-
-namespace FlexFlow {
-
-LegionRuntime::Logger::Category log_beam_bc("BeamSearchBatchConfig");
-
-BeamSearchBatchConfig::BeamSearchBatchConfig() : BatchConfig() {
-  this->beam_width = DEFAULT_BEAM_WIDTH;
-  this->target_iterations = DEFAULT_TARGET_ITERATIONS;
-  current_iteration = 0;
-}
-
-BeamSearchBatchConfig::BeamSearchBatchConfig(int model_id) : BatchConfig() {
-  this->model_id = model_id;
-  std::cout << "==================\n"
-            << "Register Batch Config with Model " << this->model_id
-            << std::endl;
-  current_iteration = 0;
-}
-
-BeamSearchBatchConfig::BeamSearchBatchConfig(size_t beam_width,
-                                             size_t target_iterations)
-    : BatchConfig() {
-  this->beam_width = beam_width;
-  this->target_iterations = target_iterations;
-  current_iteration = 0;
-}
-
-BeamSearchBatchConfig::BeamSearchBatchConfig(BeamSearchBatchConfig const &other,
-                                             int model_id)
-    : BatchConfig() {
-  this->beam_width = other.beam_width;
-  this->target_iterations = other.target_iterations;
-  this->model_id = model_id;
-  current_iteration = 0;
-}
-
-BeamSearchBatchConfig::~BeamSearchBatchConfig() {}
-
-InferenceMode BeamSearchBatchConfig::get_mode() const {
-  return BEAM_SEARCH_MODE;
-}
-
-bool BeamSearchBatchConfig::done() const {
-  assert(current_iteration <= target_iterations);
-  return current_iteration == target_iterations;
-}
-
-int BeamSearchBatchConfig::max_beam_depth_all_requests() const {
-  int max_depth_all_requests = 0;
-  for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
-    if (!request_completed[i] &&
-        beamRequestsInfo[i].max_depth > max_depth_all_requests) {
-      /* printf("\treq %i has max_depth=%i. Increasing max_depth_all_requests "
-             "from %i\n",
-             i,
-             beamRequestsInfo[i].max_depth,
-             max_depth_all_requests); */
-      max_depth_all_requests = beamRequestsInfo[i].max_depth;
-    }
-  }
-  assert(max_depth_all_requests <= BeamSearchBatchConfig::MAX_BEAM_DEPTH);
-  return max_depth_all_requests;
-}
-
-int BeamSearchBatchConfig::get_speculative_request_num() const {
-  return speculative_request_num;
-}
-
-int BeamSearchBatchConfig::current_depth_all_requests() const {
-  int current_depth = 0;
-  for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
-    if (!request_completed[i] &&
-        beamRequestsInfo[i].current_depth > current_depth) {
-      /* printf("\treq %i has current_depth=%i. Increasing "
-             "current_depth_all_requests from %i\n",
-             i,
-             beamRequestsInfo[i].current_depth,
-             current_depth); */
-      current_depth = beamRequestsInfo[i].current_depth;
-    }
-  }
-  assert(current_depth <= BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1);
-  return current_depth;
-}
-
-std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
-  os << "@@@@@@@@@@@@@@ BeamSearchBatchConfig (mode " << bc.get_mode()
-     << ") @@@@@@@@@@@@@@" << std::endl;
-  // Max values
-  os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl;
-  os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl;
-  os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
-  // Current values
-  os << "Number of tokens: " << bc.num_active_tokens() << std::endl;
-  os << "Number of requests: " << bc.num_active_requests() << std::endl;
-  // BeamSearch-specific
-  os << "Model ID: " << bc.model_id << std::endl;
-  os << "Max Beam Depth (all requests): " << bc.max_beam_depth_all_requests()
-     << std::endl;
-  os << "Current depth (all requests): " << bc.current_depth_all_requests()
-     << std::endl;
-  os << "Beam width: " << bc.beam_width << std::endl;
-  os << "Target Iterations: " << bc.target_iterations << std::endl;
-  os << "Current Iterations: " << bc.current_iteration << std::endl;
-
-  os << "Per-request info:\n";
-  for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-    if (!bc.request_completed[i]) {
-      os << "  Request " << i << ":\n";
-      os << "    First token depth in request: "
-         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
-      os << "    First token offset in batch: "
-         << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
-      os << "    Number of tokens in batch: "
-         << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
-      os << "    Request completed: " << bc.request_completed[i] << std::endl;
-      os << "    Request running: " << bc.request_running[i] << std::endl;
-      os << "    Beam Search Specific: " << std::endl;
-      os << "        beam_size: " << bc.beamRequestsInfo[i].beam_size
-         << std::endl;
-      os << "        current_depth: " << bc.beamRequestsInfo[i].current_depth
-         << std::endl;
-      os << "        max_depth: " << bc.beamRequestsInfo[i].max_depth
-         << std::endl;
-      os << "        tokens: ";
-      for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) {
-        os << bc.beamRequestsInfo[i].tokens[j] << ", ";
-      }
-      os << std::endl;
-      os << "        probs: ";
-      for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) {
-        os << bc.beamRequestsInfo[i].probs[j] << ", ";
-      }
-      os << std::endl;
-      os << "        parent_id: ";
-      for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) {
-        os << bc.beamRequestsInfo[i].parent_id[j] << ", ";
-      }
-      os << std::endl;
-    }
-  }
-
-  os << "Per-token info:\n";
-  for (int i = 0; i < bc.num_tokens; i++) {
-    os << "  Token " << i << ":\n";
-    os << "    Absolute depth in request: "
-       << bc.tokensInfo[i].abs_depth_in_request << std::endl;
-    os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
-    os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
-    os << "    Beam Search Specific: " << std::endl;
-    os << "        beam_size: " << bc.beamTokenInfo[i].sub_request_index
-       << std::endl;
-  }
-  os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl;
-  return os;
-}
-
-void BeamSearchBatchConfig::print() const {
-  std::cout << *this << std::endl;
-}
-
-void BeamSearchBatchConfig::save_to_file(std::string const &filename) const {
-  std::ofstream outputFile(filename);
-  if (outputFile.is_open()) {
-    outputFile << *this << std::endl;
-    outputFile.close();
-  } else {
-    std::cerr << "Error: Unable to open the batch config output file: "
-              << filename << std::endl;
-    assert(false);
-  }
-}
-
-}; // namespace FlexFlow
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index c7b6e1257..36c68c836 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -114,10 +114,12 @@ std::string get_operator_type_name(OperatorType type) {
       return "Size";
     case OP_TOPK:
       return "TopK";
+    case OP_GUMBEL_TOPK:
+      return "GumbelTopK";
     case OP_ARG_TOPK:
       return "ArgTopK";
-    case OP_BEAM_TOPK:
-      return "BeamTopK";
+    // case OP_BEAM_TOPK:
+    //   return "BeamTopK";
     case OP_WHERE:
       return "Where";
     case OP_CEIL:
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 43ce9d700..14e806d49 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -16,6 +16,7 @@
 #include "flexflow/utils/file_loader.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/model.h"
 
 #include <vector>
 using namespace std;
@@ -27,12 +28,12 @@ FileDataLoader::FileDataLoader(std::string _prompts_filepath,
                                int _num_heads,
                                int _num_kv_heads,
                                size_t _hidden_dim,
-                               size_t _qkv_inner_dim,
+                               size_t _head_dim,
                                int _tensor_parallelism_degree,
                                bool _use_full_precision)
     : prompts_filepath(_prompts_filepath), weights_folder(_weights_folder),
       num_heads(_num_heads), num_kv_heads(_num_kv_heads),
-      hidden_dim(_hidden_dim), qkv_inner_dim(_qkv_inner_dim),
+      hidden_dim(_hidden_dim), head_dim(_head_dim),
       tensor_parallelism_degree(_tensor_parallelism_degree),
       use_full_precision(_use_full_precision){};
 
@@ -124,6 +125,7 @@ void load_attention_weights_multi_query(DT *ptr,
       ptr[data_index++] = host_array.at(i);
     }
     file_index++;
+    in.close();
   }
 }
 
@@ -132,16 +134,16 @@ void load_attention_bias_v2(DT *ptr,
                             int num_heads,
                             int num_kv_heads,
                             size_t hidden_dim,
-                            size_t qkv_inner_dim,
+                            size_t head_dim,
                             bool final_bias,
                             std::string layer_name,
                             std::string weights_folder) {
-  std::string q_file = layer_name + "_wq_bias";
-  std::string k_file = layer_name + "_wk_bias";
-  std::string v_file = layer_name + "_wv_bias";
+  std::string q_file = layer_name + ".q_proj.bias";
+  std::string k_file = layer_name + ".k_proj.bias";
+  std::string v_file = layer_name + ".v_proj.bias";
   std::vector<std::string> bias_files = {q_file, k_file, v_file};
   if (final_bias) {
-    std::string o_file = layer_name + "_wo_bias";
+    std::string o_file = layer_name + ".o_proj.bias";
     bias_files.push_back(o_file);
   }
 
@@ -159,8 +161,8 @@ void load_attention_bias_v2(DT *ptr,
 
     int replicate_num = num_heads / num_kv_heads;
 
-    size_t qkv_partial_size = qkv_inner_dim * n_heads;
-    size_t qkv_replicate_size = qkv_inner_dim * num_heads;
+    size_t qkv_partial_size = head_dim * n_heads;
+    size_t qkv_replicate_size = head_dim * num_heads;
     size_t out_partial_size = hidden_dim;
     size_t partial_size =
         (file_index < 3) ? qkv_partial_size : out_partial_size;
@@ -212,24 +214,22 @@ void load_attention_weights_v2(DT *ptr,
                                int num_heads,
                                int num_kv_heads,
                                size_t hidden_dim,
-                               size_t qkv_inner_dim,
+                               size_t head_dim,
                                std::string layer_name,
                                std::string weights_folder,
                                size_t volume,
                                int tensor_parallelism_degree) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
   int file_index = 0;
 
   int base_index = 0;
   size_t single_proj_size =
       hidden_dim *
-      qkv_inner_dim; // size of each of Q,K,V,O weights for a single head
+      head_dim; // size of each of Q,K,V,O weights for a single head
   size_t one_weight_file_size =
       num_heads * single_proj_size; // size of each of Q/K/V/O for all heads
 
@@ -323,8 +323,7 @@ void load_attention_weights_v2(DT *ptr,
     assert(one_weight_file_size == host_array.size());
     int data_index = 0;
 
-    int one_partition_size =
-        qkv_inner_dim * (num_heads / tensor_parallelism_degree);
+    int one_partition_size = head_dim * (num_heads / tensor_parallelism_degree);
     for (int i = 0; i < one_weight_file_size; i++) {
       int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
       int block_num = (i / one_partition_size);
@@ -392,6 +391,7 @@ void FileDataLoader::load_positions(FFModel *ff,
 
   // ff->get_parallel_tensor_from_tensor(pt, position_pt);
   position_pt->set_tensor<int>(ff, dims_vec, data);
+  free(data);
 }
 
 //--------------------- quantization functions ----------------------
@@ -402,24 +402,22 @@ void FileDataLoader::load_positions(FFModel *ff,
 void load_attention_weights_quantized(char *ptr,
                                       int num_heads,
                                       size_t hidden_dim,
-                                      size_t qkv_inner_dim,
+                                      size_t head_dim,
                                       std::string layer_name,
                                       std::string weights_folder,
                                       DataType data_type,
                                       bool use_full_precision) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file, o_file};
 
   int file_index = 0;
 
   size_t single_proj_size =
       hidden_dim *
-      qkv_inner_dim; // size of each of Q,K,V,O weights for a single head
+      head_dim; // size of each of Q,K,V,O weights for a single head
   size_t one_weight_file_size =
       num_heads * single_proj_size; // size of each of Q/K/V/O for all heads
 
@@ -652,14 +650,21 @@ void load_from_quantized_file(char *ptr,
 
 void FileDataLoader::load_quantization_weight(FFModel *ff,
                                               Layer *l,
-                                              int weight_idx) {
-  Tensor weight = l->weights[weight_idx];
-  size_t volume = 1;
+                                              int weight_idx,
+                                              size_t volume,
+                                              size_t num_replicas,
+                                              char *weight,
+                                              DataType data_type,
+                                              Domain weight_domain) {
+  // Tensor weight = l->weights[weight_idx];
+  size_t volume_ = 1;
   std::vector<int> dims_vec;
-  for (int i = 0; i < weight->num_dims; i++) {
-    dims_vec.push_back(weight->dims[i]);
-    volume *= weight->dims[i];
+  for (int i = 0; i < weight_domain.get_dim(); i++) {
+    int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1;
+    dims_vec.push_back(dim_i);
+    volume_ *= dim_i;
   }
+  assert(volume_ == volume * num_replicas);
   char *data = (char *)malloc(sizeof(char) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
@@ -671,17 +676,17 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
       load_attention_weights_quantized(data,
                                        num_heads,
                                        hidden_dim,
-                                       qkv_inner_dim,
+                                       head_dim,
                                        weight_filename,
                                        weights_folder,
-                                       weight->data_type,
+                                       data_type,
                                        use_full_precision);
     }
     // else {
     //   load_attention_bias_quantized(data,
     //                                 num_heads,
     //                                 hidden_dim,
-    //                                 qkv_inner_dim,
+    //                                 head_dim,
     //                                 weight_filename,
     //                                 weights_folder);
     // }
@@ -690,37 +695,47 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     if (weight_idx > 0) {
       assert(weight_idx == 0 || weight_idx == 1);
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
     }
     load_from_quantized_file(data,
                              volume,
                              join_path({weights_folder, weight_filename}),
-                             weight->data_type,
+                             data_type,
                              use_full_precision);
   }
 
-  ParallelTensor weight_pt;
-  ff->get_parallel_tensor_from_tensor(weight, weight_pt);
-  weight_pt->set_tensor<char>(ff, dims_vec, data);
+  // ParallelTensor weight_pt;
+  // ff->get_parallel_tensor_from_tensor(weight, weight_pt);
+  // weight_pt->set_tensor<char>(ff, dims_vec, data);
+  char *ptr = weight;
+  for (size_t i = 0; i < num_replicas; i++) {
+    memcpy(ptr, data, volume * sizeof(char));
+    ptr += volume;
+  }
 
-  delete data;
+  free(data);
 }
 
 template <typename DT>
 void FileDataLoader::load_single_weight_tensor(FFModel *ff,
                                                Layer *l,
-                                               int weight_idx) {
-  Tensor weight = l->weights[weight_idx];
+                                               int weight_idx,
+                                               size_t volume,
+                                               size_t num_replicas,
+                                               DT *weight,
+                                               Domain weight_domain) {
 
   // Create a buffer to store weight data from the file
-  size_t volume = 1;
+  size_t volume_ = 1;
   std::vector<int> dims_vec;
-  for (int i = 0; i < weight->num_dims; i++) {
-    dims_vec.push_back(weight->dims[i]);
-    volume *= weight->dims[i];
+  for (int i = 0; i < weight_domain.get_dim(); i++) {
+    int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1;
+    dims_vec.push_back(dim_i);
+    volume_ *= dim_i;
   }
-  assert(data_type_size(weight->data_type) == sizeof(DT));
+  assert(volume_ == volume * num_replicas);
+  // assert(data_type_size(weight->data_type) == sizeof(DT));
   DT *data = (DT *)malloc(sizeof(DT) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
@@ -734,44 +749,34 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-      if (weight_filename.find("self_attention") != std::string::npos) {
-        load_attention_weights_multi_query(
-            data, weight_filename, weights_folder, hidden_dim, num_heads);
-      } else if (weight_filename.find("attention") != std::string::npos &&
-                 weight_filename.rfind("attention") ==
-                     weight_filename.length() - strlen("attention")) {
-        if (weight_idx == 0) {
-          load_attention_weights_v2(data,
-                                    num_heads,
-                                    num_kv_heads,
-                                    hidden_dim,
-                                    qkv_inner_dim,
-                                    weight_filename,
-                                    weights_folder,
-                                    volume,
-                                    tensor_parallelism_degree);
-        } else {
-          long long value;
-          l->get_int_property("final_bias", value);
-          bool final_bias = (bool)value;
-          load_attention_bias_v2(data,
-                                 num_heads,
-                                 num_kv_heads,
-                                 hidden_dim,
-                                 qkv_inner_dim,
-                                 final_bias,
-                                 weight_filename,
-                                 weights_folder);
-        }
-
+      if (weight_idx == 0) {
+        load_attention_weights_v2(data,
+                                  num_heads,
+                                  num_kv_heads,
+                                  hidden_dim,
+                                  head_dim,
+                                  weight_filename,
+                                  weights_folder,
+                                  volume,
+                                  tensor_parallelism_degree);
       } else {
-        assert(false);
+        long long value;
+        l->get_int_property("final_bias", value);
+        bool final_bias = (bool)value;
+        load_attention_bias_v2(data,
+                               num_heads,
+                               num_kv_heads,
+                               hidden_dim,
+                               head_dim,
+                               final_bias,
+                               weight_filename,
+                               weights_folder);
       }
     } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
       assert(weight_idx >= 0 || weight_idx <= 2);
       weight_filename += (weight_idx == 0)
-                             ? "_attn_bias"
-                             : ((weight_idx == 1) ? "_weight" : "_bias");
+                             ? ".attn_bias"
+                             : ((weight_idx == 1) ? ".weight" : ".bias");
       std::cout << "Loading weight file " << weight_filename << std::endl;
       std::string weight_filepath =
           join_path({weights_folder, weight_filename});
@@ -781,7 +786,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
       assert(weight_idx == 0 || weight_idx == 1);
       // handle exception
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
       std::cout << "Loading weight file " << weight_filename << std::endl;
       std::string weight_filepath =
@@ -790,40 +795,123 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     }
   }
 
-  // Copy the weight data from the buffer to the weight's ParallelTensor
-  ParallelTensor weight_pt;
-  ff->get_parallel_tensor_from_tensor(weight, weight_pt);
-  weight_pt->set_tensor<DT>(ff, dims_vec, data);
+  // Copy the weight data from the buffer to the weight
+  DT *ptr = weight;
+  for (size_t i = 0; i < num_replicas; i++) {
+    memcpy(ptr, data, volume * sizeof(DT));
+    ptr += volume;
+  }
 
   // Free buffer memory
-  delete data;
+  free(data);
 }
 
-void FileDataLoader::load_weights(FFModel *ff) {
+void FileDataLoader::load_weight_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
+
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 1); // one weight only
+  GenericTensorAccessorW weight = helperGetGenericTensorAccessorWO(
+      args->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  Domain weight_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+
+  switch (args->data_type) {
+    case DT_HALF: {
+      args->loader->load_single_weight_tensor<half>(args->ff,
+                                                    args->layer,
+                                                    args->weight_idx,
+                                                    args->volume,
+                                                    args->num_replicas,
+                                                    weight.get_half_ptr(),
+                                                    weight_domain);
+      break;
+    }
+    case DT_FLOAT: {
+      args->loader->load_single_weight_tensor<float>(args->ff,
+                                                     args->layer,
+                                                     args->weight_idx,
+                                                     args->volume,
+                                                     args->num_replicas,
+                                                     weight.get_float_ptr(),
+                                                     weight_domain);
+      break;
+    }
+    case DT_INT4:
+    case DT_INT8: {
+      args->loader->load_quantization_weight(args->ff,
+                                             args->layer,
+                                             args->weight_idx,
+                                             args->volume,
+                                             args->num_replicas,
+                                             weight.get_byte_ptr(),
+                                             args->data_type,
+                                             weight_domain);
+      break;
+    }
+    default:
+      assert(false && "Unsupported data type");
+  }
+}
+
+void FileDataLoader::load_weights_parallel(FFModel *ff,
+                                           Context ctx,
+                                           Runtime *runtime) {
+  std::vector<Future> futures;
+
   for (Layer *l : ff->layers) {
     if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
       continue;
     }
+
     for (int i = 0; i < l->numWeights; i++) {
       Tensor weight = l->weights[i];
       if (weight == NULL) {
         continue;
       }
-      switch (weight->data_type) {
-        case DT_HALF:
-          load_single_weight_tensor<half>(ff, l, i);
-          break;
-        case DT_FLOAT:
-          load_single_weight_tensor<float>(ff, l, i);
-          break;
-        case DT_INT4:
-        case DT_INT8:
-          // load weights in quantization
-          load_quantization_weight(ff, l, i);
-          break;
-        default:
-          assert(false && "Unsupported data type");
+
+      if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF &&
+          weight->data_type != DT_INT4 && weight->data_type != DT_INT8) {
+        assert(false && "Unsupported data type");
+      }
+
+      ParallelTensor weight_pt;
+      ff->get_parallel_tensor_from_tensor(weight, weight_pt);
+
+      // Create task arguments
+      size_t volume = 1, num_replicas = 1;
+      if (weight_pt->sync_type == ParameterSyncType::NCCL) {
+        for (int i = 0; i < weight_pt->num_dims; i++) {
+          if (weight_pt->dims[i].is_replica_dim) {
+            num_replicas *= weight_pt->dims[i].size;
+          }
+        }
+      } else if (weight_pt->sync_type == ParameterSyncType::PS) {
+        num_replicas = 1;
+      } else {
+        num_replicas = 1;
+      }
+      for (int i = 0; i < weight->num_dims; i++) {
+        volume *= weight->dims[i];
       }
+      WeightLoadTaskArgs args(
+          ff, this, l, i, volume, num_replicas, weight->data_type);
+      // launch task asynchronously
+      TaskLauncher launcher(LOAD_WEIGHT_TASK_ID,
+                            TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+      launcher.add_region_requirement(RegionRequirement(
+          weight_pt->region, WRITE_ONLY, EXCLUSIVE, weight_pt->region));
+      launcher.add_field(0, FID_DATA);
+      futures.push_back(runtime->execute_task(ctx, launcher));
     }
   }
+
+  // Wait for all tasks to complete
+  for (Future &f : futures) {
+    f.get_void_result();
+  }
 }
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index f8e8240cc..30f42327f 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -21,7 +21,7 @@
 #include "flexflow/ops/argmax.h"
 #include "flexflow/ops/attention.h"
 #include "flexflow/ops/batch_matmul.h"
-#include "flexflow/ops/beam_topk.h"
+// #include "flexflow/ops/beam_topk.h"
 #include "flexflow/ops/cast.h"
 #include "flexflow/ops/concat.h"
 #include "flexflow/ops/conv_2d.h"
@@ -33,6 +33,7 @@
 #include "flexflow/ops/flat.h"
 #include "flexflow/ops/gather.h"
 #include "flexflow/ops/groupby.h"
+#include "flexflow/ops/gumbel_topk.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
@@ -66,10 +67,10 @@ namespace FlexFlow::PCG {
 using namespace Legion;
 using FlexFlow::MachineView;
 
-LegionRuntime::Logger::Category log_graph("graph");
-LegionRuntime::Logger::Category log_simplify("graph_simplify");
+Legion::Logger log_graph("graph");
+Legion::Logger log_simplify("graph_simplify");
 
-const Node Node::INVALID_NODE = Node();
+Node const Node::INVALID_NODE = Node();
 
 Node::Node(void) : guid(0), ptr(NULL) {}
 
@@ -2326,21 +2327,31 @@ GraphOptimalViewSerialized
         sez.serialize(attn->layer_guid.id);
         sez.serialize(attn->layer_guid.transformer_layer_id);
         sez.serialize(attn->layer_guid.model_id);
-        sez.serialize(attn->oProjSize);
+        sez.serialize(attn->o_dim);
         sez.serialize(attn->num_q_heads);
-        sez.serialize(attn->qProjSize);
-        sez.serialize(attn->vProjSize);
+        sez.serialize(attn->qk_dim);
+        sez.serialize(attn->v_dim);
         sez.serialize(attn->dropout);
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
         sez.serialize(attn->position_bias);
         sez.serialize(attn->quantization_type);
         sez.serialize(attn->offload);
+        sez.serialize(attn->streaming_cache);
         sez.serialize(attn->num_kv_heads);
         sez.serialize(attn->tensor_parallelism_degree);
         sez.serialize(strlen(attn->name));
@@ -2353,20 +2364,31 @@ GraphOptimalViewSerialized
         sez.serialize(attn->layer_guid.id);
         sez.serialize(attn->layer_guid.transformer_layer_id);
         sez.serialize(attn->layer_guid.model_id);
-        sez.serialize(attn->oProjSize);
+        sez.serialize(attn->o_dim);
         sez.serialize(attn->num_q_heads);
-        sez.serialize(attn->qProjSize);
-        sez.serialize(attn->vProjSize);
+        sez.serialize(attn->qk_dim);
+        sez.serialize(attn->v_dim);
         sez.serialize(attn->dropout);
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
         sez.serialize(attn->position_bias);
+        sez.serialize(attn->streaming_cache);
         sez.serialize(attn->num_kv_heads);
+        sez.serialize(attn->tensor_parallelism_degree);
         sez.serialize(strlen(attn->name));
         sez.serialize(attn->name, strlen(attn->name));
         break;
@@ -2377,15 +2399,24 @@ GraphOptimalViewSerialized
         sez.serialize(attn->layer_guid.id);
         sez.serialize(attn->layer_guid.transformer_layer_id);
         sez.serialize(attn->layer_guid.model_id);
-        sez.serialize(attn->oProjSize);
+        sez.serialize(attn->o_dim);
         sez.serialize(attn->num_q_heads);
-        sez.serialize(attn->qProjSize);
-        sez.serialize(attn->vProjSize);
+        sez.serialize(attn->qk_dim);
+        sez.serialize(attn->v_dim);
         sez.serialize(attn->dropout);
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
@@ -2805,8 +2836,9 @@ void FFModel::deserialize_graph_optimal_view(
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
             tensor_parallelism_degree;
         float dropout, scaling_factor;
-        bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, offload, position_bias;
+        bool qkv_bias, final_bias, add_zero_attn, scaling_query,
+            qk_prod_scaling, offload, streaming_cache, position_bias;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
@@ -2821,13 +2853,24 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(qkv_bias);
         dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
         dez.deserialize(position_bias);
         dez.deserialize(quantization_type);
         dez.deserialize(offload);
+        dez.deserialize(streaming_cache);
         dez.deserialize(num_kv_heads);
         dez.deserialize(tensor_parallelism_degree);
         size_t name_len;
@@ -2845,13 +2888,14 @@ void FFModel::deserialize_graph_optimal_view(
         params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
         params.position_bias = position_bias;
         params.quantization_type = quantization_type;
         params.offload = offload;
+        params.streaming_cache = streaming_cache;
         params.num_kv_heads = num_kv_heads;
         params.tensor_parallelism_degree = tensor_parallelism_degree;
         strcpy(params.name, name);
@@ -2860,10 +2904,12 @@ void FFModel::deserialize_graph_optimal_view(
       }
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(num_inputs == 1);
-        int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads;
+        int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
+            tensor_parallelism_degree;
         float dropout, scaling_factor;
         bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, position_bias;
+            scaling_query, qk_prod_scaling, position_bias, streaming_cache;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
         dez.deserialize(transformer_layer_id);
@@ -2877,12 +2923,24 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(qkv_bias);
         dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
         dez.deserialize(position_bias);
+        dez.deserialize(streaming_cache);
         dez.deserialize(num_kv_heads);
+        dez.deserialize(tensor_parallelism_degree);
         size_t name_len;
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
@@ -2898,12 +2956,14 @@ void FFModel::deserialize_graph_optimal_view(
         params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
         params.position_bias = position_bias;
+        params.streaming_cache = streaming_cache;
         params.num_kv_heads = num_kv_heads;
+        params.tensor_parallelism_degree = tensor_parallelism_degree;
         strcpy(params.name, name);
         node = get_or_create_node<SpecIncMultiHeadSelfAttention>(inputs[0],
                                                                  params);
@@ -2914,8 +2974,9 @@ void FFModel::deserialize_graph_optimal_view(
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
             tensor_parallelism_degree;
         float dropout, scaling_factor;
-        bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, offload, position_bias;
+        bool qkv_bias, final_bias, add_zero_attn, scaling_query,
+            qk_prod_scaling, offload, position_bias;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
@@ -2930,7 +2991,17 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(qkv_bias);
         dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
@@ -2954,7 +3025,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
@@ -2972,14 +3043,18 @@ void FFModel::deserialize_graph_optimal_view(
         node = TopK::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
-      case OP_ARG_TOPK: {
-        node = ArgTopK::deserialize(*this, dez, inputs, num_inputs);
+      case OP_GUMBEL_TOPK: {
+        node = GumbelTopK::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
-      case OP_BEAM_TOPK: {
-        node = BeamTopK::deserialize(*this, dez, inputs, num_inputs);
+      case OP_ARG_TOPK: {
+        node = ArgTopK::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
+        //   case OP_BEAM_TOPK: {
+        //     node = BeamTopK::deserialize(*this, dez, inputs, num_inputs);
+        //     break;
+        //   }
       case OP_SAMPLING: {
         node = Sampling::deserialize(*this, dez, inputs, num_inputs);
         break;
@@ -3152,21 +3227,21 @@ void FFModel::deserialize_graph_optimal_view(
     optimal_views[guid_to_nodes[guid]] = view;
   }
   assert(dez.get_remaining_bytes() == 0);
-  printf("Deserialized Views...\n");
-  for (auto const &it : optimal_views) {
-    printf("node[%zu]: type(%s) view(%d %d %d) ",
-           it.first.guid,
-           it.first.to_string().c_str(),
-           it.second.ndims,
-           it.second.dim[0],
-           it.second.start_device_id);
-    auto const &list = graph->inEdges.at(it.first);
-    for (auto const &it2 : list) {
-      Edge e = it2;
-      printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
-    }
-    printf("\n");
-  }
+  // printf("Deserialized Views...\n");
+  // for (auto const &it : optimal_views) {
+  //   printf("node[%zu]: type(%s) view(%d %d %d) ",
+  //          it.first.guid,
+  //          it.first.to_string().c_str(),
+  //          it.second.ndims,
+  //          it.second.dim[0],
+  //          it.second.start_device_id);
+  //   auto const &list = graph->inEdges.at(it.first);
+  //   for (auto const &it2 : list) {
+  //     Edge e = it2;
+  //     printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
+  //   }
+  //   printf("\n");
+  // }
 }
 
 }; // namespace FlexFlow
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 2a94df8b4..ed0c2ed69 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -13,20 +13,24 @@
  * limitations under the License.
  */
 
+#include "flexflow/batch_config.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/graph.h"
+#include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/fused.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 #include "flexflow/request_manager.h"
+#include <cassert>
+#include <random>
 
 namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_inf_mgr("InferenceManager");
-LegionRuntime::Logger::Category log_offload("Offloading");
+Legion::Logger log_inf_mgr("InferenceManager");
+Legion::Logger log_offload("Offloading");
 
 InferenceManager::InferenceManager() {}
 
@@ -53,11 +57,15 @@ bool parallel_tensor_list_overlaps(std::vector<ParallelTensor> const &list1,
   return false;
 }
 
-void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
+void InferenceManager::compile_model_and_allocate_buffer(FFModel *model,
+                                                         bool is_llm) {
   // TODO: currently assume there is a single data-parallel pipeline
   // (i.e., data-parallel-degree == 1)
   assert(model->config.data_parallelism_degree == 1);
-  model->config.batchSize = BatchConfig::max_tokens_per_batch();
+  model->config.batchSize =
+      std::max(is_llm ? BatchConfig::max_tokens_per_batch()
+                      : BatchConfig::max_tokens_per_ssm_batch(),
+               BatchConfig::max_tokens_per_prefilling_batch());
   model->compile_inference();
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
@@ -230,41 +238,41 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
   }
 
   // print optimized graph
-  for (size_t i = 0; i < model->operators.size(); i++) {
-    Op *op = model->operators[i];
-    if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) {
-      continue;
-    }
-    printf("operator[%zu]: type(%s) guid(%lu)\n",
-           i,
-           get_operator_type_name(model->operators[i]->op_type).c_str(),
-           model->operators[i]->op_guid);
-    for (int j = 0; j < op->numInputs; j++) {
-      assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
-      LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region;
-      printf("\tinputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
-    }
-    for (int j = 0; j < op->numOutputs; j++) {
-      LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region;
-      printf("\toutputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
-    }
-    for (int j = 0; j < op->numWeights; j++) {
-      LogicalRegion handle = op->weights[j]->region;
-      printf("\tweights[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
-    }
-  }
+  // for (size_t i = 0; i < model->operators.size(); i++) {
+  //   Op *op = model->operators[i];
+  //   if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) {
+  //     continue;
+  //   }
+  //   printf("operator[%zu]: type(%s) guid(%lu)\n",
+  //          i,
+  //          get_operator_type_name(model->operators[i]->op_type).c_str(),
+  //          model->operators[i]->op_guid);
+  //   for (int j = 0; j < op->numInputs; j++) {
+  //     assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
+  //     LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region;
+  //     printf("\tinputs[%d] mapped_region(%d,%d,%d)\n",
+  //            j,
+  //            handle.get_index_space().get_id(),
+  //            handle.get_field_space().get_id(),
+  //            handle.get_tree_id());
+  //   }
+  //   for (int j = 0; j < op->numOutputs; j++) {
+  //     LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region;
+  //     printf("\toutputs[%d] mapped_region(%d,%d,%d)\n",
+  //            j,
+  //            handle.get_index_space().get_id(),
+  //            handle.get_field_space().get_id(),
+  //            handle.get_tree_id());
+  //   }
+  //   for (int j = 0; j < op->numWeights; j++) {
+  //     LogicalRegion handle = op->weights[j]->region;
+  //     printf("\tweights[%d] mapped_region(%d,%d,%d)\n",
+  //            j,
+  //            handle.get_index_space().get_id(),
+  //            handle.get_field_space().get_id(),
+  //            handle.get_tree_id());
+  //   }
+  // }
 }
 
 void InferenceManager::init_operators_inference(FFModel *model) {
@@ -307,26 +315,8 @@ void InferenceManager::init_operators_inference(FFModel *model) {
 FutureMap InferenceManager::inference(FFModel *model,
                                       int index,
                                       BatchConfig const &bc) {
-  if (bc.get_mode() == INC_DECODING_MODE) {
-    BatchConfigFuture bcf = Future::from_value<BatchConfig>(bc);
-    return inference(model, index, bcf);
-  } else if (bc.get_mode() == BEAM_SEARCH_MODE) {
-    BatchConfig const *bc_ptr = &bc;
-    BeamSearchBatchConfig const *bsbc_ptr =
-        static_cast<BeamSearchBatchConfig const *>(bc_ptr);
-    BeamSearchBatchConfigFuture bcf =
-        Future::from_value<BeamSearchBatchConfig>(*bsbc_ptr);
-    return inference(model, index, bcf);
-  } else if (bc.get_mode() == TREE_VERIFY_MODE) {
-    BatchConfig const *bc_ptr = &bc;
-    TreeVerifyBatchConfig const *tvbc_ptr =
-        static_cast<TreeVerifyBatchConfig const *>(bc_ptr);
-    TreeVerifyBatchConfigFuture bcf =
-        Future::from_value<TreeVerifyBatchConfig>(*tvbc_ptr);
-    return inference(model, index, bcf);
-  } else {
-    assert(false && "Unsupported inference mode");
-  }
+  BatchConfigFuture bcf = Future::from_value<BatchConfig>(bc);
+  return inference(model, index, bcf);
 }
 
 FutureMap InferenceManager::inference(FFModel *model,
@@ -503,6 +493,23 @@ void FFModel::set_transformer_layer_id(int id) {
   assert(id < MAX_NUM_TRANSFORMER_LAYERS);
 }
 
+void FFModel::set_num_transformer_layers(int num_layers) {
+  num_transformer_layers = num_layers;
+}
+
+void FFModel::set_num_kv_heads(int num_heads) {
+  num_kv_heads = num_heads;
+}
+
+void FFModel::set_qkv_dim(int qkv) {
+  qkv_dim = qkv;
+}
+
+void FFModel::set_size_dt(int dt) {
+  printf("Setting size_dt to %d\n", dt);
+  size_dt = dt;
+}
+
 void FFModel::set_position_offset(int offset) {
   assert(offset == 0 || offset == 2);
   position_offset = offset;
@@ -535,7 +542,7 @@ void FFModel::compile_inference() {
     deserialize_graph_optimal_view(dez, best_graph, optimal_views);
     operators.clear();
     convert_graph_to_operators(best_graph, optimal_views);
-    best_graph->print_dot();
+    // best_graph->print_dot();
     delete best_graph;
     for (auto const &layer : layers) {
       // map inputs to parallel tensor
@@ -656,6 +663,7 @@ void FFModel::compile_inference() {
             false /*must*/,
             0 /*mapper_id*/,
             view.hash() /*MappingTagID*/);
+        index_launcher.concurrent = true;
         FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
         fm.wait_all_results();
         int idx = 0;
@@ -691,4 +699,87 @@ std::string join_path(std::vector<std::string> const &paths) {
   return joined;
 }
 
+void EmissionMachine::wait_until_next_request() {
+  // use last_request_time to determine the next request time
+  // and sleep until then
+  if (last_request_time_ms == 0) {
+    last_request_time_ms = Realm::Clock::current_time_in_microseconds() * 1e-3;
+  }
+  double current_time = Realm::Clock::current_time_in_microseconds() * 1e-3;
+  double time_to_sleep =
+      get_next_interval_ms() - (current_time - last_request_time_ms);
+  if (time_to_sleep > 0) {
+    usleep(static_cast<useconds_t>(time_to_sleep * 1e3));
+    elapsed_time_ms += time_to_sleep;
+  }
+  last_request_time_ms = Realm::Clock::current_time_in_microseconds() * 1e-3;
+}
+
+double EmissionMachine::get_elapsed_time_ms() {
+  return elapsed_time_ms;
+}
+
+EmissionTrace::EmissionTrace(json const &json_obj) {
+  prompt = json_obj["prompt"].get<std::string>();
+  input_length = json_obj["input_length"].get<int>();
+  output_length = json_obj["output_length"].get<int>();
+  slo_ratio = json_obj["slo_ratio"].get<double>();
+  emission_time_ms = json_obj["emission_time_ms"].get<double>();
+}
+
+json EmissionTrace::to_json() const {
+  json json_obj;
+  json_obj["prompt"] = prompt;
+  json_obj["input_length"] = input_length;
+  json_obj["output_length"] = output_length;
+  json_obj["slo_ratio"] = slo_ratio;
+  json_obj["emission_time_ms"] = emission_time_ms;
+  return json_obj;
+}
+
+double ConstantEmissionMachine::get_next_interval_ms() {
+  return interval_ms;
+}
+
+double PoissonEmissionMachine::get_next_interval_ms() {
+  // Note that these are static so multiple instances will share the same
+  // generator and distribution.
+  static std::default_random_engine generator(
+      std::chrono::system_clock::now().time_since_epoch().count());
+  static std::exponential_distribution<double> distribution(lambda);
+  return distribution(generator) * 1e3;
+}
+
+double TraceEmissionMachine::get_next_interval_ms() {
+  if (timestamps.empty()) {
+    return 0;
+  }
+  double next_interval = timestamps[idx] - elapsed_time_ms;
+  idx++;
+  return next_interval;
+}
+
+double EmissionMachine::sample_slo_ratio() {
+  assert(!slo_ratios.empty());
+  static std::default_random_engine generator(
+      std::chrono::system_clock::now().time_since_epoch().count());
+  static std::uniform_real_distribution<double> distribution(0.0, 1.0);
+  double r = distribution(generator);
+
+  for (auto const &pair : slo_ratios) {
+    if (r < pair.second) {
+      return pair.first;
+    }
+  }
+  return slo_ratios.back().first;
+}
+
+double TraceEmissionMachine::sample_slo_ratio() {
+  // NOTE: Should be called before wait_until_next_request.
+  if (ratios.empty()) {
+    return 1.0;
+  }
+  double next_slo_ratio = ratios[idx];
+  return next_slo_ratio;
+}
 }; // namespace FlexFlow
diff --git a/src/runtime/layer.cc b/src/runtime/layer.cc
index 8f33f6db8..72e71688c 100644
--- a/src/runtime/layer.cc
+++ b/src/runtime/layer.cc
@@ -87,6 +87,11 @@ void Layer::add_int_vector_property(std::string const &key,
   int_vector_properties[key] = value;
 }
 
+void Layer::add_string_property(std::string const &key,
+                                std::string const &value) {
+  string_properties[key] = value;
+}
+
 void Layer::add_initializer(std::string const &key, Initializer *initializer) {
   initializers[key] = initializer;
 }
@@ -125,6 +130,18 @@ bool Layer::get_int_vector_property(std::string const &key,
   }
 }
 
+bool Layer::get_string_property(std::string const &key,
+                                std::string &value) const {
+  auto const &it = string_properties.find(key);
+  if (it == string_properties.end()) {
+    assert(false);
+    return false;
+  } else {
+    value = it->second;
+    return true;
+  }
+}
+
 bool Layer::get_initializer(std::string const &key,
                             Initializer *&initializer) const {
   auto const &it = initializers.find(key);
diff --git a/src/runtime/memory_allocator.cc b/src/runtime/memory_allocator.cc
index 06a7c468a..46bef18c8 100644
--- a/src/runtime/memory_allocator.cc
+++ b/src/runtime/memory_allocator.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/utils/memory_allocator.h"
+#include "flexflow/mapper.h"
 
 namespace FlexFlow {
 
@@ -21,14 +22,30 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Memory;
 using Realm::RegionInstance;
+using namespace Legion;
+using namespace Mapping;
+
+Legion::Logger log_ff_mem_allocator("MemoryAllocator");
 
 MemoryAllocator::MemoryAllocator(Memory _memory)
     : memory(_memory), reserved_ptr(nullptr), instance_ptr(nullptr),
       reserved_total_size(0), reserved_allocated_size(0),
-      instance_total_size(0), instance_allocated_size(0) {}
+      instance_total_size(0), instance_allocated_size(0),
+      log_instance_creation(false) {
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  for (int i = 1; i < argc; i++) {
+    if (!strcmp(argv[i], "--log-instance-creation")) {
+      log_instance_creation = true;
+      break;
+    }
+  }
+}
 
 void MemoryAllocator::create_legion_instance(RegionInstance &inst,
-                                             size_t size) {
+                                             size_t size,
+                                             char const *task_name) {
   // Assert that we have used up previously created region instance
   assert(instance_total_size == instance_allocated_size);
   Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
@@ -38,6 +55,16 @@ void MemoryAllocator::create_legion_instance(RegionInstance &inst,
   Realm::RegionInstance::create_instance(
       inst, memory, bounds, field_sizes, 0, Realm::ProfilingRequestSet())
       .wait();
+  if (log_instance_creation) {
+    log_ff_mem_allocator.print(
+        "Created instance in memory_kind: %s memory_id: %llx size: %zu "
+        "(capacity %lu) task_name: %s",
+        Legion::Mapping::Utilities::to_string(memory.kind()),
+        memory.id,
+        size,
+        memory.capacity(),
+        ((task_name != NULL) ? task_name : "unknown"));
+  }
   instance_ptr = inst.pointer_untyped(0, 0);
   instance_total_size = size;
   instance_allocated_size = 0;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 1fa281777..2a72029c5 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -30,7 +30,7 @@
 #include "flexflow/ops/attention.h"
 #include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/batch_norm.h"
-#include "flexflow/ops/beam_topk.h"
+// #include "flexflow/ops/beam_topk.h"
 #include "flexflow/ops/cache.h"
 #include "flexflow/ops/cast.h"
 #include "flexflow/ops/concat.h"
@@ -44,6 +44,7 @@
 #include "flexflow/ops/fused.h"
 #include "flexflow/ops/gather.h"
 #include "flexflow/ops/groupby.h"
+#include "flexflow/ops/gumbel_topk.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
@@ -82,8 +83,8 @@ namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_model("Model");
-LegionRuntime::Logger::Category log_measure("measure");
+Legion::Logger log_model("Model");
+Legion::Logger log_measure("measure");
 
 Op::Op(FFModel &model,
        OperatorType otype,
@@ -93,10 +94,10 @@ Op::Op(FFModel &model,
        int numWeights,
        bool allocate_weights,
        int numOutputs,
-       const ParallelTensor input1,
-       const ParallelTensor input2,
-       const ParallelTensor input3,
-       const ParallelTensor input4)
+       ParallelTensor const input1,
+       ParallelTensor const input2,
+       ParallelTensor const input3,
+       ParallelTensor const input4)
     : Op(model,
          otype,
          dtype,
@@ -116,10 +117,10 @@ Op::Op(FFModel &model,
        int _numInputs,
        int _numWeights,
        int _numOutputs,
-       const ParallelTensor _input1,
-       const ParallelTensor _input2,
-       const ParallelTensor _input3,
-       const ParallelTensor _input4)
+       ParallelTensor const _input1,
+       ParallelTensor const _input2,
+       ParallelTensor const _input3,
+       ParallelTensor const _input4)
     : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++),
       numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs),
       profiling(model.config.profiling),
@@ -604,6 +605,29 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task,
   checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank));
   // fprintf(stderr, "ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n",
   //     ncclComm, allRanks, myRank, ncclId);
+
+  // Double check that we already enabled P2P access between all GPUs
+  for (int i = 0; i < allRanks; i++) {
+    if (i == myRank) {
+      continue;
+    }
+    cudaError_t err = cudaDeviceEnablePeerAccess(i, 0);
+    if (err == cudaSuccess) {
+      printf("P2P access successfully enabled between GPU %d and GPU %d\n",
+             myRank,
+             i);
+    } else if (err == cudaErrorPeerAccessAlreadyEnabled) {
+      printf("P2P access is already enabled between GPU %d and GPU %d\n",
+             myRank,
+             i);
+    } else {
+      printf("Failed to enable P2P access between GPU %d and GPU %d: %s\n",
+             myRank,
+             i,
+             cudaGetErrorString(err));
+      assert(false && "Failed to enable P2P access");
+    }
+  }
   return ncclComm;
 }
 
@@ -1035,9 +1059,9 @@ void Op::register_output_parallel_dims(
                                      operation);
 }
 
-int Op::get_output_to_input_dim_mapping(const ParallelTensor output,
+int Op::get_output_to_input_dim_mapping(ParallelTensor const output,
                                         int output_dim,
-                                        const ParallelTensor input) {
+                                        ParallelTensor const input) {
   int output_idx = -1, input_idx = -1;
   for (int i = 0; i < numOutputs; i++) {
     if (output == outputs[i]) {
@@ -1070,9 +1094,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output,
   return -1;
 }
 
-int Op::get_output_to_weight_dim_mapping(const ParallelTensor output,
+int Op::get_output_to_weight_dim_mapping(ParallelTensor const output,
                                          int output_dim,
-                                         const ParallelTensor weight) {
+                                         ParallelTensor const weight) {
   int output_idx = -1, weight_idx = -1;
   for (int i = 0; i < numOutputs; i++) {
     if (output == outputs[i]) {
@@ -1242,12 +1266,15 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff,
 #define DIMFUNC(DIM)                                                           \
   case DIM: {                                                                  \
     Rect<DIM> rect = domain;                                                   \
-    int idx = 0;                                                               \
+    int idx = 0, num_devices = rect.volume();                                  \
     for (PointInRectIterator<DIM> it(rect); it(); it++) {                      \
       FFHandler handle = ff.handlers[view.get_device_id(*it)];                 \
       if (op_type == OP_ALLREDUCE) {                                           \
         ncclComm_t *nccl_comms = ff.find_nccl_comms(view);                     \
-        handle.ncclComm = nccl_comms[idx++];                                   \
+        handle.ncclComm = nccl_comms[idx];                                     \
+        handle.num_devices = num_devices;                                      \
+        handle.device_id = idx;                                                \
+        idx++;                                                                 \
       }                                                                        \
       argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler)));         \
     }                                                                          \
@@ -1589,41 +1616,47 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload)
   model_id = model_counter++;
 }
 
+#ifdef FF_USE_NCCL
+void FFModel::finish_nccl_comms() {
+  Context ctx = config.lg_ctx;
+  Runtime *runtime = config.lg_hlr;
+  for (auto const &comm : view_hash_to_nccl_comms) {
+    // Find the machine view that has the hash
+    MachineView view;
+    for (size_t l = 0; l < operators.size(); l++) {
+      view = operators[l]->outputs[0]->machine_view;
+      if (view.hash() == comm.first) {
+        break;
+      }
+    }
+    assert(view.hash() == comm.first && "Cannot find the machine view");
+    IndexSpace task_is = get_or_create_task_is(view);
+    Domain domain = runtime->get_index_space_domain(ctx, task_is);
+    ArgumentMap argmap;
+    int idx = 0;
+    for (Domain::DomainPointIterator it(domain); it; it++, idx++) {
+      argmap.set_point(*it,
+                       TaskArgument(&comm.second[idx], sizeof(ncclComm_t)));
+    }
+    IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID,
+                                 task_is,
+                                 TaskArgument(nullptr, 0),
+                                 argmap,
+                                 Predicate::TRUE_PRED,
+                                 false /*must*/,
+                                 0 /*mapper_id*/,
+                                 comm.first);
+    FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
+    fm.wait_all_results();
+  }
+}
+#endif
+
 FFModel::~FFModel() {
   // Destroy nccl communication groups
 #ifdef FF_USE_NCCL
   if (config.computationMode == COMP_MODE_TRAINING) {
-    Context ctx = config.lg_ctx;
-    Runtime *runtime = config.lg_hlr;
-    for (auto const &comm : view_hash_to_nccl_comms) {
-      // Find the machine view that has the hash
-      MachineView view;
-      for (size_t l = 0; l < operators.size(); l++) {
-        view = operators[l]->outputs[0]->machine_view;
-        if (view.hash() == comm.first) {
-          break;
-        }
-      }
-      assert(view.hash() == comm.first && "Cannot find the machine view");
-      IndexSpace task_is = get_or_create_task_is(view);
-      Domain domain = runtime->get_index_space_domain(ctx, task_is);
-      ArgumentMap argmap;
-      int idx = 0;
-      for (Domain::DomainPointIterator it(domain); it; it++, idx++) {
-        argmap.set_point(*it,
-                         TaskArgument(&comm.second[idx], sizeof(ncclComm_t)));
-      }
-      IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID,
-                                   task_is,
-                                   TaskArgument(nullptr, 0),
-                                   argmap,
-                                   Predicate::TRUE_PRED,
-                                   false /*must*/,
-                                   0 /*mapper_id*/,
-                                   comm.first);
-      FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
-      fm.wait_all_results();
-    }
+    finish_nccl_comms();
   }
 #endif
 }
@@ -1706,7 +1739,7 @@ Tensor FFModel::create_tensor(int numdim,
 }
 
 ParallelTensor FFModel::create_parallel_tensor(int numdim,
-                                               const ParallelDim dims[],
+                                               ParallelDim const dims[],
                                                DataType data_type,
                                                Op const *op,
                                                int idx,
@@ -1739,7 +1772,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim,
 
 ParallelTensor
     FFModel::create_parallel_tensor_legion_ordering(int numdim,
-                                                    const ParallelDim dims[],
+                                                    ParallelDim const dims[],
                                                     DataType data_type,
                                                     Op const *op,
                                                     int idx,
@@ -1789,7 +1822,7 @@ Tensor FFModel::create_tensor(int const dims[],
 }
 
 template <int NDIM>
-ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[],
+ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[],
                                                DataType data_type,
                                                Op const *owner_op,
                                                int owner_idx,
@@ -1870,7 +1903,7 @@ Parameter FFModel::create_weight(int numdim,
 }
 
 template <int NDIM>
-ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[],
+ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[],
                                                   DataType data_type,
                                                   Op const *owner_op,
                                                   bool create_grad,
@@ -1901,7 +1934,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[],
 }
 
 ParallelParameter FFModel::create_parallel_weight(int numdim,
-                                                  const ParallelDim dims[],
+                                                  ParallelDim const dims[],
                                                   DataType data_type,
                                                   Op const *owner_op,
                                                   bool create_grad,
@@ -1921,7 +1954,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim,
 
 ParallelParameter FFModel::create_parallel_weight_legion_ordering(
     int numdim,
-    const ParallelDim dims[],
+    ParallelDim const dims[],
     DataType data_type,
     Op const *owner_op,
     bool create_grad,
@@ -2135,7 +2168,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight,
 }
 
 bool FFModel::get_parallel_tensor_from_tensor(
-    const Tensor tensor, ParallelTensor &parallel_tensor) const {
+    Tensor const tensor, ParallelTensor &parallel_tensor) const {
   // check if tensor->parallel_tensor is already set
   if (tensor->parallel_tensor != nullptr) {
     parallel_tensor = tensor->parallel_tensor;
@@ -2172,7 +2205,7 @@ bool FFModel::get_parallel_tensor_from_tensor(
 }
 
 void FFModel::create_disjoint_partition(int num_dims,
-                                        const ParallelDim dims[],
+                                        ParallelDim const dims[],
                                         IndexSpace const &part_is,
                                         LogicalRegion const &region,
                                         LogicalPartition &part) {
@@ -2195,7 +2228,7 @@ void FFModel::create_disjoint_partition(int num_dims,
 
 template <int NDIM, int TDIM>
 void FFModel::create_disjoint_partition_with_dim2(
-    const ParallelDim dims[],
+    ParallelDim const dims[],
     IndexSpaceT<TDIM> const &part_is,
     LogicalRegion const &region,
     LogicalPartition &part) {
@@ -2228,7 +2261,7 @@ void FFModel::create_disjoint_partition_with_dim2(
 }
 
 void FFModel::create_aliased_partition(int num_dims,
-                                       const ParallelDim dims[],
+                                       ParallelDim const dims[],
                                        int aliased_dim,
                                        IndexSpace const &part_is,
                                        LogicalRegion const &region,
@@ -2252,7 +2285,7 @@ void FFModel::create_aliased_partition(int num_dims,
 
 template <int NDIM, int TDIM>
 void FFModel::create_aliased_partition_with_dim2(
-    const ParallelDim dims[],
+    ParallelDim const dims[],
     int aliased_dim,
     IndexSpaceT<TDIM> const &part_is,
     LogicalRegion const &region,
@@ -2289,7 +2322,7 @@ void FFModel::create_aliased_partition_with_dim2(
 }
 
 template <int NDIM>
-void FFModel::create_disjoint_partition(const ParallelTensor tensor,
+void FFModel::create_disjoint_partition(ParallelTensor const tensor,
                                         IndexSpaceT<NDIM> const &part_is,
                                         LogicalPartition &part_fwd,
                                         LogicalPartition &part_bwd) {
@@ -2337,7 +2370,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor,
 
 template <int NDIM, int TDIM>
 void FFModel::create_data_parallel_partition_with_diff_dims(
-    const ParallelTensor tensor,
+    ParallelTensor const tensor,
     IndexSpaceT<TDIM> const &part_is,
     LogicalPartition &part_fwd,
     LogicalPartition &part_bwd) {
@@ -2719,7 +2752,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const {
   return get_task_is(view);
 }
 
-IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) {
+IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) {
   MachineView view;
   view.ndims = 0;
   for (int i = 0; i < tensor->num_dims; i++) {
@@ -3241,16 +3274,21 @@ Op *FFModel::create_operator_from_layer(
       operators.push_back(op);
       return op;
     }
-    case OP_ARG_TOPK: {
-      Op *op = ArgTopK::create_operator_from_layer(*this, layer, inputs);
+    case OP_GUMBEL_TOPK: {
+      Op *op = GumbelTopK::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
       return op;
     }
-    case OP_BEAM_TOPK: {
-      Op *op = BeamTopK::create_operator_from_layer(*this, layer, inputs);
+    case OP_ARG_TOPK: {
+      Op *op = ArgTopK::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
       return op;
     }
+    // case OP_BEAM_TOPK: {
+    //   Op *op = BeamTopK::create_operator_from_layer(*this, layer, inputs);
+    //   operators.push_back(op);
+    //   return op;
+    // }
     case OP_SAMPLING: {
       Op *op = Sampling::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
@@ -3308,7 +3346,7 @@ bool FFModel::is_mlp_block(int layer_idx) const {
 }
 
 void FFModel::create_operators_from_layers() {
-  std::map<const Tensor, ParallelTensor> tensors_to_parallel_tensors;
+  std::map<Tensor const, ParallelTensor> tensors_to_parallel_tensors;
   // for (auto const &l : layers) {
   for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) {
     auto const &l = layers[layer_idx];
@@ -3321,10 +3359,14 @@ void FFModel::create_operators_from_layers() {
     }
     Op *op = nullptr;
     // add a combine before arg_topk
+    // if (config.computationMode == COMP_MODE_INFERENCE &&
+    //     config.tensor_parallelism_degree > 1 &&
+    //     (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX ||
+    //      l->op_type == OP_ARGMAX || l->op_type == OP_GUMBEL_TOPK)) {
     if (config.computationMode == COMP_MODE_INFERENCE &&
         config.tensor_parallelism_degree > 1 &&
-        (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX ||
-         l->op_type == OP_ARGMAX)) {
+        (l->op_type == OP_SOFTMAX || l->op_type == OP_ARGMAX ||
+         l->op_type == OP_GUMBEL_TOPK)) {
       std::vector<ParallelTensor> partitioned_inputs;
       assert(inputs.size() == 1);
       Combine *comb = new Combine(*this,
@@ -3351,6 +3393,7 @@ void FFModel::create_operators_from_layers() {
                config.tensor_parallelism_degree > 1 &&
                (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
                 l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
+                l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
                 // mlp layer
                 is_mlp_block(layer_idx) ||
                 // llama mlp layer
@@ -4061,39 +4104,40 @@ void FFIterationConfig::reset() {
 
 // Default Config Parameters
 struct DefaultConfig {
-  const static int epochs = 1;
+  static int const epochs = 1;
   // const static int iterations = 1;
-  const static int batchSize = 64;
-  const static bool profiling = false;
-  const static bool benchmarking = false;
-  const static bool inference_debugging = false;
+  static int const batchSize = 64;
+  static bool const log_instance_creation = false;
+  static bool const profiling = false;
+  static bool const benchmarking = false;
+  static bool const inference_debugging = false;
   constexpr static float learningRate = 0.01f;
   constexpr static float weightDecay = 0.0001f;
-  const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB
-  const static int numNodes = 1;
-  const static int workersPerNode = 0;
-  const static int cpusPerNode = 0;
-  const static size_t searchBudget = -1;
-  const static size_t simulatorWorkSpaceSize =
+  static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB
+  static int const numNodes = 1;
+  static int const workersPerNode = 0;
+  static int const cpusPerNode = 0;
+  static size_t const searchBudget = -1;
+  static size_t const simulatorWorkSpaceSize =
       (size_t)2 * 1024 * 1024 * 1024; // 2 GB
   constexpr static float searchAlpha = 1.2f;
-  const static bool searchOverlapBackwardUpdate = false;
-  const static size_t offloadReserveSpaceSize =
+  static bool const searchOverlapBackwardUpdate = false;
+  static size_t const offloadReserveSpaceSize =
       (size_t)8 * 1024 * 1024 * 1024; // 8 GB
-  const static bool cpuOffload = false;
-  const static bool onlyDataParallel = true;
-  const static bool enableSampleParallel = true;
-  const static bool enableParameterParallel = false;
-  const static bool enableAttributeParallel = false;
-  const static bool enableInplaceOptimizations = false;
-  const static bool allowTensorOpMathConversion = false;
-  const static int machine_model_version = 0;
-  const static int simulator_segment_size = 16777216; // 16 MB
-  const static int simulator_max_num_segments = 1;
-  const static int base_optimize_threshold = 10;
-  const static bool enable_control_replication = true;
+  static bool const cpuOffload = false;
+  static bool const onlyDataParallel = true;
+  static bool const enableSampleParallel = true;
+  static bool const enableParameterParallel = false;
+  static bool const enableAttributeParallel = false;
+  static bool const enableInplaceOptimizations = false;
+  static bool const allowTensorOpMathConversion = false;
+  static int const machine_model_version = 0;
+  static int const simulator_segment_size = 16777216; // 16 MB
+  static int const simulator_max_num_segments = 1;
+  static int const base_optimize_threshold = 10;
+  static bool const enable_control_replication = true;
   // The default python data loader type is 2 to enable control replication
-  const static int python_data_loader_type = 2;
+  static int const python_data_loader_type = 2;
 };
 
 FFConfig::FFConfig() {
@@ -4101,6 +4145,7 @@ FFConfig::FFConfig() {
   // iterations = DefaultConfig::iterations;
   batchSize = DefaultConfig::batchSize;
   profiling = DefaultConfig::profiling;
+  log_instance_creation = DefaultConfig::log_instance_creation;
   benchmarking = DefaultConfig::benchmarking;
   inference_debugging = DefaultConfig::inference_debugging;
   learningRate = DefaultConfig::learningRate;
@@ -4288,6 +4333,10 @@ void FFConfig::parse_args(char **argv, int argc) {
       cpusPerNode = atoi(argv[++i]);
       continue;
     }
+    if ((!strcmp(argv[i], "--log-instance-creation"))) {
+      log_instance_creation = true;
+      continue;
+    }
     if (!strcmp(argv[i], "--profiling")) {
       profiling = true;
       continue;
@@ -4452,105 +4501,56 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
-  // RequestManager prepare_next_batch
+  // RequestMang get_next_batch_config
   {
-    TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID,
-                                   "RequestManager Prepare Next Batch");
+    TaskVariantRegistrar registrar(RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
+                                   "RequestManager Get Next Batch Config");
     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<
           BatchConfig,
-          RequestManager::prepare_next_batch_task>(
-          registrar, "RequestManager Prepare Next Batch Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<BatchConfig,
-                                     RequestManager::prepare_next_batch_task>(
-          registrar);
-    }
-  }
-  // RequestManager prepare_next_batch_beam
-  {
-    TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
-                                   "RequestManager Prepare Next Batch (Beam)");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<
-          BeamSearchBatchConfig,
-          RequestManager::prepare_next_batch_beam_task>(
-          registrar, "RequestManager Prepare Next Batch (Beam) Task");
+          RequestManager::get_next_batch_config_task>(
+          registrar, "RequestManager Get Next Batch Config Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
       runtime
-          ->register_task_variant<BeamSearchBatchConfig,
-                                  RequestManager::prepare_next_batch_beam_task>(
+          ->register_task_variant<BatchConfig,
+                                  RequestManager::get_next_batch_config_task>(
               registrar);
     }
   }
-  // RequestManager prepare_next_batch_init
-  {
-    TaskVariantRegistrar registrar(
-        RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
-        "RequestManager Prepare Next Batch (Init Beam)");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<
-          BeamSearchBatchConfig,
-          RequestManager::prepare_next_batch_init_task>(
-          registrar, "RequestManager Prepare Next Batch (Init Beam) Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime
-          ->register_task_variant<BeamSearchBatchConfig,
-                                  RequestManager::prepare_next_batch_init_task>(
-              registrar);
-    }
-  }
-  // RequestManager prepare_next_batch_verify
+  // RequestManager background serving task
   {
-    TaskVariantRegistrar registrar(
-        RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
-        "RequestManager Prepare Next Batch (Verify)");
+    TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID,
+                                   "RequestManager Background Serving Task");
     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
+    // registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<
-          TreeVerifyBatchConfig,
-          RequestManager::prepare_next_batch_verify_task>(
-          registrar, "RequestManager Prepare Next Batch (Verify) Task");
+          RequestManager::background_serving_task>(
+          registrar, "RequestManager Background Serving Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<
-          TreeVerifyBatchConfig,
-          RequestManager::prepare_next_batch_verify_task>(registrar);
+      runtime->register_task_variant<RequestManager::background_serving_task>(
+          registrar);
     }
   }
-  // RequestManager background serving task
   {
-    TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID,
-                                   "RequestManager Background Serving Task");
+    TaskVariantRegistrar registrar(LOAD_WEIGHT_TASK_ID, "load_weight_task");
     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    // registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<
-          RequestManager::background_serving_task>(
-          registrar, "RequestManager Background Serving Task");
+      Runtime::preregister_task_variant<FileDataLoader::load_weight_task>(
+          registrar, "load_weight_task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<RequestManager::background_serving_task>(
+      runtime->register_task_variant<FileDataLoader::load_weight_task>(
           registrar);
     }
   }
@@ -6007,86 +6007,143 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<TopK::backward_task>(registrar);
     }
   }
-  // ArgTopk task
+  // GumbelTopk task
   {
-    TaskVariantRegistrar registrar(ARG_TOPK_INIT_TASK_ID, "ArgTopK Init");
+    TaskVariantRegistrar registrar(GUMBEL_TOPK_INIT_TASK_ID, "GumbelTopK Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<OpMeta *, ArgTopK::init_task>(
-          registrar, "ArgTopK Init Task");
+      Runtime::preregister_task_variant<OpMeta *, GumbelTopK::init_task>(
+          registrar, "GumbelTopK Init Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<OpMeta *, ArgTopK::init_task>(registrar);
+      runtime->register_task_variant<OpMeta *, GumbelTopK::init_task>(
+          registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(ARG_TOPK_INF_TASK_ID, "ArgTopK Inference");
+    TaskVariantRegistrar registrar(GUMBEL_TOPK_INF_TASK_ID,
+                                   "GumbelTopK Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<InferenceResult,
-                                        ArgTopK::inference_task>(
-          registrar, "ArgTopK Inference Task");
+                                        GumbelTopK::inference_task>(
+          registrar, "GumbelTopK Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<InferenceResult, ArgTopK::inference_task>(
-          registrar);
+      runtime
+          ->register_task_variant<InferenceResult, GumbelTopK::inference_task>(
+              registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
-                                   "ArgTopK Speculative Inference");
+    TaskVariantRegistrar registrar(GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID,
+                                   "GumbelTopK Speculative Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<BeamInferenceResult,
-                                        ArgTopK::inference_speculative_task>(
-          registrar, "ArgTopK Speculative Inference Task");
+      Runtime::preregister_task_variant<InferenceResult,
+                                        GumbelTopK::inference_speculative_task>(
+          registrar, "GumbelTopK Speculative Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<BeamInferenceResult,
-                                     ArgTopK::inference_speculative_task>(
+      runtime->register_task_variant<InferenceResult,
+                                     GumbelTopK::inference_speculative_task>(
           registrar);
     }
   }
-  // BeamTopk task
+  // ArgTopk task
   {
-    TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init");
+    TaskVariantRegistrar registrar(ARG_TOPK_INIT_TASK_ID, "ArgTopK Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<OpMeta *, BeamTopK::init_task>(
-          registrar, "BeamTopK Init Task");
+      Runtime::preregister_task_variant<OpMeta *, ArgTopK::init_task>(
+          registrar, "ArgTopK Init Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<OpMeta *, BeamTopK::init_task>(registrar);
+      runtime->register_task_variant<OpMeta *, ArgTopK::init_task>(registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(BEAM_TOPK_INF_TASK_ID, "BeamTopK Inference");
+    TaskVariantRegistrar registrar(ARG_TOPK_INF_TASK_ID, "ArgTopK Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<BeamInferenceResult,
-                                        BeamTopK::inference_task>(
-          registrar, "BeamTopK Inference Task");
+      Runtime::preregister_task_variant<InferenceResult,
+                                        ArgTopK::inference_task>(
+          registrar, "ArgTopK Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<BeamInferenceResult,
-                                     BeamTopK::inference_task>(registrar);
+      runtime->register_task_variant<InferenceResult, ArgTopK::inference_task>(
+          registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
+                                   "ArgTopK Speculative Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<InferenceResult,
+                                        ArgTopK::inference_speculative_task>(
+          registrar, "ArgTopK Speculative Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<InferenceResult,
+                                     ArgTopK::inference_speculative_task>(
+          registrar);
+    }
+  }
+  // BeamTopk task
+  //   {
+  //     TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK
+  //     Init");
+  //     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+  //     registrar.set_leaf();
+  //     if (pre_register) {
+  //       Runtime::preregister_task_variant<OpMeta *, BeamTopK::init_task>(
+  //           registrar, "BeamTopK Init Task");
+  //     } else {
+  //       if (enable_control_replication) {
+  //         registrar.global_registration = false;
+  //       }
+  //       runtime->register_task_variant<OpMeta *,
+  //       BeamTopK::init_task>(registrar);
+  //     }
+  //   }
+  //   {
+  //     TaskVariantRegistrar registrar(BEAM_TOPK_INF_TASK_ID, "BeamTopK
+  //     Inference");
+  //     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+  //     registrar.set_leaf();
+  //     if (pre_register) {
+  //       Runtime::preregister_task_variant<SsmInferenceResult,
+  //                                         BeamTopK::inference_task>(
+  //           registrar, "BeamTopK Inference Task");
+  //     } else {
+  //       if (enable_control_replication) {
+  //         registrar.global_registration = false;
+  //       }
+  //       runtime
+  //           ->register_task_variant<SsmInferenceResult,
+  //           BeamTopK::inference_task>(
+  //               registrar);
+  //     }
+  //   }
   // Sampling task
   {
     TaskVariantRegistrar registrar(SAMPLING_INIT_TASK_ID, "Sampling Init");
@@ -6139,15 +6196,16 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<BeamInferenceResult,
+      Runtime::preregister_task_variant<InferenceResult,
                                         ArgMax::inference_task_beam>(
           registrar, "ArgMax Inference Task Beam");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<BeamInferenceResult,
-                                     ArgMax::inference_task_beam>(registrar);
+      runtime
+          ->register_task_variant<InferenceResult, ArgMax::inference_task_beam>(
+              registrar);
     }
   }
   {
@@ -6408,6 +6466,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::forward_task>(
           registrar, "FusedOp Forward Task");
@@ -6422,6 +6481,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::inference_task>(
           registrar, "FusedOp Inference Task");
@@ -6436,6 +6497,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::backward_task>(
           registrar, "FusedOp Backward Task");
@@ -6643,6 +6705,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "AllReduce Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::inference_task>(
           registrar, "AllReduce Inference Task");
@@ -6657,6 +6721,10 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrentluy since they
+    // use ncclAllReduce internally
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::forward_task>(
           registrar, "AllReduce Forward Task");
@@ -6671,6 +6739,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrentluy since they
+    // use ncclAllReduce internally
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::backward_task>(
           registrar, "AllReduce Backward Task");
@@ -6748,6 +6818,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(SGD_UPD_NCCL_TASK_ID, "SGD NCCL Update");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<SGDOptimizer::nccl_update_task>(
           registrar, "SGD NCCL Update Task");
@@ -6762,6 +6834,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ADAM_UPD_NCCL_TASK_ID, "Adam NCCL Update");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AdamOptimizer::nccl_update_task>(
           registrar, "Adam NCCL Update Task");
@@ -6898,6 +6972,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "NCCL Init Communicators");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<ncclComm_t, Op::init_nccl_comms_task>(
           registrar, "NCCL Init Communicators Task");
@@ -6914,6 +6990,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "NCCL Finish Communicators");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<Op::finish_nccl_comms_task>(
           registrar, "NCCL Finish Communicators Task");
diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp
index ad2b78156..2f8631b24 100644
--- a/src/runtime/model.cpp
+++ b/src/runtime/model.cpp
@@ -155,7 +155,9 @@ FFHandler
   } else {
     handle.offload_reserve_space = nullptr;
   }
-  if (handle.batch_config_metadata_size > 0) {
+  if (handle.batch_config_metadata_size +
+          handle.attention_metadata->mem_size() >
+      0) {
     // allocate memory for offload reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                          .only_kind(Memory::GPU_FB_MEM)
@@ -163,7 +165,8 @@ FFHandler
                          .first();
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1));
+        Realm::Point<1, coord_t>(handle.batch_config_metadata_size +
+                                 handle.attention_metadata->mem_size() - 1));
     std::vector<size_t> field_sizes;
     field_sizes.push_back(sizeof(char));
     Realm::RegionInstance workspaceInst;
@@ -176,12 +179,19 @@ FFHandler
         .wait();
     handle.batch_config_metadata =
         workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size),
+        handle.attention_metadata->mem_size());
   } else {
     handle.batch_config_metadata = nullptr;
+    handle.attention_metadata->assign_address(nullptr, 0);
   }
   // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
+  handle.num_devices = 0;
+  handle.device_id = 0;
 #endif
   return handle;
 }
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 23b7f0efb..962d2c345 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -14,6 +14,7 @@
  */
 #include "flexflow/model.h"
 #include "flexflow/utils/cuda_helper.h"
+#include <cassert>
 
 namespace FlexFlow {
 // declare Legion names
@@ -89,11 +90,30 @@ FFHandler
   handle.offload_reserve_space_size = info->offload_reserve_space_size;
   handle.quantization_type = info->quantization_type;
   handle.allowTensorOpMathConversion = info->allowTensorOpMathConversion;
+  handle.incr_attention_metadata = new AttentionMetaData();
+  handle.tree_search_attention_metadata = new AttentionMetaData();
+  handle.tree_verify_attention_metadata = new AttentionMetaData();
+  assert(handle.incr_attention_metadata != nullptr &&
+         "Attention metadata must be allocated");
+  assert(handle.tree_search_attention_metadata != nullptr &&
+         "Attention metadata must be allocated");
+  assert(handle.tree_verify_attention_metadata != nullptr &&
+         "Attention metadata must be allocated");
   checkCUDA(cublasCreate(&handle.blas));
+  checkCUDA(cublasLtCreate(&handle.blasLt));
   if (handle.allowTensorOpMathConversion) {
     checkCUDA(cublasSetMathMode(handle.blas, CUBLAS_TENSOR_OP_MATH));
   }
   checkCUDNN(cudnnCreate(&handle.dnn));
+  handle.num_devices = 0;
+  handle.device_id = 0;
+  handle.gemm_engine = new Internal::GemmEngine(handle.blas, handle.blasLt);
+  // We may not use all devices, physical_device may not be successive, so we
+  // explicitly get the physical device id
+  int physical_device;
+  checkCUDA(cudaGetDevice(&physical_device));
+  checkCUDA(cudaGetDeviceProperties(handle.gemm_engine->device_prop,
+                                    physical_device));
   // #ifdef FF_USE_NCCL
   //   checkNCCL(ncclCommInitRank(&handle.nccl, info->allRanks, info->ncclId,
   //   info->myRank)); fprintf(stderr, "handle.nccl(%p)\n", handle.nccl);
@@ -151,7 +171,12 @@ FFHandler
   } else {
     handle.offload_reserve_space = nullptr;
   }
-  if (handle.batch_config_metadata_size > 0) {
+  if (handle.batch_config_metadata_size +
+          handle.incr_attention_metadata->mem_size() +
+          handle.tree_search_attention_metadata->mem_size() +
+          handle.tree_verify_attention_metadata->mem_size() +
+          handle.gemm_engine->workspace_size >
+      0) {
     // allocate memory for offload reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                          .only_kind(Memory::GPU_FB_MEM)
@@ -159,7 +184,12 @@ FFHandler
                          .first();
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1));
+        Realm::Point<1, coord_t>(
+            handle.batch_config_metadata_size +
+            handle.incr_attention_metadata->mem_size() +
+            handle.tree_search_attention_metadata->mem_size() +
+            handle.tree_verify_attention_metadata->mem_size() +
+            handle.gemm_engine->workspace_size - 1));
     std::vector<size_t> field_sizes;
     field_sizes.push_back(sizeof(char));
     Realm::RegionInstance workspaceInst;
@@ -172,8 +202,34 @@ FFHandler
         .wait();
     handle.batch_config_metadata =
         workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.incr_attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size),
+        handle.incr_attention_metadata->mem_size());
+    handle.tree_search_attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size +
+                            handle.incr_attention_metadata->mem_size()),
+        handle.tree_search_attention_metadata->mem_size());
+    handle.tree_verify_attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size +
+                            handle.incr_attention_metadata->mem_size() +
+                            handle.tree_search_attention_metadata->mem_size()),
+        handle.tree_verify_attention_metadata->mem_size());
+    handle.gemm_engine->assign_workspace(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size +
+                            handle.incr_attention_metadata->mem_size() +
+                            handle.tree_search_attention_metadata->mem_size() +
+                            handle.tree_verify_attention_metadata->mem_size()),
+        handle.gemm_engine->workspace_size);
   } else {
     handle.batch_config_metadata = nullptr;
+    handle.incr_attention_metadata->assign_address(nullptr, 0);
+    handle.tree_search_attention_metadata->assign_address(nullptr, 0);
+    handle.tree_verify_attention_metadata->assign_address(nullptr, 0);
+    handle.gemm_engine->assign_workspace(nullptr, 0);
   }
 
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc
index 6b2d223f5..33e945774 100644
--- a/src/runtime/operator_params.cc
+++ b/src/runtime/operator_params.cc
@@ -7,7 +7,7 @@
 #include "flexflow/ops/attention.h"
 #include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/batch_norm.h"
-#include "flexflow/ops/beam_topk.h"
+// #include "flexflow/ops/beam_topk.h"
 #include "flexflow/ops/cache.h"
 #include "flexflow/ops/cast.h"
 #include "flexflow/ops/concat.h"
@@ -19,6 +19,7 @@
 #include "flexflow/ops/flat.h"
 #include "flexflow/ops/gather.h"
 #include "flexflow/ops/groupby.h"
+#include "flexflow/ops/gumbel_topk.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
@@ -129,6 +130,8 @@ tl::optional<OperatorParameters> get_op_parameters(Op const *op) {
       return ((Split *)op)->get_params();
     case OP_TOPK:
       return ((TopK *)op)->get_params();
+    case OP_GUMBEL_TOPK:
+      return ((GumbelTopK *)op)->get_params();
     case OP_GROUP_BY:
       return ((Group_by *)op)->get_params();
     case OP_AGGREGATE:
@@ -141,8 +144,8 @@ tl::optional<OperatorParameters> get_op_parameters(Op const *op) {
       return ((ResidualRMSNorm *)op)->get_params();
     case OP_ARG_TOPK:
       return ((ArgTopK *)op)->get_params();
-    case OP_BEAM_TOPK:
-      return ((BeamTopK *)op)->get_params();
+    // case OP_BEAM_TOPK:
+    //   return ((BeamTopK *)op)->get_params();
     case OP_SAMPLING:
       return ((Sampling *)op)->get_params();
     case OP_ARGMAX:
diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc
index c42a0c9aa..96b735803 100644
--- a/src/runtime/optimizer.cc
+++ b/src/runtime/optimizer.cc
@@ -311,7 +311,7 @@ void SGDOptimizer::nccl_update_task(Task const *task,
     }
   }
 
-  nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr);
+  nccl_update_task_gpu(ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr);
 }
 #endif
 
@@ -603,7 +603,8 @@ void AdamOptimizer::nccl_update_task(Task const *task,
     }
   }
 
-  nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr);
+  nccl_update_task_gpu(
+      ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr);
 }
 #endif
 
diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp
index e71adc87a..a33ee35de 100644
--- a/src/runtime/optimizer_kernel.cpp
+++ b/src/runtime/optimizer_kernel.cpp
@@ -21,7 +21,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_optimizer("optimizer");
+Legion::Logger log_optimizer("optimizer");
 
 __global__ void sgd_update(size_t count,
                            float lr,
@@ -86,7 +86,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
+__host__ void SGDOptimizer::nccl_update_task_gpu(Legion::Context ctx,
+                                                 Legion::Runtime *runtime,
+                                                 SGDOptimizer const *op,
                                                  OpMeta const *meta,
                                                  float const *w_grad_ptr,
                                                  size_t size,
@@ -96,6 +98,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -103,6 +106,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
 
   // Step 2: SGD update
@@ -208,7 +212,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
+__host__ void AdamOptimizer::nccl_update_task_gpu(Legion::Context ctx,
+                                                  Legion::Runtime *runtime,
+                                                  AdamOptimizer const *op,
                                                   OpMeta const *meta,
                                                   float const *w_grad_ptr,
                                                   size_t size,
@@ -218,6 +224,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -225,6 +232,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
@@ -247,4 +255,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
 }
 #endif
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu
index 5f654fbb5..6bc3d52b2 100644
--- a/src/runtime/optimizer_kernel.cu
+++ b/src/runtime/optimizer_kernel.cu
@@ -20,7 +20,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_optimizer("optimizer");
+Legion::Logger log_optimizer("optimizer");
 
 __global__ void sgd_update(size_t count,
                            float lr,
@@ -75,7 +75,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
+__host__ void SGDOptimizer::nccl_update_task_gpu(Legion::Context ctx,
+                                                 Legion::Runtime *runtime,
+                                                 SGDOptimizer const *op,
                                                  OpMeta const *meta,
                                                  float const *w_grad_ptr,
                                                  size_t size,
@@ -85,6 +87,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -92,6 +95,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
 
@@ -183,7 +187,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
+__host__ void AdamOptimizer::nccl_update_task_gpu(Legion::Context ctx,
+                                                  Legion::Runtime *runtime,
+                                                  AdamOptimizer const *op,
                                                   OpMeta const *meta,
                                                   float const *w_grad_ptr,
                                                   size_t size,
@@ -193,6 +199,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -200,6 +207,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
new file mode 100644
index 000000000..7fbb16bcd
--- /dev/null
+++ b/src/runtime/page_manager.cc
@@ -0,0 +1,246 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/page_manager.h"
+
+namespace FlexFlow {
+
+// For all runtime functions, they share a single page manager for pages
+// information
+PageManager *page_manager_singleton = nullptr;
+
+// the interface of logicaltokenblock
+LogicalTokenBlock::LogicalTokenBlock(int block_number, uint32_t block_size)
+    : block_number(block_number), block_size(block_size), num_tokens(0),
+      num_commit_tokens(0), num_spec_tokens(0) {}
+
+bool LogicalTokenBlock::is_empty() const {
+  assert(num_spec_tokens == 0 && num_commit_tokens == 0);
+  assert(num_tokens <= block_size);
+  return num_tokens == 0;
+}
+
+bool LogicalTokenBlock::is_full() const {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return num_tokens == block_size;
+}
+
+int LogicalTokenBlock::get_num_empty_slots() const {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return block_size - num_tokens;
+}
+
+int LogicalTokenBlock::get_num_alloc_slots() const {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return num_tokens;
+}
+
+void LogicalTokenBlock::reset_num_spec_tokens() {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+
+  num_tokens -= num_spec_tokens;
+  num_spec_tokens = 0;
+
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+}
+
+void LogicalTokenBlock::append_tokens(
+    std::vector<TokenId> const &token_ids_to_append, bool committed) {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  if (num_tokens + token_ids_to_append.size() > block_size) {
+    printf("block is full! Cannot append more tokens\n");
+    throw std::runtime_error("Block is full! Cannot append more tokens.");
+  }
+  token_ids.insert(
+      token_ids.end(), token_ids_to_append.begin(), token_ids_to_append.end());
+  num_tokens += token_ids_to_append.size();
+  if (committed) {
+    num_commit_tokens += token_ids_to_append.size();
+  } else {
+    num_spec_tokens += token_ids_to_append.size();
+  }
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+}
+
+std::vector<TokenId> LogicalTokenBlock::get_token_ids() const {
+  return token_ids;
+}
+
+PhysicalTokenBlock::PhysicalTokenBlock(int block_number, int block_size)
+    : block_number(block_number), block_size(block_size), ref_count(0) {}
+
+BlockAllocator::BlockAllocator(int block_size, int num_total_blocks) {
+  for (int block_number = 0; block_number < num_total_blocks; ++block_number) {
+    free_blocks.push_back(PhysicalTokenBlock(block_number, block_size));
+  }
+  num_total_blocks = num_total_blocks;
+}
+
+// Allocate a block
+PhysicalTokenBlock BlockAllocator::allocate() {
+  if (free_blocks.empty()) {
+    printf("no free blocks are available\n");
+    throw std::runtime_error("Out of memory! No free blocks are available.");
+  }
+  PhysicalTokenBlock block = free_blocks.front();
+  free_blocks.pop_front();
+  block.incr_ref_count();
+  return block;
+}
+
+// Free a block
+void BlockAllocator::free(PhysicalTokenBlock &block) {
+  if (block.ref_count == 0) {
+    printf("block is already freed\n");
+    throw std::runtime_error("Double free! Block is already freed.");
+  }
+  block.decr_ref_count();
+  if (block.ref_count == 0) {
+    free_blocks.push_back(block);
+  } else {
+    // in current implementation this should not be the case
+    printf("block is not freed. Ref count: %d\n", block.ref_count);
+    throw std::runtime_error("Block is not freed. Ref count: " +
+                             std::to_string(block.ref_count));
+  }
+}
+
+int BlockAllocator::get_num_free_blocks() const {
+  return free_blocks.size();
+}
+
+PageManager::PageManager(int block_size, size_t num_total_blocks)
+    : block_size(block_size), num_total_blocks(num_total_blocks),
+      block_allocator(block_size, num_total_blocks) {}
+
+// return the physical number of this block
+int PageManager::allocate_one_block(RequestGuid const &request_guid) {
+  BlockTable &block_table = block_tables[request_guid];
+
+  PhysicalTokenBlock block = block_allocator.allocate();
+  block_table.push_back(block);
+  block_tables[request_guid] = block_table;
+  return block.get_block_number();
+}
+
+void PageManager::free_block_table(BlockTable &block_table) {
+  // make it reverse order to free the last allocated block first
+  BlockTable::reverse_iterator rit = block_table.rbegin();
+  for (; rit != block_table.rend(); ++rit) {
+    block_allocator.free(*rit);
+  }
+  return;
+}
+
+void PageManager::free_request(RequestGuid const &request_guid) {
+  // we only free the blocks that are already used
+  BlockTable block_table = block_tables[request_guid];
+  free_block_table(block_table);
+  block_tables.erase(request_guid);
+  return;
+}
+
+// delete the last num_blocks in the request_guid
+void PageManager::free_multiple_blocks(RequestGuid const &request_guid,
+                                       int num_blocks) {
+  // assert(block_tables.find(request_guid) != block_tables.end());
+  auto &block_table = block_tables[request_guid];
+  // assert(num_blocks <= block_table.size());
+  int num_blocks_allocated = block_table.size();
+  for (int i = 0; i < num_blocks; i++) {
+    block_allocator.free(block_table[num_blocks_allocated - i - 1]);
+  }
+  // only keep the first num_blocks_allocated - num_blocks blocks
+  block_table.erase(block_table.begin() + num_blocks_allocated - num_blocks,
+                    block_table.end());
+  block_tables[request_guid] = block_table;
+  return;
+}
+
+std::vector<int> PageManager::get_block_table_indices(
+    RequestGuid const &request_guid) const {
+  std::vector<int> indices;
+  auto const &it = block_tables.find(request_guid);
+  if (it == block_tables.end()) {
+    return indices;
+  }
+  auto const &block_table = it->second;
+  for (auto const &block : block_table) {
+    indices.push_back(block.get_block_number());
+  }
+  return indices;
+}
+
+int PageManager::get_num_total_free_blocks() const {
+  return block_allocator.get_num_free_blocks();
+}
+
+int PageManager::get_num_allocated_blocks(
+    RequestGuid const &request_guid) const {
+  auto it = block_tables.find(request_guid);
+  if (it == block_tables.end()) {
+    return 0;
+  } else {
+    return it->second.size();
+  }
+}
+
+PageManager *PageManager::get_page_manager(FFModel *ff,
+                                           size_t total_kv_cache_size) {
+  int num_kv_heads = ff->num_kv_heads;
+  int size_dt = ff->size_dt;
+  int qkv_dim = ff->qkv_dim;
+  int num_transformer_layers = ff->num_transformer_layers;
+  int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree;
+  assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 &&
+         num_transformer_layers > 0 &&
+         pipeline_parallelism_degree >
+             0); // needs to make sure that the model is initialized
+  if (page_manager_singleton == nullptr) {
+    size_t num_total_blocks = 0;
+    if (total_kv_cache_size == 0) {
+      num_total_blocks = (BatchConfig::max_spec_tree_token_num() +
+                          BatchConfig::max_sequence_length() + kPagesize - 1) /
+                         kPagesize * BatchConfig::max_requests_per_batch();
+    } else {
+      num_total_blocks = total_kv_cache_size * 1024 * 1024 / size_dt / qkv_dim /
+                         num_kv_heads / num_transformer_layers / kPagesize;
+    }
+    printf("page manager singleton is initialized with %d blocks\n",
+           num_total_blocks);
+    page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
+    page_manager_singleton->kv_cache_size_per_layer =
+        total_kv_cache_size * 1024 * 1024 / num_transformer_layers;
+  }
+  return page_manager_singleton;
+}
+
+size_t PageManager::get_kv_cache_size_per_layer() {
+  return kv_cache_size_per_layer;
+}
+
+PageManager *PageManager::get_page_manager() {
+  assert(page_manager_singleton != nullptr);
+  return page_manager_singleton;
+}
+
+}; // namespace FlexFlow
diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc
index 8f1be15fd..202983e8f 100644
--- a/src/runtime/parallel_tensor.cc
+++ b/src/runtime/parallel_tensor.cc
@@ -1,4 +1,5 @@
 #include "flexflow/ffconst_utils.h"
+#include "flexflow/mapper.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/attention.h"
 #include "flexflow/ops/concat.h"
@@ -19,6 +20,9 @@
 namespace FlexFlow {
 
 using namespace Legion;
+using namespace Legion;
+using namespace Mapping;
+Legion::Logger pt_logger("ParallelTensor");
 
 TensorBase::TensorBase(TensorBase const &rhs) {
   tensor_guid = rhs.tensor_guid;
@@ -647,11 +651,41 @@ bool ParallelTensorBase::is_valid_machine_view(MachineView const &view) const {
   return true;
 }
 
+size_t get_physical_region_size(PhysicalRegion const &pr,
+                                Context ctx,
+                                Runtime *runtime) {
+  // Get the logical region
+  LogicalRegion lr = pr.get_logical_region();
+
+  // Get the index space domain
+  Domain domain = runtime->get_index_space_domain(ctx, lr.get_index_space());
+
+  // Get number of elements in the domain
+  size_t num_elements = domain.get_volume();
+
+  // Get the field space
+  FieldSpace fs = lr.get_field_space();
+
+  // Get all fields in the field space
+  std::vector<FieldID> fields;
+  runtime->get_field_space_fields(ctx, fs, fields);
+
+  // Sum up the size of all fields
+  size_t total_field_size = 0;
+  for (FieldID fid : fields) {
+    size_t field_size = runtime->get_field_size(ctx, fs, fid);
+    total_field_size += field_size;
+  }
+
+  // Total size is number of elements times size of each element
+  return num_elements * total_field_size;
+}
+
 template <typename T>
 bool ParallelTensorBase::set_tensor(FFModel const *ff,
                                     std::vector<int> const &dim_sizes,
                                     T const *data) {
-  Context ctx = ff->config.lg_ctx;
+  Context ctx = Legion::Runtime::get_context();
   Runtime *runtime = ff->config.lg_hlr;
   // TODO: check data type matches
   // TODO: Currently we use a task launch, change to index launch for NCCL
@@ -678,6 +712,28 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff,
   InlineLauncher launcher(req);
   PhysicalRegion pr = runtime->map_region(ctx, launcher);
   pr.wait_until_valid();
+
+  if (ff->config.log_instance_creation) {
+    size_t pr_size = get_physical_region_size(pr, ctx, runtime);
+    if (pr_size != volume * num_replicas * sizeof(T)) {
+      std::cout << "Physical region size: " << pr_size << std::endl;
+      std::cout << "Volume: " << volume << std::endl;
+      std::cout << "Num replicas: " << num_replicas << std::endl;
+      std::cout << "Size of T: " << sizeof(T) << std::endl;
+    }
+    assert(pr_size == volume * num_replicas * sizeof(T));
+    std::set<Memory> memories;
+    pr.get_memories(memories);
+    assert(memories.size() == 1);
+    Memory memory = *(memories.begin());
+    pt_logger.print("Created instance in memory_kind: %s memory_id: %llx size: "
+                    "%zu (capacity %lu) task_name: set_tensor",
+                    Legion::Mapping::Utilities::to_string(memory.kind()),
+                    memory.id,
+                    pr_size,
+                    memory.capacity());
+  }
+
   switch (num_dims) {
 #define DIMFUNC(DIM)                                                           \
   case DIM: {                                                                  \
@@ -704,7 +760,7 @@ template <typename T>
 bool ParallelTensorBase::get_tensor(FFModel const *ff,
                                     T *data,
                                     bool get_gradients) {
-  Context ctx = ff->config.lg_ctx;
+  Context ctx = Legion::Runtime::get_context();
   Runtime *runtime = ff->config.lg_hlr;
   LogicalRegion weight_lr = LogicalRegion::NO_REGION;
   if (sync_type == ParameterSyncType::PS) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
old mode 100644
new mode 100755
index 16513e918..47c394f7e
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -14,39 +14,102 @@
  */
 
 #include "flexflow/request_manager.h"
+#include "flexflow/inference.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
+#include <cmath>
+#include <cstdio>
 #include <filesystem>
 #include <future>
 #include <iomanip>
 #include <new>
+#include <random>
 #include <stack>
 #include <stdexcept>
+#include <thread>
+#include <unordered_map>
+#include <vector>
 
 namespace FlexFlow {
 
 using namespace Legion;
 using tokenizers::Tokenizer;
+using RequestGuid = BatchConfig::RequestGuid;
 
-LegionRuntime::Logger::Category log_req_mgr("RequestManager");
+Legion::Logger log_req_mgr("RequestManager");
+
+bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
+               std::shared_ptr<TokenTreeNode> const &rhs) {
+  if (lhs->gumbel) {
+    assert(rhs->gumbel);
+    return lhs->gumbel_logit < rhs->gumbel_logit;
+  }
+  return lhs->log_accumulated_prob < rhs->log_accumulated_prob;
+}
+
+bool operator<=(std::shared_ptr<TokenTreeNode> const &lhs,
+                std::shared_ptr<TokenTreeNode> const &rhs) {
+  if (lhs->gumbel) {
+    assert(rhs->gumbel);
+    return lhs->gumbel_logit <= rhs->gumbel_logit;
+  }
+  return lhs->log_accumulated_prob <= rhs->log_accumulated_prob;
+}
+
+void write_to_output_file(std::string const &output_filepath,
+                          std::string const &str) {
+  std::ostream *os = &std::cout;
+  std::ofstream output_file;
+  if (!output_filepath.empty()) {
+    output_file.open(output_filepath, std::ios::app);
+    if (output_file.is_open()) {
+      os = &output_file;
+    } else {
+      std::cout << "Unable to open the output file: " << output_filepath
+                << std::endl;
+      assert(false);
+    }
+  }
+  *os << str << std::endl;
+  if (!output_filepath.empty()) {
+    output_file.close();
+  }
+}
 
 std::string LoadBytesFromFile(std::string const &path) {
   std::ifstream fs(path, std::ios::in | std::ios::binary);
-  assert(!fs.fail() && "no such file");
-  std::string data;
+  assert(fs.is_open() && "Failed to open file for reading.");
   fs.seekg(0, std::ios::end);
-  size_t size = static_cast<size_t>(fs.tellg());
+  size_t size = fs.tellg();
   fs.seekg(0, std::ios::beg);
-  data.resize(size);
-  fs.read(data.data(), size);
+  std::string data(size, '\0');
+  fs.read(&data[0], size);
+  assert(!fs.fail() && "Failed to read data from file.");
   return data;
 }
 
+double Request::get_length_weight() {
+  double coeff_alpha = 128;
+  return log((double(tokens.size()) + coeff_alpha) / coeff_alpha);
+}
+
+void Request::set_slo_ratio(double slo_ratio_) {
+  slo_ratio = slo_ratio_;
+}
+double Request::get_slo_ratio() {
+  return slo_ratio;
+}
+
+int Request::decode_length() const {
+  return tokens.size() - llm_prefill_len;
+}
+
 RequestManager::RequestManager()
-    : request_manager_status(INITIALIZED), verbose(false),
+    : background_server_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
-      total_request_run_time(0.0f) {
+      total_request_run_time(0.0f), request_manager_status(PREFILLING),
+      decoding_mode(INCREMENTAL_DECODING), prefill_model(SSM) {
   // The following config parameters are set
   // during ffmodel.compile()
   // Initialize them to -1 to make sure no one
@@ -54,8 +117,18 @@ RequestManager::RequestManager()
   // ffmodel.compile()
   max_requests_per_batch = -1;
   max_tokens_per_batch = -1;
+  max_tokens_per_ssm_batch = -1;
+  max_tokens_per_prefilling_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
+  max_output_length = -1;
+  max_kv_cache_size = 0;
+  max_tree_depth = -1;
+  max_tree_width = -1;
+  k = -1;
+  std::fill(std::begin(request_available), std::end(request_available), false);
+  std::fill(
+      std::begin(guid_of_requests), std::end(guid_of_requests), INVALID_GUID);
 }
 
 void RequestManager::set_max_requests_per_batch(int max_num_requests) {
@@ -76,11 +149,19 @@ void RequestManager::set_max_tokens_per_batch(int max_num_tokens) {
   assert(max_tokens_per_batch <= BatchConfig::MAX_NUM_TOKENS);
 }
 
-void RequestManager::set_max_spec_tree_token_num(int max_num_tokens) {
-  assert(max_spec_tree_token_num == -1 ||
-         max_spec_tree_token_num == max_num_tokens);
-  max_spec_tree_token_num = max_num_tokens;
-  assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+void RequestManager::set_max_tokens_per_ssm_batch(int max_num_ssm_tokens) {
+  assert(max_tokens_per_ssm_batch == -1 ||
+         max_tokens_per_ssm_batch == max_num_ssm_tokens);
+  max_tokens_per_ssm_batch = max_num_ssm_tokens;
+  assert(max_tokens_per_ssm_batch <= BatchConfig::MAX_NUM_TOKENS);
+}
+
+void RequestManager::set_max_tokens_per_prefilling_batch(
+    int max_num_prefilling_tokens) {
+  assert(max_tokens_per_prefilling_batch == -1 ||
+         max_tokens_per_prefilling_batch == max_num_prefilling_tokens);
+  max_tokens_per_prefilling_batch = max_num_prefilling_tokens;
+  assert(max_tokens_per_prefilling_batch <= BatchConfig::MAX_NUM_TOKENS);
 }
 
 int RequestManager::get_max_tokens_per_batch() {
@@ -88,17 +169,21 @@ int RequestManager::get_max_tokens_per_batch() {
   return max_tokens_per_batch;
 }
 
+int RequestManager::get_max_tokens_per_ssm_batch() {
+  assert(max_tokens_per_ssm_batch > 0);
+  return max_tokens_per_ssm_batch;
+}
+
+int RequestManager::get_max_tokens_per_prefilling_batch() {
+  assert(max_tokens_per_prefilling_batch > 0);
+  return max_tokens_per_prefilling_batch;
+}
+
 int RequestManager::get_max_spec_tree_token_num() {
   assert(max_spec_tree_token_num > 0);
   return max_spec_tree_token_num;
 }
 
-int RequestManager::get_max_verify_tokens_per_batch() {
-  assert(max_tokens_per_batch > 0);
-  return max_tokens_per_batch +
-         max_spec_tree_token_num * max_requests_per_batch;
-}
-
 void RequestManager::set_max_sequence_length(int max_seq_length) {
   assert(max_sequence_length == -1 || max_sequence_length == max_seq_length);
   max_sequence_length = max_seq_length;
@@ -109,45 +194,306 @@ int RequestManager::get_max_sequence_length() {
   return max_sequence_length;
 }
 
-void RequestManager::push_spec_infer_tree_width(int tree_width) {
-  assert(tree_width <= BeamSearchBatchConfig::MAX_BEAM_WIDTH);
-  spec_infer_tree_width.emplace_back(tree_width);
+void RequestManager::set_max_output_length(int max_output_length) {
+  assert(max_output_length > 0);
+  this->max_output_length = max_output_length;
+}
+
+int RequestManager::get_max_output_length() {
+  assert(max_output_length > 0);
+  return max_output_length;
+}
+
+void RequestManager::set_max_kv_cache_size(size_t max_kv_cache_size) {
+  this->max_kv_cache_size = max_kv_cache_size;
+}
+
+size_t RequestManager::get_max_kv_cache_size() {
+  return max_kv_cache_size;
+}
+
+void RequestManager::set_decoding_mode(DecodingMode mode) {
+  assert(mode == INCREMENTAL_DECODING || mode == SPECULATIVE_DECODING);
+  decoding_mode = mode;
+}
+
+void RequestManager::set_verbose(bool verbose_) {
+  verbose = verbose_;
+}
+
+int RequestManager::get_k() {
+  assert(k > 0 and k <= BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES and
+         "Invalid k");
+  return k;
+}
+
+void RequestManager::set_k(int _k) {
+  assert(_k > 0 and _k <= BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES and
+         "Invalid k");
+  k = _k;
+}
+
+int RequestManager::get_max_tree_depth() {
+  assert(max_tree_depth > 0 and
+         max_tree_depth <= BatchConfig::MAX_TREE_DEPTH and
+         "Invalid max_tree_depth");
+  return max_tree_depth;
+}
+
+void RequestManager::set_max_tree_depth(int max_tree_depth) {
+  assert(max_tree_depth > 0 and
+         max_tree_depth <= BatchConfig::MAX_TREE_DEPTH and
+         "Invalid max_tree_depth");
+  this->max_tree_depth = max_tree_depth;
+  if (max_tree_width > 0) {
+    // 8 is k of topk, if max_tree_width <= k, we will fill the second level
+    max_spec_tree_token_num =
+        max_tree_depth * max_tree_width + (max_tree_width <= 8);
+    assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+  }
+}
+
+int RequestManager::get_max_tree_width() {
+  assert(max_tree_width > 0 and
+         max_tree_width <= BatchConfig::MAX_TREE_WIDTH and
+         "Invalid max_tree_width");
+  return max_tree_width;
+}
+
+void RequestManager::set_max_tree_width(int max_tree_width) {
+  assert(max_tree_width > 0 and
+         max_tree_width <= BatchConfig::MAX_TREE_WIDTH and
+         "Invalid max_tree_width");
+  this->max_tree_width = max_tree_width;
+  if (max_tree_depth > 0) {
+    // 8 is k of topk, if max_tree_width <= k, we will fill the second level
+    max_spec_tree_token_num =
+        max_tree_depth * max_tree_width + (max_tree_width <= 8);
+    assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+  }
+}
+
+int RequestManager::get_expansion_degree() {
+  assert(expansion_degree > 0 and
+         expansion_degree <= BatchConfig::MAX_TREE_WIDTH and
+         "Invalid expansion_degree");
+  return expansion_degree;
+}
+void RequestManager::set_expansion_degree(int expansion_degree_) {
+  assert(expansion_degree > 0 and
+         expansion_degree <= BatchConfig::MAX_TREE_WIDTH and
+         "Invalid expansion_degree");
+  this->expansion_degree = expansion_degree_;
+}
+
+void RequestManager::set_speculative_sampling(bool speculative_sampling_) {
+  speculative_sampling = speculative_sampling_;
+}
+
+void RequestManager::set_baseline_latency(double baseline_latency_ms_) {
+  baseline_latency_ms = baseline_latency_ms_;
+}
+
+double RequestManager::get_baseline_latency() {
+  return baseline_latency_ms;
+}
+
+void RequestManager::set_ssm_spec_latency(double ssm_spec_latency_ms_) {
+  ssm_spec_latency_ms = ssm_spec_latency_ms_;
+}
+
+double RequestManager::get_ssm_spec_latency() {
+  return ssm_spec_latency_ms;
+}
+
+void RequestManager::set_llm_verify_latency(double llm_verify_latency_ms_) {
+  llm_verify_latency_ms = llm_verify_latency_ms_;
+}
+
+double RequestManager::get_llm_verify_latency() {
+  return llm_verify_latency_ms;
+}
+
+void RequestManager::set_correction_factor(double correction_factor_) {
+  correction_factor = correction_factor_;
+}
+
+double RequestManager::get_correction_factor() {
+  return correction_factor;
+}
+
+void RequestManager::set_streaming_cache(bool streaming_cache_) {
+  streaming_cache = streaming_cache_;
+}
+
+bool RequestManager::get_streaming_cache() {
+  return streaming_cache;
+}
+
+bool RequestManager::get_memory_occupancy() {
+  return memory_occupancy;
+}
+
+void RequestManager::set_memory_occupancy(bool memory_occupancy_) {
+  memory_occupancy = memory_occupancy_;
+}
+
+void RequestManager::set_slo_violation_early_termination(
+    bool slo_violation_early_termination_) {
+  slo_violation_early_termination = slo_violation_early_termination_;
+}
+
+void RequestManager::set_spec_infer_old_version(bool spec_infer_old_version_) {
+  spec_infer_old_version = spec_infer_old_version_;
+}
+
+void RequestManager::set_greedy_schedule(bool greedy_schedule_) {
+  greedy_schedule = greedy_schedule_;
+}
+
+void RequestManager::set_equal_schedule(bool equal_schedule_) {
+  equal_schedule = equal_schedule_;
+}
+
+void RequestManager::set_fcfs_slo(bool fcfs_slo_) {
+  fcfs_slo = fcfs_slo_;
+}
+
+void RequestManager::set_stta(bool stta_) {
+  stta = stta_;
+}
+
+bool RequestManager::get_spec_infer_old_version() {
+  return spec_infer_old_version;
+}
+
+bool RequestManager::get_greedy_schedule() {
+  return greedy_schedule;
+}
+
+bool RequestManager::get_equal_schedule() {
+  return equal_schedule;
+}
+
+bool RequestManager::get_fcfs_slo() {
+  return fcfs_slo;
+}
+
+bool RequestManager::get_stta() {
+  return stta;
+}
+
+void RequestManager::set_eval_overhead_breakdown(
+    bool eval_overhead_breakdown_) {
+  eval_overhead_breakdown = eval_overhead_breakdown_;
+}
+
+bool RequestManager::get_eval_overhead_breakdown() {
+  return eval_overhead_breakdown;
+}
+
+inline double RequestManager::get_slo_constraint(Request &request) {
+  if (request.get_slo_ratio() < 0) {
+    // we use negative number to specify the absolute slo constraint (ms)
+    return -request.get_slo_ratio();
+  } else {
+    // relative slo constraint upon the baseline latency
+    return request.get_slo_ratio() * baseline_latency_ms;
+  }
+}
+
+double RequestManager::get_request_expected_latency(Request &request) {
+  return get_slo_constraint(request) * request.decode_length();
+}
+
+Request &RequestManager::get_request_with_guid(RequestGuid guid) {
+  assert(all_requests.find(guid) != all_requests.end() &&
+         "Request with the given GUID does not exist.");
+  return all_requests[guid];
+}
+
+bool RequestManager::SharedTokenTreeNodePtrRequestGuidWeightedLess::operator()(
+    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
+    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs) const {
+  if (lhs.first->gumbel) {
+    assert(rhs.first->gumbel);
+    return lhs.first->gumbel_logit * get_request_manager()
+                                         ->get_request_with_guid(lhs.second)
+                                         .get_length_weight() <
+           rhs.first->gumbel_logit * get_request_manager()
+                                         ->get_request_with_guid(rhs.second)
+                                         .get_length_weight();
+  }
+  return lhs.first->log_accumulated_prob *
+             get_request_manager()
+                 ->get_request_with_guid(lhs.second)
+                 .get_length_weight() <
+         rhs.first->log_accumulated_prob *
+             get_request_manager()
+                 ->get_request_with_guid(rhs.second)
+                 .get_length_weight();
+}
+
+bool RequestManager::SharedTokenTreeNodePtrDoubleRequestGuidLess ::operator()(
+    std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid> const &lhs,
+    std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid> const &rhs)
+    const {
+  return std::get<1>(lhs) < std::get<1>(rhs);
 }
 
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
-                                        int eos_token_id,
+                                        std::vector<int> eos_token_ids,
                                         std::string const &path) {
   this->model_type = type;
   this->bos_token_id = bos_token_id;
-  this->eos_token_id = eos_token_id;
-  std::string tokenizer_folder =
-      (!path.empty() && path.back() != '/') ? path + '/' : path;
+  this->eos_token_ids = eos_token_ids;
+  std::filesystem::path tokenizer_folder(path);
+
   if (model_type == ModelType::LLAMA) {
-    bool path_to_file = !path.empty() &&
-                        (path.size() >= strlen("tokenizer.model")) &&
-                        path.find("tokenizer.model") ==
-                            (path.size() - strlen("tokenizer.model"));
-    std::string tokenizer_filepath =
-        path_to_file ? path : tokenizer_folder + "tokenizer.model";
-    this->tokenizer_ =
-        Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath));
+    // try with tokenizer.json first
+    std::filesystem::path tokenizer_json_path;
+    if (std::filesystem::is_directory(tokenizer_folder)) {
+      tokenizer_json_path =
+          std::filesystem::path(tokenizer_folder) / "tokenizer.json";
+    } else {
+      tokenizer_json_path = tokenizer_folder;
+    }
+    if (std::filesystem::exists(tokenizer_json_path)) {
+      // load from tokenizer.json
+      this->tokenizer_ = Tokenizer::FromBlobJSON(
+          LoadBytesFromFile(tokenizer_json_path.string()));
+    } else {
+      // load from tokenizer.model
+      std::filesystem::path tokenizer_model_path;
+      if (std::filesystem::is_directory(tokenizer_folder)) {
+        tokenizer_model_path =
+            std::filesystem::path(tokenizer_folder) / "tokenizer.model";
+      } else {
+        tokenizer_model_path = tokenizer_folder;
+      }
+      if (!std::filesystem::exists(tokenizer_model_path)) {
+        std::cerr << "Failed to open file: " << tokenizer_model_path
+                  << std::endl;
+        assert(false);
+      }
+      old_llama_tokenizer = true;
+      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(
+          LoadBytesFromFile(tokenizer_model_path.string()));
+    }
   } else if (model_type == ModelType::OPT) {
-    std::string vocab_file = tokenizer_folder + "vocab.json";
-    std::string merges_file = tokenizer_folder + "merges.txt";
-    std::string added_tokens_file =
-        tokenizer_folder + "special_tokens_map.json";
-    std::filesystem::path path1(vocab_file);
-    std::filesystem::path path2(merges_file);
-    std::filesystem::path path3(added_tokens_file);
-    assert(std::filesystem::exists(path1) &&
+    std::filesystem::path vocab_file = tokenizer_folder / "vocab.json";
+    std::filesystem::path merges_file = tokenizer_folder / "merges.txt";
+    std::filesystem::path added_tokens_file =
+        tokenizer_folder / "special_tokens_map.json";
+    assert(std::filesystem::exists(vocab_file) &&
            "Vocab file vocab.json does not exist at the specified path");
-    assert(std::filesystem::exists(path2) &&
+    assert(std::filesystem::exists(merges_file) &&
            "Merge file merges.txt does not exist at the specified path");
     // opt_tokenizer = new OptTokenizer(vocab_file, merges_file);
-    std::string vocab = LoadBytesFromFile(path1.string());
-    std::string merges = LoadBytesFromFile(path2.string());
-    std::string added_tokens = LoadBytesFromFile(path3.string());
+    std::string vocab = LoadBytesFromFile(vocab_file.string());
+    std::string merges = LoadBytesFromFile(merges_file.string());
+    std::string added_tokens = LoadBytesFromFile(added_tokens_file.string());
 
     this->tokenizer_ =
         Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens);
@@ -160,6 +506,10 @@ void RequestManager::register_tokenizer(ModelType type,
   }
 }
 
+std::vector<int32_t> RequestManager::tokenize(std::string const &text) {
+  return tokenizer_->Encode(text);
+}
+
 void RequestManager::register_output_filepath(
     std::string const &_output_filepath) {
   this->output_filepath = _output_filepath;
@@ -173,7 +523,7 @@ int RequestManager::register_ssm_model(FFModel *model) {
 }
 
 FFModel *RequestManager::get_ssm_model(int model_id) {
-  assert(model_id < ssm_models.size());
+  assert(model_id >= 0 && model_id < ssm_models.size());
   return ssm_models[model_id];
 }
 
@@ -182,27 +532,26 @@ size_t RequestManager::get_num_ssms() {
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::vector<TokenId> const &prompt,
-                                         int max_sequence_length) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
+    RequestManager::register_new_request(GenerationRequest const &req) {
   // Add a new request
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-
-  if (prompt.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << prompt.size() << ".\n";
-
-    printf("tokens size: %zu\n", request.tokens.size());
-    return INVALID_GUID;
-  } else {
-    request.initial_len = prompt.size();
-    request.tokens = prompt;
+  request.add_special_tokens = req.add_special_tokens;
+  if (bos_token_id >= 0 && request.add_special_tokens &&
+      model_type != ModelType::FALCON) {
+    request.tokens.push_back(bos_token_id);
   }
+  std::vector<int32_t> tokens = this->tokenizer_->Encode(req.prompt);
+  // for (int i = 0; i < tokens.size(); i++) {
+  //   std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+  // }
+  // std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
+  request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
+  request.set_slo_ratio(req.slo_ratio);
+  printf("Registered as request[%ld] with slo %.3f ms\n",
+         request.guid,
+         get_slo_constraint(request));
 
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
@@ -210,116 +559,67 @@ RequestManager::RequestGuid
               << std::endl;
   } else {
     std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
-    for (int i = 0; i < get_num_ssms(); i++) {
-      BeamTree beam_tree = BeamTree{};
-      request.beam_trees.push_back(beam_tree);
-    }
-  }
-
-  pending_request_queue.push(request);
-  all_requests[request.guid] = request;
-  {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
-    request_to_promise[request.guid] = new std::promise<void>();
+    assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
+    init_token_tree(request.guid);
   }
 
-  if (verbose) {
-    std::cout << "new req: " << request.tokens.size() << std::endl;
-    for (int i = 0; i < request.tokens.size(); i++) {
-      std::cout << i << " : " << request.tokens[i] << std::endl;
-    }
-  }
+  request.streaming_cache_info = StreamingCacheInfo(
+      BatchConfig::SINK_SIZE,
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::SINK_SIZE -
+          BatchConfig::get_max_tree_depth());
 
   GenerationResult gr;
   gr.guid = request.guid;
-  gr.input_text = "";
-  gr.input_tokens = prompt;
-  gr.output_text = "";
-  gr.output_tokens = prompt;
-  request_generation_results[request.guid] = gr;
-
-  return request.guid;
-}
-
-RequestManager::RequestGuid
-    RequestManager::register_new_request(std::string const &prompt,
-                                         int max_sequence_length) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
-  // Add a new request
-  Request request;
-  request.status = Request::PENDING;
-  request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
-    request.tokens.push_back(bos_token_id);
-  }
-  std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
-  if (tokens.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << tokens.size() << ".\n";
-
-    printf("tokens size: %zu\n", tokens.size());
-    return INVALID_GUID;
-  }
-  for (int i = 0; i < tokens.size(); i++) {
-    std::cout << "[" << i << "]" << tokens.at(i) << "\n";
-  }
-  request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
-  request.initial_len = request.tokens.size();
+  gr.input_text = req.prompt;
+  gr.input_tokens = request.tokens;
+  gr.output_text = req.prompt;
+  gr.output_tokens = request.tokens;
+  gr.slo_ratio = req.slo_ratio;
+  gr.emission_time_ms = req.emission_time_ms;
+
+  // Record time when request was enqueued
+  // Step idx -2: enqueueing; step idx -1: prefilling begins, step idx 0:
+  // prefilling finished
+  NewProfileInfo new_profile_info;
+  new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+  new_profile_info.request_guid = request.guid;
+  new_profile_info.request_step_idx = -2;
+  new_profiling_info.push_back(new_profile_info);
 
-  if (get_num_ssms() == 0) {
-    std::cout << "No small speculative model registered, using incremental "
-                 "decoding."
-              << std::endl;
-  } else {
-    std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
-    for (int i = 0; i < get_num_ssms(); i++) {
-      BeamTree beam_tree = BeamTree{};
-      request.beam_trees.push_back(beam_tree);
-    }
+  {
+    std::lock_guard<std::mutex> const lock(request_queue_mutex);
+    pending_request_queue.push(request);
+    all_requests[request.guid] = request;
   }
-
-  pending_request_queue.push(request);
-  all_requests[request.guid] = request;
+  request_queue_cv.notify_all();
   {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
+  {
+    std::lock_guard<std::mutex> const lock(request_result_mutex);
+    request_generation_results[request.guid] = gr;
+  }
 
   {
-    std::string output = "New request tokens:";
-    output = "[" + std::to_string(request.guid) + "]" + output;
-    for (int i = 0; i < request.tokens.size(); i++) {
-      output = output + " " + std::to_string(request.tokens[i]);
-    }
-    log_req_mgr.print("%s", output.c_str());
+    // std::string output = "New request tokens:";
+    // output = "[" + std::to_string(request.guid) + "] " + output;
+    // for (int i = 0; i < request.tokens.size(); i++) {
+    //   output = output + " " + std::to_string(request.tokens[i]);
+    // }
+    // log_req_mgr.print("%s", output.c_str());
+    // write_to_output_file("", output);
   }
 
-  GenerationResult gr;
-  gr.guid = request.guid;
-  gr.input_text = prompt;
-  gr.input_tokens = request.tokens;
-  gr.output_text = prompt;
-  gr.output_tokens = request.tokens;
-  request_generation_results[request.guid] = gr;
   return request.guid;
 }
 
-bool RequestManager::is_request_completed(RequestGuid const &guid) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
-  assert(all_requests.find(guid) != all_requests.end());
-  Request const &request = all_requests[guid];
-  // return request.tokens.size() >= request.max_sequence_length;
-  return request.status == Request::COMPLETED;
-}
-
 GenerationResult
     RequestManager::get_generation_result(RequestGuid const &guid) {
   // First get the future of the request
   std::future<void> future;
   {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     assert(request_to_promise.find(guid) != request_to_promise.end());
     future = request_to_promise[guid]->get_future();
   }
@@ -327,7 +627,7 @@ GenerationResult
   future.get();
   // Get the generation result
   {
-    const std::lock_guard<std::mutex> lock(request_queue_mutex);
+    std::lock_guard<std::mutex> const lock(request_result_mutex);
     assert(request_generation_results.find(guid) !=
            request_generation_results.end());
     return request_generation_results[guid];
@@ -338,1974 +638,2261 @@ size_t RequestManager::get_num_processed_requests() {
   return num_processed_requests;
 }
 
-BatchConfigFuture
-    RequestManager::prepare_next_batch(BatchConfigFuture const &old_bc,
-                                       InferenceResultFuture const &result,
-                                       Context ctx,
-                                       Runtime *runtime) {
+int RequestManager::get_num_active_requests() {
+  return num_available_requests;
+}
+
+int RequestManager::get_empty_request_index() {
+  for (int i = 0; i < get_max_requests_per_batch(); i++) {
+    if (guid_of_requests[i] == INVALID_GUID) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+std::unordered_map<RequestGuid, RequestProfileInfo>
+    RequestManager::get_requests_profiling() {
+  return profiling_requests;
+}
+
+std::unordered_map<RequestGuid, GenerationResult>
+    RequestManager::get_request_generation_results() {
+  return request_generation_results;
+}
+
+ProfileInfo RequestManager::get_profiling_info() {
+  return profiling;
+}
+
+std::vector<NewProfileInfo> RequestManager::get_new_profiling_info() {
+  return new_profiling_info;
+}
+
+BatchConfigFuture RequestManager::get_next_batch_config(
+    InferenceResultFuture const &result, Context ctx, Runtime *runtime) {
   RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_TASK_ID,
+  TaskLauncher launcher(RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
                         TaskArgument(&rm, sizeof(RequestManager *)));
-  launcher.add_future(old_bc);
   launcher.add_future(result);
   return runtime->execute_task(ctx, launcher);
 }
 
-BatchConfig RequestManager::prepare_next_batch_task(
+BatchConfig RequestManager::get_next_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
   RequestManager *rm = *((RequestManager **)task->args);
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  InferenceResult const &result =
-      Future(task->futures[1]).get_result<InferenceResult>();
-  return rm->prepare_next_batch(*bc, result);
-}
-
-BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
-                                               InferenceResult const &result) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
-  // Step 1: append result from previous iteration to request's tokens
-  for (int i = 0; i < old_bc.num_tokens; i++) {
-    size_t guid =
-        old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
-    Request &request = all_requests[guid];
-    if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) {
-      // This is a prompt token
-      continue;
-    } else {
-      assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 ==
-             request.tokens.size());
-      // This is a decoding token
-      log_req_mgr.print("Output token is: %d", result.token_ids[i]);
-      request.tokens.push_back(result.token_ids[i]);
-      // std::string output = this->tokenizer_->Decode(request.tokens);
-      // log_req_mgr.print("Output: %s", output.c_str());
-    }
+  if (rm->request_manager_status == PREFILLING and rm->prefill_model == SSM and
+      rm->current_ssm_step != 0) {
+    // Return an empty batch config, because we only need on step for SSM
+    // prefilling, and the rest is placeholder for scheduling
+    return rm->get_next_batch_config(InferenceResult());
+  } else if (rm->request_manager_status == SSM_SPEC and rm->ssm_completed) {
+    return rm->get_next_batch_config(InferenceResult());
   }
-  int num_generation_tokens = 0;
-  int num_active_req = -1;
 
-  // Step 2: prepare the next batch for existing requests
-  BatchConfig new_bc;
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i]) { // add new requests to the next batch
-      continue;
-    } else {
-      assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
-      Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-      int processed_tokens =
-          old_bc.requestsInfo[i].first_token_depth_in_request +
-          old_bc.requestsInfo[i].num_tokens_in_batch;
-      assert(processed_tokens < request.tokens.size());
-      bool request_completed = false;
-      // printf("model_type = %d\n", this->model_type);
-      if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
-        request_completed = true;
-      } else if (request.tokens.back() == eos_token_id) {
-        // Encounter EOS token id
-        request_completed = true;
-      }
-      if (request_completed) {
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        // Unlike Huggingface, the sentencepiece C++ library automatically
-        // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
-            request.tokens.at(0) == bos_token_id) {
-          output = "<s> " + output;
-        }
-        {
-          // update generation result
-          GenerationResult &gr = request_generation_results[request.guid];
-          assert(gr.guid == request.guid);
-          gr.output_tokens = request.tokens;
-          gr.output_text = output;
-        }
-        request.status = Request::COMPLETED;
-        trigger_request_completion_future(request.guid);
-        log_req_mgr.print("[Done] guid(%zu) final_length(%zu)",
-                          old_bc.requestsInfo[i].request_guid,
-                          request.tokens.size());
-        log_req_mgr.print("Final output: %s", output.c_str());
-        num_processed_requests++;
-        ProfileInfo profile_info = profiling_requests[request.guid];
-        profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
-        total_request_run_time +=
-            profile_info.finish_time - profile_info.start_time;
-        profiling_requests[request.guid] = profile_info;
-        log_req_mgr.print(
-            "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-            "finish(%.1lf) latency(%.1lf)",
-            request.guid,
-            profile_info.llm_decoding_steps,
-            profile_info.start_time,
-            profile_info.finish_time,
-            profile_info.finish_time - profile_info.start_time);
-        // Write output to file if needed:
-        if (!output_filepath.empty()) {
-          std::ofstream outputFile(output_filepath, std::ios::app);
-          if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
-            outputFile << "token IDs: ";
-            for (int i = 0; i < request.tokens.size(); i++) {
-              outputFile << request.tokens[i];
-              if (i < request.tokens.size() - 1) {
-                outputFile << ",";
-              }
-            }
-            outputFile << std::endl;
-            outputFile << output;
-            outputFile.close();
-          } else {
-            std::cout << "Unable to open the output file: " << output_filepath
-                      << std::endl;
-            assert(false);
-          }
-        }
+  InferenceResult const &result =
+      Future(task->futures[0]).get_result<InferenceResult>();
+  return rm->get_next_batch_config(result);
+}
 
-      } else {
-        new_bc.request_completed[i] = false;
-        new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
-        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-        new_bc.requestsInfo[i].request_guid =
-            old_bc.requestsInfo[i].request_guid;
-        new_bc.requestsInfo[i].max_sequence_length =
-            old_bc.requestsInfo[i].max_sequence_length;
-        num_active_req++;
-        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-        if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 ==
-            request.tokens.size()) {
-          // Incremental phase
-          new_bc.requestsInfo[i].num_tokens_in_batch = 1;
-          num_generation_tokens++;
-          new_bc.requestsInfo[i].prompt_phase = false;
+BatchConfig
+    RequestManager::get_next_batch_config(InferenceResult const &result) {
+  static double process_this_start_us = 0.0, process_last_end_us = 0.0;
+  if (get_eval_overhead_breakdown()) {
+    process_this_start_us = Realm::Clock::current_time_in_microseconds();
+    if (process_last_end_us != 0) {
+      if (request_manager_status == PREFILLING) {
+        if (prefill_model == SSM) {
+          eval_ssm_prefill_latency_us +=
+              process_this_start_us - process_last_end_us;
         } else {
-          // Prompt phase
-          new_bc.requestsInfo[i].num_tokens_in_batch =
-              std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-                       (int)request.tokens.size() -
-                           new_bc.requestsInfo[i].first_token_depth_in_request);
-          new_bc.requestsInfo[i].prompt_phase = true;
-        }
-        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-          assert(depth < request.tokens.size());
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth];
-          new_bc.num_tokens++;
+          eval_llm_prefill_latency_us +=
+              process_this_start_us - process_last_end_us;
         }
-        // Update profiling
-        profiling_requests[new_bc.requestsInfo[i].request_guid]
-            .llm_decoding_steps++;
+      } else if (request_manager_status == SSM_SPEC) {
+        eval_ssm_spec_latency_us += process_this_start_us - process_last_end_us;
+      } else if (request_manager_status == LLM_VERIFY) {
+        eval_llm_verify_latency_us +=
+            process_this_start_us - process_last_end_us;
       }
     }
   }
-  new_bc.num_generation_tokens = num_generation_tokens;
-
-  // Step 3: add new requests to the next batch
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (new_bc.request_completed[i]) {
-      if (!pending_request_queue.empty() &&
-          new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
-        // all_requests[new_request.guid] = new_request;
-
-        new_bc.requestsInfo[i].first_token_depth_in_request = 0;
-        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-        new_bc.requestsInfo[i].request_guid = new_request.guid;
-        new_bc.requestsInfo[i].num_tokens_in_batch =
-            std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-                     (int)new_request.tokens.size());
-        new_bc.requestsInfo[i].max_sequence_length =
-            new_request.max_sequence_length;
-        new_bc.request_completed[i] = false;
-        new_bc.requestsInfo[i].prompt_phase = true;
-        num_active_req++;
-        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-        // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 1;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
-        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-          assert(depth < new_request.tokens.size());
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              new_request.tokens[depth];
-          new_bc.num_tokens++;
-        }
-        if (new_bc.num_tokens == get_max_tokens_per_batch()) {
-          break;
-        }
+  update_inference_results(result);
+  BatchConfig bc = prepare_next_batch();
+  if (get_eval_overhead_breakdown()) {
+    process_last_end_us = Realm::Clock::current_time_in_microseconds();
+    double process_time_us = process_last_end_us - process_this_start_us;
+    // printf("Process time: %.3f us\n", process_time_us);
+    eval_process_latency_us += process_time_us;
+  }
+  return bc;
+}
+
+// Return value: true if load a pending request to the batch
+bool RequestManager::load_pending_request_to_batch() {
+  static double load_request_start = 0.0;
+  if (get_eval_overhead_breakdown()) {
+    load_request_start = Realm::Clock::current_time_in_microseconds();
+  }
+  if (num_running_requests >= get_max_requests_per_batch()) {
+    if (get_eval_overhead_breakdown()) {
+      eval_other_latency_us +=
+          Realm::Clock::current_time_in_microseconds() - load_request_start;
+    }
+    return false;
+  }
+  std::unique_lock<std::mutex> lock(request_queue_mutex);
+  if (pending_request_queue.empty()) {
+    if (num_running_requests > 0) {
+      // No pending request to process, but there are running requests in the
+      // batch. Do nothing and return
+      if (get_eval_overhead_breakdown()) {
+        eval_other_latency_us +=
+            Realm::Clock::current_time_in_microseconds() - load_request_start;
+      }
+      return false;
+    }
+    // Wait until there is a pending request or the background server is
+    // terminated
+    request_queue_cv.wait(lock, [&] {
+      return !pending_request_queue.empty() ||
+             is_background_server_terminated();
+    });
+    // If the background server has been terminated, exit
+    if (is_background_server_terminated()) {
+      if (get_eval_overhead_breakdown()) {
+        eval_other_latency_us +=
+            Realm::Clock::current_time_in_microseconds() - load_request_start;
       }
+      return false;
     }
   }
+  assert(!pending_request_queue.empty() && "No pending request to process.");
+  if (profiling.server_start_time == 0) {
+    reset_profiling_statistics();
+  }
+  while (num_running_requests < get_max_requests_per_batch() &&
+         !pending_request_queue.empty()) {
+    RequestGuid guid = pending_request_queue.front().guid;
+    pending_request_queue.pop();
+    Request *request = &all_requests[guid];
+    if (request->tokens.size() > get_max_sequence_length()) {
+      std::cerr << "Request " << guid
+                << " exceeds the maximum sequence length: "
+                << request->tokens.size() << " > " << get_max_sequence_length()
+                << std::endl;
+      continue;
+    }
 
-  return new_bc;
+    request->status = Request::RUNNING;
+    // Find an empty slot
+    int request_index = get_empty_request_index();
+    assert(request_index != -1 && "No empty request slot to load the request.");
+    // Load request into batch
+    request->batch_index = request_index;
+    guid_of_requests[request_index] = guid;
+    num_running_requests++;
+    request_available[request_index] = true;
+    num_available_requests++;
+    // Initialize the bitmask for the new request with its prompt length
+    init_bitmask_prompt(guid, request->tokens.size());
+
+    prefilling_requests.push_back(request);
+
+    profiling_requests[guid] = RequestProfileInfo();
+    profiling_requests[guid].start_time =
+        Realm::Clock::current_time_in_microseconds();
+  }
+  if (get_eval_overhead_breakdown()) {
+    eval_other_latency_us +=
+        Realm::Clock::current_time_in_microseconds() - load_request_start;
+  }
+  return true;
 }
 
-/* ----- Speculative Inference Specific functions ----- */
+void RequestManager::request_update_attainment(int batch_index, bool attained) {
+  Request &request = all_requests[guid_of_requests[batch_index]];
+  request.attained &= attained;
+}
 
-/***** Request Init Phase *****/
-BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init(
-    TreeVerifyBatchConfigFuture const &old_bc,
-    InferenceResultFuture const &result,
-    int model_id,
-    Context ctx,
-    Runtime *runtime) {
+bool isPrefixAndRemove(std::vector<int> const &prefix, std::vector<int> &vec) {
+  if (prefix.size() > vec.size()) {
+    return false;
+  }
 
-  RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
-                        TaskArgument(&rm, sizeof(RequestManager *)));
-  launcher.add_future(old_bc);
-  launcher.add_future(result);
-  launcher.add_future(Future::from_value<int>(model_id));
-  return runtime->execute_task(ctx, launcher);
-}
+  if (std::equal(prefix.begin(), prefix.end(), vec.begin())) {
+    vec.erase(vec.begin(), vec.begin() + prefix.size());
+    return true;
+  }
 
-BeamSearchBatchConfig RequestManager::prepare_next_batch_init_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  TreeVerifyBatchConfig const &bc =
-      Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-  InferenceResult const &result =
-      Future(task->futures[1]).get_result<InferenceResult>();
-  int model_id = Future(task->futures[2]).get_result<int>();
-  return rm->prepare_next_batch_init(bc, result, model_id);
+  return false;
 }
 
-BeamSearchBatchConfig
-    RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
-                                            InferenceResult const &result,
-                                            int model_id) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
-  if (verbose) {
-    std::cout << "\n############### prepare_next_batch_init ###############\n";
+void RequestManager::request_complete_clean_up(int batch_index) {
+  static double request_complete_start = 0.0;
+  if (get_eval_overhead_breakdown()) {
+    request_complete_start = Realm::Clock::current_time_in_microseconds();
   }
+  RequestGuid guid = guid_of_requests[batch_index];
 
-  // Step 1: use result to update requests
-  BeamSearchBatchConfig new_bc;
-  new_bc.num_tokens = 0;
-  new_bc.model_id = model_id;
-  int result_index = 0;
+  profiling_requests[guid].finish_time =
+      Realm::Clock::current_time_in_microseconds();
+  Request &request = all_requests[guid];
+  guid_of_requests[batch_index] = INVALID_GUID;
+  num_running_requests--;
+  request_available[batch_index] = false;
+  num_available_requests--;
+  request.status = Request::COMPLETED;
+
+  // page attention: free the pages
+  PageManager *page_manager = PageManager::get_page_manager();
+  page_manager->free_request(guid);
+
+  // Find the sos and eos in the sequence
+  // auto bos_it = std::find(
+  //     request.tokens.begin(), request.tokens.end(), this->bos_token_id);
+  // auto eos_rit = std::find(
+  //     request.tokens.rbegin(), request.tokens.rend(), this->eos_token_id);
+  // std::vector<int>::iterator eos_it;
+  // if (eos_rit != request.tokens.rend()) {
+  //   eos_it = eos_rit.base();
+  // } else {
+  //   eos_it = request.tokens.end();
+  // }
+  // std::string output =
+  //     this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
+  {
+    std::lock_guard<std::mutex> const lock(request_result_mutex);
+    request_generation_results[guid].output_tokens = request.tokens;
+    assert(isPrefixAndRemove(request_generation_results[guid].input_tokens,
+                             request_generation_results[guid].output_tokens));
+    if (request_generation_results[guid].output_tokens.size() > 0 &&
+        is_eos_token(
+            request_generation_results[guid].output_tokens
+                [request_generation_results[guid].output_tokens.size() - 1]) &&
+        !request.add_special_tokens) {
+      request_generation_results[guid].output_tokens.pop_back();
+    }
+    request_generation_results[guid].output_text = this->tokenizer_->Decode(
+        request_generation_results[guid].output_tokens);
+    request_generation_results[guid].decoding_steps =
+        profiling_requests[guid].llm_decoding_steps;
+    // request_generation_results[guid].output_tokens =
+    //     std::vector<int>(bos_it, eos_it);
+  }
 
-  int num_generation_tokens = 0;
-  int num_active_req = -1;
+  trigger_request_completion_future(guid);
 
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i]) {
-      continue;
+  std::string output = this->tokenizer_->Decode(request.tokens);
+  std::cout << "Request " << guid << " completed: " << std::endl;
+  std::cout << "<bos>" << output;
+  if (is_eos_token(request.tokens.back())) {
+    std::cout << "<eos>";
+  }
+  std::cout << std::endl << std::endl;
+  {
+    RequestProfileInfo profile_info = profiling_requests[guid];
+
+    std::ostream *os = &std::cout;
+    std::ofstream output_file;
+    if (!output_filepath.empty()) {
+      output_file.open(output_filepath, std::ios::app);
+      if (output_file.is_open()) {
+        os = &output_file;
+      } else {
+        std::cout << "Unable to open the output file: " << output_filepath
+                  << std::endl;
+        assert(false);
+      }
     }
-    size_t guid = old_bc.requestsInfo[i].request_guid;
-    Request &request = all_requests[guid];
-
-    std::cout << "[ " << guid << " ]" << std::endl;
+    *os << "Request " << guid << " profiling: " << std::endl;
+    if (profile_info.start_decoding_time != 0) {
+      *os << "Decoding time: "
+          << (profile_info.finish_time - profile_info.start_decoding_time) *
+                 1e-3
+          << " ms" << std::endl;
+    } else {
+      *os << "Decoding time: 0 ms" << std::endl;
+    }
+    *os << "Total time: "
+        << (profile_info.finish_time - profile_info.start_time) * 1e-3 << " ms"
+        << std::endl;
+    *os << "LLM decoding steps: " << profile_info.llm_decoding_steps
+        << std::endl;
+    if (decoding_mode == SPECULATIVE_DECODING) {
+      *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
+          << std::endl;
+    }
+    *os << std::endl;
+    // *os << output << std::endl << std::endl;
 
-    // Verify this: get verified tokens from result
-    std::vector<std::pair<BatchConfig::TokenId, int>> tree_outputs =
-        std::vector<std::pair<BatchConfig::TokenId, int>>();
+    if (!output_filepath.empty()) {
+      output_file.close();
+    }
+  }
+  // RequestProfileInfo profile_info = profiling_requests[guid];
+  // std::string str =
+  //     "[" + std::to_string(guid) +
+  //     "] Request completed:" + " decoding_time_ms(" +
+  //     std::to_string(
+  //         (profile_info.finish_time - profile_info.start_decoding_time) *
+  //         1e-3) +
+  //     ")" + " total_time_ms(" +
+  //     std::to_string((profile_info.finish_time - profile_info.start_time) *
+  //                    1e-3) +
+  //     ")" + " LLM_decoding_steps(" +
+  //     std::to_string(profile_info.llm_decoding_steps) + ")";
+  // if (decoding_mode == SPECULATIVE_DECODING) {
+  //   str = str + " SSM_decoding_steps(" +
+  //         std::to_string(profile_info.ssm_decoding_steps) + ")";
+  // }
+  // write_to_output_file("", str);
+  if (get_eval_overhead_breakdown()) {
+    eval_other_latency_us +=
+        Realm::Clock::current_time_in_microseconds() - request_complete_start;
+  }
+}
 
-    assert(old_bc.num_tokens > 0);
+void RequestManager::request_offload_from_batch(int batch_index) {
+  RequestGuid guid = guid_of_requests[batch_index];
+  Request &request = all_requests[guid];
+  // Still keep the request in `guid_of_requests` where can be retrieved later
+  request_available[batch_index] = false;
+  num_available_requests--;
+}
 
-    // reset committed_tokens
-    if (committed_tokens.count(guid) == 0) {
-      committed_tokens[guid] = {};
-    } else {
-      committed_tokens[guid].clear();
-    }
-
-    // iterate through all the tokens that belong to request i
-    int root_abs_depth = request.tokens.size() - 1;
-
-    while (result_index < old_bc.num_tokens &&
-           old_bc.tokensInfo[result_index].request_index == i) {
-      int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
-      int token_id = result.token_ids[result_index];
-
-      if (request.status == Request::PENDING) {
-        committed_tokens[guid].emplace_back(abs_depth, result_index);
-      } else if (abs_depth >= root_abs_depth) {
-        tree_outputs.emplace_back(token_id, abs_depth + 1);
-        // std::cout << "committred tokens push: " << abs_depth
-        //           << " ,result index: " << result_index << "\n";
-        committed_tokens[guid].emplace_back(abs_depth, result_index);
-
-        if (verbose) {
-          std::cout << "Index within old batch: " << result_index << std::endl;
-          printf("  Input: [%d] %d ---> [%d] %d \n",
-                 abs_depth,
-                 old_bc.tokensInfo[result_index].token_id,
-                 tree_outputs.back().second,
-                 token_id);
-        }
-        // std::cout << "Index within old batch: " << result_index << std::endl;
-        // printf("  Input: [%d] %d ---> [%d] %d \n",
-        //        abs_depth,
-        //        old_bc.tokensInfo[result_index].token_id,
-        //        tree_outputs.back().second,
-        //        token_id);
-      }
-      result_index++;
-    }
+void RequestManager::request_load_onto_batch(int batch_index) {
+  RequestGuid guid = guid_of_requests[batch_index];
+  Request &request = all_requests[guid];
+  request_available[batch_index] = true;
+  num_available_requests++;
+}
 
-    if (request.status == Request::RUNNING) {
+void RequestManager::update_token_tree_depth() {
+  ssm_tree_depth = min(int(std::ceil((double)get_max_tokens_per_batch() /
+                                     get_num_active_requests())),
+                       get_max_tree_depth());
+}
 
-      std::vector<std::pair<BatchConfig::TokenId, int>> verified_tokens =
-          traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs);
+void RequestManager::update_inference_results(InferenceResult const &result) {
+  // Update the inference results
+  if (num_running_requests == 0) {
+    // Update nothing
+    // Load the pending request to the batch
+    load_pending_request_to_batch();
+    request_manager_status = PREFILLING;
+    if (decoding_mode == SPECULATIVE_DECODING) {
+      prefill_model = SSM;
+      current_ssm_step = 0;
+    }
+    return;
+  }
 
-      log_req_mgr.print("Number of Verified Tokens = %zu",
-                        verified_tokens.size());
-      // check if the request is finished
-      if (verified_tokens.size() + request.tokens.size() >=
-          request.max_sequence_length) {
-        // Append all verified tokens to the request
-        for (auto const &token_pair : verified_tokens) {
-          if (token_pair.second < request.max_sequence_length) {
-            request.tokens.push_back(token_pair.first);
+  switch (request_manager_status) {
+    case PREFILLING:
+      if (decoding_mode == INCREMENTAL_DECODING) {
+        // This indicates that the prefilling of the requests finishes
+        bool all_prefilled = update_llm_prefill_results(result);
+        // Check if there are more empty slots
+        if (load_pending_request_to_batch() or !all_prefilled) {
+          // Load the pending request to the batch
+          request_manager_status = PREFILLING;
+        } else {
+          // No more empty slots, start the decoding
+          while (!prefilled_requests.empty()) {
+            Request *request = prefilled_requests.front();
+            request_load_onto_batch(request->batch_index);
+            prefilled_requests.pop();
           }
+          request_manager_status = DECODING;
         }
-        log_req_mgr.print("[Done] guid(%zu) with final length(%zu)",
-                          request.guid,
-                          request.tokens.size());
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        // Unlike Huggingface, the sentencepiece C++ library automatically
-        // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
-            request.tokens.at(0) == bos_token_id) {
-          output = "<s> " + output;
-        }
-        {
-          // update generation result
-          GenerationResult &gr = request_generation_results[request.guid];
-          assert(gr.guid == request.guid);
-          gr.output_tokens = request.tokens;
-          gr.output_text = output;
-        }
-        request.status = Request::COMPLETED;
-        trigger_request_completion_future(request.guid);
-        log_req_mgr.print("Final output: %s", output.c_str());
-
-        new_bc.request_completed[i] = true;
-        new_bc.request_running[i] = false;
-        num_processed_requests++;
-
-        // Log profiling info
-        ProfileInfo profile_info = profiling_requests[request.guid];
-        profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
-        profile_info.ssm_decoding_steps = 0;
-        total_request_run_time +=
-            profile_info.finish_time - profile_info.start_time;
-        profiling_requests[request.guid] = profile_info;
-        log_req_mgr.print(
-            "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-            "finish(%.1lf) latency(%.1lf)",
-            request.guid,
-            profile_info.llm_decoding_steps,
-            profile_info.start_time,
-            profile_info.finish_time,
-            profile_info.finish_time - profile_info.start_time);
-
-        // Write output to file if needed:
-        if (!output_filepath.empty()) {
-          std::ofstream outputFile(output_filepath, std::ios::app);
-          if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
-            outputFile << "token IDs: ";
-            for (int i = 0; i < request.tokens.size(); i++) {
-              outputFile << request.tokens[i];
-              if (i < request.tokens.size() - 1) {
-                outputFile << ",";
-              }
-            }
-            outputFile << std::endl;
-            outputFile << output;
+        // Not completed, continue prefilling
+      } else if (decoding_mode == SPECULATIVE_DECODING) {
+        if (prefill_model == SSM) {
+          // A single iteration contains max_tree_depth SSM steps and a single
+          // LLM step. To align with this structure, we have to create
+          // max_tree_depth - 1 empty SSM steps during the prefilling phase.
+          if (current_ssm_step == 0) {
+            update_ssm_prefill_results(result);
+          }
+          // Except for the first step, we do nothing.
+          current_ssm_step++;
 
-            outputFile.close();
+          if (current_ssm_step == get_max_tree_depth()) {
+            prefill_model = LLM;
+          }
+        } else if (prefill_model == LLM) {
+          // This indicates that the prefilling of the requests finishes
+          bool all_prefilled = update_llm_prefill_results(result);
+          if (load_pending_request_to_batch() or !all_prefilled) {
+            request_manager_status = PREFILLING;
+            prefill_model = SSM;
+            current_ssm_step = 0;
           } else {
-            std::cout << "Unable to open the output file: " << output_filepath
-                      << std::endl;
-            assert(false);
+            // No more empty slots, start the speculation
+            while (!prefilled_requests.empty()) {
+              Request *request = prefilled_requests.front();
+              request_load_onto_batch(request->batch_index);
+              prefilled_requests.pop();
+            }
+            request_manager_status = SSM_SPEC;
+            // Reset the prefill_request
+            current_ssm_step = 0;
+            ssm_completed = false;
           }
+        } else {
+          assert(false && "Invalid prefill model.");
         }
+      } else {
+        assert(false && "Invalid inference mode.");
+      }
+      break;
+    case DECODING: {
+      bool request_completed = update_llm_decode_results(result);
+      if (load_pending_request_to_batch()) {
+        request_manager_status = PREFILLING;
+      } else {
+        request_manager_status = DECODING;
+      }
+    } break;
+    case LLM_VERIFY: {
+      bool request_completed = update_llm_verify_results(result);
+      if (load_pending_request_to_batch()) {
+        request_manager_status = PREFILLING;
+        prefill_model = SSM;
+        current_ssm_step = 0;
+      } else {
+        request_manager_status = SSM_SPEC;
+        current_ssm_step = 0;
+        ssm_completed = false;
+      }
+    } break;
+    case SSM_SPEC:
+      // Update current_ssm_step first because when we first call
+      // update_ssm_inference_results, there's already a step of small model
+      // inference
+      current_ssm_step++;
+      if (!ssm_completed) {
+        ssm_completed = update_ssm_inference_results(result);
+      }
+      // If the ssm speculation is completed, we do nothing
 
-        // delete the old input tree from cache
-        dfs_tree_inputs.erase(request.guid);
-
-      } else { // Request not finished, pass verified_tokens to next iteration
-
-        new_bc.request_completed[i] = false;
-        new_bc.request_running[i] = true;
-        num_active_req++;
-
-        // Normal Request Info
-        new_bc.requestsInfo[i].first_token_depth_in_request =
-            verified_tokens.front().second;
-        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-        new_bc.requestsInfo[i].request_guid =
-            old_bc.requestsInfo[i].request_guid;
-        new_bc.requestsInfo[i].max_sequence_length =
-            old_bc.requestsInfo[i].max_sequence_length;
-        new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
-        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-        // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
-        int new_max_depth =
-            new_bc.requestsInfo[i].max_sequence_length -
-            new_bc.requestsInfo[i].first_token_depth_in_request -
-            verified_tokens.size();
-        new_bc.beamRequestsInfo[i].current_depth = 1;
-
-        profiling_requests[request.guid].ssm_decoding_steps = 0;
-        new_bc.requestsInfo[i].prompt_phase = true;
-
-        int ssm_decoding_steps = 0;
-        new_bc.beamRequestsInfo[i].beam_size =
-            spec_infer_tree_width.size() > ssm_decoding_steps
-                ? spec_infer_tree_width[ssm_decoding_steps]
-                : 1;
-        new_bc.beamRequestsInfo[i].max_depth =
-            std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH);
-        for (int j = 0;
-             j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-             j++) {
-          new_bc.beamRequestsInfo[i].parent_id[j] = 0;
-          new_bc.beamRequestsInfo[i].probs[j] = 1;
-        }
-
-        new_bc.beamRequestsInfo[i].sub_request_num = 1;
-
-        new_bc.sub_requests[i] = 1;
-
-        updateBitMask(new_bc.causalMask[i],
-                      verified_tokens.size(),
-                      request.tokens.size());
-
-        // Token Info
-        for (int j = 0; j < verified_tokens.size(); j++) {
-          auto token = verified_tokens.at(j);
-
-          // Normal Token Info
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-              token.second;
+      if (current_ssm_step == get_max_tree_depth()) {
+        request_manager_status = LLM_VERIFY;
+      }
+      break;
+    default:
+      assert(false && "Invalid request manager status.");
+  }
+}
 
-          // Beam Token Info
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
-          new_bc.num_tokens++;
+bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
+  int num_tokens = 0;
+  std::vector<Request *> incomplete_requests;
+  incomplete_requests.reserve(prefilling_requests.size());
+  for (Request *request : prefilling_requests) {
+    if (request->num_tokens_in_batch > 0) {
+      if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
+        request->streaming_cache_info.commit_cache(
+            request->num_tokens_in_batch);
+        request->llm_cache_size = request->streaming_cache_info.commit_len;
+      } else {
+        request->llm_cache_size += request->num_tokens_in_batch;
+      }
+      request->llm_prefill_len += request->num_tokens_in_batch;
 
-          // Add verified token to request's token list
-          request.tokens.push_back(token.first);
+      if (request->llm_prefill_len == request->tokens.size()) {
+        // Indicates that this request's prefilling phase finishes
+        request->tokens.push_back(
+            result.token_ids[num_tokens + request->num_tokens_in_batch - 1]);
 
-          if (new_bc.num_tokens == get_max_tokens_per_batch()) {
-            break;
+        if (is_eos_token(request->tokens.back())) {
+          request_complete_clean_up(request->batch_index);
+        } else {
+          // Temporarily offload request from the batch
+          request_offload_from_batch(request->batch_index);
+          prefilled_requests.push(request);
+
+          if (decoding_mode == SPECULATIVE_DECODING) {
+            // Add the last token to the token tree
+            assert(request->committed_tokens.empty() &&
+                   "The committed tokens should be empty.");
+            request->committed_tokens.push_back(Request::CommittedToken{
+                -1, (int)request->tokens.size() - 1, request->tokens.back()});
+            init_token_tree(request->guid);
+            add_root_to_spec_token_tree(request->guid, request->tokens.back());
+            update_bitmask_prompt(request->guid, 1);
           }
         }
-
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        // Unlike Huggingface, the sentencepiece C++ library automatically
-        // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
-            request.tokens.at(0) == bos_token_id) {
-          output = "<s> " + output;
-        }
-        log_req_mgr.print("Output: %s", output.c_str());
+      } else {
+        // Next phase will still be prefilling
+        incomplete_requests.push_back(request);
       }
+      profiling_requests[request->guid].llm_prefilling_steps++;
+      num_tokens += request->num_tokens_in_batch;
+    } else if (request->llm_prefill_len < request->tokens.size()) {
+      // The request is not completed, continue prefilling
+      incomplete_requests.push_back(request);
+    }
+  }
 
-    } else if (request.status == Request::PENDING) {
-      new_bc.request_completed[i] = false;
-      new_bc.request_running[i] = false;
-      num_active_req++;
-
-      std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", "
-                << "initial_len: " << request.initial_len << std::endl;
-      assert(request.ssm_cache_size == request.initial_len);
-
-      // Normal Request Info
-      new_bc.requestsInfo[i].first_token_depth_in_request =
-          request.ssm_cache_size;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-      // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
-      new_bc.beamRequestsInfo[i].current_depth = 1;
-      int ssm_decoding_steps =
-          profiling_requests[request.guid].ssm_decoding_steps;
-      new_bc.beamRequestsInfo[i].beam_size =
-          spec_infer_tree_width.size() > ssm_decoding_steps
-              ? spec_infer_tree_width[ssm_decoding_steps]
-              : 1;
-      new_bc.beamRequestsInfo[i].max_depth = 0;
-      for (int j = 0; j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-           j++) {
-        new_bc.beamRequestsInfo[i].parent_id[j] = 0;
-        new_bc.beamRequestsInfo[i].probs[j] = 1;
-      }
+  prefilling_requests.swap(incomplete_requests);
+  return prefilling_requests.empty();
+}
 
-      new_bc.beamRequestsInfo[i].sub_request_num = 1;
+bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
+  bool request_completed = false;
+  int nb_requests_decoded = 0;
+  long long int current_time = Realm::Clock::current_time_in_microseconds();
 
-      new_bc.sub_requests[i] = 1;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (streaming_cache) {
+      request.streaming_cache_info.commit_cache(1);
+      request.llm_cache_size = request.streaming_cache_info.commit_len;
+    } else {
+      request.llm_cache_size++;
+    }
+    request.tokens.push_back(
+        result.token_ids[request.first_token_offset_in_batch]);
+
+    request.decode_latency_ms =
+        (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
+    bool attained =
+        request.decode_latency_ms <= get_request_expected_latency(request);
+    profiling_requests[guid].llm_decoding_steps++;
+    nb_requests_decoded++;
+
+    NewProfileInfo new_profile_info;
+    new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+    new_profile_info.request_guid = guid;
+    new_profile_info.request_step_idx =
+        profiling_requests[guid].llm_decoding_steps - 1;
+    new_profile_info.num_generated_tokens = 1;
+    new_profiling_info.push_back(new_profile_info);
+
+    if (is_eos_token(request.tokens.back()) or
+        request.decode_length() >= get_max_output_length() or
+        request.tokens.size() >= get_max_sequence_length()) {
+      request_update_attainment(request_index, attained);
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    } else if (!attained and slo_violation_early_termination) {
+      // Early drop that request
+      request_update_attainment(request_index, attained);
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    }
 
-      // Token Info
+    if (verbose) {
       std::string output = this->tokenizer_->Decode(request.tokens);
-      // Unlike Huggingface, the sentencepiece C++ library automatically removes
-      // the BOS token
-      if (model_type == ModelType::LLAMA &&
-          request.tokens.at(0) == bos_token_id) {
-        output = "<s> " + output;
-      }
-      log_req_mgr.print("Output: %s", output.c_str());
-    } else {
-      assert(false);
+      std::cout << "Request " << guid << " tokens: " << std::endl
+                << output << std::endl;
     }
   }
+  profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) *
+                                     1e-3);
+  profiling.requests_per_step.push_back(nb_requests_decoded);
+  profiling.generated_tokens_per_step.push_back(nb_requests_decoded);
+  return request_completed;
+}
 
-  // Step 2: Initialize new request
-  for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
-    if (new_bc.request_completed[i]) {
-      if (!pending_request_queue.empty() &&
-          new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
-        // all_requests[new_request.guid] = new_request;
-        num_active_req++;
-        new_bc.requestsInfo[i].first_token_depth_in_request = 0;
-        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-        new_bc.requestsInfo[i].request_guid = new_request.guid;
-        new_bc.requestsInfo[i].num_tokens_in_batch =
-            std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-                     (int)new_request.tokens.size());
-        new_bc.requestsInfo[i].max_sequence_length =
-            new_request.max_sequence_length;
-        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-        // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 0;
-        profile_info.ssm_decoding_steps = 0;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
-        // init the beam search metadata per request
-        int ssm_decoding_steps = profile_info.ssm_decoding_steps;
-
-        new_bc.beamRequestsInfo[i].beam_size =
-            spec_infer_tree_width.size() > ssm_decoding_steps
-                ? spec_infer_tree_width[ssm_decoding_steps]
-                : 1;
-        new_bc.beamRequestsInfo[i].current_depth = 1;
-        new_bc.beamRequestsInfo[i].max_depth =
-            std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH,
-                     get_max_tokens_per_batch() -
-                         new_bc.requestsInfo[i].num_tokens_in_batch - 1);
-        for (int j = 0;
-             j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-             j++) {
-          new_bc.beamRequestsInfo[i].parent_id[j] = 0;
-          new_bc.beamRequestsInfo[i].probs[j] = 1;
-        }
-
-        new_bc.request_completed[i] = false;
-        new_bc.requestsInfo[i].prompt_phase = true;
-
-        new_bc.beamRequestsInfo[i].sub_request_num = 1;
-        printf("sub request num == 1, %d \n",
-               new_bc.beamRequestsInfo[i].beam_size);
-
-        new_bc.sub_requests[i] = 1;
-
-        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-          assert(depth < new_request.tokens.size());
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              new_request.tokens[depth];
-
-          // beam search meta data, indicate which sub request this token
-          // belongs to, init to 0;
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
-          new_bc.num_tokens++;
-        }
+void RequestManager::update_ssm_prefill_results(
+    InferenceResult const &ssm_prefill_result) {
+  // This function is called by update_inference_results when the
+  // request_manager_status is PREFILLING and the prefill_model is SSM.
+  // There's no results to update, but we should update ssm_cache_size.
+  for (Request *request : prefilling_requests) {
+    if (request->num_tokens_in_batch > 0) {
+      if (streaming_cache) {
+        request->streaming_cache_info.commit_cache(
+            request->num_tokens_in_batch);
+        request->ssm_cache_size = request->streaming_cache_info.commit_len;
+      } else {
+        request->ssm_cache_size += request->num_tokens_in_batch;
+      }
+      request->ssm_prefill_len += request->num_tokens_in_batch;
 
-        initBitMask(new_bc.causalMask[i],
-                    new_bc.requestsInfo[i].num_tokens_in_batch);
-
-        // if (new_bc.requestsInfo[i].num_tokens_in_batch <
-        // new_request.initial_len) {
-        //   all_requests[new_request.guid].status = Request::PENDING;
-        //   new_bc.request_running[i] = false;
-        //   std::cout << "Request " << new_request.guid << " is pending"
-        //             << std::endl;
-        // } else {
-        //   all_requests[new_request.guid].status = Request::RUNNING;
-        //   new_bc.request_running[i] = true;
-        //   std::cout << "Request " << new_request.guid << " is running"
-        //             << std::endl;
-        // }
-        all_requests[new_request.guid].status = Request::PENDING;
-        all_requests[new_request.guid].ssm_cache_size =
-            new_bc.requestsInfo[i].num_tokens_in_batch;
-        new_bc.request_running[i] = false;
-        std::cout << "SSM KV Cache Size init: "
-                  << all_requests[new_request.guid].ssm_cache_size << std::endl;
-        std::cout << "LLM KV Cache Size init: "
-                  << all_requests[new_request.guid].llm_cache_size << std::endl;
-
-        std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch
-                  << " tokens for request " << new_request.guid << std::endl;
-        std::cout << "total prompt in request: " << new_request.initial_len
-                  << std::endl;
+      profiling_requests[request->guid].ssm_prefilling_steps++;
+    }
+  }
+}
 
-        if (new_bc.num_tokens == get_max_tokens_per_batch()) {
-          break;
+BatchConfig RequestManager::prepare_next_batch() {
+  if (is_background_server_terminated()) {
+    return BatchConfig();
+  }
+  switch (request_manager_status) {
+    case PREFILLING:
+      if (decoding_mode == INCREMENTAL_DECODING) {
+        return prepare_llm_prefilling_batch();
+      } else if (decoding_mode == SPECULATIVE_DECODING) {
+        if (prefill_model == SSM) {
+          if (current_ssm_step == 0) {
+            return prepare_ssm_prefilling_batch();
+          } else {
+            // Return an empty batch config
+            return BatchConfig();
+          }
+        } else if (prefill_model == LLM) {
+          return prepare_llm_prefilling_batch();
+        } else {
+          assert(false && "Invalid prefill model.");
         }
+      } else {
+        assert(false && "Invalid inference mode.");
       }
+      break;
+    case DECODING:
+      if (get_fcfs_slo()) {
+        return prepare_decoding_batch_fcfs_slo();
+      } else if (get_stta()) {
+        return prepare_decoding_batch_stta();
+      } else {
+        return prepare_decoding_batch();
+      }
+    case SSM_SPEC:
+      if (current_ssm_step == 0) {
+        return prepare_first_spec_batch_config();
+      } else if (!ssm_completed) {
+        return prepare_next_spec_batch_config();
+      } else {
+        // Return an empty batch config
+        return BatchConfig();
+      }
+    case LLM_VERIFY:
+      return prepare_verify_batch_config();
+    default:
+      std::cout << "Invalid request manager status: " << request_manager_status
+                << std::endl;
+      assert(false);
+  }
+}
+
+BatchConfig RequestManager::prepare_llm_prefilling_batch() {
+  // This function is called when the request_manager_status is PREFILLING,
+  // which means that there is a request in the prefilling phase.
+  // This function load its prefilling tokens, constructing a BatchConfig with
+  // only one request.
+  if (verbose) {
+    std::cout << "\n############### prepare_llm_prefilling_batch "
+                 "##############\n";
+  }
+  assert(prefilling_requests.size() > 0 &&
+         "No prefilling request to process in the prefilling phase.");
+
+  // get page manager
+  PageManager *page_manager = PageManager::get_page_manager();
+
+  BatchConfig bc;
+  if (decoding_mode == INCREMENTAL_DECODING) {
+    bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  } else if (decoding_mode == SPECULATIVE_DECODING) {
+    bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
+  }
+  bc.prompt_phase = true;
+  bc.num_available_requests = 0;
+  int num_tokens = 0;
+  for (Request *request : prefilling_requests) {
+    int request_index = request->batch_index;
+
+    assert(request->status == Request::RUNNING);
+
+    // Request Info
+    bc.requestsInfo[request_index].first_token_offset_in_batch = num_tokens;
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request->llm_cache_size;
+    int num_tokens_in_batch =
+        std::min(get_max_tokens_per_prefilling_batch() - num_tokens,
+                 (int)request->tokens.size() - request->llm_prefill_len);
+    num_tokens_in_batch = std::max(num_tokens_in_batch, 0);
+    bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
+
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request->streaming_cache_info;
+
+    request->first_token_offset_in_batch = num_tokens;
+    request->num_tokens_in_batch = num_tokens_in_batch;
+
+    // Token Info
+    for (int idx = 0; idx < num_tokens_in_batch; idx++) {
+      int token_idx = num_tokens + idx;
+      int abs_idx = request->llm_cache_size + idx;
+
+      bc.tokensInfo[token_idx].request_index = request_index;
+      bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+      bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
+      assert(request->llm_prefill_len + idx < request->tokens.size());
+      bc.tokensInfo[token_idx].token_id =
+          request->tokens[request->llm_prefill_len + idx];
+
+      append_token_to_block(
+          *request, request->tokens[request->llm_prefill_len + idx], true);
+    }
+    num_tokens += num_tokens_in_batch;
+    if (num_tokens_in_batch > 0) {
+      bc.num_available_requests++;
+      bc.request_available[request_index] = true;
+    }
+    bc.requestsInfo[request_index].request_guid = request->guid;
+
+    // Record prefilling start time. We don't do this for speculative decoding,
+    // because in that case we start the timer in the ssm prefilling Step idx
+    // -2: enqueueing; step idx -1: prefilling begins, step idx 0: prefilling
+    // finished
+    if (decoding_mode == INCREMENTAL_DECODING) {
+      NewProfileInfo new_profile_info;
+      new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+      new_profile_info.request_guid = request->guid;
+      new_profile_info.request_step_idx = -1;
+      new_profiling_info.push_back(new_profile_info);
     }
   }
-  new_bc.num_generation_tokens = num_generation_tokens;
+  bc.num_tokens = num_tokens;
 
   if (verbose) {
-    std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:"
-              << std::endl;
-    old_bc.print();
-    new_bc.print();
+    std::cout << "prepare_llm_prefilling_batch NEW batchconfig:" << std::endl;
+    bc.print();
   }
-  return new_bc;
+  return bc;
 }
 
-/***** Beam Search Phase *****/
-BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
-    BeamSearchBatchConfigFuture const &old_bc,
-    BeamInferenceResultFuture const &result,
-    Context ctx,
-    Runtime *runtime) {
+BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
+  // This function is called when the request_manager_status is PREFILLING,
+  // which means that there is a request in the prefilling phase.
+  // This function load its prefilling tokens, constructing a BatchConfig with
+  // only one request.
+  if (verbose) {
+    std::cout << "\n############### prepare_ssm_prefilling_batch "
+                 "##############\n";
+  }
+  assert(prefilling_requests.size() > 0 &&
+         "No prefilling request to process in the prefilling phase.");
+
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
+  bc.prompt_phase = true;
+  bc.num_available_requests = 0;
+  int num_tokens = 0;
+  for (Request *request : prefilling_requests) {
+    int request_index = request->batch_index;
+
+    // Request Info
+    bc.requestsInfo[request_index].first_token_offset_in_batch = num_tokens;
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request->ssm_cache_size;
+    int num_tokens_in_batch =
+        std::min(get_max_tokens_per_prefilling_batch() - num_tokens,
+                 (int)request->tokens.size() - request->ssm_prefill_len);
+    num_tokens_in_batch = std::max(num_tokens_in_batch, 0);
+    bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
+
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request->streaming_cache_info;
+
+    request->first_token_offset_in_batch = num_tokens;
+    request->num_tokens_in_batch = num_tokens_in_batch;
+
+    // Token Info
+    for (int idx = 0; idx < num_tokens_in_batch; idx++) {
+      int token_idx = num_tokens + idx;
+      int abs_idx = request->ssm_cache_size + idx;
+
+      bc.tokensInfo[token_idx].request_index = request_index;
+      bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+      bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
+      assert(request->ssm_prefill_len + idx < request->tokens.size());
+      bc.tokensInfo[token_idx].token_id =
+          request->tokens[request->ssm_prefill_len + idx];
+    }
+    num_tokens += num_tokens_in_batch;
+    if (num_tokens_in_batch > 0) {
+      bc.num_available_requests++;
+      // Only set the prefilling request to be available
+      bc.request_available[request_index] = true;
+    }
 
-  RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
-                        TaskArgument(&rm, sizeof(RequestManager *)));
-  launcher.add_future(old_bc);
-  launcher.add_future(result);
-  return runtime->execute_task(ctx, launcher);
-}
+    // Record prefilling start time
+    // Step idx -2: enqueueing; step idx -1: prefilling begins, step idx 0:
+    // prefilling finished
+    NewProfileInfo new_profile_info;
+    new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+    new_profile_info.request_guid = request->guid;
+    new_profile_info.request_step_idx = -1;
+    new_profiling_info.push_back(new_profile_info);
+  }
+  bc.num_tokens = num_tokens;
 
-BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-  BeamInferenceResult const &result =
-      Future(task->futures[1]).get_result<BeamInferenceResult>();
-  return rm->prepare_next_batch_beam(bc, result);
-}
-
-// update beam search metadata
-BeamSearchBatchConfig
-    RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
-                                            BeamInferenceResult const &result) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
   if (verbose) {
-    std::cout << "\n############### prepare_next_batch_beam ###############\n";
+    std::cout << "prepare_ssm_prefilling_batch NEW batchconfig:" << std::endl;
+    bc.print();
   }
+  return bc;
+}
+
+BatchConfig RequestManager::prepare_decoding_batch() {
+  // This function is called when the request_manager_status is DECODING. It
+  // fills the last token of each request in the current batch to the
+  // BatchConfig for the LLM to decode.
   if (verbose) {
-    std::cout << "print all results"
-              << "\n";
-    for (int i = 0; i < 40; i++) {
-      std::cout << result.token_ids[i] << ", ";
-    }
-    std::cout << "Current Beam Depth: "
-              << old_bc.beamRequestsInfo[0].current_depth << "\n";
-    std::cout << "Current sub request num: "
-              << old_bc.beamRequestsInfo[0].sub_request_num << "\n";
-  }
-  // Step 1: Store result to the beam tree struct
-  store_beam_metadata(old_bc, result);
-
-  // Step 2: preparing the next batch for existing requests
-  BeamSearchBatchConfig new_bc;
-  new_bc.model_id = old_bc.model_id;
-  // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n";
-  int num_generation_tokens = 0;
-
-  // Add incremental tokens to the batch
-  int num_active_req = -1;
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i] || !old_bc.request_running[i]) {
+    std::cout << "\n############### prepare_decoding_batch "
+                 "##############\n";
+  }
+
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  bc.prompt_phase = false;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.num_available_requests = num_available_requests;
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (!request_available[request_index]) {
       continue;
     }
-    num_active_req++;
-    // Comment out this assertion since num_tokens_in_batch can be
-    // zero when beam search has reached required sequence length
-    // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
-    Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-    int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request +
-                           old_bc.requestsInfo[i].num_tokens_in_batch;
-
-    // assert(processed_tokens < request.tokens.size());
-    log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n";
-    {
-      log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", "
-                          << new_bc.num_tokens;
-      new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-      profiling_requests[request.guid].ssm_decoding_steps += 1;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-      // update the beam search metadata
-      // how many sub request in current request
-      // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH
-      // entries?
-      // update the parentid, accumalated_probs, depth, and token_ids
-      int ssm_decoding_steps =
-          profiling_requests[request.guid].ssm_decoding_steps;
-
-      new_bc.beamRequestsInfo[i].beam_size =
-          spec_infer_tree_width.size() > ssm_decoding_steps
-              ? spec_infer_tree_width[ssm_decoding_steps]
-              : 1;
-
-      new_bc.beamRequestsInfo[i].max_depth =
-          old_bc.beamRequestsInfo[i].max_depth;
-
-      new_bc.sub_requests[i] =
-          old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size;
-      new_bc.beamRequestsInfo[i].sub_request_num =
-          old_bc.beamRequestsInfo[i].sub_request_num *
-          old_bc.beamRequestsInfo[i].beam_size;
-
-      assert(new_bc.beamRequestsInfo[i].sub_request_num <=
-                 BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
-             "exceed maximum nodes per layer");
-
-      if (request.status == Request::RUNNING) {
-        new_bc.beamRequestsInfo[i].current_depth =
-            old_bc.beamRequestsInfo[i].current_depth + 1;
-        new_bc.request_running[i] = true;
-        // do the slot exchange to minimize the cache exchange in kernel.
-        update_beam_metadata(
-            new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i);
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
 
-      } else {
-        assert(false && "Request should not be pending in beam search phase");
-      }
+    // Per Request Info
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request.llm_cache_size;
+    bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens;
+    bc.requestsInfo[request_index].num_tokens_in_batch = 1;
 
-      // do the slot exchange to minimize the cache exchange in kernel.
-      // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id),
-      // i);
-      if (new_bc.requestsInfo[i].first_token_depth_in_request >=
-          request.tokens.size()) {
-        // Incremental phase
-        if (request.status == Request::RUNNING) {
-          // todo this is replaced by this_layer_size, but should check it
-          new_bc.requestsInfo[i].num_tokens_in_batch = 1;
-        } else {
-          assert(false && "Request should be done");
-          // new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-        }
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
 
-        if (verbose) {
-          std::cout << "[ Beam Spec] " << request.guid << std::endl;
-          std::cout << "Incremental phase: " << request.tokens.size()
-                    << ", num_tokens_in_batch: "
-                    << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-        }
-      }
+    request.first_token_offset_in_batch = bc.num_tokens;
+    request.num_tokens_in_batch = 1;
 
-      if (verbose) {
-        std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size
-                  << std::endl;
-        std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size
-                  << std::endl;
-      }
+    // Per Token Info
+    bc.tokensInfo[bc.num_tokens].request_index = request_index;
+    bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
+    bc.requestsInfo[request_index].request_guid = request.guid;
 
-      // register more tokens due to the beam width
-
-      // copy metadata
-      memcpy(&new_bc.causalMask[i],
-             &old_bc.causalMask[i],
-             sizeof(BatchConfig::BitMask));
-      BeamTree tree = request.beam_trees[old_bc.model_id];
-      appendBitMask(new_bc.causalMask[i],
-                    new_bc.beamRequestsInfo[i].sub_request_num,
-                    old_bc.beamRequestsInfo[i].beam_size,
-                    old_bc.beamRequestsInfo[i].sub_request_num,
-                    tree,
-                    old_bc.beamRequestsInfo[i].current_depth);
-      for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-        int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
-        for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-
-          // get value from requestinfo
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              new_bc.beamRequestsInfo[i].tokens[k];
-
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k;
-          new_bc.num_tokens++;
+    bc.num_tokens++;
 
-          num_generation_tokens++;
-        }
-      }
+    if (profiling_requests[request.guid].llm_decoding_steps == 0) {
+      profiling_requests[request.guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
     }
   }
 
-  // how many requests is in speculative phase
-  new_bc.speculative_request_num = num_active_req + 1;
+  if (verbose) {
+    std::cout << "prepare_decoding_batch NEW batchconfig:" << std::endl;
+    bc.print();
+  }
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
+  return bc;
+}
 
-  // Add prompt tokens to the batch
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i] || old_bc.request_running[i]) {
+BatchConfig RequestManager::prepare_decoding_batch_fcfs_slo() {
+  // This function is called when the request_manager_status is DECODING. It
+  // fills the last token of each request in the current batch to the
+  // BatchConfig for the LLM to decode.
+  if (verbose) {
+    std::cout << "\n############### prepare_decoding_batch_fcfs_slo "
+                 "##############\n";
+  }
+
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  bc.prompt_phase = false;
+
+  // Check if there are any requests whose SLO is in the fastest category
+  std::fill(request_available,
+            request_available + get_max_requests_per_batch(),
+            false);
+  num_available_requests = 0;
+  std::vector<Request> fcfs_request_queue;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (guid_of_requests[request_index] == INVALID_GUID) {
       continue;
     }
-    num_active_req++;
-    // Comment out this assertion since num_tokens_in_batch can be
-    // zero when beam search has reached required sequence length
-    // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
-    Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-    int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request +
-                           old_bc.requestsInfo[i].num_tokens_in_batch;
-
-    // assert(processed_tokens < request.tokens.size());
-    log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n";
-
-    {
-      log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", "
-                          << new_bc.num_tokens;
-      new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-      // update the beam search metadata
-      // how many sub request in current request
-      // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH
-      // entries?
-      int ssm_decoding_steps =
-          profiling_requests[request.guid].ssm_decoding_steps;
-
-      new_bc.beamRequestsInfo[i].beam_size = 1;
-      // printf("beam size: %d, %d\n",
-      //        new_bc.beamRequestsInfo[i].beam_size,
-      //        ssm_decoding_steps);
-      new_bc.beamRequestsInfo[i].max_depth =
-          old_bc.beamRequestsInfo[i].max_depth;
-      // new_bc.sub_requests[i] =
-      //     old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size;
-      new_bc.sub_requests[i] = 1;
-      new_bc.beamRequestsInfo[i].sub_request_num =
-          old_bc.beamRequestsInfo[i].sub_request_num;
-
-      assert(new_bc.beamRequestsInfo[i].sub_request_num <=
-                 BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
-             "exceed maximum nodes per layer");
-
-      // update the parentid, accumalated_probs, depth, and token_ids
-
-      if (request.status == Request::PENDING) {
-        // if the request is pending, we need to update the beam search
-        // metadata based on the initial length
-        new_bc.beamRequestsInfo[i].current_depth =
-            old_bc.beamRequestsInfo[i].current_depth;
-        new_bc.request_running[i] = false;
-      } else {
-        assert(false && "Request should be pending");
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
+    fcfs_request_queue.push_back(request);
+  }
+
+  // Sort the requests in the FCFS queue based on the decoding time in
+  // descending order
+  std::sort(fcfs_request_queue.begin(),
+            fcfs_request_queue.end(),
+            [](Request const &a, Request const &b) {
+              return a.decode_latency_ms < b.decode_latency_ms;
+            });
+
+  // Include the requests one by one until:
+  // 1. If the batch includes a request whose SLO is in the fastest category,
+  // limit the number of requests in the batch to 8.
+  // 2. If the batch does not include a request whose SLO is in the fastest
+  // category, keep adding requests until a request whose SLO is in the fastest
+  // category is met (do not include it).
+  bool has_fastest_slo = false;
+  for (Request &request : fcfs_request_queue) {
+    if (has_fastest_slo and num_available_requests >= 8) {
+      break;
+    }
+    if (request.get_slo_ratio() <= 1.0) {
+      has_fastest_slo = true;
+      if (num_available_requests >= 8) {
+        break;
       }
+    }
+    request_load_onto_batch(request.batch_index);
+  }
 
-      memcpy(&new_bc.causalMask[i],
-             &old_bc.causalMask[i],
-             sizeof(BatchConfig::BitMask));
-
-      new_bc.requestsInfo[i].prompt_phase = true;
-      if (new_bc.requestsInfo[i].first_token_depth_in_request >=
-          request.tokens.size()) {
-        // request is done
-        new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-        new_bc.causalMask[i].this_layer_size = 0;
-        new_bc.beamRequestsInfo[i].sub_request_num = 0;
-        new_bc.beamRequestsInfo[i].beam_size = 1;
-      } else {
-        // Prompt phase
-        new_bc.requestsInfo[i].num_tokens_in_batch =
-            std::min(get_max_tokens_per_batch() - new_bc.num_tokens -
-                         BatchConfig::max_requests_per_batch() + i,
-                     (int)request.tokens.size() -
-                         new_bc.requestsInfo[i].first_token_depth_in_request);
-        request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch;
-        BeamTree tree = request.beam_trees[old_bc.model_id];
-        appendPendingRequest(new_bc.causalMask[i],
-                             new_bc.requestsInfo[i].num_tokens_in_batch);
-      }
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.num_available_requests = num_available_requests;
 
-      if (verbose) {
-        std::cout << "[ Beam Spec] " << request.guid << std::endl;
-        std::cout << "Prompt phase: " << request.tokens.size()
-                  << ", num_tokens_in_batch:"
-                  << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-        std::cout << "Update ssm cache size: " << request.ssm_cache_size
-                  << std::endl;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
 
-        std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size
-                  << std::endl;
-        std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size
-                  << std::endl;
-      }
+    // Per Request Info
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request.llm_cache_size;
+    bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens;
+    bc.requestsInfo[request_index].num_tokens_in_batch = 1;
 
-      // register more tokens due to the beam width
-      for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-        int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
-        for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
 
-          // get value from requestinfo
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              request.tokens[request.tokens.size() -
-                             new_bc.requestsInfo[i].num_tokens_in_batch + j];
+    request.first_token_offset_in_batch = bc.num_tokens;
+    request.num_tokens_in_batch = 1;
 
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k;
-          new_bc.num_tokens++;
-        }
-      }
+    // Per Token Info
+    bc.tokensInfo[bc.num_tokens].request_index = request_index;
+    bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
+
+    bc.num_tokens++;
+
+    if (profiling_requests[request.guid].llm_decoding_steps == 0) {
+      profiling_requests[request.guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
     }
   }
 
-  new_bc.num_generation_tokens = num_generation_tokens;
   if (verbose) {
-    std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:"
+    std::cout << "prepare_decoding_batch_fcfs_slo NEW batchconfig:"
               << std::endl;
-    old_bc.print();
-    new_bc.print();
+    bc.print();
   }
-  return new_bc;
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
+  return bc;
 }
 
-/***** Verify Phase *****/
+BatchConfig RequestManager::prepare_decoding_batch_stta() {
+  // This function is called when the request_manager_status is DECODING. It
+  // fills the last token of each request in the current batch to the
+  // BatchConfig for the LLM to decode.
+  if (verbose) {
+    std::cout << "\n############### prepare_decoding_batch_stta "
+                 "##############\n";
+  }
+
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  bc.prompt_phase = false;
+
+  // Check if there are any requests whose SLO is in the fastest category
+  std::fill(request_available,
+            request_available + get_max_requests_per_batch(),
+            false);
+  num_available_requests = 0;
+  std::vector<std::pair<double, int>> tta_2_batch_index;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (guid_of_requests[request_index] == INVALID_GUID) {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
+    tta_2_batch_index.push_back(std::make_pair(
+        get_request_expected_latency(request) - request.decode_latency_ms,
+        request_index));
+  }
+
+  // Sort the requests in the queue based on the time to attain SLO in ascending
+  // order
+  std::sort(tta_2_batch_index.begin(),
+            tta_2_batch_index.end(),
+            [](std::pair<double, int> const &a,
+               std::pair<double, int> const &b) { return a.first < b.first; });
+
+  // Include the requests one by one until:
+  // 1. If the batch includes a request whose SLO is in the fastest category,
+  // limit the number of requests in the batch to 8.
+  // 2. If the batch does not include a request whose SLO is in the fastest
+  // category, keep adding requests until a request whose SLO is in the fastest
+  // category is met (do not include it).
+  bool has_fastest_slo = false;
+  for (auto const &[tta, request_index] : tta_2_batch_index) {
+    Request &request = all_requests[guid_of_requests[request_index]];
+    if (has_fastest_slo and num_available_requests >= 8) {
+      break;
+    }
+    if (request.get_slo_ratio() <= 1.0) {
+      has_fastest_slo = true;
+      if (num_available_requests >= 8) {
+        break;
+      }
+    }
+    request_load_onto_batch(request_index);
+  }
 
-TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify(
-    std::vector<BeamSearchBatchConfigFuture> const &old_batches,
-    Context ctx,
-    Runtime *runtime) {
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.num_available_requests = num_available_requests;
 
-  RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
-                        TaskArgument(&rm, sizeof(RequestManager *)));
-  for (auto const &bcf : old_batches) {
-    launcher.add_future(bcf);
-  }
-  return runtime->execute_task(ctx, launcher);
-}
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
 
-TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  std::vector<BeamSearchBatchConfig> old_batches;
-  for (auto const &bcf : task->futures) {
-    old_batches.push_back(Future(bcf).get_result<BeamSearchBatchConfig>());
-  }
-  return rm->prepare_next_batch_verify(old_batches);
-}
+    // Per Request Info
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request.llm_cache_size;
+    bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens;
+    bc.requestsInfo[request_index].num_tokens_in_batch = 1;
 
-TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
-    std::vector<BeamSearchBatchConfig> const &old_batches) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
 
-  if (verbose) {
-    std::cout
-        << "\n############### prepare_next_batch_verify ###############\n";
-  }
+    request.first_token_offset_in_batch = bc.num_tokens;
+    request.num_tokens_in_batch = 1;
 
-  assert(old_batches.size() > 0);
+    // Per Token Info
+    bc.tokensInfo[bc.num_tokens].request_index = request_index;
+    bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
-  TreeVerifyBatchConfig new_bc;
-  new_bc.num_tokens_to_commit = 0;
-  new_bc.num_tokens = 0;
+    bc.num_tokens++;
 
-  int max_prompt_load_size = get_max_verify_tokens_per_batch();
-  for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) {
-    if (old_batches.at(0).request_completed[i]) {
-      continue;
-    } else if (old_batches.at(0).request_running[i]) {
-      max_prompt_load_size -= (BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1);
-    } else {
-      max_prompt_load_size -= 1;
+    if (profiling_requests[request.guid].llm_decoding_steps == 0) {
+      profiling_requests[request.guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
     }
   }
-  int num_active_req = -1;
-  for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) {
-    if (old_batches.at(0).request_completed[i]) {
-      continue;
-    }
-    num_active_req++;
-    size_t guid = old_batches.at(0).requestsInfo[i].request_guid;
-    Request &request = all_requests[guid];
-
-    // Profiling
-    profiling_requests[request.guid].llm_decoding_steps += 1;
 
-    if (request.status == Request::RUNNING) {
-      new_bc.request_running[i] = true;
+  if (verbose) {
+    std::cout << "prepare_decoding_batch_fcfs_slo NEW batchconfig:"
+              << std::endl;
+    bc.print();
+  }
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
+  return bc;
+}
+/* ----- Speculative Inference Specific functions ----- */
 
-      // Get the dfs tree
-      std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
-          all_dfs_trees;
+/***** Request Init Phase *****/
+BatchConfig RequestManager::prepare_first_spec_batch_config() {
+  if (verbose) {
+    std::cout << "\n############### prepare_first_spec_batch_config "
+                 "##############\n";
+  }
+  // This method does the following:
+  // 1. Commit the verified tokens through BatchConfig. The infomation
+  // of the committed tokens are stored in request.committed_tokens. Put the
+  // information of the committed tokens into BatchConfig.TokensInfo.
+  // 2. Maintain BatchConfig::RequestsInfo and all other fields of
+  // BatchConfig.
+  assert(current_ssm_step == 0);
 
-      for (int j = 0; j < old_batches.size(); j++) {
-        std::vector<std::pair<BatchConfig::TokenId, int>> new_tree =
-            traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1);
-        all_dfs_trees.push_back(new_tree);
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
+  // Assume that only one small model is in use now
+  new_bc.prompt_phase = true;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    std::vector<Request::CommittedToken> &committed_tokens =
+        request.committed_tokens;
+
+    // Maintain requestsInfo
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
+    new_bc.requestsInfo[request_index].first_token_index_in_request =
+        request.ssm_cache_size;
+
+    // Store committed tokens to tokensInfo
+    int num_committed_tokens = committed_tokens.size();
+    if (num_committed_tokens == 1) {
+      new_bc.requestsInfo[request_index].num_tokens_in_batch = 1;
+      // The case where the prefilling is just finished. Although the last
+      // token's kv cache is already there, the we need to decode the last
+      // token because it's the root of the token tree.
+      new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+      if (streaming_cache) {
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            request.ssm_cache_size;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            request.ssm_cache_size;
+      } else {
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            committed_tokens[0].to_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            committed_tokens[0].to_index;
       }
-      assert(all_dfs_trees.size() == old_batches.size());
-      std::vector<std::pair<BatchConfig::TokenId, int>> dfs_tree_inputs =
-          merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid);
-
-      if (verbose) {
-        std::cout << "Request Tokens Size: " << request.tokens.size()
-                  << std::endl;
-        for (int k = 0; k < request.tokens.size(); k++) {
-          std::cout << k << ": " << request.tokens[k] << std::endl;
+      new_bc.tokensInfo[new_bc.num_tokens].token_id =
+          committed_tokens[0].token_id;
+      new_bc.num_tokens++;
+    } else {
+      for (int committed_token_index = 1;
+           committed_token_index < committed_tokens.size();
+           committed_token_index++) {
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+        if (streaming_cache) {
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+              request.ssm_cache_size + committed_token_index - 1;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              request.ssm_cache_size + committed_token_index - 1;
+        } else {
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+              committed_tokens[committed_token_index].to_index;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              committed_tokens[committed_token_index].to_index;
         }
+        new_bc.tokensInfo[new_bc.num_tokens].token_id =
+            committed_tokens[committed_token_index].token_id;
+        new_bc.num_tokens++;
       }
+      new_bc.requestsInfo[request_index].num_tokens_in_batch =
+          num_committed_tokens - 1;
+    }
 
-      // Normal Request Info
-      new_bc.requestsInfo[i].first_token_depth_in_request =
-          dfs_tree_inputs.front().second;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid =
-          old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-      // copy bitmask to verify batchconfig
-      memcpy(&(new_bc.causalMask[i]),
-             &(old_batches.at(0).causalMask[i]),
-             sizeof(BatchConfig::BitMask));
-      // TODO: Check this
-      new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-      new_bc.request_completed[i] = false;
-
-      // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", "
-      //           << new_bc.causalMask[i].tree_size << ", "
-      //           << new_bc.causalMask[i].non_tree_cache_size << "\n";
-      // std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0])
-      //           << "\n";
-
-      // Committed Tokens
-      if (committed_tokens.find(guid) != committed_tokens.end()) {
-        for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
-          // if (j < committed_tokens.at(guid).size()) {
-
-          auto committed_token = committed_tokens.at(guid).at(j);
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
-              committed_token.second;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-              i;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-              committed_token.first;
-          if (verbose) {
-            std::cout << new_bc.num_tokens_to_commit
-                      << "- committed_token.token_depth: "
-                      << committed_token.first
-                      << ", token_index: " << committed_token.second
-                      << std::endl;
-          }
-          new_bc.num_tokens_to_commit++;
-          request.llm_cache_size++;
-          // }
-        }
-      }
-      if (verbose) {
-        std::cout << "new_bc.num_tokens_to_commit: "
-                  << new_bc.num_tokens_to_commit << std::endl;
-      }
+    request.first_token_offset_in_batch =
+        new_bc.requestsInfo[request_index].first_token_offset_in_batch;
+    request.num_tokens_in_batch =
+        new_bc.requestsInfo[request_index].num_tokens_in_batch;
+
+    // Copy the causal mask, it should already been updated in
+    // update_llm_verify_results
+    new_bc.causalMask[request_index] = request.causal_mask;
+    if (streaming_cache) {
+      new_bc.causalMask[request_index].non_tree_cache_size =
+          request.ssm_cache_size - 1;
+    }
 
-      // Incremental phase: only add the last committed token
-      new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-      new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-          request.tokens.size() - 1;
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
 
-      new_bc.num_tokens++;
-      new_bc.requestsInfo[i].num_tokens_in_batch++;
+    if (profiling_requests[guid].ssm_decoding_steps == 0) {
+      profiling_requests[guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
+    }
+    profiling.ssm_step_start = Realm::Clock::current_time_in_microseconds();
+  }
 
-      if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) {
-        assert(false &&
-               "Exceeding the space available in the TreeVerify batch");
-        break;
-      }
+  if (!spec_infer_old_version) {
+    // Only dynamically update the tree depth in the new version
+    update_token_tree_depth();
+  }
+  if (verbose) {
+    std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
+              << std::endl;
+    new_bc.print();
+  }
+  return new_bc;
+}
 
-      new_bc.requestsInfo[i].first_token_depth_in_request =
-          request.tokens.size() - 1;
+/***** Speculative Decoding Phase *****/
+BatchConfig RequestManager::prepare_next_spec_batch_config() {
+  if (verbose) {
+    std::cout << "\n############### prepare_next_spec_batch_config "
+                 "###############\n";
+    std::cout << "Current tree depth: " << current_ssm_step + 1 << "\n";
+  }
 
-      bool cutLayer = false;
-      // Add Tokens from the DFS Tree to the next batch
-      for (int j = 1; j < dfs_tree_inputs.size(); j++) {
-        auto token = dfs_tree_inputs.at(j);
-        if (verbose) {
-          std::cout << "[" << j << "] Token: " << token.first
-                    << ", Depth:" << token.second << std::endl;
-        }
-        // Normal Token Info
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-        new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
+  // Prepare the next batch for existing requests
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
+  // We assume that only one small model is in use now
+  new_bc.model_id = 0;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
+
+    // Fill in the tokens
+    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
+    if (token_tree.tree_layers.size() <= current_ssm_step) {
+      // This request has no token to decode in this and the following small
+      // model inference steps
+      new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+      // non_tree_cache_size = ssm_cache_size - 1
+      new_bc.requestsInfo[request_index].first_token_index_in_request =
+          request.ssm_cache_size - 1 + request.causal_mask.tree_or_prompt_size -
+          request.causal_mask.current_layer_size;
+      request.num_tokens_in_batch = 0;
+      request.first_token_offset_in_batch = new_bc.num_tokens;
+      continue;
+    } else {
+      std::vector<std::shared_ptr<TokenTreeNode>> &current_layer =
+          token_tree.tree_layers.back();
+      // Exclude the current layer from the token tree, because we want the
+      // start index
+      // non_tree_cache_size = ssm_cache_size - 1
+      new_bc.requestsInfo[request_index].first_token_index_in_request =
+          request.ssm_cache_size - 1 + request.causal_mask.tree_or_prompt_size -
+          request.causal_mask.current_layer_size;
+      new_bc.requestsInfo[request_index].num_tokens_in_batch =
+          request.causal_mask.current_layer_size;
+
+      request.num_tokens_in_batch =
+          new_bc.requestsInfo[request_index].num_tokens_in_batch;
+      request.first_token_offset_in_batch = new_bc.num_tokens;
+
+      int child_index = 0;
+      for (auto const &node_ptr : current_layer) {
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            new_bc.requestsInfo[request_index].first_token_index_in_request +
+            child_index;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-            token.second;
+            request.ssm_cache_size - 1 + current_ssm_step;
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
 
         new_bc.num_tokens++;
-        new_bc.requestsInfo[i].num_tokens_in_batch++;
-
-        if (new_bc.num_tokens == get_max_verify_tokens_per_batch() &&
-            (j != dfs_tree_inputs.size() - 1)) {
-          cutLayer = true;
-          break;
-        }
+        child_index++;
       }
+    }
 
-      // delete the last incomplete layer
-      if (cutLayer) {
-        int total_tokens = new_bc.num_tokens;
-        for (int j = total_tokens - 1; j >= 1; j--) {
-          new_bc.num_tokens--;
-          new_bc.requestsInfo[i].num_tokens_in_batch--;
-          // std::cout << "cut: " << j << "\n";
-          if (new_bc.tokensInfo[j].abs_depth_in_request !=
-              new_bc.tokensInfo[j - 1].abs_depth_in_request) {
-            break;
-          }
-        }
-      }
+    // Copy the causal mask, it should already been updated by
+    // update_ssm_inference_results
+    new_bc.causalMask[request_index] = request.causal_mask;
+    if (streaming_cache) {
+      new_bc.causalMask[request_index].non_tree_cache_size =
+          request.ssm_cache_size - 1;
+    }
 
-    } else if (request.status == Request::PENDING) {
-      new_bc.request_running[i] = false;
-      if (verbose) {
-        std::cout << "[Verify] Request " << request.guid
-                  << " is pending in loading prompt phase" << std::endl;
-        std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size
-                  << std::endl;
-        std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size
-                  << std::endl;
-      }
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+  }
 
-      // Commit all tokens from the last loading batch
-      if (committed_tokens.find(guid) != committed_tokens.end()) {
-        for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
-          auto token = committed_tokens.at(guid).at(j);
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
-              token.second;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-              i;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-              token.first;
-
-          new_bc.num_tokens_to_commit++;
-          request.llm_cache_size++;
-        }
-        std::cout << "[Verify] Committed Tokens from last loading batch: "
-                  << new_bc.num_tokens_to_commit << std::endl;
-      }
+  if (verbose) {
+    std::cout << "prepare_next_spec_batch_config NEW batchconfig:" << std::endl;
+    new_bc.print();
+  }
+  return new_bc;
+}
 
-      memcpy(&(new_bc.causalMask[i]),
-             &(old_batches.at(0).causalMask[i]),
-             sizeof(BatchConfig::BitMask));
-
-      // Normal Request Info
-      new_bc.requestsInfo[i].first_token_depth_in_request =
-          request.llm_cache_size;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid =
-          old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-      new_bc.request_completed[i] = false;
-
-      new_bc.requestsInfo[i].num_tokens_in_batch =
-          std::min(max_prompt_load_size,
-                   (int)request.initial_len -
-                       new_bc.requestsInfo[i].first_token_depth_in_request);
-      max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch;
-
-      std::cout << "max_prompt_load_size: " << max_prompt_load_size
-                << std::endl;
+/***** Verify Phase *****/
+BatchConfig RequestManager::prepare_verify_batch_config() {
+  if (verbose) {
+    std::cout
+        << "\n############### prepare_verify_batch_config ###############\n";
+  }
+  // This method does the following:
+  // 1. Commit the verified tokens in the last iteration through the
+  // BatchConfig. We can do this request by request.
+  // The information of the committed tokens is stored in
+  // Request.llm_committed_tokens. Put the information of the committed tokens
+  // into BatchConfig.committed_tokens.
+  // 2. Load the tokens on the token tree that are not yet pruned to
+  // BatchConfig.tokensInfo. Be careful with the abs_depth etc.
+  // (skip the pruned tokens).
+  // 3. Create the causal mask for the large model based on the small model
+  // causal mask (call create_llm_bitmask()).
+  // 4. Maintain BatchConfig::RequestsInfo and all other fields of
+  // BatchConfig.
+  // Please refer to the implementation of prepare_next_spec_batch_config()
+  // for more details.
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
+
+  // get page manager
+  PageManager *page_manager = PageManager::get_page_manager();
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    // before commit token, reset the pages assigned by cleaning all the tokens
+    std::vector<int> block_table_before_commit =
+        page_manager->get_block_table_indices(guid);
+    // also need to reset the pages
+    reset_block_table(request);
+
+    // 1. Maintain requestsInfo
+    new_bc.requestsInfo[request_index].first_token_index_in_request =
+        request.tokens.size() - 1; // Exclude the last token
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
+    new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+
+    // Put the information of the committed tokens into
+    // BatchConfig.committed_tokens.
+    // Note here, we shouldn't put the last token in request.committed_tokens
+    // into new_bc. Because the LLM don't have that token's KV cache.
+    std::vector<Request::CommittedToken> &committed_tokens =
+        request.committed_tokens;
+    for (int committed_token_index = 0;
+         committed_token_index < committed_tokens.size() - 1;
+         committed_token_index++) {
+      Request::CommittedToken &committed_token =
+          committed_tokens.at(committed_token_index);
+
+      int idx_to_physical =
+          append_token_to_block(request, committed_token.token_id, true);
+      int idx_from_logical = committed_token.from_index;
+      int idx_from_physical =
+          block_table_before_commit[idx_from_logical / kPagesize] * kPagesize +
+          idx_from_logical % kPagesize;
+
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+          request_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
+          idx_from_physical;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+          idx_to_physical;
+      new_bc.num_tokens_to_commit++;
+    }
 
-      if (request.llm_cache_size < request.initial_len) {
-        // std::cout << "Initialization (prompt) phase: "
-        //           << new_bc.requestsInfo[i].num_tokens_in_batch << ", "
-        //           << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n";
-        // Initialization (prompt) phase
-        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              request.tokens[request.llm_cache_size + j];
+    // Load the tokens on the token tree that are not yet pruned to
+    // BatchConfig.tokensInfo.
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    int token_tree_index = 0;
+    int layer_index = 0;
+    for (auto const &tree_layer : token_tree.tree_layers) {
+      for (auto const &tree_node : tree_layer) {
+        if (tree_node->included == true) {
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+              request.tokens.size() - 1 + token_tree_index;
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-              request.llm_cache_size + j;
+              request.tokens.size() - 1 + layer_index;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id;
           new_bc.num_tokens++;
-        }
+          token_tree_index++;
 
-        if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) {
-          printf("Exceeding (%i) the space available (%i) in the TreeVerify "
-                 "batch\n",
-                 new_bc.num_tokens,
-                 get_max_verify_tokens_per_batch());
-          assert(false);
+          // Append the token to the block
+          append_token_to_block(request, tree_node->id, false);
         }
+      }
+      layer_index++;
+    }
+    if (verbose) {
+      // print token tree
+      std::cout << "Token tree for request " << request_index << ": "
+                << std::endl;
+      std::cout << token_tree << std::endl;
+    }
+    new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
 
-        if (new_bc.requestsInfo[i].num_tokens_in_batch +
-                request.llm_cache_size >=
-            request.initial_len) {
-          // launch the request into running phase after loading all prompt
-          request.status = Request::RUNNING;
-          new_bc.request_running[i] = true;
-
-          // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: "
-          //           << new_bc.requestsInfo[i].num_tokens_in_batch <<
-          //           std::endl;
-          new_bc.requestsInfo[i].prompt_phase = true;
-
-          dfs_tree_inputs[guid] =
-              std::vector<std::pair<BatchConfig::TokenId, int>>{std::make_pair(
-                  request.tokens.back(), request.tokens.size() - 1)};
-        }
-      } else { // launch the request into running phase after loading all prompt
-        if (get_max_verify_tokens_per_batch() - new_bc.num_tokens > 0) {
-          // std::cout << "Initialization running phase: "
-          //           << new_bc.requestsInfo[i].num_tokens_in_batch << "\n";
-          request.status = Request::RUNNING;
-          new_bc.request_running[i] = true;
-
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-              request.tokens.size() - 1;
+    request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
+    request.num_tokens_in_batch = token_tree_index;
 
-          new_bc.num_tokens++;
-          new_bc.requestsInfo[i].num_tokens_in_batch++;
-          // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch2: "
-          //           << new_bc.requestsInfo[i].num_tokens_in_batch <<
-          //           std::endl;
-
-          new_bc.requestsInfo[i].prompt_phase = true;
-          dfs_tree_inputs[guid] =
-              std::vector<std::pair<BatchConfig::TokenId, int>>{std::make_pair(
-                  request.tokens.back(), request.tokens.size() - 1)};
-        }
-      }
+    // Create the causal mask for the large model based on the small model
+    // causal mask.
+    new_bc.causalMask[request_index] = create_llm_bitmask(guid);
 
-    } else {
-      assert(false && "Request status is not RUNNING or PENDING");
-    }
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+    new_bc.requestsInfo[request_index].request_guid = request.guid;
   }
 
+  if (verbose) {
+    std::cout << "prepare_verify_batch_config NEW batchconfig:" << std::endl;
+    new_bc.print();
+  }
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
   return new_bc;
 }
 
-void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
-                                         BeamInferenceResult const &result) {
-  // step1 store the outputs
-  if (old_bc.num_tokens <= 0) {
-    return;
+int get_tree_size(Request const &request) {
+  int size = 0;
+  for (auto &layer : request.speculative_token_trees[0].tree_layers) {
+    size += (int)layer.size();
   }
-  auto guid =
-      old_bc.requestsInfo[old_bc.tokensInfo[0].request_index].request_guid;
-  auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request;
-  int result_index = 0;
-
-  if (verbose) {
-    std::cout << "Store total of " << old_bc.num_tokens
-              << " tokens in the current batch.\n";
-  }
-
-  for (int i = 0; i <= old_bc.num_tokens; i++) {
-    if (i == old_bc.num_tokens ||
-        old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid !=
-            guid) {
-
-      // std::cout << "i is: " << i << "old guid" << guid << " new guid"
-      //           << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index]
-      //                  .request_guid
-      //           << "\n";
-
-      int index = old_bc.tokensInfo[i - 1].request_index;
-      int beam_size = old_bc.beamRequestsInfo[index].beam_size;
-
-      // int leaf_node_num = old_bc.sub_requests[index];
-      int leaf_node_num =
-          old_bc.beamRequestsInfo[index].sub_request_num * beam_size;
-      int depth = old_bc.beamRequestsInfo[index].current_depth;
-
-      // Each token yields (beam_width) results
-      // int beam_width = old_bc.beamRequestsInfo[index].beam_size;
-
-      // Count tokens sent to model in this request to find the final token's
-      // index
-      result_index +=
-          (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) *
-          beam_size;
-
-      if (verbose) {
-        std::cout << "i = " << i << ", result index = " << result_index
-                  << ", value: " << result.token_ids[result_index]
-                  << ", leaf node num: " << leaf_node_num << ", depth" << depth
-                  << ", beam size: " << beam_size << "\n";
-      }
-
-      Request &request = all_requests[old_bc.requestsInfo[index].request_guid];
-
-      if (old_bc.requestsInfo[index].num_tokens_in_batch == 0) {
-        continue;
-      }
-
-      if (depth == 1) {
-        // store the last input into the tree;
-        if (verbose) {
-          std::cout << "try to store the input"
-                    << "\n";
-        }
+  return size;
+}
 
-        request.beam_trees.at(old_bc.model_id).treeLayers[0].tokens[0] =
-            request.tokens.back();
-        request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1;
-        request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1;
-        request.beam_trees.at(old_bc.model_id)
-            .treeLayers[0]
-            .nodes_num_this_layer = 1;
-
-        if (verbose) {
-          std::cout << "Store the previous last token to the tree root: "
-                    << request.tokens.back() << "\n";
-        }
-      }
-      request.beam_trees.at(old_bc.model_id)
-          .treeLayers[depth]
-          .nodes_num_this_layer = leaf_node_num;
-      for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) {
-
-        request.beam_trees.at(old_bc.model_id)
-            .treeLayers[depth]
-            .tokens[beam_id] = result.token_ids[result_index];
-        request.beam_trees.at(old_bc.model_id)
-            .treeLayers[depth]
-            .probs[beam_id] = result.probs[result_index];
-        request.beam_trees.at(old_bc.model_id)
-            .treeLayers[depth]
-            .parent_ids[beam_id] = result.parent_id[result_index];
-
-        if (verbose) {
-          std::cout << "tree value: " << depth << "token: "
-                    << request.beam_trees.at(old_bc.model_id)
-                           .treeLayers[depth]
-                           .tokens[beam_id]
-                    << "result tokens: " << result.token_ids[result_index];
-        }
-        result_index += 1;
-      }
-      // update the guid and start_depth for current request
-      if (i < old_bc.num_tokens) {
-        int new_req_idx = old_bc.tokensInfo[i].request_index;
-        guid = old_bc.requestsInfo[new_req_idx].request_guid;
-        start_depth = old_bc.tokensInfo[i].abs_depth_in_request;
-      }
+bool RequestManager::is_eos_token(TokenId token_id) {
+  for (int eos_token : eos_token_ids) {
+    if (token_id == eos_token) {
+      return true;
     }
   }
+  return false;
 }
 
-// for updating the beam search metadata in requests in incremental phase
-void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc,
-                                          BeamSearchBatchConfig const &old_bc,
-                                          BeamTree &tree,
-                                          int request_index) {
+bool RequestManager::update_llm_verify_results(
+    InferenceResult const &llm_verify_result) {
+  // We may have two types of InferenceResults, one is the results from
+  // sampling the large model, the other is the top-p / top-k logits of the
+  // large model, we can first implement the former one. For the latter one,
+  // we have to add a CPU based verify function.
+
+  // Compare the results returned from the LLM and compare them with the
+  // SSM's speculative token tree. For the greedy construction of the
+  // speculative token tree, we can simply compare LLM's sample result at each
+  // token, this is implemented in get_verify_results_greedy(). This function
+  // stores the commmitted tokens into the corresponding fields in the
+  // Request. For the sampling construction of the speculative token tree, we
+  // need to implement a CPU based verify function.
+
+  // Update llm_cache_size with the last committed_tokens, and clear
+  // committed_tokens
+  int nb_requests_decoded = 0;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    request.llm_cache_size += request.committed_tokens.size() - 1;
+    request.committed_tokens.clear();
 
-  // do the exchange
-  if (new_bc.request_completed[request_index]) {
-    assert(false);
+    profiling_requests[guid].llm_decoding_steps++;
+    nb_requests_decoded++;
   }
-  int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1;
-  int beam_size = new_bc.beamRequestsInfo[request_index].beam_size;
-
-  // int leaf_node_num = old_bc.sub_requests[request_index];
-  int leaf_node_num = new_bc.beamRequestsInfo[request_index].sub_request_num;
 
-  if (new_bc.beamRequestsInfo[request_index].current_depth ==
-      1) { // TODO: check if this is correct
-    // for (int j = 0; j < beam_size; j++) {
-    //   new_bc.beamRequestsInfo[request_index].parent_id[j] = j;
-    //   new_bc.beamRequestsInfo[request_index].probs[j] =
-    //       tree.treeLayers[depth].probs[j]; // ?
-    //   new_bc.beamRequestsInfo[request_index].tokens[j] =
-    //       tree.treeLayers[depth].tokens[j]; // ?
-    // }
-    // Do nothing
-    // assert(false);
+  // Process the LLM results greedily
+  if (speculative_sampling) {
+    get_verify_results_sample(llm_verify_result);
   } else {
-    for (int j = 0; j < leaf_node_num; j++) {
-      new_bc.beamRequestsInfo[request_index].parent_id[j] =
-          tree.treeLayers[depth].parent_ids[j];
-      new_bc.beamRequestsInfo[request_index].probs[j] =
-          tree.treeLayers[depth].probs[j];
-      new_bc.beamRequestsInfo[request_index].tokens[j] =
-          tree.treeLayers[depth].tokens[j];
-      // std::cout << "token: " << j << ": "
-      //           << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n";
-    }
-  }
-  if (verbose) {
-    std::cout << "-----------after parent id exchange-----------" << std::endl;
-    for (int j = 0; j < beam_size; j++) {
-      std::cout << "after request id: " << request_index << "beam id = " << j
-                << "parent: "
-                << new_bc.beamRequestsInfo[request_index].parent_id[j]
-                << "token: " << new_bc.beamRequestsInfo[request_index].tokens[j]
-                << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j]
-                << std::endl;
-    }
+    get_verify_results_greedy(llm_verify_result);
   }
-}
 
-// bit mask related function
+  long long int current_time = Realm::Clock::current_time_in_microseconds();
+  profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) *
+                                     1e-3);
+  profiling.requests_per_step.push_back(nb_requests_decoded);
 
-// prompt phase, init task
-void RequestManager::initBitMask(BatchConfig::BitMask &bitmask,
-                                 int initLength) {
-  assert(initLength > 0);
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
-  // 0000000..1000
-  bitmask.non_tree_cache_size = 0;
-  bitmask.tree_size = 1;
-
-  bitmask.prompt_size = initLength;
-  bitmask.this_layer_size = initLength;
-  // std::cout << "see bit mask" << bitmask.prompt_size << "\n";
-  // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n";
-  // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n";
-  // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n";
-}
-
-// prepare next init
-void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask,
-                                   int initLength,
-                                   int non_tree_size) {
-  // assert(initLength == 1);
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
-  // 0000000..1000
-  assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM &&
-         "do not support tree size > 64");
-  assert(initLength >= 1 && "verified token num should >= 1");
-
-  // std::cout << "non tree size: " << non_tree_size << ", "
-  //           << bitmask.non_tree_cache_size << "\n";
-
-  bitmask.non_tree_cache_size = non_tree_size + initLength - 1;
-  bitmask.tree_size = 1;
-  bitmask.this_layer_size = initLength;
-  // std::cout << "non_tree_size: " << non_tree_size << "\n";
-  bitmask.prompt_size = 1;
-  for (int i = 0; i < bitmask.prompt_size; i++) {
-    for (int j = i; j < bitmask.prompt_size; j++) {
-      bitmask.mask[i] |= (1 << j);
-    }
-  }
-
-  // std::cout << "see bit mask update" << bitmask.prompt_size << "\n";
-  // std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0])
-  //           << "\n";
-}
-
-// prompt phase, init task
-void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
-                                          int initLength) {
-  assert(initLength > 0);
-  // std::cout << "append pending bit mask: " << initLength << "\n";
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
-  // 0000000..1000
-  bitmask.non_tree_cache_size = 0;
-  bitmask.tree_size = 1;
-  bitmask.prompt_size += initLength;
-  bitmask.this_layer_size = initLength;
-
-  // for (int i = 0; i < bitmask.prompt_size; i++) {
-  //   for (int j = i; j < bitmask.prompt_size; j++) {
-  //     bitmask.mask[i] |= (1 << j);
-  //   }
-  // }
-}
+  bool request_completed = false;
+
+  // Iterate over the requests
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (verbose) {
+      std::cout << "Request " << guid << " token tree: " << std::endl;
+      std::cout << request.speculative_token_trees[0];
+    }
 
-// prepare next beam, append layers to the tree
-void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask,
-                                   int newNodes,
-                                   int preBeamSize,
-                                   int old_sub_num,
-                                   BeamTree const tree,
-                                   int currentDepth) {
-  int pre_tree_size = bitmask.tree_size;
-  bitmask.tree_size += newNodes;
-  bitmask.this_layer_size = newNodes;
-  assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM &&
-         "do not support tree size > 64");
-  // preBeamSize: replicate num
-
-  // add relationship with input/prompt
-  for (int i = 0; i < bitmask.prompt_size; i++) {
-    for (int j = pre_tree_size; j < bitmask.tree_size; j++) {
-      bitmask.mask[i] |= (1 << j);
-      // std::cout << "see bit mask append: " << i << ", to" << j
-      //           << std::bitset<64>(bitmask.mask[i]) << "\n";
-    }
-  }
-
-  // std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", "
-  //           << pre_tree_size << ", " << bitmask.prompt_size << ", "
-  //           << preBeamSize << "\n";
-
-  // int num_groups = newNodes / preBeamSize;
-  // int group_size = newNodes / num_groups;
-  // add relations to branch
-  // requests in same groups share same relations, except the last token.
-
-  // set middle layers
-  //  skip the root prompt/tokens
-  int token_idx = bitmask.prompt_size;
-  int new_nodes_start_idx = pre_tree_size;
-  // std::cout << "new nodes start " << new_nodes_start_idx << "\n";
-  for (int i = 1; i < currentDepth; i++) {
-    new_nodes_start_idx = pre_tree_size;
-    int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer;
-    // std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer
-    //           << "group size: " << newNodes / nodes_this_layer << "\n";
-    for (int j = 0; j < nodes_this_layer; j++) {
-      int group_size = newNodes / nodes_this_layer;
-      for (int k = 0; k < group_size; k++) {
-        bitmask.mask[token_idx] |= (1 << new_nodes_start_idx);
-        new_nodes_start_idx += 1;
+    request.decode_latency_ms =
+        (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
+    bool attained =
+        request.decode_latency_ms <= get_request_expected_latency(request);
+    bool current_attained =
+        request.decode_latency_ms <=
+        get_request_expected_latency(request) + get_slo_constraint(request) * 6;
+
+    // Initialize the token tree for the request
+    init_token_tree(guid);
+    assert(!request.committed_tokens.empty() &&
+           "The committed tokens should not be empty.");
+    // Add the last committed token as the root of the speculative token tree
+    add_root_to_spec_token_tree(guid, request.committed_tokens.back().token_id);
+
+    // Check if the request is completed. If its completed, clean up the
+    // metainfo stored in the RequestManager. Otherwise, update its bitmask.
+    bool eos_token_found = false;
+    for (auto const &committed_token : request.committed_tokens) {
+      if (is_eos_token(committed_token.token_id)) {
+        eos_token_found = true;
+        break;
       }
-      token_idx += 1;
+    }
+    if (eos_token_found or request.decode_length() >= get_max_output_length() or
+        request.tokens.size() >= get_max_sequence_length()) {
+      // Request is completed
+      request_update_attainment(request_index, attained);
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    } else if (!current_attained and slo_violation_early_termination) {
+      // Early drop that request
+      request_update_attainment(request_index, attained);
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    } else {
+      update_bitmask_prompt(guid, request.committed_tokens.size() - 1);
     }
   }
 
-  assert(token_idx == pre_tree_size);
-  assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size);
+  // Some requests may be completed after appending the verified tokens.
+  // If there is a request completed, return true.
+  return request_completed;
+}
 
-  // assert(currentDepth <= 2);
-  // set last layer, all tokens are only relevant to it self;
-  for (int i = token_idx; i < bitmask.tree_size; i++) {
-    bitmask.mask[i] |= (1 << i);
-    // std::cout << "set rel: " << i << "to: " << i << "\n";
+bool RequestManager::update_ssm_inference_results(
+    InferenceResult const &ssm_inference_result) {
+  // This function returns true if no tokens are added to the token tree,
+  // which indicates that the ssm inference phase is done.
+  assert(current_ssm_step >= 1 &&
+         "The current speculation step should be no less than 1");
+
+  // Here we assume that the order of the tokens in the last
+  // BatchConfig and hence the last InferenceResult is equal to
+  // the order of the request in the last BatchConfig
+  if (!spec_infer_old_version) {
+    static double schedule_start = 0.0;
+    if (get_eval_overhead_breakdown()) {
+      schedule_start = Realm::Clock::current_time_in_microseconds();
+    }
+    add_tokens_to_spec_token_tree(ssm_inference_result);
+    if (get_eval_overhead_breakdown()) {
+      eval_schedule_latency_us +=
+          Realm::Clock::current_time_in_microseconds() - schedule_start;
+    }
+  } else {
+    add_tokens_to_spec_token_tree_old_version(ssm_inference_result);
   }
 
-  // if(bitmask.non_tree_cache_size == 19 && bitmask.tree_size > 2){
-  //   assert(false);
-  // }
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
 
-  // std::cout << "see bit mask append" << bitmask.prompt_size << "\n";
-  // std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n";
-  // std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0])
-  //           << "\n";
-}
-
-bool PreOrder(
-    BeamTree const &tree,
-    int max_depth,
-    int current_depth,
-    int beam_width,
-    int id,
-    std::vector<std::pair<BeamSearchBatchConfig::TokenId, int>> &serializedTree,
-    bool verbose) {
-  // terminate
-  if (current_depth >= max_depth) {
-    serializedTree.push_back(std::make_pair(
-        tree.treeLayers[current_depth].tokens[id], current_depth));
-    if (verbose) {
-      std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id]
-                << "\n";
-      std::cout << "return true"
-                << "\n";
+    if (current_ssm_step == 1) {
+      if (streaming_cache) {
+        request.streaming_cache_info.commit_cache(request.num_tokens_in_batch);
+        request.ssm_cache_size = request.streaming_cache_info.commit_len;
+      } else {
+        request.ssm_cache_size = request.tokens.size();
+      }
+    }
+
+    if (current_ssm_step == 1) {
+      init_bitmask_spec(guid);
+    }
+    append_bitmask(guid);
+
+    profiling_requests[guid].ssm_decoding_steps++;
+
+    if (current_ssm_step == ssm_tree_depth) {
+      profiling_requests[guid].speculation_start_timestamp =
+          profiling.ssm_step_start;
+      profiling_requests[guid].speculation_end_timestamp =
+          Realm::Clock::current_time_in_microseconds();
     }
-    return true;
   }
 
-  // add to tree;
-  // std::cout<<"node: " << current_depth << ", id: " <<
-  serializedTree.push_back(
-      std::make_pair(tree.treeLayers[current_depth].tokens[id], current_depth));
-  if (verbose) {
-    std::cout << "push something: " << tree.treeLayers[current_depth].tokens[id]
-              << ", " << current_depth << std::endl;
-  }
-  int index = serializedTree.size() - 1;
-  int next_layers = current_depth + 1;
-
-  bool flag = false;
-  // recursion
-  for (int i = 0; i < beam_width; i++) {
-    int child_id = i;
-    int child_parent = tree.treeLayers[next_layers].parent_ids[i];
-
-    // for all childs, do preOrder
-    if (child_parent == id) {
-      if (verbose) {
-        std::cout << "current depth: " << current_depth << ", child_parent, "
-                  << child_parent << ", child_id, " << child_id << "\n";
+  // Stop conditions
+  if (current_ssm_step == ssm_tree_depth) {
+    // Prune the token tree at the last step
+    if (!spec_infer_old_version) {
+      static double schedule_start = 0.0;
+      if (get_eval_overhead_breakdown()) {
+        schedule_start = Realm::Clock::current_time_in_microseconds();
+      }
+      prune_token_tree();
+      if (get_eval_overhead_breakdown()) {
+        eval_schedule_latency_us +=
+            Realm::Clock::current_time_in_microseconds() - schedule_start;
       }
-      bool res = PreOrder(tree,
-                          max_depth,
-                          current_depth + 1,
-                          beam_width,
-                          child_id,
-                          serializedTree,
-                          verbose);
-      flag = flag || res;
-    }
-  }
-  // if (!flag) {
-  //   // no child for this token, delete it
-  //   std::cout << "delete a node: " <<
-  //   tree.treeLayers[current_depth].tokens[id]
-  //             << ", " << current_depth << std::endl;
-  //   serializedTree.erase(serializedTree.begin() + index);
-  // }
-  return flag;
-}
-
-std::vector<std::pair<BatchConfig::TokenId, int>>
-    RequestManager::traverse_verify_tree(
-        size_t guid,
-        std::vector<std::pair<BatchConfig::TokenId, int>> const
-            &inputSerializedTree,
-        std::vector<std::pair<BatchConfig::TokenId, int>> const
-            &outputSerializedTree) {
-  std::vector<std::pair<BeamSearchBatchConfig::TokenId, int>> verifiedTree;
-  // verifiedTree.push_back(inputSerializedTree.at(0));
-  std::vector<std::pair<int, int>> new_committed_tokens =
-      std::vector<std::pair<int, int>>();
-
-  log_req_mgr.print("Input tree size (%zu) Output tree size (%zu)",
-                    inputSerializedTree.size(),
-                    outputSerializedTree.size());
-  { // Input tree
-    std::ostringstream oss;
-    // inputSerializedTree is the dfs_tree_inputs_map[guid] array og (token id,
-    // depth) pairs
-    for (auto const &pair : inputSerializedTree) {
-      oss << " " << pair.second << ":" << pair.first;
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-    }
-    log_req_mgr.print("Input tree:%s", oss.str().c_str());
-  }
-  { // Output tree
-    // log_req_mgr.print("========Output============");
-    // outputSerializedTree is an array of (token id, depth + 1) pairs
-    std::ostringstream oss;
-    for (auto const &pair : outputSerializedTree) {
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-      oss << " " << pair.second << ":" << pair.first;
-    }
-    log_req_mgr.print("Output tree:%s", oss.str().c_str());
-  }
-  {
-    // log_req_mgr.print("========Committed============");
-    //  committed_tokens[guid] is an array of (depth, result_index) pairs for
-    //  the given request
-    std::ostringstream oss;
-    for (auto const &pair : committed_tokens.at(guid)) {
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-      oss << " " << pair.second << ":" << pair.first;
-    }
-    log_req_mgr.print("Committed tokens:%s", oss.str().c_str());
-  }
-
-  // It's safe to have inputSerializedTree.size() > outputSerializedTree.size()
-  // In this case the inputSeriedTree ends with padding 0s
-  assert(inputSerializedTree.size() >= outputSerializedTree.size());
-
-  int *treeLayers = new int[inputSerializedTree.size()];
-  int node_num = 1;
-  int layer_num = 0;
-  for (int token_id = 0; token_id < inputSerializedTree.size(); token_id++) {
-    if (token_id == (inputSerializedTree.size() - 1) ||
-        inputSerializedTree.at(token_id + 1).second !=
-            inputSerializedTree.at(token_id).second) {
-      treeLayers[layer_num] = node_num;
-      layer_num += 1;
-      node_num = 1;
-    } else {
-      node_num++;
     }
+    // Update profiling statistics before returning
+    profiling.ssm_step_times.push_back(
+        (Realm::Clock::current_time_in_microseconds() -
+         profiling.ssm_step_start) *
+        1e-3);
+    profiling.ssm_steps.push_back(current_ssm_step);
+    return true;
   }
+  return false;
+}
 
-  // to avoid branch switch when same tokens in input tree.
-  // todo, only checked for N->1->1->1 cases
+/* --------- Bitmask Related Functions --------- */
 
-  bool findFirst = false;
-  layer_num = -1;
-  int first_layer_slot = 0;
-  int first_layer_slot_total = 0;
-  int processed_whole_layer_tokens = 0;
+void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
+  // This method is called by load_pending_request_to_batch when there is a
+  // new request to load into the batch
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
 
-  for (int i = 0; i < outputSerializedTree.size(); i++) {
-    auto input = inputSerializedTree.at(i);
-    auto output = outputSerializedTree.at(i);
+  // Clear because the prompt kernel doesn't use mask
+  bitmask.clear_bitmask();
+  // Set the info for the mask which is used to store the KV cache
+  bitmask.tree_or_prompt_size = prompt_length;
+  bitmask.current_layer_size = prompt_length;
+  bitmask.non_tree_cache_size = 0;
+}
 
-    if (i == 0 || inputSerializedTree.at(i - 1).second !=
-                      inputSerializedTree.at(i).second) {
-      layer_num += 1;
-      processed_whole_layer_tokens += i == 0 ? 0 : treeLayers[layer_num - 1];
-    }
+void RequestManager::update_bitmask_prompt(RequestGuid guid,
+                                           int num_committed_tokens) {
+  // This method modifies the bitmask in place
+  // This method is called by update_llm_verify_results
+  // 1. Clear the causal mask because the first SSM inference uses the prompt
+  // kernel and it doesn't use mask.
+  // 2. Maintain all other fields.
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
+  // Clear because the prompt kernel doesn't use mask
+  bitmask.clear_bitmask();
+  bitmask.tree_or_prompt_size = num_committed_tokens;
+  bitmask.current_layer_size = num_committed_tokens;
 
-    if (i == 0) {
-      verifiedTree.push_back(output);
+  // If the request just finishes the prefilling phase, we need to set the
+  // non_tree_cache_size to the size of the prompt
+  if (bitmask.non_tree_cache_size == 0) {
+    bitmask.non_tree_cache_size = request.tokens.size() - num_committed_tokens;
+  }
+}
+
+void RequestManager::init_bitmask_spec(RequestGuid guid) {
+  // This method modifies the bitmask in place
+  // This method is called by the first call of update_ssm_inference_results
+  // in a speculative iteration CAUTION: You should still call
+  // append_bitmask() after this method
+  // 1. Clear the causal mask and add a root into it, because the tree is
+  // currently empty but we have a root.
+  // 2. Maintain all other fields.
+  assert(current_ssm_step == 1 && "The current speculation step should be 1");
+  Request &request = all_requests[guid];
+  request.causal_mask = BatchConfig::BitMask();
+  // Set the mask for the root
+  request.causal_mask.bit_mask[0].set_bit(0);
+  request.causal_mask.tree_or_prompt_size = 1;
+  request.causal_mask.non_tree_cache_size = request.tokens.size() - 1;
+  request.causal_mask.current_layer_size = 1;
+}
 
-      new_committed_tokens.push_back(std::make_pair(
-          input.second,
-          committed_tokens.at(guid).at(i).second)); // <input_abs_depth,
-                                                    // input_index_in_batch>
-      // std::cout << committed_tokens.at(guid).at(i).first << ", "
-      //           << committed_tokens.at(guid).at(i).second << std::endl;
-      // std::cout << input.first << ", " << input.second << std::endl;
+void RequestManager::append_bitmask(RequestGuid guid) {
+  // This method changes the bitmask in place
+  // This method is called by update_ssm_inference_results(), after the new
+  // tokens are added to the token tree
+  assert(current_ssm_step >= 1 &&
+         "The current speculation step should be no less than 1");
 
-      assert(committed_tokens.at(guid).at(i).first == input.second);
-      continue;
-    }
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
+  TokenTree &token_tree = request.speculative_token_trees[0];
 
-    if (input.first == verifiedTree.back().first &&
-        input.second == verifiedTree.back().second) {
-      if (findFirst) {
-        // must in this branch.
-        int layer_slot = i - processed_whole_layer_tokens;
-        int layer_slot_total = treeLayers[layer_num];
-        if ((first_layer_slot == layer_slot)) {
-          verifiedTree.push_back(output);
-          new_committed_tokens.push_back(std::make_pair(
-              input.second, committed_tokens.at(guid).at(i).second));
-          // at this point, you'll not go other branches
-          // std::cout << "verify tree push back: " << output.first
-          //           << ", tree size is: " << verifiedTree.size()
-          //           << ", ??: " << input.first << ", " << input.second <<
-          //           "\n";
+  if (token_tree.tree_layers.size() <= current_ssm_step) {
+    // This request has no token added in this and the following small model
+    // inference steps, skip it
+    return;
+  }
+  std::vector<std::shared_ptr<TokenTreeNode>> &tree_layer =
+      request.speculative_token_trees[0].tree_layers.back();
+  int new_layer_size = tree_layer.size();
+  int last_layer_size = bitmask.current_layer_size;
+  int previous_tree_size = bitmask.tree_or_prompt_size;
+  bitmask.current_layer_size = new_layer_size;
+  bitmask.tree_or_prompt_size += new_layer_size;
+
+  assert(bitmask.tree_or_prompt_size <= get_max_spec_tree_token_num());
+
+  int parent_offset = previous_tree_size - last_layer_size;
+  int child_offset = previous_tree_size;
+
+  int child_idx = 0;
+  for (auto const &child_ptr : tree_layer) {
+    // Each child copy its parent's mask
+    bitmask.bit_mask[child_offset + child_idx] =
+        bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
+    // Each child attend to itself
+    bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
+                                                       child_idx);
+    child_idx++;
+  }
+}
+
+BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
+  // This method creates a new bitmask for LLM verification model's bitmask,
+  // it does not modify the small model's bitmask This method is called by
+  // prepare_verify_batch_config().
 
+  Request &request = all_requests[guid];
+  TokenTree &token_tree = request.speculative_token_trees[0];
+  BatchConfig::BitMask llm_bitmask = BatchConfig::BitMask();
+
+  int abs_index_in_tree = 0;
+  std::vector<int> parent_pos_2_abs_index;
+  std::vector<int> current_layer_abs_index;
+  for (auto const &tree_layer : token_tree.tree_layers) {
+    for (auto const &tree_node : tree_layer) {
+      current_layer_abs_index.push_back(abs_index_in_tree);
+      if (tree_node->included == true) {
+        if (abs_index_in_tree == 0) {
+          // The root token, set itself
+          llm_bitmask.bit_mask[0].set_bit(0);
         } else {
-          printf("not correct slot\n");
+          // Copy from the parent, and set itself
+          int parent_abs_index = parent_pos_2_abs_index[tree_node->parent_pos];
+          llm_bitmask.bit_mask[abs_index_in_tree] =
+              llm_bitmask.bit_mask[parent_abs_index];
+          llm_bitmask.bit_mask[abs_index_in_tree].set_bit(abs_index_in_tree);
         }
-      } else {
-        verifiedTree.push_back(output);
-        first_layer_slot = i - processed_whole_layer_tokens;
-        first_layer_slot_total = treeLayers[layer_num];
-        findFirst = true;
-        new_committed_tokens.push_back(std::make_pair(
-            input.second,
-            committed_tokens.at(guid).at(i).second)); // <input_abs_depth,
-                                                      // input_index_in_batch>
-        // at this point, you'll not go other branches
-        // std::cout << "verify tree push back: " << output.first
-        //           << ", tree size is: " << verifiedTree.size()
-        //           << ", ??: " << input.first << ", " << input.second << "\n";
+        abs_index_in_tree++;
       }
-
-      assert(committed_tokens.at(guid).at(i).first == input.second);
     }
+    parent_pos_2_abs_index.clear();
+    parent_pos_2_abs_index.swap(current_layer_abs_index);
   }
-  committed_tokens[guid] = new_committed_tokens;
-  {
-    // log_req_mgr.print("========Verified============");
-    std::ostringstream oss;
-    for (auto const &pair : verifiedTree) {
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-      oss << " " << pair.second << ":" << pair.first;
-    }
-    log_req_mgr.print("Verified:%s", oss.str().c_str());
+
+  // Maintain other fields of llm_bitmask
+  llm_bitmask.non_tree_cache_size = request.causal_mask.non_tree_cache_size;
+  llm_bitmask.tree_or_prompt_size = request.causal_mask.tree_or_prompt_size;
+  // We don't need to set llm_bitmask.current_layer_size here because they are
+  // not used in LLM verification.
+  return llm_bitmask;
+}
+
+/* --------- Page Attention Related Functions --------- */
+int RequestManager::get_num_blocks_allocated(Request &request) const {
+  // needs some assertion
+  return request.blocks.size();
+}
+
+int RequestManager::get_len_last_block(Request &request) const {
+  int num_tokens = request.blocks.back().get_num_tokens();
+  if (request.blocks.empty()) {
+    return 0;
   }
-  {
-    // log_req_mgr.print("========New Committed============");
-    std::ostringstream oss;
-    for (auto const &pair : committed_tokens.at(guid)) {
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-      oss << " " << pair.second << ":" << pair.first;
-    }
-    log_req_mgr.print("New committed:%s", oss.str().c_str());
+  return request.blocks.back().get_num_tokens();
+}
+
+// get the index of the last token in the request
+int RequestManager::get_idx_last_logical_token(Request &request) const {
+  if (request.blocks.empty()) {
+    printf("Error: request.blocks is empty\n");
+    return -1;
+  } else {
+    return (request.blocks.size() - 1) * kPagesize +
+           request.blocks.back().get_num_tokens() - 1;
   }
+}
 
-  return verifiedTree;
+int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
+  // get physical indices
+  PageManager *page_manager = PageManager::get_page_manager();
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
+  return block_table_indices[idx_logical / kPagesize] * kPagesize +
+         idx_logical % kPagesize;
 }
 
-std::vector<std::pair<BatchConfig::TokenId, int>>
-    RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
-                                       int request_index,
-                                       int first_token_depth_in_request) {
-  if (verbose) {
-    std::cout << "[Traverse Beam Tree] request_index: " << request_index
-              << "\n";
-    std::cout << "[Traverse Beam Tree] max_depth: "
-              << old_bc.beamRequestsInfo[request_index].max_depth << "\n";
-    std::cout << "[Traverse Beam Tree] current_depth: "
-              << old_bc.beamRequestsInfo[request_index].current_depth << "\n";
-    std::cout << "[Traverse Beam Tree] beam_width: "
-              << old_bc.beamRequestsInfo[request_index].beam_size << "\n";
-    std::cout << "[Traverse Beam Tree] start index: "
-              << first_token_depth_in_request << "\n";
-  }
-
-  auto guid = old_bc.requestsInfo[request_index].request_guid;
-  Request &request = all_requests[guid];
-  // std::cout << "request.beam_trees.size(): " << request.beam_trees.size()
-  //           << std::endl;
-  BeamTree tree = request.beam_trees.at(old_bc.model_id);
-
-  // std::cout << "print beam tree: "
-  //           << "\n";
-  std::vector<std::pair<BatchConfig::TokenId, int>> serializedTree;
-  for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) {
-    // std::cout << "tree layer: " << i
-    //           << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer
-    //           << "\n";
-    // push tokens into tree
-    for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) {
-      // std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n";
-      serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i));
-    }
-  }
-  // token, index
-  // todo make this one global for different stages
-
-  // PreOrder(tree,
-  //          old_bc.beamRequestsInfo[request_index].max_depth,
-  //          0,
-  //          old_bc.beamRequestsInfo[request_index].beam_size,
-  //          0,
-  //          serializedTree,
-  //          verbose);
-
-  // print it
-  if (verbose) {
-    std::cout << "Print serialized tree: size:" << request_index
-              << serializedTree.size() << "\n";
-  }
-  for (int k = 0; k < serializedTree.size(); k++) {
-    serializedTree.at(k).second += first_token_depth_in_request;
-    if (verbose) {
-      std::cout << "token id: " << serializedTree.at(k).first
-                << ", depth: " << serializedTree.at(k).second << "\n";
-    }
+// this will allocate one logical block and one physical block to the request
+void RequestManager::_append_block_to_request(Request &request,
+                                              bool is_commit) {
+  PageManager *page_manager = PageManager::get_page_manager();
+  // Append the logical block to the request
+  // page attention: in this function we need to remember the last logical block
+  // number that still contains committed tokens
+  LogicalTokenBlock block(request.blocks.size(), kPagesize);
+  request.blocks.push_back(block);
+  page_manager->allocate_one_block(request.guid);
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
+  // update page_id_commit
+  if (is_commit) {
+    request.page_last_committed++;
+    int size_blocks = request.blocks.size();
   }
+}
 
-  // if (dfs_tree_inputs.find(old_bc.requestsInfo[request_index].request_guid)
-  // !=
-  //     dfs_tree_inputs.end()) {
-  //   dfs_tree_inputs[old_bc.requestsInfo[request_index].request_guid] =
-  //       serializedTree;
-  // } else {
-  //   dfs_tree_inputs.insert(std::make_pair(
-  //       old_bc.requestsInfo[request_index].request_guid, serializedTree));
-  // }
+// this function is used for appending a token to the last logical block and
+// also the last physical block it will return the physical position of this
+// token
+int RequestManager::append_token_to_block(Request &request,
+                                          TokenId token,
+                                          bool is_commit) {
+  PageManager *page_manager = PageManager::get_page_manager();
+  if (request.blocks.empty() || request.blocks.back().is_full()) {
+    // Append a new logical block
+    _append_block_to_request(request, is_commit);
+    // also allocate one physical page
+  }
+  // insert token to both logical block and physical block
+  request.blocks.back().append_tokens({token}, is_commit);
+  int idx_logical = get_idx_last_logical_token(request);
+  assert(idx_logical >= 0);
+  int idx_physical = idx_logical_to_physical(request, idx_logical);
+  assert(idx_physical >= 0);
+  return idx_physical;
+}
 
-  return serializedTree;
-  // }
+void RequestManager::reset_block_table(Request &request) {
+  // get the indices of original physical block table for request
+  PageManager *page_manager = PageManager::get_page_manager();
+  assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
+  // reset the block table according to the request's page_last_commit
+  page_manager->free_multiple_blocks(request.guid,
+                                     block_table_indices.size() -
+                                         request.page_last_committed - 1);
+  // reset this request's logical block table
+  if (request.page_last_committed < static_cast<int>(request.blocks.size())) {
+    request.blocks.erase(request.blocks.begin() + request.page_last_committed +
+                             1,
+                         request.blocks.end());
+  }
+  request.blocks.back().reset_num_spec_tokens();
+  // the indices of block table should be the same as the number of blocks
+  std::vector<int> block_table =
+      page_manager->get_block_table_indices(request.guid);
+  return;
 }
 
-std::vector<std::pair<BatchConfig::TokenId, int>>
-    RequestManager::merge_dfs_trees(
-        std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
-            input_trees,
-        int root_depth,
-        RequestGuid guid) {
-  assert(input_trees.size() == 1 && "currently using one ssm");
-  dfs_tree_inputs[guid] = input_trees.at(0);
-  return input_trees.at(0);
+/* --------- Bitmask Related Functions --------- */
+void RequestManager::gumbel_conditioned_on_max(
+    double target_max, std::vector<std::pair<double, int>> &logits) {
+  // Assume the logits are sorted in descending order
+  if (logits.size() == 0) {
+    return;
+  }
+  double max_logit = logits[0].first;
+  for (auto &logit_n_idx : logits) {
+    logit_n_idx.first =
+        -log(exp(-target_max) - exp(-max_logit) + exp(-logit_n_idx.first));
+  }
+}
 
-  std::vector<std::pair<BatchConfig::TokenId, int>> merged_tree;
+void RequestManager::renormalize(std::vector<std::pair<TokenId, float>> &D,
+                                 std::unordered_map<TokenId, float> &R,
+                                 TokenId token_id) {
+  float token_prob;
+  for (auto &kv : D) {
+    TokenId d_token_id = kv.first;
+    float d_prob = kv.second;
+    if (R.find(d_token_id) != R.end()) {
+      float r_prob = R[d_token_id];
+      R[d_token_id] = max(0.0f, r_prob - d_prob);
+    }
+    if (d_token_id == token_id) {
+      token_prob = d_prob;
+      kv.second = 0.0f;
+    }
+  }
+  // Normalize R
+  float sum_r = 0.0f;
+  for (auto &kv : R) {
+    sum_r += kv.second;
+  }
+  for (auto &kv : R) {
+    kv.second /= (sum_r + 1e-6);
+  }
+  // Normalize D
+  for (auto &kv : D) {
+    kv.second /= (1.0f - token_prob - 1e-6);
+  }
+}
 
-  std::unordered_map<int, std::set<int>> childrens;
-  std::unordered_map<int, int> curr_path;
+std::tuple<int, BatchConfig::TokenId, bool>
+    RequestManager::reject_sampling(std::vector<std::pair<TokenId, float>> &D,
+                                    std::unordered_map<TokenId, float> &R,
+                                    int k) {
+  assert(D.size() == k);
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+  double r;
+  for (int i = 0; i < k; ++i) {
+    // Generate a random number in the range [0, 1)
+    r = dis(gen);
+    double d_prob = (double)D[i].second;
+    if (R.find(D[i].first) != R.end()) {
+      double r_prob = (double)R[D[i].first];
+      if (r < d_prob / d_prob + 1e-6) {
+        return {i, D[i].first, true};
+      }
+    }
+    // else, r_prob = 0.0, reject the token
+    renormalize(D, R, D[i].first);
+  }
+  std::vector<double> r_probs;
+  std::vector<BatchConfig::TokenId> r_tokens;
+  for (auto &kv : R) {
+    r_probs.push_back(kv.second);
+    r_tokens.push_back(kv.first);
+  }
+  std::discrete_distribution<> r_dist(r_probs.begin(), r_probs.end());
+  int sampled_index = r_dist(gen);
+  return {-1, r_tokens[sampled_index], false};
+}
 
-  // convert <token_id, depth> pair to an integer
-  auto root = input_trees.at(0).at(0);
-  int root_id = root.first * 10000 + root.second;
+void RequestManager::get_verify_results_sample(
+    InferenceResult const &llm_verify_result) {
+  // This function maintain the generated token list of the request and the
+  // committed tokens.
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    int llm_result_offset =
+        request.first_token_offset_in_batch * BatchConfig::MAX_K_LOGITS;
+    int llm_input_offset = request.first_token_offset_in_batch;
+    int committed_token_index = request.tokens.size() - 1;
+
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    // First add the root to the committed tokens
+    request.committed_tokens.push_back(Request::CommittedToken(
+        llm_input_offset, committed_token_index, request.tokens.back()));
+    committed_token_index++;
+    // Don't add it to request.tokens because it has already been added.
+
+    // The position of the last accepted token in its tree layer (includeing
+    // the pruned tokens)
+    int last_accepted_token_index_in_layer = 0;
+    // The index of the last accepted token in the entire tree (excluding the
+    // pruned tokens)
+    int last_accepted_token_index = 0;
+    float last_accepted_token_accumulated_log_prob = 0.0f;
+    int current_token_index = 1; // Because we skip the root
+    bool rejected = false;
+
+    auto layer_it = token_tree.tree_layers.begin();
+    ++layer_it;
+    for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
+      // We skip the first layer
+      std::vector<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
+      std::vector<std::pair<TokenId, float>> D;
+      std::unordered_map<TokenId, float> R;
+      // Data format: <current_token_index, current_token_index_in_layer,
+      // acc_log_prob>
+      std::unordered_map<TokenId, std::tuple<int, int, float>> d_token_info;
+
+      int current_token_index_in_layer = 0;
+
+      // Iterate through the tokens in the current layer to find the candidate
+      // tokens whose parent is the last accepted token
+      for (auto const &node_ptr : tree_layer) {
+        if (!node_ptr->included) {
+          // Don't increase current_token_index here
+          current_token_index_in_layer++;
+          continue;
+        }
+        if (node_ptr->parent_pos != last_accepted_token_index_in_layer) {
+          // The token's parent is not accepted
+          current_token_index++;
+          current_token_index_in_layer++;
+          continue;
+        } else {
+          // The token's parent is accepted
+          float prob = std::exp(node_ptr->log_accumulated_prob -
+                                last_accepted_token_accumulated_log_prob);
+          D.push_back({node_ptr->id, prob});
+          d_token_info[node_ptr->id] = {current_token_index,
+                                        current_token_index_in_layer,
+                                        node_ptr->log_accumulated_prob};
+          current_token_index++;
+          current_token_index_in_layer++;
+        }
+      }
 
-  for (int i = 0; i < input_trees.size(); i++) {
-    auto tree = input_trees.at(i);
-    // all trees should have the same root
-    assert(tree.at(0) == root);
+      int result_offset = llm_result_offset +
+                          last_accepted_token_index * BatchConfig::MAX_K_LOGITS;
+      for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) {
+        TokenId token_id = llm_verify_result.token_ids[result_offset + i];
+        R[token_id] = llm_verify_result.probs[result_offset + i];
+      }
 
-    for (auto const &pair : tree) {
-      int id = pair.first * 10000 + pair.second; // current node
-      curr_path[pair.second] = id;               // log node in current search
+      auto [sampled_index, token_id, accepted] =
+          reject_sampling(D, R, D.size());
+      if (accepted) {
+        // The token's parent is accepted, and this token's id equals the
+        // llm's sample at its parent's position. We accept this token.
+        // from_index: the index of the token in the tree (excluding the
+        // pruned tokens)
+        // to_index: the committed token index in the request
+        request.committed_tokens.push_back(Request::CommittedToken(
+            llm_input_offset + std::get<0>(d_token_info[token_id]),
+            committed_token_index,
+            token_id));
+        request.tokens.push_back(token_id);
+
+        last_accepted_token_index = std::get<0>(d_token_info[token_id]);
+        last_accepted_token_index_in_layer =
+            std::get<1>(d_token_info[token_id]);
+        last_accepted_token_accumulated_log_prob =
+            std::get<2>(d_token_info[token_id]);
+        committed_token_index++;
+      } else {
+        request.committed_tokens.push_back(
+            Request::CommittedToken(-1, committed_token_index, token_id));
+        rejected = true;
+        break;
+      }
+    }
 
-      if (childrens.find(id) == childrens.end()) {
-        // init empty set
-        childrens[id] = std::set<int>();
+    // Add the last token (that is not in the cache of the LLM) if the
+    // sampling procedure succeed in the last layer from_index: since this
+    // token is not in the token tree, the llm doesn't have its KV cache, so
+    // the from_index should be a place holder, which is -1
+    if (!rejected) {
+      std::unordered_map<TokenId, float> R;
+      std::vector<std::pair<TokenId, float>> D;
+      int result_offset = llm_result_offset +
+                          last_accepted_token_index * BatchConfig::MAX_K_LOGITS;
+      for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) {
+        TokenId token_id = llm_verify_result.token_ids[result_offset + i];
+        R[token_id] = llm_verify_result.probs[result_offset + i];
       }
+      auto [sampled_index, token_id, accepted] =
+          reject_sampling(D, R, D.size());
+      request.committed_tokens.push_back(
+          Request::CommittedToken(-1, committed_token_index, token_id));
+      request.tokens.push_back(token_id);
+    }
 
-      if (pair.second > root_depth) {
-        int parent_id = curr_path[pair.second - 1];
-        childrens[parent_id].insert(id);
+    if (verbose) {
+      std::cout << "Request " << request.guid << " committed tokens: ";
+      for (auto const &committed_token : request.committed_tokens) {
+        std::cout << committed_token.token_id << " ("
+                  << tokenizer_->Decode({committed_token.token_id}) << ") ";
       }
+      std::cout << std::endl;
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      // std::cout << "Output sequence: " << output << std::endl;
     }
   }
+}
 
-  std::stack<int> q;
-  q.push(root_id);
-
-  while (!q.empty()) {
-    int curr = q.top();
-    q.pop();
-    merged_tree.push_back(std::make_pair(curr / 10000, curr % 10000));
-    for (int child : childrens[curr]) {
-      q.push(child);
+void RequestManager::get_verify_results_greedy(
+    InferenceResult const &llm_verify_result) {
+  // This function maintain the generated token list of the request and the
+  // committed tokens.
+  int total_nb_generated_tokens = 0;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    int llm_result_offset = request.first_token_offset_in_batch;
+    int llm_cache_size = request.tokens.size() - 1;
+    int committed_token_index = request.tokens.size() - 1;
+
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    // First add the root to the committed tokens
+    request.committed_tokens.push_back(Request::CommittedToken(
+        llm_cache_size, committed_token_index, request.tokens.back()));
+    committed_token_index++;
+    // Don't add it to request.tokens because it has already been added.
+
+    // The position of the last accepted token in its tree layer (includeing
+    // the pruned tokens)
+    int last_accepted_token_index_in_layer = 0;
+    // The index of the last accepted token in the entire tree (excluding the
+    // pruned tokens)
+    int last_accepted_token_index = 0;
+
+    int current_token_index = 1; // Because we skip the root
+                                 // We skip the first layer
+    bool found_eos = false;
+    for (auto layer_it = token_tree.tree_layers.begin() + 1;
+         layer_it != token_tree.tree_layers.end();
+         ++layer_it) {
+      std::vector<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
+
+      bool token_accepted_this_layer = false;
+      int current_token_index_in_layer = 0;
+
+      for (auto const &node_ptr : tree_layer) {
+        if (!node_ptr->included) {
+          current_token_index_in_layer++;
+          continue;
+        }
+        if ((node_ptr->parent_pos != last_accepted_token_index_in_layer) ||
+            token_accepted_this_layer) {
+          // The token's parent is not accepted, or there is already another
+          // token accepted in this layer
+          current_token_index++;
+          current_token_index_in_layer++;
+          continue;
+        } else {
+          // The token's parent is accepted, and no token has been accepted in
+          // this layer yet
+          if (node_ptr->id ==
+              llm_verify_result
+                  .token_ids[llm_result_offset + last_accepted_token_index]) {
+            // The token's parent is accepted, and this token's id equals the
+            // llm's sample at its parent's position. We accept this token.
+
+            // from_index: the index of the token in the tree (excluding the
+            // pruned tokens)
+            // to_index: the committed token index in the request
+            request.committed_tokens.push_back(
+                Request::CommittedToken(llm_cache_size + current_token_index,
+                                        committed_token_index,
+                                        node_ptr->id));
+            request.tokens.push_back(node_ptr->id);
+
+            token_accepted_this_layer = true;
+            last_accepted_token_index = current_token_index;
+            last_accepted_token_index_in_layer = current_token_index_in_layer;
+            committed_token_index++;
+            if (is_eos_token(node_ptr->id)) {
+              found_eos = true;
+            }
+          }
+          current_token_index++;
+          current_token_index_in_layer++;
+        }
+        if (found_eos) {
+          break;
+        }
+      }
+      if (!token_accepted_this_layer) {
+        // No token is accepted in this layer, we should stop the traversal
+        break;
+      }
+      if (found_eos) {
+        break;
+      }
     }
-  }
 
-  if (verbose) {
-    for (auto &pair : merged_tree) {
-      std::cout << pair.first << ", depth: " << pair.second << std::endl;
+    // Add the last token (that is not verified by the LLM)
+    // from_index: since this token is not in the token tree, the llm
+    // doesn't have its KV cache, so the from_index should be a place
+    // holder, which is -1
+    if (!found_eos) {
+      request.committed_tokens.push_back(Request::CommittedToken(
+          -1,
+          committed_token_index,
+          llm_verify_result
+              .token_ids[llm_result_offset + last_accepted_token_index]));
+      request.tokens.push_back(
+          llm_verify_result
+              .token_ids[llm_result_offset + last_accepted_token_index]);
     }
-  }
 
-  dfs_tree_inputs[guid] = merged_tree;
+    assert(request.committed_tokens.size() >= 2);
+    int nb_generated_tokens = (int)request.committed_tokens.size() -
+                              1; // exclude previous bonus token
+    int accepted_tokens = (int)request.committed_tokens.size() -
+                          1; // exclude previous bonus token
+    if (!found_eos) {
+      accepted_tokens--; // exclude the last bonus token (if we found eos, we
+                         // don't add it)
+    }
+    total_nb_generated_tokens += nb_generated_tokens;
+
+    NewProfileInfo new_profile_info;
+    new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+    new_profile_info.request_guid = guid;
+    new_profile_info.request_step_idx =
+        profiling_requests[guid].llm_decoding_steps -
+        1; // check if this has already been incremented
+    new_profile_info.num_speculated_tokens = get_tree_size(request);
+    new_profile_info.num_accepted_tokens = accepted_tokens;
+    new_profile_info.speculation_score = -1.0;
+    new_profile_info.num_generated_tokens = nb_generated_tokens;
+    new_profile_info.speculation_start_timestamp =
+        profiling_requests[guid].speculation_start_timestamp;
+    new_profile_info.speculation_end_timestamp =
+        profiling_requests[guid].speculation_end_timestamp;
+    new_profiling_info.push_back(new_profile_info);
 
-  return merged_tree;
+    if (verbose) {
+      std::cout << "Request " << request.guid << " committed tokens: ";
+      for (auto const &committed_token : request.committed_tokens) {
+        std::cout << committed_token.token_id << " ("
+                  << tokenizer_->Decode({committed_token.token_id}) << ") ";
+      }
+      std::cout << std::endl;
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      std::cout << "Output sequence: " << output << std::endl;
+    }
+  }
+  profiling.generated_tokens_per_step.push_back(total_nb_generated_tokens);
 }
 
 std::vector<GenerationResult>
-    FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
+    FFModel::generate(std::vector<GenerationRequest> &requests,
+                      EmissionMachine &emission_machine) {
   RequestManager *rm = RequestManager::get_request_manager();
   std::vector<RequestManager::RequestGuid> guids;
-  for (int i = 0; i < prompts.size(); i++) {
-    RequestManager::RequestGuid guid =
-        rm->register_new_request(prompts.at(i), max_seq_length);
+
+  // Wait until the request manager is ready
+  while (!rm->is_background_server_serving()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  for (size_t i = 0; i < requests.size(); i++) {
+    requests[i].slo_ratio = emission_machine.sample_slo_ratio();
+    requests[i].emission_time_ms = emission_machine.get_elapsed_time_ms();
+    printf("Prompt[%ld]: %s\n", i, requests[i].prompt.c_str());
+    RequestManager::RequestGuid guid = rm->register_new_request(requests[i]);
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
     }
+    emission_machine.wait_until_next_request();
   }
   std::vector<GenerationResult> results;
   for (int i = 0; i < guids.size(); i++) {
@@ -2314,9 +2901,18 @@ std::vector<GenerationResult>
   return results;
 }
 
+std::vector<GenerationResult>
+    FFModel::generate(std::vector<std::string> &prompts,
+                      EmissionMachine &emission_machine) {
+  std::vector<GenerationRequest> requests;
+  for (std::string &prompt : prompts) {
+    requests.push_back(GenerationRequest(prompt, -1.0, 0));
+  }
+  return generate(requests, emission_machine);
+}
+
 void RequestManager::start_background_server(FFModel *model) {
-  assert(request_manager_status == INITIALIZED);
-  request_manager_status = SERVING;
+  assert(background_server_status == INITIALIZED);
   // Start background task
   Runtime *runtime = Runtime::get_runtime();
   Context ctx = Runtime::get_context();
@@ -2358,17 +2954,23 @@ void RequestManager::background_serving_task(
       ssm->config.lg_ctx = ctx;
     }
   }
-  if (rm->get_num_ssms() == 0) {
+  // page attention: initalize the page manager here
+  int kv_cache_size = rm->get_max_kv_cache_size();
+  PageManager::get_page_manager(llm, rm->get_max_kv_cache_size());
+  if (rm->decoding_mode == INCREMENTAL_DECODING) {
     // No SSMs: perform incremental decoding
-    rm->serve_incr_decoding(llm);
+    rm->serve_decoding(llm);
   } else {
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
   }
+#ifdef FF_USE_NCCL
+  llm->finish_nccl_comms();
+#endif
 }
 
 /*static*/
-void RequestManager::serve_incr_decoding(FFModel *llm) {
+void RequestManager::serve_decoding(FFModel *llm) {
   Context ctx = llm->config.lg_ctx;
   Runtime *runtime = llm->config.lg_hlr;
   // Compile the llm
@@ -2377,50 +2979,45 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
   assert(im->model_weights_loaders.find(llm) !=
          im->model_weights_loaders.end());
   // Load model weights
-  im->model_weights_loaders[llm]->load_weights(llm);
+  im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
   // init operators
   im->init_operators_inference(llm);
   // Legion futures for inc_decoding and spec_infer
-  BatchConfigFuture last_bcf;
   InferenceResultFuture last_irf;
   {
     // Initialize futures for incr decoding
-    BatchConfig bc;
     InferenceResult ir;
-    last_bcf = Future::from_value<BatchConfig>(bc);
     last_irf = Future::from_value<InferenceResult>(ir);
   }
 
-  std::queue<std::pair<BatchConfigFuture, InferenceResultFuture>>
-      batch_pipeline;
-  { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); }
+  std::queue<InferenceResultFuture> batch_pipeline;
+  { batch_pipeline.push(last_irf); }
 
+  // reset_profiling_statistics();
+  background_server_status = SERVING;
   while (!is_background_server_terminated()) {
 
     if (batch_pipeline.size() >= 4) {
       // Block here to avoid launching too many batches
-      auto const &batch = batch_pipeline.front();
-      batch.second.get_void_result();
+      auto const &ir = batch_pipeline.front();
+      ir.get_void_result();
     }
     // deque finished batches
     while (batch_pipeline.size() > 1) {
-      auto const &batch = batch_pipeline.front();
-      if (batch.second.is_ready()) {
+      auto const &ir = batch_pipeline.front();
+      if (ir.is_ready()) {
         batch_pipeline.pop();
       } else {
         break;
       }
     }
     runtime->begin_trace(ctx, 12346 /*trace_id*/);
-    auto const &next_batch = batch_pipeline.back();
-    BatchConfigFuture bcf =
-        prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime);
+    InferenceResultFuture next_ir = batch_pipeline.back();
+    BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
     FutureMap fm = im->inference(llm, 0, bcf);
     assert(fm.get_future_map_domain().get_volume() == 1);
     InferenceResultFuture irf = fm.get_future(0);
-    batch_pipeline.push(std::make_pair(bcf, irf));
-    last_bcf = bcf;
-    last_irf = irf;
+    batch_pipeline.push(irf);
     runtime->end_trace(ctx, 12346 /*trace_id*/);
   }
 }
@@ -2436,91 +3033,132 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     assert(im->model_weights_loaders.find(llm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[llm]->load_weights(llm);
+    im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
     // init operators
     im->init_operators_inference(llm);
   }
   for (size_t i = 0; i < get_num_ssms(); i++) {
     // Compile the i-th ssm
     FFModel *ssm = get_ssm_model(i);
-    im->compile_model_and_allocate_buffer(ssm);
-    assert(im->model_weights_loaders.find(llm) !=
+    im->compile_model_and_allocate_buffer(ssm, false);
+    assert(im->model_weights_loaders.find(ssm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[ssm]->load_weights(ssm);
+    im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime);
     // init operators
     im->init_operators_inference(ssm);
   }
 
-  std::queue<std::pair<TreeVerifyBatchConfigFuture, InferenceResultFuture>>
-      batch_pipeline;
-  // Legion futures for inc_decoding and spec_infer
-  TreeVerifyBatchConfigFuture last_tree_bcf;
-  InferenceResultFuture last_tree_irf;
+  InferenceResultFuture irf_0;
   {
-    // Initialize futures for spec infer
-    TreeVerifyBatchConfig tree_bc;
-    InferenceResult tree_ir;
-    last_tree_bcf = Future::from_value<TreeVerifyBatchConfig>(tree_bc);
-    last_tree_irf = Future::from_value<InferenceResult>(tree_ir);
+    // Initialize futures for incr decoding
+    InferenceResult ir_0;
+    irf_0 = Future::from_value<InferenceResult>(ir_0);
   }
-  batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf));
 
-  while (!is_background_server_terminated()) {
+  request_manager_status = PREFILLING;
+  prefill_model = SSM;
+  ssm_tree_depth = get_max_tree_depth();
 
-    if (batch_pipeline.size() >= 4) {
+  std::queue<InferenceResultFuture> infer_result_future_pipeline;
+  infer_result_future_pipeline.push(irf_0);
+
+  // reset_profiling_statistics();
+  background_server_status = SERVING;
+  while (!is_background_server_terminated()) {
+    if (infer_result_future_pipeline.size() >= 4) {
       // Block here to avoid launching too many batches
-      auto const &batch = batch_pipeline.front();
-      batch.second.get_void_result();
+      auto const &ir = infer_result_future_pipeline.front();
+      ir.get_void_result();
     }
     // deque finished batches
-    while (batch_pipeline.size() > 1) {
-      auto const &batch = batch_pipeline.front();
-      if (batch.second.is_ready()) {
-        batch_pipeline.pop();
+    while (infer_result_future_pipeline.size() > 1) {
+      auto const &ir = infer_result_future_pipeline.front();
+      if (ir.is_ready()) {
+        infer_result_future_pipeline.pop();
       } else {
         break;
       }
     }
-    auto const &next_batch = batch_pipeline.back();
-    BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(
-        next_batch.first, next_batch.second, 0, ctx, runtime);
-    std::vector<BeamSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
-    for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) {
-      beam_bcf_vec[ssm_id] = beam_bcf;
-    }
+
     runtime->begin_trace(ctx, 12345 /*trace_id*/);
+    for (int ssm_step_i = 0; ssm_step_i < get_max_tree_depth(); ssm_step_i++) {
+      InferenceResultFuture irf = infer_result_future_pipeline.back();
+      BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime);
+      FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
+      infer_result_future_pipeline.push(fm.get_future(0));
+    }
+    InferenceResultFuture irf = infer_result_future_pipeline.back();
+    BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime);
+    FutureMap fm = im->inference(llm, 0, bcf);
+    infer_result_future_pipeline.push(fm.get_future(0));
+    runtime->end_trace(ctx, 12345 /*trace_id*/);
+  }
+}
+
+/*static*/
+void RequestManager::serve_spec_infer_sync(FFModel *llm) {
+  Context ctx = llm->config.lg_ctx;
+  Runtime *runtime = llm->config.lg_hlr;
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  {
+    // Compile the llm
+    im->compile_model_and_allocate_buffer(llm);
+    assert(im->model_weights_loaders.find(llm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
+    // init operators
+    im->init_operators_inference(llm);
+  }
+  for (size_t i = 0; i < get_num_ssms(); i++) {
+    // Compile the i-th ssm
+    FFModel *ssm = get_ssm_model(i);
+    im->compile_model_and_allocate_buffer(ssm, false);
+    assert(im->model_weights_loaders.find(ssm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime);
+    // init operators
+    im->init_operators_inference(ssm);
+  }
+
+  InferenceResultFuture irf_0;
+  {
+    // Initialize futures for incr decoding
+    InferenceResult ir_0;
+    irf_0 = Future::from_value<InferenceResult>(ir_0);
+  }
 
-    for (size_t i = 0; i < get_num_ssms(); i++) {
-      for (int depth = 0; depth < BeamSearchBatchConfig::MAX_BEAM_DEPTH;
-           depth++) {
-        beam_bcf = beam_bcf_vec[i];
+  request_manager_status = PREFILLING;
+  prefill_model = SSM;
 
-        FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]);
-        assert(fm.get_future_map_domain().get_volume() == 1);
-        BeamInferenceResultFuture beam_irf = fm.get_future(0);
-        beam_bcf_vec[i] =
-            prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime);
-      }
-    }
-    // Token Tree Verification
-    {
-      TreeVerifyBatchConfigFuture tree_bcf =
-          prepare_next_batch_verify(beam_bcf_vec, ctx, runtime);
-      FutureMap fm = im->inference(llm, 0, tree_bcf);
-      assert(fm.get_future_map_domain().get_volume() == 1);
-      InferenceResultFuture tree_irf = fm.get_future(0);
-      batch_pipeline.push(std::make_pair(tree_bcf, tree_irf));
-      last_tree_bcf = tree_bcf;
-      last_tree_irf = tree_irf;
+  background_server_status = SERVING;
+  while (!is_background_server_terminated()) {
+    BatchConfigFuture bcf = get_next_batch_config(irf_0, ctx, runtime);
+    bcf.get_void_result();
+    if ((request_manager_status == PREFILLING and prefill_model == LLM) or
+        request_manager_status == LLM_VERIFY) {
+      runtime->begin_trace(ctx, 12345 /*trace_id*/);
+      FutureMap fm = im->inference(llm, 0, bcf);
+      irf_0 = fm.get_future(0);
+      runtime->end_trace(ctx, 12345 /*trace_id*/);
+    } else if ((request_manager_status == PREFILLING and
+                prefill_model == SSM) or
+               request_manager_status == SSM_SPEC) {
+      runtime->begin_trace(ctx, 23456 /*trace_id*/);
+      FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
+      irf_0 = fm.get_future(0);
+      runtime->end_trace(ctx, 23456 /*trace_id*/);
+    } else {
+      assert(false && "Invalid request manager status");
     }
-    runtime->end_trace(ctx, 12345 /*trace_id*/);
   }
 }
 
 void RequestManager::trigger_request_completion_future(
     RequestGuid const &guid) {
-  const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+  std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
   assert(request_to_promise.find(guid) != request_to_promise.end());
   // Set the completion promise in case other threads are waiting
   request_to_promise[guid]->set_value();
@@ -2533,8 +3171,222 @@ void RequestManager::terminate_background_server_at_exit() {
 }
 
 void RequestManager::terminate_background_server() {
-  if (request_manager_status == SERVING) {
-    request_manager_status = TERMINATED;
+  if (is_background_server_serving()) {
+    assert(profiling.llm_step_times.size() ==
+           profiling.requests_per_step.size());
+    // Write the last profiling statistics to output file
+    std::string str = "[Profiling Statistics]";
+
+    profiling.server_end_time = Realm::Clock::current_time_in_microseconds();
+    long long total_time =
+        profiling.server_end_time - profiling.server_start_time;
+    int total_requests = 0;
+    for (auto const &profiling_info : profiling_requests) {
+      int request_id = profiling_info.first;
+      Request &request = all_requests[request_id];
+      if (request.status == Request::COMPLETED) {
+        total_requests++;
+      }
+    }
+    int total_tokens = 0;
+    for (int num_tokens : profiling.generated_tokens_per_step) {
+      total_tokens += num_tokens;
+    }
+
+    if (profiling_requests.size() != all_requests.size()) {
+      std::cerr << "profiling_requests.size()=" << profiling_requests.size()
+                << " != all_requests.size()=" << all_requests.size()
+                << std::endl;
+    }
+    assert(profiling_requests.size() == all_requests.size());
+    str += "\nDecoding Steps: ";
+    for (auto const &profiling_info : profiling_requests) {
+      int request_id = profiling_info.first;
+      Request &request = all_requests[request_id];
+      str += "Request " + std::to_string(request_id) + ": ";
+      str += std::to_string(profiling_info.second.llm_decoding_steps);
+      str += "/";
+      str += std::to_string(request.decode_length());
+      float speedup = (float)request.decode_length() /
+                      profiling_info.second.llm_decoding_steps;
+      str += " " + std::to_string(speedup) + "\n";
+    }
+    str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")";
+    str += "\n total_requests(" + std::to_string(total_requests) + "/" +
+           std::to_string(all_requests.size()) + ")";
+    str += "\n total_tokens(" + std::to_string(total_tokens) + ")";
+    // throughput
+    str += "\n throughput_requests_per_sec(" +
+           std::to_string(total_requests / (total_time / 1e6)) + ")";
+    str += "\n throughput_tokens_per_sec(" +
+           std::to_string(total_tokens / (total_time / 1e6)) + ")";
+
+    double average_latency_per_request = 0;
+    std::string latency_per_request_ms = "\n latency_per_request_ms( ";
+    for (auto const &profiling_info : profiling_requests) {
+      double latency_ms = (profiling_info.second.finish_time -
+                           profiling_info.second.start_time) /
+                          1000.0;
+
+      // latency_per_request_ms += "[" + std::to_string(profiling_info.first)
+      // +
+      // ","; latency_per_request_ms += std::to_string(latency_ms) + "] ";
+      latency_per_request_ms += std::to_string(latency_ms) + " ";
+      average_latency_per_request += latency_ms;
+    }
+    latency_per_request_ms += ")";
+    str += latency_per_request_ms;
+
+    average_latency_per_request /= total_requests;
+    str += "\n average_latency_per_request_ms(" +
+           std::to_string(average_latency_per_request) + ")";
+
+    std::string ttft_per_request_ms = "\n ttft_per_request_ms( ";
+    for (auto const &profiling_info : profiling_requests) {
+      double prefilling_time_ms = 0;
+      auto const &profiling = profiling_info.second;
+      if (profiling.start_decoding_time != 0) {
+        prefilling_time_ms =
+            (profiling.start_decoding_time - profiling.start_time) / 1000.0;
+      } else {
+        prefilling_time_ms =
+            (profiling.finish_time - profiling.start_time) / 1000.0;
+      }
+      ttft_per_request_ms += std::to_string(prefilling_time_ms) + " ";
+    }
+    ttft_per_request_ms += ")";
+    str += ttft_per_request_ms;
+
+    std::unordered_map<double, std::pair<int, double>> tpots;
+    std::string tpot_per_request_ms = "\n tpot_per_request_ms( ";
+    for (auto const &profiling_info : profiling_requests) {
+      double per_token_time_ms = 0;
+      auto const &request = all_requests[profiling_info.first];
+      auto const &profiling = profiling_info.second;
+      if (profiling.start_decoding_time != 0) {
+        per_token_time_ms =
+            (profiling.finish_time - profiling.start_decoding_time) / 1000.0 /
+            request.decode_length();
+      }
+      tpot_per_request_ms += std::to_string(per_token_time_ms) + " ";
+      auto &tpot = tpots[request.slo_ratio];
+      tpot.first++;
+      tpot.second += per_token_time_ms;
+    }
+    tpot_per_request_ms += ")";
+    str += tpot_per_request_ms;
+
+    std::string average_tpot_per_slo_ms = "\n average_tpot_per_slo_ms( ";
+    for (auto const &kv : tpots) {
+      double average_tpot = kv.second.second / kv.second.first;
+      average_tpot_per_slo_ms +=
+          std::to_string(kv.first) + ":" + std::to_string(average_tpot) + " ";
+    }
+    average_tpot_per_slo_ms += ")";
+    str += average_tpot_per_slo_ms;
+
+    std::string req_per_step = "\n requests_per_step( ";
+    for (int nb : profiling.requests_per_step) {
+      req_per_step += std::to_string(nb) + " ";
+    }
+    req_per_step += ")";
+    str += req_per_step;
+
+    if (profiling.ssm_step_times.size() > 0) {
+      // assert(profiling.ssm_step_times.size() ==
+      //        profiling.llm_step_times.size());
+      std::string ssm_step_times_ms = "\n ssm_step_times_ms( ";
+      for (double time : profiling.ssm_step_times) {
+        ssm_step_times_ms += std::to_string(time) + " ";
+      }
+      ssm_step_times_ms += ")";
+      str += ssm_step_times_ms;
+    }
+
+    if (profiling.ssm_steps.size() > 0) {
+      std::string ssm_steps = "\n ssm_steps( ";
+      for (int nb : profiling.ssm_steps) {
+        ssm_steps += std::to_string(nb) + " ";
+      }
+      ssm_steps += ")";
+      str += ssm_steps;
+    }
+
+    std::string llm_step_times_ms = "\n llm_step_times_ms( ";
+    for (double time : profiling.llm_step_times) {
+      llm_step_times_ms += std::to_string(time) + " ";
+    }
+    llm_step_times_ms += ")";
+    str += llm_step_times_ms;
+
+    std::string generated_tokens_per_step = "\n generated_tokens_per_step( ";
+    for (int nb : profiling.generated_tokens_per_step) {
+      generated_tokens_per_step += std::to_string(nb) + " ";
+    }
+    generated_tokens_per_step += ")";
+    str += generated_tokens_per_step;
+
+    std::string mean_generated_tokens_per_step =
+        "\n mean_generated_tokens_per_step( ";
+    double mean_generated_tokens =
+        (double)std::accumulate(profiling.generated_tokens_per_step.begin(),
+                                profiling.generated_tokens_per_step.end(),
+                                0);
+    double total_request_steps =
+        (double)std::accumulate(profiling.requests_per_step.begin(),
+                                profiling.requests_per_step.end(),
+                                0);
+    mean_generated_tokens /= total_request_steps;
+    mean_generated_tokens_per_step += std::to_string(mean_generated_tokens);
+    mean_generated_tokens_per_step += ")";
+    str += mean_generated_tokens_per_step;
+
+    double attainment = 0, goodput = 0;
+    for (auto request_pair : all_requests) {
+      Request &request = request_pair.second;
+      if (request.attained) {
+        attainment += 1;
+        goodput += request.decode_length();
+      }
+    }
+    attainment /= total_requests;
+    goodput /= total_time / 1e6;
+
+    std::string slo_attainment = "\n slo_attainment( ";
+    slo_attainment += std::to_string(attainment);
+    slo_attainment += ")";
+    str += slo_attainment;
+
+    std::string goodput_str = "\n goodput( ";
+    goodput_str += std::to_string(goodput);
+    goodput_str += ")";
+    str += goodput_str;
+
+    if (get_eval_overhead_breakdown()) {
+      eval_process_latency_us -=
+          eval_schedule_latency_us + eval_other_latency_us;
+      std::string eval_overhead_breakdown_str = "\n eval_overhead_breakdown( ";
+      eval_overhead_breakdown_str +=
+          "\n  ssm_prefill_us: " + std::to_string(eval_ssm_prefill_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  ssm_spec_us: " + std::to_string(eval_ssm_spec_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  llm_prefill_us: " + std::to_string(eval_llm_prefill_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  llm_verify_us: " + std::to_string(eval_llm_verify_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  process_us: " + std::to_string(eval_process_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  scheduling_us: " + std::to_string(eval_schedule_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  other_us: " + std::to_string(eval_other_latency_us);
+      eval_overhead_breakdown_str += ")";
+      str += eval_overhead_breakdown_str;
+    }
+
+    write_to_output_file("", str);
+    background_server_status = TERMINATED;
+    request_queue_cv.notify_all();
     // Wait for the background server to terminate
     Runtime *runtime = Runtime::get_runtime();
     Context ctx = Runtime::get_context();
@@ -2542,8 +3394,12 @@ void RequestManager::terminate_background_server() {
   }
 }
 
+bool RequestManager::is_background_server_serving() {
+  return background_server_status == SERVING;
+}
+
 bool RequestManager::is_background_server_terminated() {
-  return request_manager_status == TERMINATED;
+  return background_server_status == TERMINATED;
 }
 
 RequestManager *request_manager_singleton = nullptr;
@@ -2556,4 +3412,554 @@ RequestManager *RequestManager::get_request_manager() {
   return request_manager_singleton;
 }
 
+/* --------- Request Token Tree Related Functions --------- */
+void RequestManager::init_token_tree(RequestGuid guid) {
+  Request &request = all_requests[guid];
+  request.speculative_token_trees.clear();
+  // Assume we only use one small model for speculation
+  request.speculative_token_trees.emplace_back();
+}
+
+void RequestManager::add_root_to_spec_token_tree(
+    RequestGuid guid, BatchConfig::TokenId token_id) {
+  // This method is called by update_llm_verify_results()
+  // The last token in the accepted sequence should be the root of the next
+  // speculation tree. The reason is that the KV cache of this token is not
+  // computed yet, and we need the large model to decode the logit of this
+  // token to verify its childs (the tokens in the first layer). This method
+  // should: construct and add the root token to the empty speculative token
+  // tree, with parent_pos being -1 and log_accumulated_prob being 0.0
+  Request &request = all_requests[guid];
+  TokenTree &speculative_token_tree = request.speculative_token_trees[0];
+  speculative_token_tree.add_layer();
+  auto node_ptr = std::make_shared<TokenTreeNode>(token_id, 0.0, -1);
+  node_ptr->included = true;
+  if (speculative_sampling) {
+    node_ptr->gumbel = true;
+  }
+  speculative_token_tree.tree_layers[0].push_back(node_ptr);
+}
+
+void RequestManager::add_tokens_to_spec_token_tree(
+    InferenceResult const &ssm_inference_result) {
+  // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
+  // TODO: support gumbel sampling
+
+  int tree_width =
+      min(get_max_tokens_per_ssm_batch() / get_num_active_requests(),
+          get_max_tree_width());
+  assert(tree_width >= 1);
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    int parent_num = request.num_tokens_in_batch;
+    if (parent_num == 0) {
+      continue;
+    }
+
+    // ssm_first_step only decode the last token (the root of the tree)
+    int result_offset =
+        (request.first_token_offset_in_batch +
+         (current_ssm_step == 1 ? (request.num_tokens_in_batch - 1) : 0)) *
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+    TokenTree &spec_token_tree = request.speculative_token_trees[0];
+    std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
+        spec_token_tree.tree_layers.back();
+    std::vector<std::pair<double, int>> child_probs_v;
+    child_probs_v.reserve(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES *
+                          get_max_tree_width());
+    int parent_pos = 0;
+    for (auto const &parent_ptr : last_layer) {
+      double parent_log_prob = parent_ptr->log_accumulated_prob;
+      int child_start_idx =
+          result_offset +
+          parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      for (int result_idx = child_start_idx;
+           result_idx <
+           child_start_idx + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+           result_idx++) {
+        double log_prob = log((double)ssm_inference_result.probs[result_idx]);
+        if (log_prob == -std::numeric_limits<double>::infinity()) {
+          continue;
+        }
+        if (log_prob == 0.0) {
+          // Slightly perturb the log prob to make it strictly less than 0
+          log_prob -= 1e-10;
+        }
+
+        double accumulated_log_prob = log_prob + parent_log_prob;
+
+        child_probs_v.emplace_back(accumulated_log_prob, result_idx);
+      }
+      parent_pos++;
+    }
+
+    spec_token_tree.add_layer();
+    int actual_width = min(tree_width, (int)child_probs_v.size());
+    if (actual_width == 0) {
+      continue;
+    }
+    std::partial_sort(child_probs_v.begin(),
+                      child_probs_v.begin() + actual_width,
+                      child_probs_v.end(),
+                      std::greater<std::pair<double, int>>());
+    for (int i = 0; i < actual_width; i++) {
+      auto [accumulated_log_prob, result_idx] = child_probs_v[i];
+      int parent_pos = (result_idx - result_offset) /
+                       BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      std::shared_ptr<TokenTreeNode> node_ptr = std::make_shared<TokenTreeNode>(
+          ssm_inference_result.token_ids[result_idx],
+          accumulated_log_prob,
+          parent_pos);
+      spec_token_tree.tree_layers.back().push_back(node_ptr);
+      request.token_tree_nodes_acc_prob_pair_pq.push(
+          std::make_pair(node_ptr, accumulated_log_prob));
+    }
+  }
+}
+
+void RequestManager::add_tokens_to_spec_token_tree_old_version(
+    InferenceResult const &ssm_inference_result) {
+
+  std::vector<int> tree_width_vector = {
+      1, 1, this->expansion_degree, 1, 1, 1, 1, 1};
+
+  int expand_width = tree_width_vector[current_ssm_step - 1];
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    int parent_num = request.num_tokens_in_batch;
+    if (parent_num == 0) {
+      continue;
+    }
+
+    // ssm_first_step only decode the last token (the root of the tree)
+    int result_offset =
+        (request.first_token_offset_in_batch +
+         (current_ssm_step == 1 ? (request.num_tokens_in_batch - 1) : 0)) *
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+    TokenTree &spec_token_tree = request.speculative_token_trees[0];
+    std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
+        spec_token_tree.tree_layers.back();
+    spec_token_tree.add_layer();
+
+    int parent_pos = 0;
+    for (auto const &parent_ptr : last_layer) {
+      double parent_log_prob = parent_ptr->log_accumulated_prob;
+      int child_start_idx =
+          result_offset +
+          parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      std::vector<std::pair<double, int>> child_probs_v;
+      child_probs_v.reserve(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+      for (int result_idx = child_start_idx;
+           result_idx <
+           child_start_idx + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+           result_idx++) {
+        double log_prob = log((double)ssm_inference_result.probs[result_idx]);
+        if (log_prob == -std::numeric_limits<double>::infinity()) {
+          continue;
+        }
+        if (log_prob == 0.0) {
+          // Slightly perturb the log prob to make it strictly less than 0
+          log_prob -= 1e-10;
+        }
+
+        double accumulated_log_prob = log_prob + parent_log_prob;
+
+        child_probs_v.emplace_back(accumulated_log_prob, result_idx);
+      }
+      int actual_width = min(expand_width, (int)child_probs_v.size());
+      if (actual_width == 0) {
+        continue;
+      }
+      std::partial_sort(child_probs_v.begin(),
+                        child_probs_v.begin() + actual_width,
+                        child_probs_v.end(),
+                        std::greater<std::pair<double, int>>());
+      for (int i = 0; i < actual_width; i++) {
+        auto [accumulated_log_prob, result_idx] = child_probs_v[i];
+        std::shared_ptr<TokenTreeNode> node_ptr =
+            std::make_shared<TokenTreeNode>(
+                ssm_inference_result.token_ids[result_idx],
+                accumulated_log_prob,
+                parent_pos);
+        node_ptr->included = true;
+        spec_token_tree.tree_layers.back().push_back(node_ptr);
+      }
+      parent_pos++;
+    }
+  }
+}
+
+void RequestManager::prune_token_tree() {
+  if (get_greedy_schedule()) {
+    return prune_token_tree_greedy();
+  } else if (get_equal_schedule()) {
+    return prune_token_tree_equal();
+  }
+
+  // Each reqeust has at least one token
+  int budget = get_max_tokens_per_batch() - num_available_requests;
+  assert(budget >= 0);
+
+  std::vector<std::pair<double, int>> num_tokens_to_decode_2_request_index;
+  num_tokens_to_decode_2_request_index.reserve(get_max_requests_per_batch());
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (request.get_slo_ratio() > 999) { // infinity
+      continue;
+    }
+    double num_tokens_to_decode_per_step =
+        (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor /
+        get_slo_constraint(request);
+    double expected_num_tokens_decoded =
+        request.decode_latency_ms / get_slo_constraint(request);
+    double num_tokens_to_decode =
+        max(1.0,
+            num_tokens_to_decode_per_step + expected_num_tokens_decoded -
+                request.decode_length());
+    num_tokens_to_decode =
+        min(num_tokens_to_decode, (double)ssm_tree_depth + 1);
+    num_tokens_to_decode_2_request_index.push_back(
+        std::make_pair(num_tokens_to_decode, request_index));
+  }
+
+  // Sort the requests by spare latency in ascending order
+  std::sort(num_tokens_to_decode_2_request_index.begin(),
+            num_tokens_to_decode_2_request_index.end(),
+            std::less<std::pair<double, int>>());
+
+  for (auto const &spare_latency_request_index_pair :
+       num_tokens_to_decode_2_request_index) {
+    int request_index = spare_latency_request_index_pair.second;
+    RequestGuid guid = guid_of_requests[request_index];
+    if (all_requests[guid].get_slo_ratio() < 0) {
+      continue;
+    }
+    add_tokens_toward_slo(
+        guid, budget, num_tokens_to_decode_2_request_index.size());
+  }
+
+  assert(budget >= 0);
+  if (budget > 0) {
+    if (memory_occupancy) {
+      add_tokens_toward_memory_occupancy(budget);
+    } else {
+      add_tokens_toward_goodput(budget);
+    }
+  }
+}
+
+void RequestManager::prune_token_tree_equal() {
+  // Each reqeust has at least one token
+  int const equal_budget =
+      get_max_tokens_per_batch() / get_num_active_requests();
+  assert(equal_budget >= 0);
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    int budget = equal_budget;
+    assert(budget >= 0);
+    if (budget > 0) {
+      add_tokens_toward_goodput_per_request(budget, request_index);
+    }
+  }
+}
+
+void RequestManager::prune_token_tree_greedy() {
+  // Each reqeust has at least one token
+  int budget = get_max_tokens_per_batch();
+  assert(budget >= 0);
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+  }
+
+  if (budget > 0) {
+    add_tokens_toward_goodput(budget);
+  }
+}
+
+void RequestManager::add_tokens_toward_slo(RequestGuid guid,
+                                           int &budget,
+                                           int num_req_with_slo) {
+  Request &request = all_requests[guid];
+  double num_tokens_to_decode_per_step =
+      (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor /
+      get_slo_constraint(request);
+  double expected_num_tokens_decoded =
+      request.decode_latency_ms / get_slo_constraint(request);
+
+  double num_tokens_to_decode =
+      max(1.0,
+          num_tokens_to_decode_per_step + expected_num_tokens_decoded -
+              request.decode_length());
+  num_tokens_to_decode = min(num_tokens_to_decode, (double)ssm_tree_depth + 1);
+
+  // The root is already included
+  // In function add_root_to_spec_token_tree
+  double current_added = 1.0;
+
+  // The max token that can be added to the token tree when fulfilling the SLO
+  int max_token_toward_slo =
+      int(get_max_tokens_per_batch() * 1.2 / num_available_requests);
+
+  while (budget > 0 and max_token_toward_slo > 0 and
+         current_added < num_tokens_to_decode) {
+    if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+      break;
+    }
+    auto [node_ptr, log_acc_prob] =
+        request.token_tree_nodes_acc_prob_pair_pq.top();
+    request.token_tree_nodes_acc_prob_pair_pq.pop();
+    node_ptr->included = true;
+    current_added += exp(log_acc_prob);
+    budget--;
+    max_token_toward_slo--;
+  }
+}
+
+void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
+  // This is a helper data structure to store help the pruning of the token
+  // trees across different requests.
+  std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>
+      global_token_tree_node_vector;
+  global_token_tree_node_vector.reserve(get_max_requests_per_batch());
+  std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
+      SharedTokenTreeNodePtrRequestGuidWeightedLess>
+      global_token_tree_node_pq(SharedTokenTreeNodePtrRequestGuidWeightedLess(),
+                                std::move(global_token_tree_node_vector));
+
+  // Initialie the priority queue with the top element in each request's token
+  // tree
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+      continue;
+    }
+    if (!request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_acc_prob_pair_pq.top().first, guid});
+      request.token_tree_nodes_acc_prob_pair_pq.pop();
+    }
+  }
+
+  // Perform dequeue and enqueue until the budget is used up
+  while (budget > 0 and !global_token_tree_node_pq.empty()) {
+    auto [node_ptr, guid] = global_token_tree_node_pq.top();
+    global_token_tree_node_pq.pop();
+    node_ptr->included = true;
+    if (!get_request_with_guid(guid)
+             .token_tree_nodes_acc_prob_pair_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {get_request_with_guid(guid)
+               .token_tree_nodes_acc_prob_pair_pq.top()
+               .first,
+           guid});
+      get_request_with_guid(guid).token_tree_nodes_acc_prob_pair_pq.pop();
+    }
+    budget--;
+  }
+
+  // Clear the priority queue in each requests
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
+        _prealloc_vector;
+    _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+    request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue<
+        std::pair<std::shared_ptr<TokenTreeNode>, double>,
+        std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+        SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
+                                          std::move(_prealloc_vector));
+  }
+}
+
+void RequestManager::add_tokens_toward_goodput(int budget) {
+  // This is a helper data structure to store help the pruning of the token
+  // trees across different requests.
+  std::vector<std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid>>
+      global_token_tree_node_vector;
+  global_token_tree_node_vector.reserve(get_max_requests_per_batch());
+  std::priority_queue<
+      std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid>,
+      std::vector<
+          std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid>>,
+      SharedTokenTreeNodePtrDoubleRequestGuidLess>
+      global_token_tree_node_pq(SharedTokenTreeNodePtrDoubleRequestGuidLess(),
+                                std::move(global_token_tree_node_vector));
+
+  // Initialie the priority queue with the top element in each request's token
+  // tree
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+      continue;
+    }
+    if (!request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_acc_prob_pair_pq.top().first,
+           request.token_tree_nodes_acc_prob_pair_pq.top().second,
+           guid});
+      request.token_tree_nodes_acc_prob_pair_pq.pop();
+    }
+  }
+
+  // Perform dequeue and enqueue until the budget is used up
+  while (budget > 0 and !global_token_tree_node_pq.empty()) {
+    auto [node_ptr, acc_log_prob, guid] = global_token_tree_node_pq.top();
+    global_token_tree_node_pq.pop();
+    node_ptr->included = true;
+    if (!get_request_with_guid(guid)
+             .token_tree_nodes_acc_prob_pair_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {get_request_with_guid(guid)
+               .token_tree_nodes_acc_prob_pair_pq.top()
+               .first,
+           get_request_with_guid(guid)
+               .token_tree_nodes_acc_prob_pair_pq.top()
+               .second,
+           guid});
+      get_request_with_guid(guid).token_tree_nodes_acc_prob_pair_pq.pop();
+    }
+    budget--;
+  }
+
+  // Clear the priority queue in each requests
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
+        _prealloc_vector;
+    _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+    request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue<
+        std::pair<std::shared_ptr<TokenTreeNode>, double>,
+        std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+        SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
+                                          std::move(_prealloc_vector));
+  }
+}
+
+void RequestManager::add_tokens_toward_goodput_per_request(int budget,
+                                                           int request_index) {
+  RequestGuid guid = guid_of_requests[request_index];
+  Request &request = all_requests[guid];
+  assert(request.status == Request::RUNNING);
+  if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+    return;
+  }
+
+  auto &pq = request.token_tree_nodes_acc_prob_pair_pq;
+
+  // Perform dequeue and enqueue until the budget is used up
+  while (budget > 0 and !pq.empty()) {
+    auto [node_ptr, acc_log_prob] = pq.top();
+    pq.pop();
+    node_ptr->included = true;
+    budget--;
+  }
+
+  // Clear the priority queue in the request
+  std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
+      _prealloc_vector;
+  _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+  request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, double>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+      SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
+                                        std::move(_prealloc_vector));
+}
+
+std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
+  os << "Token tree: " << std::endl;
+  int layer_idx = 0;
+  for (auto const &layer : token_tree.tree_layers) {
+    os << "Layer: " << layer_idx << std::endl;
+    int token_pos = 0;
+    for (auto const &node : layer) {
+      os << std::fixed << std::setprecision(12);
+      os << "token pos: " << token_pos << "\ttoken id: " << node->id
+         << "\tparent pos: " << node->parent_pos
+         << "\tlog prob: " << node->log_accumulated_prob
+         << (node->included ? " included" : " not included") << std::endl;
+      token_pos++;
+    }
+    layer_idx++;
+  }
+  return os;
+}
+
+/* --------- Request Token Tree Related Functions --------- */
+
+/* --------- Profiling Related Functions --------- */
+void RequestManager::reset_profiling_statistics() {
+  profiling.llm_step_times.clear();
+  profiling.requests_per_step.clear();
+  profiling.ssm_step_times.clear();
+  profiling.ssm_steps.clear();
+  profiling.generated_tokens_per_step.clear();
+  profiling.llm_step_start = 0;
+  profiling.ssm_step_start = 0;
+  profiling.server_start_time = Realm::Clock::current_time_in_microseconds();
+}
+/* --------- Profiling Related Functions --------- */
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index fadbf80d6..e3e5a5d5f 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/request_manager.h"
+#include "flexflow/ffconst.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -35,9 +36,14 @@ void RequestManager::load_tokens_task(
 
   // Extreme long prompts are not supported, only load up to
   // max_tokens_per_batch as prompt
-  if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) {
+  int max_tokens_per_batch =
+      std::max(batch_config->get_mode() == TREE_SEARCH_MODE
+                   ? BatchConfig::max_tokens_per_ssm_batch()
+                   : BatchConfig::max_tokens_per_batch(),
+               BatchConfig::max_tokens_per_prefilling_batch());
+  if (batch_config->num_tokens > max_tokens_per_batch) {
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
-           BatchConfig::max_tokens_per_batch());
+           max_tokens_per_batch);
     printf("Got: %d tokens\n", batch_config->num_tokens);
   }
 
@@ -90,53 +96,30 @@ void RequestManager::load_batch_config_task(
   total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   // load speculative metadata
-  if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
-    BeamSearchBatchConfig const *beam_batch_config =
-        static_cast<BeamSearchBatchConfig const *>(batch_config);
-
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
-                             &(beam_batch_config->beamTokenInfo),
-                             sizeof(BeamSearchBatchConfig::beamTokenInfo),
-                             hipMemcpyHostToDevice,
-                             stream));
-
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
-
+  if (batch_config->get_mode() == TREE_SEARCH_MODE) {
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
-                             &(beam_batch_config->beamRequestsInfo),
-                             sizeof(BeamSearchBatchConfig::beamRequestsInfo),
-                             hipMemcpyHostToDevice,
-                             stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
-
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
-                             &(beam_batch_config->causalMask),
+                             &(batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
 
     total_copy_size += sizeof(BatchConfig::causalMask);
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
-    TreeVerifyBatchConfig const *tree_batch_config =
-        static_cast<TreeVerifyBatchConfig const *>(batch_config);
-
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
-                             &(tree_batch_config->causalMask),
+                             &(batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
     total_copy_size += sizeof(BatchConfig::causalMask);
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
-                             &(tree_batch_config->committed_tokens),
-                             sizeof(TreeVerifyBatchConfig::committed_tokens),
+                             &(batch_config->committed_tokens),
+                             sizeof(BatchConfig::committed_tokens),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
+    total_copy_size += sizeof(BatchConfig::committed_tokens);
   }
 
   // add a size check
@@ -160,7 +143,7 @@ void RequestManager::load_positions_task(
   int dram_copy[BatchConfig::MAX_NUM_TOKENS];
 
   for (int i = 0; i < batch_config->num_tokens; i++) {
-    dram_copy[i] = batch_config->tokensInfo[i].abs_depth_in_request + offset;
+    dram_copy[i] = batch_config->tokensInfo[i].abs_index_in_request + offset;
   }
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 8380d6be7..be09ee7b2 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -13,13 +13,25 @@
  * limitations under the License.
  */
 
+#include "flashinfer/decode_attention_decl.cuh"
+#include "flashinfer/prefill_attention_decl.cuh"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/request_manager.h"
 #include "flexflow/utils/cuda_helper.h"
+#include <cassert>
 
 namespace FlexFlow {
 
 using namespace Legion;
 
+using flashinfer::BatchDecodeHandler;
+using flashinfer::BatchPrefillHandler;
+using flashinfer::LogitsPostHook;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
+
 void RequestManager::load_tokens_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
@@ -35,36 +47,295 @@ void RequestManager::load_tokens_task(
 
   // Extreme long prompts are not supported, only load up to
   // BatchConfig::max_tokens_per_batch() as prompt
-  if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() &&
-      batch_config->get_mode() == INC_DECODING_MODE) {
+  int max_tokens_per_batch =
+      std::max(batch_config->get_mode() == TREE_SEARCH_MODE
+                   ? BatchConfig::max_tokens_per_ssm_batch()
+                   : BatchConfig::max_tokens_per_batch(),
+               BatchConfig::max_tokens_per_prefilling_batch());
+  if (batch_config->num_tokens > max_tokens_per_batch) {
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
-           BatchConfig::max_tokens_per_batch());
-    printf("Got: %d tokens\n", batch_config->num_tokens);
-  } else if (batch_config->num_tokens >
-             BatchConfig::max_verify_tokens_per_batch()) {
-    printf("Warning: Speculative decoding. too many tokens in prompt, only "
-           "load up to %d tokens\n",
-           BatchConfig::max_verify_tokens_per_batch());
+           max_tokens_per_batch);
     printf("Got: %d tokens\n", batch_config->num_tokens);
   }
 
-  for (int i = 0; i < batch_config->num_tokens; i++) {
-    dram_copy[i] = batch_config->tokensInfo[i].token_id;
+  if (batch_config->num_tokens > 0) {
+    for (int i = 0; i < batch_config->num_tokens; i++) {
+      dram_copy[i] = batch_config->tokensInfo[i].token_id;
+    }
+    TokenId *fb_ptr = helperGetTensorPointerWO<TokenId>(
+        regions[0], task->regions[0], FID_DATA, ctx, runtime);
+    Domain domain = runtime->get_index_space_domain(
+        ctx, task->regions[0].region.get_index_space());
+    assert(batch_config->num_tokens <= domain.get_volume());
+    cudaStream_t stream;
+    checkCUDA(get_legion_stream(&stream));
+    checkCUDA(cudaMemcpyAsync(fb_ptr,
+                              dram_copy,
+                              sizeof(TokenId) * batch_config->num_tokens,
+                              cudaMemcpyHostToDevice,
+                              stream));
   }
-  TokenId *fb_ptr = helperGetTensorPointerWO<TokenId>(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  Domain domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  assert(batch_config->num_tokens <= domain.get_volume());
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkCUDA(cudaMemcpyAsync(fb_ptr,
-                            dram_copy,
-                            sizeof(TokenId) * batch_config->num_tokens,
+}
+
+void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
+                                       PageManager *pm,
+                                       AttentionMetaData *attention_metadata,
+                                       cudaStream_t stream,
+                                       uint32_t const max_num_pages,
+                                       int32_t *q_indptr_h,
+                                       int32_t *kv_indptr_h,
+                                       int32_t *kv_indices_h,
+                                       int32_t *kv_last_page_len_h,
+                                       int32_t *qk_indptr_h) {
+  int batch_size = batch_config->num_active_requests();
+  // we just search for the page number for each request
+  q_indptr_h[0] = 0;
+  kv_indptr_h[0] = 0;
+  qk_indptr_h[0] = 0;
+  int q_lens = 0, qk_lens = 0;
+  int indices_offset = 0, indices_lens = 0;
+  for (int req_idx = 0, indptr_idx = 0;
+       req_idx < batch_config->max_requests_per_batch();
+       req_idx++) {
+    if (batch_config->request_available[req_idx]) {
+      int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+      int kv_len =
+          batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+          batch_config->requestsInfo[req_idx].first_token_index_in_request;
+
+      q_lens += q_len;
+      qk_lens += (q_len * kv_len + 7) / 8;
+      indices_offset = indices_lens;
+      indices_lens += (kv_len + kPagesize - 1) / kPagesize;
+      q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
+      kv_indptr_h[indptr_idx + 1] =
+          round_up_pages(kv_len) + kv_indptr_h[indptr_idx];
+      std::vector<int32_t> kv_indices = pm->get_block_table_indices(
+          batch_config->requestsInfo[req_idx].request_guid);
+      for (int i = indices_offset; i < indices_lens; i++) {
+        kv_indices_h[i] = kv_indices[i - indices_offset];
+      }
+      kv_last_page_len_h[indptr_idx] = (kv_len - 1) % kPagesize + 1;
+      qk_indptr_h[indptr_idx + 1] = qk_lens;
+      indptr_idx++;
+    }
+  }
+
+  // do the copy
+  checkCUDA(cudaMemcpyAsync(attention_metadata->q_indptr,
+                            q_indptr_h,
+                            sizeof(int32_t) * (batch_size + 1),
+                            cudaMemcpyHostToDevice,
+                            stream));
+  checkCUDA(cudaMemcpyAsync(attention_metadata->kv_indptr,
+                            kv_indptr_h,
+                            sizeof(int32_t) * (batch_size + 1),
+                            cudaMemcpyHostToDevice,
+                            stream));
+  checkCUDA(cudaMemcpyAsync(attention_metadata->kv_indices,
+                            kv_indices_h,
+                            sizeof(int32_t) * batch_size * max_num_pages,
+                            cudaMemcpyHostToDevice,
+                            stream));
+  checkCUDA(cudaMemcpyAsync(attention_metadata->kv_last_page_len,
+                            kv_last_page_len_h,
+                            sizeof(int32_t) * batch_size,
+                            cudaMemcpyHostToDevice,
+                            stream));
+  checkCUDA(cudaMemcpyAsync(attention_metadata->qk_indptr,
+                            qk_indptr_h,
+                            sizeof(int32_t) * (batch_size + 1),
                             cudaMemcpyHostToDevice,
                             stream));
 }
 
+// q_indptr: the start offset of q in the batch for each request,
+//           the length is `num_requests + 1`: [0, num_q_0, num_q_0 + num_q_1,
+//           ..., num_q_0 + num_q_1 + ... + num_q_{num_requests - 1}]
+// kv_indptr: the start offset of kv page_indices for each request,
+//            the length is `num_requests + 1`.
+// kv_indices: the page indices for kv, the length is `num_kv_pages`.
+// kv_last_page_len: the cache length in the last page for each request,
+//                   the length is `num_requests`.
+// qk_indptr: the start offset of custom_mask in the flattened mask for each
+//            request, the length is `num_requests + 1`. It can be calculated as
+//            accumulative `ceil(qk_len / 8)`.
+__global__ void
+    prepare_inference_params_kernel(int const num_requests,
+                                    BatchConfig::PerRequestInfo *request_infos,
+                                    bool *request_available,
+                                    uint32_t const max_num_pages,
+                                    int32_t *q_indptr,
+                                    int32_t *kv_indptr,
+                                    int32_t *kv_indices,
+                                    int32_t *kv_last_page_len,
+                                    int32_t *qk_indptr) {
+  int const request_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (request_idx >= num_requests) {
+    return;
+  }
+
+  // request id in batch config
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0, q_lens = 0, qk_lens = 0;
+  int indices_offset = 0, indices_lens = 0, kv_len = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
+      int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch;
+      q_lens += q_len;
+      kv_len = request_infos[requext_idx_in_batch].num_tokens_in_batch +
+               request_infos[requext_idx_in_batch].first_token_index_in_request;
+      qk_lens += (q_len * kv_len + 7) / 8;
+      indices_offset = indices_lens;
+      indices_lens += (kv_len + kPagesize - 1) / kPagesize;
+    }
+  }
+
+  if (request_idx == 0) {
+    q_indptr[0] = 0;
+    kv_indptr[0] = 0;
+    qk_indptr[0] = 0;
+  }
+  __syncthreads();
+  q_indptr[request_idx + 1] = q_lens;
+  kv_indptr[request_idx + 1] = indices_lens;
+  for (int i = indices_offset; i < indices_lens; i++) {
+    kv_indices[i] = max_num_pages * requext_idx_in_batch + (i - indices_offset);
+  }
+  kv_last_page_len[request_idx] = (kv_len - 1) % kPagesize + 1;
+  qk_indptr[request_idx + 1] = qk_lens;
+}
+
+#define test_bit_orig(bit_mask, idx, pos)                                      \
+  (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
+
+// cache = (global-sink) % window + sink
+#define cache_2_global_index(cache_info, cache_index)                          \
+  do {                                                                         \
+    if (cache_index >= (cache_info).sink_cache_size) {                         \
+      cache_index -= (cache_info).sink_cache_size;                             \
+      int num_window =                                                         \
+          ((cache_info).total_len - (cache_info).sink_cache_size) /            \
+              (cache_info).window_cache_size -                                 \
+          ((cache_info).window_back <= cache_index);                           \
+      cache_index += (cache_info).sink_cache_size +                            \
+                     num_window * (cache_info).window_cache_size;              \
+    }                                                                          \
+  } while (0)
+
+__global__ void
+    update_custom_mask_kernel(uint8_t *custom_mask,
+                              int32_t const *qk_indptr,
+                              BatchConfig::BitMask *causalMask,
+                              BatchConfig::PerRequestInfo *request_infos,
+                              bool *request_available,
+                              uint32_t const num_requests,
+                              StreamingCacheInfo *streaming_cache_infos,
+                              bool streaming_cache) {
+  int byte_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int request_idx = 0;
+  while (request_idx < num_requests) {
+    if (qk_indptr[request_idx + 1] > byte_idx) {
+      break;
+    }
+    request_idx++;
+  }
+
+  if (request_idx >= num_requests) {
+    return;
+  }
+  byte_idx -= qk_indptr[request_idx];
+
+  // request id in batch config
+  int requext_idx_in_batch = -1, cnt_1 = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
+    }
+  }
+
+  BatchConfig::PerRequestInfo const &request_info =
+      request_infos[requext_idx_in_batch];
+  BatchConfig::BitMask &causal_mask = causalMask[requext_idx_in_batch];
+
+  int const q_length = request_info.num_tokens_in_batch,
+            q_start = request_info.first_token_index_in_request -
+                      causal_mask.non_tree_cache_size,
+            non_tree_cache_size = causal_mask.non_tree_cache_size,
+            kv_len = request_info.num_tokens_in_batch +
+                     request_info.first_token_index_in_request;
+
+  uint8_t packed_bits = 0;
+  for (int bit_idx = 0; bit_idx < 8; bit_idx++) {
+    int const bit_offset = byte_idx * 8 + bit_idx, q_idx = bit_offset / kv_len;
+    int kv_idx = bit_offset % kv_len;
+    if (streaming_cache) { // recover to the original index
+      if (kv_idx < streaming_cache_infos[requext_idx_in_batch].commit_len) {
+        cache_2_global_index(streaming_cache_infos[requext_idx_in_batch],
+                             kv_idx);
+      } else {
+        kv_idx += streaming_cache_infos[requext_idx_in_batch].total_len -
+                  streaming_cache_infos[requext_idx_in_batch].commit_len;
+      }
+    }
+    if (kv_idx < non_tree_cache_size || q_idx >= q_length) {
+      packed_bits |= 1 << bit_idx;
+    } else {
+      if (test_bit_orig(causal_mask.bit_mask,
+                        q_start + q_idx,
+                        kv_idx - non_tree_cache_size)) {
+        packed_bits |= 1 << bit_idx;
+      }
+    }
+  }
+  custom_mask[qk_indptr[request_idx] + byte_idx] = packed_bits;
+}
+
+// Passing the CPU-side causalMask, then output the bit-packed custom_mask for
+// attention forward.
+// Layout of causalMask: [num_requests][tree_size][tree_size]
+// Layout of custom_mask: [num_requests][q_length][kv_length] (bit-packed)
+// Note that for spec-decoding, q_length == last_layer_length != tree_size
+// Also we should consider the influence of StreamingCache
+void update_custom_mask(BatchConfig const *batch_config,
+                        AttentionMetaData *metadata,
+                        BatchConfig::BitMask *causalMask,
+                        BatchConfig::PerRequestInfo *request_infos,
+                        bool *request_available,
+                        int batch_size,
+                        StreamingCacheInfo *streaming_cache_infos,
+                        cudaStream_t stream) {
+  InferenceMode mode = batch_config->get_mode();
+  assert(mode == TREE_SEARCH_MODE || mode == TREE_VERIFY_MODE);
+  bool streaming_cache =
+      mode == TREE_SEARCH_MODE && batch_config->streaming_cache();
+  int parallelism = 0;
+  for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch();
+       req_idx++) {
+    if (batch_config->request_available[req_idx]) {
+      int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+      int kv_len =
+          batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+          batch_config->requestsInfo[req_idx].first_token_index_in_request;
+      parallelism += (q_len * kv_len + 7) / 8;
+    }
+  }
+  update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
+                              min(CUDA_NUM_THREADS, parallelism),
+                              0,
+                              stream>>>(metadata->custom_mask,
+                                        metadata->qk_indptr,
+                                        causalMask,
+                                        request_infos,
+                                        request_available,
+                                        batch_size,
+                                        streaming_cache_infos,
+                                        streaming_cache);
+}
+
 void RequestManager::load_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
@@ -81,86 +352,406 @@ void RequestManager::load_batch_config_task(
   // copy meta data to workSpace
   FFHandler handle = *((FFHandler const *)task->local_args);
   size_t total_copy_size = 0;
-  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata,
-                            &(batch_config->tokensInfo),
-                            sizeof(BatchConfig::tokensInfo),
-                            cudaMemcpyHostToDevice,
-                            stream));
+  if (batch_config->num_tokens > 0) {
+    // The tokensInfo is compact
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata,
+                              &(batch_config->tokensInfo),
+                              batch_config->num_tokens *
+                                  sizeof(BatchConfig::PerTokenInfo),
+                              cudaMemcpyHostToDevice,
+                              stream));
+  }
   total_copy_size += sizeof(BatchConfig::tokensInfo);
 
+  for (int request_idx = 0; request_idx < BatchConfig::max_requests_per_batch();
+       request_idx++) {
+    if (batch_config->request_available[request_idx]) {
+      checkCUDA(cudaMemcpyAsync(
+          static_cast<char *>(handle.batch_config_metadata) + total_copy_size +
+              request_idx * sizeof(BatchConfig::PerRequestInfo),
+          &(batch_config->requestsInfo[request_idx]),
+          sizeof(BatchConfig::PerRequestInfo),
+          cudaMemcpyHostToDevice,
+          stream));
+    }
+  }
+  total_copy_size += sizeof(BatchConfig::requestsInfo);
+
   checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                 total_copy_size,
-                            &(batch_config->requestsInfo),
-                            sizeof(BatchConfig::requestsInfo),
+                            &(batch_config->request_available),
+                            sizeof(BatchConfig::request_available),
                             cudaMemcpyHostToDevice,
                             stream));
-  total_copy_size += sizeof(BatchConfig::requestsInfo);
+  total_copy_size += sizeof(BatchConfig::request_available);
+
+  for (int request_idx = 0; request_idx < BatchConfig::max_requests_per_batch();
+       request_idx++) {
+    if (batch_config->request_available[request_idx]) {
+      checkCUDA(cudaMemcpyAsync(
+          static_cast<char *>(handle.batch_config_metadata) + total_copy_size +
+              request_idx * sizeof(BatchConfig::BitMask),
+          &(batch_config->causalMask[request_idx]),
+          sizeof(BatchConfig::BitMask),
+          cudaMemcpyHostToDevice,
+          stream));
+    }
+  }
+  total_copy_size += sizeof(BatchConfig::causalMask);
 
-  // load speculative metadata
-  if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
-    BeamSearchBatchConfig const *beam_batch_config =
-        static_cast<BeamSearchBatchConfig const *>(batch_config);
+  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
+                                total_copy_size,
+                            &(batch_config->streamingCacheInfo),
+                            sizeof(BatchConfig::streamingCacheInfo),
+                            cudaMemcpyHostToDevice,
+                            stream));
+  total_copy_size += sizeof(BatchConfig::streamingCacheInfo);
 
+  if (batch_config->num_tokens_to_commit > 0) {
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamTokenInfo),
-        sizeof(BeamSearchBatchConfig::beamTokenInfo),
+        &(batch_config->committed_tokens),
+        batch_config->num_tokens_to_commit *
+            sizeof(BatchConfig::CommittedTokensInfo),
         cudaMemcpyHostToDevice,
         stream));
+  }
+  total_copy_size += sizeof(BatchConfig::committed_tokens);
 
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
+  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
+                                total_copy_size,
+                            &(batch_config->num_tokens_to_commit),
+                            sizeof(int),
+                            cudaMemcpyHostToDevice,
+                            stream));
+  total_copy_size += sizeof(int);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamRequestsInfo),
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
+  // load attention metadata
+  if (batch_config->get_mode() == INC_DECODING_MODE) {
+    PageManager *pm = PageManager::get_page_manager();
+    static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+        kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS *
+                                BatchConfig::MAX_NUM_TOKENS];
+    static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
+    if (handle.incr_attention_metadata->enabled()) {
+      // calculate the attention meta data
+      {
+        int batch_size = batch_config->num_active_requests();
+        uint32_t const max_num_pages =
+            round_up_pages(BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
+        prepare_inference_params_kernel_h(batch_config,
+                                          pm,
+                                          handle.incr_attention_metadata,
+                                          stream,
+                                          max_num_pages,
+                                          q_indptr_h,
+                                          kv_indptr_h,
+                                          kv_indices_h,
+                                          kv_last_page_len_h,
+                                          qk_indptr_h);
+      }
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
-        cudaMemcpyHostToDevice,
-        stream));
+      // prepare attention forward handler
+      {
+        int batch_size = batch_config->num_active_requests();
+        if (!batch_config->prompt_phase) {
+          BatchDecodeHandler *handler = nullptr;
+          if (handle.incr_attention_metadata->decode_handler_collections.count(
+                  batch_size) == 0) {
+            handle.incr_attention_metadata
+                ->decode_handler_collections[batch_size] = static_cast<void *>(
+                new flashinfer::BatchDecodeHandler(true, batch_size));
+          }
+          handler = static_cast<BatchDecodeHandler *>(
+              handle.incr_attention_metadata
+                  ->decode_handler_collections[batch_size]);
+
+          handler->SetCUDAStream(stream);
+          DISPATCH_HEADDIM(
+              handle.incr_attention_metadata->head_dim(), HEAD_DIM, {
+                handler->BeginForwardDispatched<HEAD_DIM,
+                                                PageStorage::kIndices,
+                                                LogitsPostHook::kNone,
+                                                PosEncodingMode::kNone,
+                                                half,
+                                                half,
+                                                half,
+                                                int32_t>(
+                    static_cast<void *>(
+                        handle.incr_attention_metadata->float_workspace),
+                    handle.incr_attention_metadata->float_workspace_size,
+                    static_cast<void *>(
+                        handle.incr_attention_metadata->int_workspace),
+                    handle.incr_attention_metadata->int_workspace_size,
+                    static_cast<int32_t *>(kv_indptr_h),
+                    static_cast<int32_t *>(kv_last_page_len_h),
+                    batch_size,
+                    handle.incr_attention_metadata->num_q_heads(),
+                    handle.incr_attention_metadata->num_kv_heads(),
+                    kPagesize);
+              });
+        } else {
+          BatchPrefillHandler *handler = nullptr;
+          if (handle.incr_attention_metadata->prompt_handler_collections.count(
+                  batch_size) == 0) {
+            handle.incr_attention_metadata
+                ->prompt_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+              handle.incr_attention_metadata
+                  ->prompt_handler_collections[batch_size]);
+
+          handler->SetCUDAStream(stream);
+          handler->BeginForward<half, int32_t>(
+              static_cast<void *>(
+                  handle.incr_attention_metadata->float_workspace),
+              handle.incr_attention_metadata->float_workspace_size,
+              static_cast<void *>(
+                  handle.incr_attention_metadata->int_workspace),
+              handle.incr_attention_metadata->int_workspace_size,
+              static_cast<int32_t *>(q_indptr_h),
+              static_cast<int32_t *>(kv_indptr_h),
+              batch_size,
+              handle.incr_attention_metadata->num_q_heads(),
+              handle.incr_attention_metadata->num_kv_heads(),
+              handle.incr_attention_metadata->head_dim(),
+              kPagesize);
+        }
+      }
+    }
+  } else if (batch_config->get_mode() == TREE_SEARCH_MODE) {
+    if (handle.tree_search_attention_metadata->enabled()) {
+      // calculate the attention meta data
+      {
+        BatchConfig::PerRequestInfo *request_infos =
+            reinterpret_cast<BatchConfig::PerRequestInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo));
+        bool *request_available = reinterpret_cast<bool *>(
+            static_cast<char *>(handle.batch_config_metadata) +
+            sizeof(BatchConfig::tokensInfo) +
+            sizeof(BatchConfig::requestsInfo));
+        BatchConfig::BitMask *causalMask =
+            reinterpret_cast<BatchConfig::BitMask *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo) +
+                sizeof(BatchConfig::requestsInfo) +
+                sizeof(BatchConfig::request_available));
+        StreamingCacheInfo *streaming_cache_infos =
+            reinterpret_cast<StreamingCacheInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo) +
+                sizeof(BatchConfig::requestsInfo) +
+                sizeof(BatchConfig::request_available) +
+                sizeof(BatchConfig::causalMask));
+        int batch_size = batch_config->num_active_requests();
+        uint32_t const max_num_pages =
+            round_up_pages(BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
+
+        int parallelism = batch_size;
+        prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
+                                          min(CUDA_NUM_THREADS, parallelism),
+                                          0,
+                                          stream>>>(
+            batch_size,
+            request_infos,
+            request_available,
+            max_num_pages,
+            handle.tree_search_attention_metadata->q_indptr,
+            handle.tree_search_attention_metadata->kv_indptr,
+            handle.tree_search_attention_metadata->kv_indices,
+            handle.tree_search_attention_metadata->kv_last_page_len,
+            handle.tree_search_attention_metadata->qk_indptr);
 
-    total_copy_size += sizeof(BatchConfig::request_completed);
+        // Update gpu-side custom mask referring from CaualMask
+        if (!batch_config->prompt_phase) {
+          update_custom_mask(batch_config,
+                             handle.tree_search_attention_metadata,
+                             causalMask,
+                             request_infos,
+                             request_available,
+                             batch_size,
+                             streaming_cache_infos,
+                             stream);
+        }
+      }
+
+      // prepare attention forward handler
+      {
+        int batch_size = batch_config->num_active_requests();
+        BatchPrefillHandler *handler = nullptr;
+
+        if (!batch_config->prompt_phase) {
+          if (handle.tree_search_attention_metadata->decode_handler_collections
+                  .count(batch_size) == 0) {
+            handle.tree_search_attention_metadata
+                ->decode_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+              handle.tree_search_attention_metadata
+                  ->decode_handler_collections[batch_size]);
+        } else {
+          if (handle.tree_search_attention_metadata->prompt_handler_collections
+                  .count(batch_size) == 0) {
+            handle.tree_search_attention_metadata
+                ->prompt_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+              handle.tree_search_attention_metadata
+                  ->prompt_handler_collections[batch_size]);
+        }
+
+        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+            kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+        q_indptr_h[0] = 0;
+        kv_indptr_h[0] = 0;
+        for (int req_idx = 0, indptr_idx = 0;
+             req_idx < batch_config->max_requests_per_batch();
+             req_idx++) {
+          if (batch_config->request_available[req_idx]) {
+            int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+            int kv_len =
+                batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                batch_config->requestsInfo[req_idx]
+                    .first_token_index_in_request;
+            q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
+            kv_indptr_h[indptr_idx + 1] =
+                kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
+            indptr_idx++;
+          }
+        }
+
+        handler->SetCUDAStream(stream);
+        handler->BeginForward<half, int32_t>(
+            static_cast<void *>(
+                handle.tree_search_attention_metadata->float_workspace),
+            handle.tree_search_attention_metadata->float_workspace_size,
+            static_cast<void *>(
+                handle.tree_search_attention_metadata->int_workspace),
+            handle.tree_search_attention_metadata->int_workspace_size,
+            static_cast<int32_t *>(q_indptr_h),
+            static_cast<int32_t *>(kv_indptr_h),
+            batch_size,
+            handle.tree_search_attention_metadata->num_q_heads(),
+            handle.tree_search_attention_metadata->num_kv_heads(),
+            handle.tree_search_attention_metadata->head_dim(),
+            kPagesize);
+      }
+    }
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
-    TreeVerifyBatchConfig const *tree_batch_config =
-        static_cast<TreeVerifyBatchConfig const *>(batch_config);
+    PageManager *pm = PageManager::get_page_manager();
+    static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+        kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS *
+                                BatchConfig::MAX_NUM_TOKENS];
+    static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
+    if (handle.tree_verify_attention_metadata->enabled()) {
+      // calculate the attention meta data
+      {
+        BatchConfig::PerRequestInfo *request_infos =
+            reinterpret_cast<BatchConfig::PerRequestInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo));
+        bool *request_available = reinterpret_cast<bool *>(
+            static_cast<char *>(handle.batch_config_metadata) +
+            sizeof(BatchConfig::tokensInfo) +
+            sizeof(BatchConfig::requestsInfo));
+        BatchConfig::BitMask *causalMask =
+            reinterpret_cast<BatchConfig::BitMask *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo) +
+                sizeof(BatchConfig::requestsInfo) +
+                sizeof(BatchConfig::request_available));
+        StreamingCacheInfo *streaming_cache_infos =
+            reinterpret_cast<StreamingCacheInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo) +
+                sizeof(BatchConfig::requestsInfo) +
+                sizeof(BatchConfig::request_available) +
+                sizeof(BatchConfig::causalMask));
+        int batch_size = batch_config->num_active_requests();
+        uint32_t const max_num_pages =
+            round_up_pages(BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->committed_tokens),
-        sizeof(TreeVerifyBatchConfig::committed_tokens),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
+        // int parallelism = batch_size;
+        prepare_inference_params_kernel_h(batch_config,
+                                          pm,
+                                          handle.tree_verify_attention_metadata,
+                                          stream,
+                                          max_num_pages,
+                                          q_indptr_h,
+                                          kv_indptr_h,
+                                          kv_indices_h,
+                                          kv_last_page_len_h,
+                                          qk_indptr_h);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
-        cudaMemcpyHostToDevice,
-        stream));
+        // Update gpu-side custom mask referring from CaualMask
+        if (!batch_config->prompt_phase) {
+          update_custom_mask(batch_config,
+                             handle.tree_verify_attention_metadata,
+                             causalMask,
+                             request_infos,
+                             request_available,
+                             batch_size,
+                             streaming_cache_infos,
+                             stream);
+        }
+      }
+
+      // prepare attention forward handler
+      {
+        int batch_size = batch_config->num_active_requests();
+        BatchPrefillHandler *handler = nullptr;
+
+        if (!batch_config->prompt_phase) {
+          if (handle.tree_verify_attention_metadata->decode_handler_collections
+                  .count(batch_size) == 0) {
+            handle.tree_verify_attention_metadata
+                ->decode_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+              handle.tree_verify_attention_metadata
+                  ->decode_handler_collections[batch_size]);
+        } else {
+          if (handle.tree_verify_attention_metadata->prompt_handler_collections
+                  .count(batch_size) == 0) {
+            handle.tree_verify_attention_metadata
+                ->prompt_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+              handle.tree_verify_attention_metadata
+                  ->prompt_handler_collections[batch_size]);
+        }
 
-    total_copy_size += sizeof(BatchConfig::request_completed);
+        handler->SetCUDAStream(stream);
+        handler->BeginForward<half, int32_t>(
+            static_cast<void *>(
+                handle.tree_verify_attention_metadata->float_workspace),
+            handle.tree_verify_attention_metadata->float_workspace_size,
+            static_cast<void *>(
+                handle.tree_verify_attention_metadata->int_workspace),
+            handle.tree_verify_attention_metadata->int_workspace_size,
+            static_cast<int32_t *>(q_indptr_h),
+            static_cast<int32_t *>(kv_indptr_h),
+            batch_size,
+            handle.tree_verify_attention_metadata->num_q_heads(),
+            handle.tree_verify_attention_metadata->num_kv_heads(),
+            handle.tree_verify_attention_metadata->head_dim(),
+            kPagesize);
+      }
+    }
   }
 
   // add a size check
diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc
index d94337641..b71af0d47 100644
--- a/src/runtime/simulator.cc
+++ b/src/runtime/simulator.cc
@@ -31,10 +31,10 @@ namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_sim("sim");
-LegionRuntime::Logger::Category log_ps_sim("ps_sim");
-LegionRuntime::Logger::Category log_xfer_sim("xfer_sim");
-LegionRuntime::Logger::Category log_xfer_est("xfer_est");
+Legion::Logger log_sim("sim");
+Legion::Logger log_ps_sim("ps_sim");
+Legion::Logger log_xfer_sim("xfer_sim");
+Legion::Logger log_xfer_est("xfer_est");
 
 // template class std::map<const Op*, ParallelConfig>; // for debugging in gdb
 // template class std::map<const Op*, MachineView>; // for debugging in gdb
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index c0804d6e1..176133c49 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -54,8 +54,8 @@ namespace FlexFlow::PCG {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_xfers("xfers");
-LegionRuntime::Logger::Category log_xfer_matches("xfer_matches");
+Legion::Logger log_xfers("xfers");
+Legion::Logger log_xfer_matches("xfer_matches");
 
 const TensorX TensorX::NO_TX = TensorX();
 
@@ -3826,9 +3826,13 @@ bool FFModel::convert_graph_to_operators(
       case OP_SIGMOID_SILU_MULTI: {
         assert(inList.size() == 2);
         SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr;
-        SigmoidSiluMultiParams params = ssm->get_params();
-        new_op = new SigmoidSiluMulti(
-            *this, ssm->layer_guid, inputs[0], inputs[1], NULL);
+        new_op = new SigmoidSiluMulti(*this,
+                                      ssm->layer_guid,
+                                      inputs[0],
+                                      inputs[1],
+                                      ssm->intermediate_size,
+                                      ssm->tensor_parallelism_degree,
+                                      NULL);
         break;
       }
       default: {
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
deleted file mode 100644
index 841c735f5..000000000
--- a/src/runtime/tree_verify_batch_config.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2023 CMU, Stanford, Facebook, LANL
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/batch_config.h"
-#include "flexflow/request_manager.h"
-#include "legion.h"
-#include <cassert>
-#include <climits>
-
-namespace FlexFlow {
-
-LegionRuntime::Logger::Category log_tree_bc("TreeVerifyBatchConfig");
-
-TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {}
-
-TreeVerifyBatchConfig::~TreeVerifyBatchConfig() {}
-
-InferenceMode TreeVerifyBatchConfig::get_mode() const {
-  return TREE_VERIFY_MODE;
-}
-
-std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
-  os << "@@@@@@@@@@@@@@ TreeVerifyBatchConfig (mode " << bc.get_mode()
-     << ") @@@@@@@@@@@@@@" << std::endl;
-  // Max values
-  os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl;
-  os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl;
-  os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
-  // Current values
-  os << "Number of tokens: " << bc.num_active_tokens() << std::endl;
-  os << "Number of requests: " << bc.num_active_requests() << std::endl;
-  os << "Number of tokens to commit: " << bc.num_tokens_to_commit << std::endl;
-
-  os << "Per-request info:\n";
-  for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-    if (!bc.request_completed[i]) {
-      os << "  Request " << i << ":\n";
-      os << "    First token depth in request: "
-         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
-      os << "    First token offset in batch: "
-         << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
-      os << "    Number of tokens in batch: "
-         << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
-      os << "    Request completed: " << bc.request_completed[i] << std::endl;
-      os << "    Request running: " << bc.request_running[i] << std::endl;
-    }
-  }
-
-  os << "Per-token info:\n";
-  for (int i = 0; i < bc.num_tokens; i++) {
-    os << "  Token " << i << ":\n";
-    os << "    Absolute depth in request: "
-       << bc.tokensInfo[i].abs_depth_in_request << std::endl;
-    os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
-    os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
-  }
-
-  os << "Tokens to commit info:\n";
-  for (int i = 0; i < bc.num_tokens_to_commit; i++) {
-    os << "  Token " << i << ":\n";
-    os << "    token_index: " << bc.committed_tokens[i].token_index
-       << std::endl;
-    os << "    request_index: " << bc.committed_tokens[i].request_index
-       << std::endl;
-    os << "    token_depth: " << bc.committed_tokens[i].token_depth
-       << std::endl;
-  }
-
-  os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl;
-  return os;
-}
-
-void TreeVerifyBatchConfig::print() const {
-  std::cout << *this << std::endl;
-}
-
-void TreeVerifyBatchConfig::save_to_file(std::string const &filename) const {
-  std::ofstream outputFile(filename);
-  if (outputFile.is_open()) {
-    outputFile << *this << std::endl;
-    outputFile.close();
-  } else {
-    std::cerr << "Error: Unable to open the batch config output file: "
-              << filename << std::endl;
-    assert(false);
-  }
-}
-
-}; // namespace FlexFlow
diff --git a/src/utils/communication_buffer.cu b/src/utils/communication_buffer.cu
new file mode 100644
index 000000000..83b0385a3
--- /dev/null
+++ b/src/utils/communication_buffer.cu
@@ -0,0 +1,147 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/utils/communication_buffer.h"
+#include "flexflow/utils/cuda_helper.h"
+#include "tensorrt_llm/custom_allreduce_kernels.h"
+#include <cuda_runtime.h>
+#include <string>
+
+// Given a local CUDA data pointer, return the peer memory pointers group.
+// For the i-th pointer, if i is the worker id of the given device,
+// then the returned i-th ptr_group is the local pointer,
+// or otherwise it is an peer memory pointer from the remote device.
+std::vector<void *> create_peer_ptr_group(Legion::Context ctx,
+                                          Legion::Runtime *runtime,
+                                          int num_devices,
+                                          int device_id,
+                                          ncclComm_t ncclComm,
+                                          void *allgather_src,
+                                          void *allgather_dst,
+                                          void *local_ptr,
+                                          cudaStream_t stream) {
+  // Ensure we are on the correct device
+  int device = 0;
+  checkCUDA(cudaGetDevice(&device));
+  assert(device == device_id && "Device ID does not match current device.");
+
+  // Next we all-gather the peer memory pointers across all distributed workers.
+  // On each worker, we copy the peer pointers to GPU memory. And nccl AllGather
+  // is used to all-gather the pointers. Finally the all-gathered pointers
+  // on each worker are copied from GPU to CPU.
+
+  checkCUDA(cudaMemcpyAsync(allgather_src,
+                            &local_ptr,
+                            sizeof(void *),
+                            cudaMemcpyHostToDevice,
+                            stream));
+
+  runtime->concurrent_task_barrier(ctx);
+  checkNCCL(ncclAllGather(allgather_src,
+                          allgather_dst,
+                          sizeof(void *),
+                          ncclChar,
+                          ncclComm,
+                          stream));
+  runtime->concurrent_task_barrier(ctx);
+
+  std::vector<void *> peer_pointers(num_devices);
+  checkCUDA(cudaMemcpyAsync(peer_pointers.data(),
+                            allgather_dst,
+                            sizeof(void *) * num_devices,
+                            cudaMemcpyDeviceToHost,
+                            stream));
+  checkCUDA(cudaStreamSynchronize(stream));
+
+  return peer_pointers;
+}
+
+// Free the peer memory pointers group.
+void free_peer_ptr_group(std::vector<void *> ptr_group,
+                         int device_id,
+                         bool free_local) {
+  for (int i = 0; i < static_cast<int>(ptr_group.size()); ++i) {
+    if (i == device_id && free_local) {
+      // Free the local buffer.
+      checkCUDA(cudaFree(ptr_group[i]));
+    }
+    // No need to do anything for other devices.
+  }
+}
+
+// Given a local CUDA data pointer, return the CommunicationBuffer of the
+// pointer. The CommunicationBuffer contains the local pointer and the peer
+// memory pointers group. It contains the barrier helpers for synchronization
+// across distributed workers, which should also be peer-based. The
+// allgather_src and allgather_dst are device buffers, which are used for
+// all-gathering peer pointers across devices. The size of allgather_src should
+// be sizeof(void*), and the size of allgather_dst should be sizeof(void*) *
+// num_devices.
+CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
+                                                    Legion::Runtime *runtime,
+                                                    int num_devices,
+                                                    int device_id,
+                                                    ncclComm_t ncclComm,
+                                                    void *allgather_src,
+                                                    void *allgather_dst,
+                                                    void *local_ptr,
+                                                    void *barrier_in_ptr,
+                                                    void *barrier_out_ptr,
+                                                    int *barrier_flag,
+                                                    cudaStream_t stream) {
+  assert(local_ptr != nullptr && "Local pointer is nullptr.");
+  CommunicationBuffer *comm_buf = new CommunicationBuffer();
+  comm_buf->num_devices = num_devices;
+  comm_buf->device_id = device_id;
+  comm_buf->local_ptr = local_ptr;
+  comm_buf->comm_ptrs = create_peer_ptr_group(ctx,
+                                              runtime,
+                                              num_devices,
+                                              device_id,
+                                              ncclComm,
+                                              allgather_src,
+                                              allgather_dst,
+                                              local_ptr,
+                                              stream);
+  comm_buf->barrier_in = create_peer_ptr_group(ctx,
+                                               runtime,
+                                               num_devices,
+                                               device_id,
+                                               ncclComm,
+                                               allgather_src,
+                                               allgather_dst,
+                                               barrier_in_ptr,
+                                               stream);
+  comm_buf->barrier_out = create_peer_ptr_group(ctx,
+                                                runtime,
+                                                num_devices,
+                                                device_id,
+                                                ncclComm,
+                                                allgather_src,
+                                                allgather_dst,
+                                                barrier_out_ptr,
+                                                stream);
+  comm_buf->barrier_flag = barrier_flag;
+
+  return comm_buf;
+}
+
+// Release the CommunicationBuffer.
+void release_comm_buf(CommunicationBuffer *comm_buf) {
+  free_peer_ptr_group(comm_buf->comm_ptrs, comm_buf->device_id, false);
+  free_peer_ptr_group(comm_buf->barrier_in, comm_buf->device_id, false);
+  free_peer_ptr_group(comm_buf->barrier_out, comm_buf->device_id, false);
+  delete comm_buf;
+}
diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index 8beea5599..cceca7845 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -10,26 +10,26 @@ cd "${BASH_SOURCE[0]%/*}"
 ###############################################################################################
 
 # LLAMA
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 20000 -ll:zsize 30000 --verbose --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
 # LLAMA (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
 
 # OPT
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
 # OPT (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
 
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     
     # OPT
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # OPT (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 fi
 
 ###############################################################################################
@@ -37,63 +37,63 @@ fi
 ###############################################################################################
 
 # LLAMA (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
 
 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1
 
 # LLAMA (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
 
 # LLAMA (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 20000 -ll:zsize 30000 --verbose --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
 # LLAMA (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
 
 # OPT (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
 # OPT (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
 
 # OPT (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
 # OPT (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
 
 # Falcon (full precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 # Falcon (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 
 # # StarCoder (full precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
 # # StarCoder (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
 
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # LLAMA (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # LLAMA (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 
     # OPT (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # OPT (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # OPT (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # OPT (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 fi
 
 ###############################################################################################
diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py
index 6857b5cbc..8fa17f153 100644
--- a/tests/inference/huggingface_inference.py
+++ b/tests/inference/huggingface_inference.py
@@ -87,7 +87,7 @@ def main():
     # Get Tokenizer
     hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True)
     hf_arch = getattr(hf_config, "architectures")[0]
-    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM" or hf_arch == "MistralForCausalLM":
         tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True)
     else:
         tokenizer = AutoTokenizer.from_pretrained(args.model_name)
diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh
index 895b74c79..5fb142282 100755
--- a/tests/inference_tests.sh
+++ b/tests/inference_tests.sh
@@ -10,9 +10,9 @@ cleanup() {
 cd "${BASH_SOURCE[0]%/*}"
 
 # Enable Python tests (on by default)
-PYTHON_INFERENCE_TESTS=${PYTHON_INFERENCE_TESTS:-ON}
+PYTHON_INFERENCE_TESTS=${PYTHON_INFERENCE_TESTS:-OFF}
 # Enable C++ tests, (off by default)
-CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-OFF}
+CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-ON}
 # Enable model parallelism tests in C++, if desired
 TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF}
 
@@ -25,9 +25,6 @@ fi
 # Clean up before test (just in case)
 cleanup
 
-# Make sure supported version of protobuf is installed
-pip3 install protobuf==3.20.3
-
 # Create test prompt file
 mkdir -p ../inference/prompt
 echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json
diff --git a/tests/ops/batch_matmul_test.cc b/tests/ops/batch_matmul_test.cc
index 7931f4412..f61048feb 100644
--- a/tests/ops/batch_matmul_test.cc
+++ b/tests/ops/batch_matmul_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("bmm_test");
+Legion::Logger log_app("bmm_test");
 
 struct BMMTestMeta {
   int m, k, n, d;
diff --git a/tests/ops/concat_test.cc b/tests/ops/concat_test.cc
index c67b718e0..b0489d1ad 100644
--- a/tests/ops/concat_test.cc
+++ b/tests/ops/concat_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("concat_test");
+Legion::Logger log_app("concat_test");
 
 struct ConcatTestMeta {
   int batch_size, i_dim, num_channels, projected_num_channels,
diff --git a/tests/ops/flat_test.cc b/tests/ops/flat_test.cc
index 428893a0d..61de83b6b 100644
--- a/tests/ops/flat_test.cc
+++ b/tests/ops/flat_test.cc
@@ -7,7 +7,7 @@
 #include <sstream>
 
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Flat_test");
+Legion::Logger log_app("Flat_test");
 
 struct FlatTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/linear_test.cc b/tests/ops/linear_test.cc
index 5b65de3a5..7c84ad107 100644
--- a/tests/ops/linear_test.cc
+++ b/tests/ops/linear_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("linear_test");
+Legion::Logger log_app("linear_test");
 
 struct LinearTestMeta {
   int batch_size, i_dim, num_channels, dense_projection_o_dim,
diff --git a/tests/ops/reshape_test.cc b/tests/ops/reshape_test.cc
index e8f4586b2..a8aa046a6 100644
--- a/tests/ops/reshape_test.cc
+++ b/tests/ops/reshape_test.cc
@@ -6,7 +6,7 @@
 #include <sstream>
 #define PRECISION 16
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Reshape_test");
+Legion::Logger log_app("Reshape_test");
 
 struct ReshapeTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/tanh_test.cc b/tests/ops/tanh_test.cc
index 1c24d96aa..1e86934f8 100644
--- a/tests/ops/tanh_test.cc
+++ b/tests/ops/tanh_test.cc
@@ -6,7 +6,7 @@
 #include <sstream>
 #define PRECISION 16
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Tanh_test");
+Legion::Logger log_app("Tanh_test");
 
 struct TanhTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/transpose_test.cc b/tests/ops/transpose_test.cc
index 10481aa14..045f28479 100644
--- a/tests/ops/transpose_test.cc
+++ b/tests/ops/transpose_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("transpose_test");
+Legion::Logger log_app("transpose_test");
 
 struct TransposeTestMeta {
   int m, k, d;