diff --git a/.gitignore b/.gitignore index 7f6a3c413..d7917b34d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ python/flexflow/core/flexflow_cffi_header.py *.pb.h *.o *.a +*.nsys-rep +*.nfs* # Byte-compiled / optimized / DLL files __pycache__/ @@ -188,3 +190,8 @@ python/flexflow/version.txt inference_tensors tests/inference/python_test_configs/*.json + +core.* +*.out +sharegpt.json +wildchat.json diff --git a/.gitmodules b/.gitmodules index c68582d4a..6b437e036 100644 --- a/.gitmodules +++ b/.gitmodules @@ -22,4 +22,10 @@ [submodule "deps/tokenizers-cpp"] path = deps/tokenizers-cpp url = https://github.com/mlc-ai/tokenizers-cpp.git - fetchRecurseSubmodules = true \ No newline at end of file + fetchRecurseSubmodules = true +[submodule "deps/flashinfer"] + path = deps/flashinfer + url = https://github.com/flashinfer-ai/flashinfer.git +[submodule "deps/raft"] + path = deps/raft + url = https://github.com/rapidsai/raft.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ce4f704..978d84de4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,12 @@ project(FlexFlow) include(ExternalProject) +enable_language(CXX) +enable_language(CUDA) +if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8) + message(FATAL_ERROR "Your C++ compiler is too old. Please upgrade to version 8 or higher.") +endif() + # Set policy CMP0074 to eliminate cmake warnings cmake_policy(SET CMP0074 NEW) cmake_policy(SET CMP0077 NEW) @@ -128,6 +134,9 @@ list(APPEND CC_FLAGS list(APPEND NVCC_FLAGS -std=c++17) +list(APPEND NVCC_FLAGS + --expt-relaxed-constexpr + --extended-lambda) add_compile_options(${CC_FLAGS}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS}) @@ -201,6 +210,12 @@ if(NOT BUILD_LEGION_ONLY) # optional include(optional) + set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/build/install) + find_package(raft) + list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/include) + + list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include) + if (FF_GPU_BACKEND STREQUAL "cuda") list(APPEND FF_CC_FLAGS -DFF_USE_CUDA) @@ -290,6 +305,12 @@ if(NOT BUILD_LEGION_ONLY) LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cu) + # tensorrt_llm custom allreduce + if(FF_USE_NCCL) + list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm) + list(APPEND FLEXFLOW_GPU_SRC ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu) + endif() + add_compile_definitions(FF_USE_CUDA) if(BUILD_SHARED_LIBS) @@ -397,6 +418,8 @@ if(NOT BUILD_LEGION_ONLY) target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional) endif() + target_link_libraries(flexflow raft::raft) + #library api version, bump from time to time set(SOVERSION 1) @@ -425,7 +448,7 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. @@ -557,7 +580,9 @@ if(NOT BUILD_LEGION_ONLY) if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(inference/spec_infer) + add_subdirectory(inference/simplified_infer) add_subdirectory(inference/incr_decoding) + add_subdirectory(inference/trace_generator) endif() diff --git a/FlexFlow.mk b/FlexFlow.mk index 14f32a763..fadcf4de3 100644 --- a/FlexFlow.mk +++ b/FlexFlow.mk @@ -95,9 +95,12 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1) endif -INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src +INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src \ + -I${FF_HOME}/deps/raft/cpp/include -I${FF_HOME}/deps/rmm/include -I${FF_HOME}/deps/spdlog/include \ + -I${FF_HOME}/deps/flashinfer/include CC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 -NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 +NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 \ + --expt-relaxed-constexpr --extended-lambda HIPCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 GASNET_FLAGS += # For Point and Rect typedefs diff --git a/benchmarking/average_accepted_tokens.pdf b/benchmarking/average_accepted_tokens.pdf new file mode 100644 index 000000000..717e6e68a Binary files /dev/null and b/benchmarking/average_accepted_tokens.pdf differ diff --git a/benchmarking/benchmark_incr_dec.sh b/benchmarking/benchmark_incr_dec.sh new file mode 100755 index 000000000..3a75fa61d --- /dev/null +++ b/benchmarking/benchmark_incr_dec.sh @@ -0,0 +1,88 @@ +#! /usr/bin/env bash +set -x +set -e + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}/../build" + +# export BUILD_TYPE=Debug +# ../config/config.linux +make -j install + +model_name=meta-llama/Llama-3.1-70B-Instruct +NGPUS=8 +NCPUS=16 +FSIZE=36000 +ZSIZE=200000 +CSIZE=100000 + +# comment these lines in for debugging +# model_name=meta-llama/Llama-3.1-8B-Instruct +# NGPUS=8 +# FSIZE=36000 +# ZSIZE=30000 +# CSIZE=100000 + + + +MAX_SEQ_LEN=7000 +tokens_per_batch=1024 + +batch_sizes=( + 8 + 4 +) + +request_per_second_values=( + -1 + 1 + 2 + 4 + 8 +) + +dataset_name="sharegpt" +dataset_fp="../benchmarking/${dataset_name}.json" +partition_name="all" + +export LEGION_BACKTRACE=1 + +# python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='meta-llama/Llama-3.1-70B-Instruct', allow_patterns='*.safetensors', max_workers=30)" +# python ../inference/utils/download_hf_model.py --half-precision-only $model_name --refresh-cache + +for k in "${!request_per_second_values[@]}"; do +for j in "${!batch_sizes[@]}"; do + batch_size=${batch_sizes[$j]} + request_per_second=${request_per_second_values[$k]} + + echo "Running dataset ${dataset_fp} with model ${model_name}, batch size ${batch_size}, tokens per batch ${tokens_per_batch}, and request per second ${request_per_second}" + # create model name version where "/" is replaced with "-" + model_name_=$(echo $model_name | tr / -) + if [ $request_per_second -gt 0 ]; then + rate=$request_per_second + else + rate="offline" + fi + log_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.log" + output_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.json" + metrics_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv" + rm $metrics_fp $output_fp $log_fp || true + + time ./inference/simplified_infer/incr_dec \ + -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \ + -tensor-parallelism-degree $NGPUS \ + -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \ + --fusion \ + --max-sequence-length $MAX_SEQ_LEN \ + --max-requests-per-batch $batch_size \ + --max-tokens-per-batch $tokens_per_batch \ + --max-output-length 1024 \ + --request-per-second ${request_per_second} \ + -llm-model $model_name \ + -trace ${dataset_fp} \ + -trace-output-path ${output_fp} \ + -csv-output-path $metrics_fp \ + -target-partition ${partition_name} \ + 2>&1 | tee ${log_fp} +done +done \ No newline at end of file diff --git a/benchmarking/benchmark_specinfer.sh b/benchmarking/benchmark_specinfer.sh new file mode 100755 index 000000000..e0c8e39d7 --- /dev/null +++ b/benchmarking/benchmark_specinfer.sh @@ -0,0 +1,109 @@ +#! /usr/bin/env bash +set -x +set -e + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}/../build" + +# export BUILD_TYPE=Debug +# ../config/config.linux +make -j +source ./set_python_envs.sh +# reset + +model_name=meta-llama/Llama-3.1-70B-Instruct +NGPUS=8 +NCPUS=16 +FSIZE=36000 +ZSIZE=200000 +CSIZE=100000 + +# comment these lines in for debugging +# model_name=meta-llama/Llama-3.1-8B-Instruct +# NGPUS=8 +# FSIZE=36000 +# ZSIZE=30000 +# CSIZE=100000 +###################################### + +small_model_names=( + Zhuominc/Llama-3-330M + meta-llama/Llama-3.2-1B-Instruct + meta-llama/Llama-3.2-3B-Instruct + meta-llama/Llama-3.1-8B-Instruct +) + +MAX_SEQ_LEN=7000 +tokens_per_batch=1024 +max_tree_depth=8 +expansion_degree=3 + +batch_sizes=( + 8 + 4 +) + +request_per_second_values=( + -1 + 1 + 2 + 4 + 8 +) + +dataset_name="sharegpt" +dataset_fp="../benchmarking/${dataset_name}.json" +partition_name="all" + +export LEGION_BACKTRACE=1 + +# python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='meta-llama/Llama-3.1-70B-Instruct', allow_patterns='*.safetensors', max_workers=30)" +python ../inference/utils/download_hf_model.py --half-precision-only $model_name +for small_model_name in "${small_model_names[@]}"; do + python ../inference/utils/download_hf_model.py --half-precision-only $small_model_name +done + +for k in "${!request_per_second_values[@]}"; do +for j in "${!batch_sizes[@]}"; do +for i in "${!small_model_names[@]}"; do + small_model_name=${small_model_names[$i]} + batch_size=${batch_sizes[$j]} + request_per_second=${request_per_second_values[$k]} + + echo "Running dataset ${dataset_fp} with model ${model_name}, draft model ${small_model_name}, batch size ${batch_size}, tokens per batch ${tokens_per_batch}, and request per second ${request_per_second}" + # create model name version where "/" is replaced with "-" + model_name_=$(echo $model_name | tr / -) + small_model_name_=$(echo $small_model_name | tr / -) + if [ $request_per_second -gt 0 ]; then + rate=$request_per_second + else + rate="offline" + fi + log_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.log" + output_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.json" + metrics_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv" + rm $metrics_fp $output_fp $log_fp || true + + time ./inference/suffix_decoding/specinfer \ + -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \ + -tensor-parallelism-degree $NGPUS \ + -ssm-tp-degree $NGPUS \ + -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \ + --fusion \ + --max-sequence-length $MAX_SEQ_LEN \ + --max-requests-per-batch $batch_size \ + --max-tokens-per-batch $tokens_per_batch \ + --max-output-length 1024 \ + --max-tree-depth ${max_tree_depth} \ + --expansion-degree ${expansion_degree} \ + --request-per-second ${request_per_second} \ + -llm-model $model_name \ + -ssm-model $small_model_name \ + -trace ${dataset_fp} \ + -trace-output-path ${output_fp} \ + -csv-output-path $metrics_fp \ + -target-partition ${partition_name} \ + 2>&1 | tee ${log_fp} +done +done +done \ No newline at end of file diff --git a/benchmarking/get_sharegpt_trace.py b/benchmarking/get_sharegpt_trace.py new file mode 100644 index 000000000..dbe8f4d3b --- /dev/null +++ b/benchmarking/get_sharegpt_trace.py @@ -0,0 +1,206 @@ +from dataclasses import asdict, dataclass, field +import json +import os +import random +import requests +from tqdm.asyncio import tqdm +from typing import List, Optional +from collections import OrderedDict +from transformers import AutoTokenizer + +SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" + +@dataclass +class TraceEntry: + prompt: str + response: str + prompt_length: int + response_length: int + +@dataclass +class TracePartition: + partition_name: str + model_name: str + num_warmup_requests: int + training_entries: List[TraceEntry] + eval_entries: List[TraceEntry] + +@dataclass +class TraceMetadata: + avg_entries_per_partition: float + max_prompt_length: int + min_prompt_length: int + avg_prompt_length: float + max_response_length: int + min_response_length: int + avg_response_length: float + max_total_length: int + +@dataclass +class Trace: + partitions: List[TracePartition] + metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0,0)) + +def download_and_cache_file(url: str, filename: Optional[str] = None): + """Read and cache a file from a url.""" + if filename is None: + filename = os.path.join("/tmp", url.split("/")[-1]) + + # Check if the cache file already exists + if os.path.exists(filename): + return filename + + print(f"Downloading from {url} to {filename}") + + # Stream the response to show the progress bar + response = requests.get(url, stream=True) + response.raise_for_status() # Check for request errors + + # Total size of the file in bytes + total_size = int(response.headers.get("content-length", 0)) + chunk_size = 1024 # Download in chunks of 1KB + + # Use tqdm to display the progress bar + with open(filename, "wb") as f, tqdm( + desc=filename, + total=total_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + ) as bar: + for chunk in response.iter_content(chunk_size=chunk_size): + f.write(chunk) + bar.update(len(chunk)) + + return filename + +def get_warmup_entries(model_name: str, num_warmup_requests: int) -> List[TraceEntry]: + """ + Get a list of warmup entries for a model. + + Args: + model_name (str): The name of the model. + num_warmup_requests (int): The number of warmup requests to generate. + + Returns: + List[TraceEntry]: A list of warmup entries. + """ + warmup_entries = [] + tokenizer = AutoTokenizer.from_pretrained(model_name) + for i in range(num_warmup_requests): + prompt = "Hello, how are you?" + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + response = "I'm doing well, thank you for asking." + prompt_length = len(tokenizer(prompt)["input_ids"]) + response_length = len(tokenizer(response)["input_ids"]) + warmup_entries.append(TraceEntry(prompt, response, prompt_length, response_length)) + return warmup_entries + +def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, seed: int): + # Download sharegpt if necessary + dataset_path = download_and_cache_file(SHAREGPT_URL) + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f, object_pairs_hook=OrderedDict) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [ + (data["conversations"][0]["value"], data["conversations"][1]["value"]) + for data in dataset + if data["conversations"][0]["from"] == "human" and data["conversations"][1]["from"] == "gpt" + ] + + # Shuffle the dataset. + random.seed(seed) + random.shuffle(dataset) + + tokenizer = AutoTokenizer.from_pretrained(model_name) + + trace = Trace(partitions=[]) + partition = TracePartition( + partition_name="all", + model_name=model_name, + num_warmup_requests=num_warmup_requests, + training_entries=[], + eval_entries=[], + ) + trace_metadata = TraceMetadata( + avg_entries_per_partition=0, + max_prompt_length=0, + min_prompt_length=float("inf"), + avg_prompt_length=0, + max_response_length=0, + min_response_length=float("inf"), + avg_response_length=0, + max_total_length=0, + ) + + partition.eval_entries += get_warmup_entries(model_name, num_warmup_requests) + + for i in tqdm(range(len(dataset))): + if len(partition.eval_entries) == num_entries: + break + + # Tokenize the prompts and completions. + prompt = dataset[i][0] + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + response = dataset[i][1] + prompt_length = len(tokenizer(prompt)["input_ids"]) + response_length = len(tokenizer(response)["input_ids"]) + new_entry = TraceEntry(prompt, response, prompt_length, response_length) + partition.eval_entries.append(new_entry) + trace_metadata.max_prompt_length = max(trace_metadata.max_prompt_length, prompt_length) + trace_metadata.min_prompt_length = min(trace_metadata.min_prompt_length, prompt_length) + trace_metadata.avg_prompt_length += prompt_length + trace_metadata.max_response_length = max(trace_metadata.max_response_length, response_length) + trace_metadata.min_response_length = min(trace_metadata.min_response_length, response_length) + trace_metadata.avg_response_length += response_length + trace_metadata.max_total_length = max(trace_metadata.max_total_length, prompt_length + response_length) + trace_metadata.avg_prompt_length /= len(partition.eval_entries) + trace_metadata.avg_response_length /= len(partition.eval_entries) + trace_metadata.avg_entries_per_partition = len(partition.eval_entries) + + trace.partitions.append(partition) + trace.metadata = trace_metadata + + return trace + +def save_trace(trace: Trace, output_path: str): + """ + Save a Trace instance to a JSON file. + + Args: + trace (Trace): The trace to save. + output_path (str): The path where the JSON file will be saved. + """ + # Convert the Trace instance to a dictionary + trace_dict = asdict(trace) + + # Save the dictionary as a JSON file + with open(output_path, 'w') as f: + json.dump(trace_dict, f, indent=2) + + print(f"Trace saved to {output_path}") + +if __name__ == "__main__": + # Change directory to that holding this script + os.chdir(os.path.dirname(os.path.abspath(__file__))) + + num_entries=125 + num_warmup_requests=8 + seed=42 + + trace = build_trace("meta-llama/Llama-3.1-70B-Instruct", num_entries, num_warmup_requests, seed) + print(trace.metadata) + # Save prompts list to a json file + save_trace(trace, "sharegpt.json") \ No newline at end of file diff --git a/benchmarking/get_wildchat_trace.py b/benchmarking/get_wildchat_trace.py new file mode 100644 index 000000000..53ee46efb --- /dev/null +++ b/benchmarking/get_wildchat_trace.py @@ -0,0 +1,64 @@ +import datasets +from transformers import AutoTokenizer +from tqdm import tqdm +import json, os + +def build_trace(dataset: datasets.Dataset, model_name: str, num_entries: int, seed: int): + tokenizer = AutoTokenizer.from_pretrained(model_name) + + dataset = dataset["train"].filter( + lambda x: x["model"] == "gpt-4" and x["turn"] == 1 and x["language"] == "English" + ).shuffle(seed=seed).select(range(num_entries)) + pairs = [] + for row in dataset: + assert len(row["conversation"]) == 2 + assert row["conversation"][0]["role"] == "user" + assert row["conversation"][1]["role"] == "assistant" + pairs.append(( + row["conversation"][0]["content"], + row["conversation"][1]["content"], + )) + + prompts = [] + avg_prompt_length = 0 + min_prompt_length = float("inf") + max_prompt_length = 0 + avg_response_length = 0 + min_response_length = float("inf") + max_response_length = 0 + max_total_length = 0 + for prompt, response in tqdm(pairs, desc="Processing HF trace"): + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + prompt_length = len(tokenizer(prompt)["input_ids"]) + response_length = len(tokenizer(response)["input_ids"]) + prompts.append(prompt) + avg_prompt_length += prompt_length + avg_response_length += response_length + min_prompt_length = min(min_prompt_length, prompt_length) + min_response_length = min(min_response_length, response_length) + max_prompt_length = max(max_prompt_length, prompt_length) + max_response_length = max(max_response_length, response_length) + max_total_length = max(max_total_length, prompt_length + response_length) + avg_prompt_length /= len(prompts) + avg_response_length /= len(prompts) + + return prompts, max_prompt_length, max_response_length, avg_prompt_length, avg_response_length, min_prompt_length, min_response_length, max_total_length + +if __name__ == "__main__": + # Change directory to that holding this script + os.chdir(os.path.dirname(os.path.abspath(__file__))) + + dataset = datasets.load_dataset("allenai/WildChat") + prompts, max_prompt_length, max_response_length, avg_prompt_length, avg_response_length, min_prompt_length, min_response_length, max_total_length = build_trace(dataset, "meta-llama/Llama-3.1-70B-Instruct", 250, 42) + print(f"Number of prompts: {len(prompts)}") + print(f"Prompt lengths: [{min_prompt_length} -> {max_prompt_length}] (avg: {avg_prompt_length})") + print(f"Response lengths: [{min_response_length} -> {max_response_length}] (avg: {avg_response_length})") + print(f"Max total length: {max_total_length}") + # Save prompts list to a json file + + with open("wildchat.json", "w") as f: + json.dump(prompts, f, indent=2) \ No newline at end of file diff --git a/benchmarking/plot_results.ipynb b/benchmarking/plot_results.ipynb new file mode 100644 index 000000000..c7dcff18c --- /dev/null +++ b/benchmarking/plot_results.ipynb @@ -0,0 +1,776 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/FlexFlow/inference/output\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import os\n", + "os.chdir(\"/usr/FlexFlow/inference/output\")\n", + "print(os.getcwd())" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "small_model_names = [\n", + " \"Zhuominc/Llama-3-330M\",\n", + " \"meta-llama/Llama-3.2-1B-Instruct\",\n", + " # \"meta-llama/Llama-3.2-3B-Instruct\",\n", + " \"meta-llama/Llama-3.1-8B-Instruct\",\n", + "]\n", + "batch_sizes=[4,8]\n", + "arrival_rates=[\"offline\", \"1\", \"2\", \"4\", \"8\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def get_speculation_len(filepath):\n", + " df = pd.read_csv(filepath)\n", + " # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n", + " df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n", + " return df[\"num_speculated_tokens\"].mean()\n", + "\n", + "def get_accepted_len(filepath):\n", + " df = pd.read_csv(filepath)\n", + " # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n", + " df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n", + " return df[\"num_accepted_tokens\"].mean()\n", + "\n", + "def get_acceptance_rates(filepath):\n", + " df = pd.read_csv(filepath)\n", + " # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n", + " df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n", + " # group = df.groupby(\"request_guid\", as_index=False)\n", + " num_speculated_tokens = df[\"num_speculated_tokens\"].sum()\n", + " num_accepted_tokens = df[\"num_accepted_tokens\"].sum()\n", + " return num_accepted_tokens/num_speculated_tokens\n", + "\n", + "def get_tpot(filepath):\n", + " df = pd.read_csv(filepath)\n", + " # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n", + " df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n", + " group = df.groupby(\"request_guid\", as_index=False)\n", + " min_time = group[\"timestamp\"].min()[\"timestamp\"]\n", + " max_time = group[\"timestamp\"].max()[\"timestamp\"]\n", + " num_tokens = group[\"num_generated_tokens\"].sum()[\"num_generated_tokens\"]\n", + " tpots = (max_time - min_time) / num_tokens / 1000\n", + " return tpots.mean()\n", + "\n", + "def get_throughput(filepath):\n", + " df = pd.read_csv(filepath)\n", + " # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n", + " df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n", + " num_tokens = df[\"num_generated_tokens\"].sum()\n", + " total_time = df[\"timestamp\"].max() - df[\"timestamp\"].min() # in microseconds\n", + " total_time = total_time / 1000000 # convert to seconds\n", + " throughput = num_tokens / total_time # (tokens/sec)\n", + " return throughput\n", + "\n", + "def get_ttft(filepath):\n", + " df = pd.read_csv(filepath)\n", + " # remove entries where is_warmup_request is 1\n", + " df = df[(df[\"is_warmup_request\"] == 0)]\n", + " group = df.groupby(\"request_guid\", as_index=False)\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + " # convert to milliseconds from microseconds\n", + " return ttft.mean()[1] / 1000\n", + "\n", + "def get_queueing_time(filepath):\n", + " df = pd.read_csv(filepath)\n", + " # remove entries where is_warmup_request is 1\n", + " df = df[(df[\"is_warmup_request\"] == 0)]\n", + " group = df.groupby(\"request_guid\", as_index=False)\n", + " # in each group, find the difference between the timestampt at request_step_idx=-1 and the timestamp at request_step_idx=-2.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + " # convert to seconds from microseconds\n", + " return queueing_time.mean()[1] / 1000000\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA9wAAAPECAYAAABc1TPrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACsUklEQVR4nOzdeXxM1//H8fdkMUlI7EEIiVC72rWU2PdYqlVULW1RS7X17V6K0qKtLqpFUZRYaqmuqrSxdaHW6rf2SilaSxGEiMz5/eGX+ZpOkIm5YvT1fDzyYM49c+9n7owb7zn3nmszxhgBAAAAAACv8svuAgAAAAAAuBURuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQC3hAYNGqhixYrZXUamzZo1S2XLllVgYKDy5MmT3eX4tAYNGqhBgwY3ZFvDhw+XzWbTsWPHbsj2AAC+jcANAP/w3nvvyWazqXbt2tldyk0nKipKNptNjz76qNuylStXymazaeHChdlQmW/ZsWOHevbsqZiYGE2ZMkXvv/9+pp739NNPy2az6b777rO4Qu/7/vvvNXz4cJ08efKGbzs9JF/r50aF9n+jzz77TLGxsQoPD1dISIhKliypTp066auvvnL2OXTokIYPH64tW7ZkX6EA4GUB2V0AANxs4uPjFRUVpfXr12vPnj0qVapUdpd005kyZYqee+45RUREZHcpPmnlypVyOBx6++23M/35MsZo7ty5ioqK0meffabTp08rNDTU4kq95/vvv9eIESPUs2fPGz6if/fdd7vs5zNnzqhfv37q0KGD7r77bmd7oUKFbmhd/xavv/66nnrqKcXGxuq5555TSEiI9uzZoxUrVmjevHlq0aKFpEuBe8SIEYqKilKVKlWyt2gA8BICNwBcZt++ffr++++1ePFi9e3bV/Hx8Ro2bNgNrcHhcOjChQsKCgq6odvNrAoVKmjnzp0aM2aMxo8fn93l3FDeem+OHDkiSR4Fz5UrV+qPP/7Qt99+q+bNm2vx4sXq0aPHddXxb1G5cmVVrlzZ+fjYsWPq16+fKleurG7dumVjZbcGY4zOnz+v4OBgt2UXL17UyJEj1bRpU3399dduy9P/LQDArYpTygHgMvHx8cqbN69at26te+65R/Hx8c5lqampypcvn3r16uX2vKSkJAUFBenJJ590tqWkpGjYsGEqVaqU7Ha7IiMj9fTTTyslJcXluTabTQMHDlR8fLwqVKggu93uPM3y9ddfV506dZQ/f34FBwerevXqGZ6yfe7cOQ0aNEgFChRQaGio2rZtq4MHD8pms2n48OEufQ8ePKgHH3xQhQoVkt1uV4UKFfTBBx9keh9FRUWpe/fumjJlig4dOnTVvj179lRUVJRbe/opvhnthwULFqh8+fIKDg7WnXfeqW3btkmSJk+erFKlSikoKEgNGjRQYmJihtvcuHGj6tSpo+DgYEVHR2vSpElufbzx3lzJe++95+wbERGhAQMGuJxGHRUV5fwSp2DBghm+RxmJj49X+fLl1bBhQzVp0sTls3m5gwcP6qGHHlJERITsdruio6PVr18/Xbhwwdnn5MmTeuKJJxQVFSW73a5ixYqpe/fuLtclZ2UflSlTRkFBQapevbpWr17t7DN8+HA99dRTkqTo6GjnKdyXv4ezZ89W9erVFRwcrHz58qlz5846cOCA2+t7//33FRMTo+DgYNWqVUtr1qy55r7LrG+//Vb16tVTzpw5lSdPHrVr107bt2+/5vN+//13lSpVShUrVtRff/0l6dI+fvzxxxUZGSm73a5SpUpp7NixcjgczuclJibKZrPp9ddfd74uu92umjVr6qeffnLZxp9//qlevXqpWLFistvtKlKkiNq1a3fFfwfpevbsqVy5cum3335T8+bNlTNnTkVEROill16SMcalr8Ph0FtvvaUKFSooKChIhQoVUt++fXXixAmXflFRUWrTpo2WLVumGjVqKDg4WJMnT85w+8eOHVNSUpLq1q2b4fLw8HBJl75QqlmzpiSpV69ezs/IjBkznH3XrVunFi1aKHfu3AoJCVFsbKy+++47l/WlH1t27NihTp06KSwsTPnz59djjz2m8+fPX3VfAYAlDADAqWzZsuahhx4yxhizevVqI8msX7/eufzBBx80efLkMSkpKS7PmzlzppFkfvrpJ2OMMWlpaaZZs2YmJCTEPP7442by5Mlm4MCBJiAgwLRr187luZJMuXLlTMGCBc2IESPMu+++azZv3myMMaZYsWKmf//+ZsKECeaNN94wtWrVMpLM559/7rKOTp06GUnmgQceMO+++67p1KmTuf32240kM2zYMGe/P//80xQrVsxERkaal156yUycONG0bdvWSDJvvvnmNfdPiRIlTOvWrc3evXtNQECAefTRR53LEhISjCSzYMECZ1uPHj1MiRIl3NYzbNgw889fQZJM5cqVTWRkpBkzZowZM2aMyZ07tylevLiZMGGCKV++vBk3bpwZMmSIyZEjh2nYsKHL82NjY01ERIQJDw83AwcONOPHjzd33XWXkWSmTZvm7Oet9yYj6a+rSZMm5p133jEDBw40/v7+pmbNmubChQvGGGM+/vhj06FDByPJTJw40cyaNcts3br1qvv9/PnzJk+ePGbkyJHGGGM+/PBD4+/vbw4fPuzS7+DBgyYiIsL52iZNmmSGDh1qypUrZ06cOGGMMeb06dOmYsWKxt/f3/Tu3dtMnDjRjBw50tSsWdP52jzdRxUrVjQFChQwL730khk7dqwpUaKECQ4ONtu2bTPGGLN161bTpUsX5+ds1qxZZtasWebMmTPGGGNGjRplbDabue+++8x7771nRowYYQoUKGCioqKcdRtjzNSpU40kU6dOHTN+/Hjz+OOPmzx58piSJUua2NjYq+7Dyx09etTt38by5ctNQECAue2228yrr77qrCFv3rxm3759zn7p7/HRo0eNMcbs2bPHFC9e3FSpUsXZdvbsWVO5cmWTP39+8/zzz5tJkyaZ7t27G5vNZh577DHnuvbt22ckmapVq5pSpUqZsWPHmldffdUUKFDAFCtWzPmZMcaYOnXqmNy5c5shQ4aYqVOnmldeecU0bNjQrFq16qqvtUePHiYoKMiULl3aPPDAA2bChAmmTZs2RpIZOnSoS9+HH37YBAQEmN69e5tJkyaZZ555xuTMmdPl82vMpeNAqVKlTN68ec2zzz5rJk2aZBISEjLcflpamgkODjbVq1c3x48fv2Kdf/75p3nppZeMJNOnTx/nZ2Tv3r3GGGO++eYbkyNHDnPnnXeacePGmTfffNNUrlzZ5MiRw6xbt87t/alUqZKJi4szEyZMMN26dXMeHwHgRiNwA8D/27Bhg5Fkli9fbowxxuFwmGLFirn8B3nZsmVGkvnss89cntuqVStTsmRJ5+NZs2YZPz8/s2bNGpd+kyZNMpLMd99952yTZPz8/Mx///tft5qSk5NdHl+4cMFUrFjRNGrUyNm2ceNGI8k8/vjjLn179uzpFioeeughU6RIEXPs2DGXvp07dza5c+d2294/pQduY4zp1auXCQoKMocOHTLGeCdw2+12l3AzefJkI8kULlzYJCUlOdufe+45I8mlb2xsrJFkxo0b52xLSUkxVapUMeHh4c7A4K335p+OHDlicuTIYZo1a2bS0tKc7RMmTDCSzAcffOD2+tMD2rUsXLjQSDK7d+82xhiTlJRkgoKC3L4k6d69u/Hz83N+8XM5h8NhjDHmxRdfNJLM4sWLr9jH030kyWzYsMHZ9vvvv5ugoCDToUMHZ9trr73m9p4ZY0xiYqLx9/c3L7/8skv7tm3bTEBAgLP9woULJjw83FSpUsXlC6/333/fSLruwJ3+Obk8FG7dutX4+fmZ7t27O9suf++2b99uIiIiTM2aNc3ff//t7DNy5EiTM2dOs2vXLpftPvvss8bf39/s37/fGPO/wJ0/f36X53/yyScux5kTJ04YSea1117L9GtM16NHDyPJ5csxh8NhWrdubXLkyOH8DK5Zs8ZIMvHx8S7P/+qrr9zaS5QoYSSZr776KlM1pH/mcubMaVq2bGlefvlls3HjRrd+P/30k5Fkpk+f7tLucDhM6dKlTfPmzZ2fUWMuHR+jo6NN06ZNnW3p70/btm1d1tG/f38j6ZpfbgGAt3FKOQD8v/j4eBUqVEgNGzaUJOds0PPmzVNaWpokqVGjRipQoIDmz5/vfN6JEye0fPlyl5mjFyxYoHLlyqls2bI6duyY86dRo0aSpISEBJdtx8bGqnz58m41XX5N5IkTJ3Tq1CnVq1dPmzZtcrann+Lcv39/l+f+cyZxY4wWLVqkuLg4GWNc6mrevLlOnTrlst5rGTJkiC5evKgxY8Zk+jnX0rhxY5dT0NNniu/YsaPLBGHp7b/99pvL8wMCAtS3b1/n4xw5cqhv3746cuSINm7cKMl7780/rVixQhcuXNDjjz8uP7///Xrt3bu3wsLC9MUXX2RmF2QoPj5eNWrUcE78FRoaqtatW7ucVu5wOLRkyRLFxcWpRo0abutIP4V/0aJFuv3229WhQ4cr9vF0H915552qXr2683Hx4sXVrl07LVu2zPlv50oWL14sh8OhTp06uWyrcOHCKl26tHNbGzZs0JEjR/TII48oR44czuf37NlTuXPnvuo2ruXw4cPasmWLevbsqXz58jnbK1eurKZNm+rLL790e84vv/yi2NhYRUVFacWKFcqbN69z2YIFC1SvXj3lzZvX5TU1adJEaWlpLqfbS9J9993n8vx69epJ+t/nOzg4WDly5NDKlSvdTu/OrIEDBzr/nn4ZwIULF7RixQpnzblz51bTpk1daq5evbpy5crl9p5HR0erefPmmdr2iBEjNGfOHFWtWlXLli3TCy+8oOrVq6tatWqZOmV/y5Yt2r17t7p27arjx487azt79qwaN26s1atXu5yqL0kDBgxweZx+PMzovQQAKzFpGgBISktL07x589SwYUPt27fP2V67dm2NGzdO33zzjZo1a6aAgAB17NhRc+bMUUpKiux2uxYvXqzU1FSXwL17925t375dBQsWzHB7/5woKDo6OsN+n3/+uUaNGqUtW7a4XDt7+fXPv//+u/z8/NzW8c/Zr48ePaqTJ0/q/fffv+JtqDyZwKhkyZJ64IEH9P777+vZZ5/N9POupnjx4i6P04NUZGRkhu3/DB8RERHKmTOnS9ttt90m6dL1snfccYfX3pt/+v333yVJZcqUcWnPkSOHSpYs6VzuqZMnT+rLL7/UwIEDtWfPHmd73bp1tWjRIu3atUu33Xabjh49qqSkpGvei3zv3r3q2LHjVft4uo9Kly7t1ue2225TcnKyjh49qsKFC191W8aYDNchSYGBgZL+t3//2S8wMFAlS5a88ovJhCu9d5JUrlw5LVu2TGfPnnX5bMXFxalQoUJatmyZcuXK5fKc3bt36+eff870/vvn5z49fKd/vu12u8aOHav//Oc/KlSokO644w61adNG3bt3v+q+Tefn5+e2jy7/d5Fe86lTp5zXVF+r5sz+u0jXpUsXdenSRUlJSVq3bp1mzJihOXPmKC4uTr/88stVJyLcvXu3JF11ksBTp065fGnxz89JTEyM/Pz8rnnNOwB4G4EbAHRpsqTDhw9r3rx5mjdvntvy+Ph4NWvWTJLUuXNnTZ48WUuXLlX79u310UcfqWzZsrr99tud/R0OhypVqqQ33ngjw+39M0BmNLvvmjVr1LZtW9WvX1/vvfeeihQposDAQE2fPl1z5szx+DWmjwB169btiv9xvXwm58x44YUXNGvWLI0dO1bt27d3W/7PidHSXWnU09/f36N2849JnzLDG+/NjbRgwQKlpKRo3LhxGjdunNvy+Ph4jRgxwqvb9HQfXe+2bDabli5dmuH7/M8we7Po2LGjZs6cqfj4eJezKqRLr6lp06Z6+umnM3xuethNl5nP9+OPP664uDgtWbJEy5Yt09ChQzV69Gh9++23qlq16nW+mks1h4eHX3Eyvn9+eZDVfxdhYWFq2rSpmjZtqsDAQM2cOVPr1q1TbGzsVWuTpNdee+2Ktwu71ufkSsciALAagRsAdCm0hIeH691333VbtnjxYn388ceaNGmSgoODVb9+fRUpUkTz58/XXXfdpW+//VYvvPCCy3NiYmK0detWNW7cOMv/0Vu0aJGCgoK0bNky2e12Z/v06dNd+pUoUUIOh0P79u1zGdW5fDRUuvQf5tDQUKWlpalJkyZZqumfYmJi1K1bN02ePNl5mvfl8ubN6zJDd7qsjvZey6FDh9xGInft2iVJzlPVvfHeZKREiRKSpJ07d7qMJl64cEH79u3L8j6Pj49XxYoVM7w93eTJkzVnzhyNGDFCBQsWVFhYmH755Zerri8mJiZTfTzZR+kjkJfbtWuXQkJCnEHtSuuJiYmRMUbR0dFuQfRy6ft39+7dzlPbpUt3D9i3b5/LF16euvy9+6cdO3aoQIECbmdOvPbaawoICFD//v0VGhqqrl27urymM2fOeO3f2eXr/c9//qP//Oc/2r17t6pUqaJx48Zp9uzZV32ew+HQb7/95rJ/M/p3sWLFCtWtW/eGfclUo0YNzZw5U4cPH5Z09c+IdCmsZ3af7t6922UUfs+ePXI4HBneNQEArMQ13AD+9c6dO6fFixerTZs2uueee9x+Bg4cqNOnT+vTTz+VdOn0zHvuuUefffaZZs2apYsXL7qcTi5JnTp10sGDBzVlypQMt3f27Nlr1uXv7y+bzeYyGpyYmKglS5a49Eu/jvK9995zaX/nnXfc1texY0ctWrQow8B19OjRa9aUkSFDhig1NVWvvvqq27KYmBidOnVKP//8s7Pt8OHD+vjjj7O0rWu5ePGiy+2JLly4oMmTJ6tgwYLOa4y98d5kpEmTJsqRI4fGjx/vMjI5bdo0nTp1Sq1bt/Z4nQcOHNDq1avVqVOnDD+bvXr10p49e7Ru3Tr5+fmpffv2+uyzz7Rhwwa3daXX1LFjR23dujXD9yC9j6f76IcffnC5/v/AgQP65JNP1KxZM+fobXpg/ecXMHfffbf8/f01YsQItzMWjDE6fvy4pEvhrGDBgpo0aZLLLc5mzJiR4Zc6nihSpIiqVKmimTNnuqzrl19+0ddff61WrVq5Pcdms+n999/XPffcox49ejiPD9Kl/ffDDz9o2bJlbs87efKkLl686FF9ycnJbre0iomJUWhoqNtt2q5kwoQJzr8bYzRhwgQFBgaqcePGzprT0tI0cuRIt+devHgxy/s4OTlZP/zwQ4bLli5dKul/p/Jf6TNSvXp1xcTE6PXXX9eZM2fc1pPRseufX56mHw9btmzp2QsAgOvECDeAf71PP/1Up0+fVtu2bTNcfscdd6hgwYKKj493Buv77rtP77zzjoYNG6ZKlSqpXLlyLs954IEH9NFHH+mRRx5RQkKC6tatq7S0NO3YsUMfffSR8/61V9O6dWu98cYbatGihbp27aojR47o3XffValSpVwCbPXq1dWxY0e99dZbOn78uO644w6tWrXKOYJ1+ajRmDFjlJCQoNq1a6t3794qX768/v77b23atEkrVqzQ33//7fH+Sx/lnjlzptuyzp0765lnnlGHDh00aNAgJScna+LEibrttts8mqAtsyIiIjR27FglJibqtttu0/z587Vlyxa9//77zmuBvfHeZKRgwYJ67rnnNGLECLVo0UJt27bVzp079d5776lmzZrq1q2bx+ucM2eOjDFX/Gy2atVKAQEBio+PV+3atfXKK6/o66+/VmxsrPr06aNy5crp8OHDWrBggdauXas8efLoqaee0sKFC3XvvffqwQcfVPXq1fX333/r008/1aRJk3T77bd7vI8qVqyo5s2ba9CgQbLb7c4vfy4/1T39C48XXnhBnTt3VmBgoOLi4hQTE6NRo0bpueeeU2Jiotq3b6/Q0FDt27dPH3/8sfr06aMnn3xSgYGBGjVqlPr27atGjRrpvvvu0759+zR9+vTrvoZbujRi3bJlS91555166KGHdO7cOb3zzjvKnTv3Fe+T7ufnp9mzZ6t9+/bq1KmTvvzySzVq1EhPPfWUPv30U7Vp00Y9e/ZU9erVdfbsWW3btk0LFy5UYmKiChQokOnadu3apcaNG6tTp04qX768AgIC9PHHH+uvv/5S586dr/n8oKAgffXVV+rRo4dq166tpUuX6osvvtDzzz/vPAMhNjZWffv21ejRo7VlyxY1a9ZMgYGB2r17txYsWKC3335b99xzT6ZrTpecnKw6derojjvuUIsWLRQZGamTJ09qyZIlWrNmjdq3b+88JT4mJkZ58uTRpEmTFBoaqpw5c6p27dqKjo7W1KlT1bJlS1WoUEG9evVS0aJFdfDgQSUkJCgsLEyfffaZy3b37duntm3bqkWLFvrhhx80e/Zsde3a9brOhACALMmWudEB4CYSFxdngoKCzNmzZ6/Yp2fPniYwMNB5Oy2Hw2EiIyONJDNq1KgMn3PhwgUzduxYU6FCBWO3203evHlN9erVzYgRI8ypU6ec/SSZAQMGZLiOadOmmdKlSxu73W7Kli1rpk+fnuEttc6ePWsGDBhg8uXLZ3LlymXat29vdu7caSSZMWPGuPT966+/zIABA0xkZKQJDAw0hQsXNo0bNzbvv//+NffV5bcFu9zu3buNv7+/223BjDHm66+/NhUrVjQ5cuQwZcqUMbNnz77ibcH+uR/Sb5v0z9shZXQLstjYWFOhQgWzYcMGc+edd5qgoCBTokQJM2HCBLd6vfHeXMmECRNM2bJlTWBgoClUqJDp16+fy72kjcn8bcEqVapkihcvftU+DRo0MOHh4SY1NdUYc+mWXN27dzcFCxY0drvdlCxZ0gwYMMDlVlrHjx83AwcONEWLFjU5cuQwxYoVMz169HC5XZyn+2j27NnOz2rVqlUzvC/zyJEjTdGiRY2fn5/bLcIWLVpk7rrrLpMzZ06TM2dOU7ZsWTNgwACzc+dOl3W89957Jjo62tjtdlOjRg2zevVqExsbe923BTPGmBUrVpi6deua4OBgExYWZuLi4syvv/7q0iej9y45OdnExsaaXLlymR9//NEYc+l+588995wpVaqUyZEjhylQoICpU6eOef311523qLvS5zt9v6bXd+zYMTNgwABTtmxZkzNnTpM7d25Tu3Zt89FHH13ztfbo0cPkzJnT7N2713lv9UKFCplhw4a53L4u3fvvv2+qV69ugoODTWhoqKlUqZJ5+umnnbf/M+bKx4GMpKammilTppj27dubEiVKGLvdbkJCQkzVqlXNa6+95vK5NObSLdHKly9vAgIC3G4RtnnzZnP33Xeb/PnzG7vdbkqUKGE6depkvvnmG2ef9Pfn119/Nffcc48JDQ01efPmNQMHDjTnzp3LVM0A4E02Y7Iw4wwA4Ka3ZcsWVa1aVbNnz9b999+f3eXgFmWz2TRgwACXU5Zx8+jZs6cWLlyY4anYt6Lhw4drxIgROnr0qEdnEQCAVbiGGwBuAefOnXNre+utt+Tn56f69etnQ0UAAADgGm4AuAW8+uqr2rhxoxo2bKiAgAAtXbpUS5cuVZ8+fbx6CycAAABkHoEbAG4BderU0fLlyzVy5EidOXNGxYsX1/Dhw91uVwYAAIAbh2u4AQAAAACwANdwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcA4KpmzJghm82mDRs2XLFPYmKibDabXn/99auuKyoqSjabTU2aNMlw+ZQpU2Sz2a65vasZPny4bDabjh07dsU+K1eulM1m08KFCzO93k6dOslms+mZZ5656jptNptmz56dYZ+6devKZrOpYsWKGS5PS0tTRESEbDabli5dmunaJOmJJ55QtWrVlC9fPoWEhKhcuXIaPnx4pu+/PHHiRN17770qXry4bDabevbs6dH209/bjH5Kly7t1n/atGkqV66cgoKCVLp0ab3zzjtufXr27OmynoCAAEVGRqpz58769ddfM1VXZj6/1+PXX3/V8OHDlZiYaMn6faUGAEDGmKUcAHBDBQUFKSEhQX/++acKFy7ssiw+Pl5BQUE6f/58NlWXsaSkJH322WeKiorS3LlzNWbMGNlstgz7BgUFac6cOerWrZtLe2Jior7//nsFBQVdcTvffvutDh8+rKioKMXHx6tly5aZrvGnn35SvXr11KtXLwUFBWnz5s0aM2aMVqxYodWrV8vP7+rfsY8dO1anT59WrVq1dPjw4UxvN91bb73lFu5///13DRkyRM2aNXNpnzx5sh555BF17NhRgwcP1po1azRo0CAlJye7faFht9s1depUSdLFixe1d+9eTZo0SV999ZV+/fVXRUREeFyrN/36668aMWKEGjRooKioqH9tDQCAjBG4AQA3VN26dfXTTz9p/vz5euyxx5ztf/zxh9asWaMOHTpo0aJF2Vihu0WLFiktLU0ffPCBGjVqpNWrVys2NjbDvq1atdKnn36qY8eOqUCBAs72OXPmqFChQipdurROnDiR4XNnz56tatWqqUePHnr++ed19uxZ5cyZM1M1rl271q0tJiZGTz75pNavX6877rjjqs9ftWqVc3Q7V65cmdrm5dq3b+/WNmrUKEnS/fff72w7d+6cXnjhBbVu3dp5hkHv3r3lcDg0cuRI9enTR3nz5nX2DwgIcPvy4o477lCbNm30xRdfqHfv3h7Xml2MMTp//ryCg4OzuxQAwA3CKeUAgBsqKChId999t+bMmePSPnfuXOXNm1fNmzd3e05qaqp27NiRpZFXb4iPj1fTpk3VsGFDlStXTvHx8Vfs265dO9ntdi1YsMClfc6cOerUqZP8/f0zfN65c+f08ccfq3PnzurUqZPOnTunTz755LrqTh/tPHny5DX7lihR4oqj9lk1Z84cRUdHq06dOs62hIQEHT9+XP3793fpO2DAAJ09e1ZffPHFNdebfmZEQEDWxg169uypXLly6eDBg2rfvr1y5cqlggUL6sknn1RaWppL33nz5ql69eoKDQ1VWFiYKlWqpLffflvSpdPV7733XklSw4YNnae+r1y5UtKl/d+mTRstW7ZMNWrUUHBwsCZPnuy8BGPGjBlutdlsNg0fPtyl7eDBg3rooYcUEREhu92u6Oho9evXTxcuXLhmDQCA7EXgBgDccF27dtX69eu1d+9eZ9ucOXN0zz33KDAw0K3/wYMHVa5cOT333HM3skxJ0qFDh5SQkKAuXbpIkrp06aKFCxfqwoULGfYPCQlRu3btNHfuXGfb1q1b9d///lddu3a94nY+/fRTnTlzRp07d1bhwoXVoEGDqwb7jFy8eFHHjh3ToUOH9PXXX2vIkCEKDQ1VrVq1PFqPN2zevFnbt293e82bN2+WJNWoUcOlvXr16vLz83Muv9yxY8d07Ngx/fXXX/rhhx/0xBNPKH/+/GrTpk2W60tLS1Pz5s2VP39+vf7664qNjdW4ceP0/vvvO/ssX75cXbp0Ud68eTV27FiNGTNGDRo00HfffSdJql+/vgYNGiRJev755zVr1izNmjVL5cqVc65j586d6tKli5o2baq3335bVapU8ajOQ4cOqVatWpo3b57uu+8+jR8/Xg888IBWrVql5OTkTNUAAMg+nFIOALjhGjVqpMKFC2vu3LkaMmSItm/fri1btujtt9/Wb7/9lt3luZg7d67sdrvatWsnSercubNefPFFffnllxmeRi1d+kIhLi5OBw4cUGRkpOLj41WyZMmrntY9e/Zs1alTR5GRkc7t9O/fX0ePHlXBggUzVeuGDRt05513Oh+XKVNGn376qfLly5fJV+s96V8WXH46uSQdPnxY/v7+Cg8Pd2nPkSOH8ufPr0OHDrm0nz171u31Fy1aVF9//XWm90tGzp8/r/vuu09Dhw6VJD3yyCOqVq2apk2bpn79+kmSvvjiC4WFhWnZsmUZnplQsmRJ1atXT+PHj1fTpk3VoEEDtz579uzRV1995XLmhieTmz333HP6888/tW7dOpcvKV566SUZY5QnT55r1gAAyD6McAMAbjh/f3916tTJOQocHx+vyMhI1atXL8P+UVFRMsZkeAqu1eLj49W6dWuFhoZKkkqXLq3q1atfdfS5WbNmypcvn+bNmydjjObNm+ccIc/I8ePHtWzZMpc+HTt2lM1m00cffZTpWsuXL6/ly5dryZIlevrpp5UzZ85Mz1LuTQ6HQ/PmzVPVqlXdRlrPnTunHDlyZPi8oKAgnTt3zq1t+fLlWr58uZYtW6bJkycrV65catWqlXbt2nVddT7yyCMuj+vVq+fyhU+ePHl09uxZLV++PMvbiI6OzvAyicxwOBxasmSJ4uLi3M4IkOT1SwAAAN7HCDcAIFt07dpV48eP19atWzVnzhx17tz5pgsQ27dv1+bNm9W9e3ft2bPH2d6gQQO9++67SkpKUlhYmNvzAgMDde+992rOnDmqVauWDhw4cNXTyefPn6/U1FRVrVrVZTu1a9dWfHy8BgwYIEn6+++/XU5lDw4OVu7cuZ2Pw8LCnLdca9eunebMmaN27dpp06ZNuv3227O+I/7fuXPndOrUKZe2f840L12agO3gwYN64okn3JYFBwdf8XT8jCYU8/f3d7uNXKtWrVS6dGk999xzzgntjh496tInX758Vwz20qUg/88R8rx587pMaNe/f3999NFHatmypYoWLapmzZqpU6dOatGixRXX+0/R0dGZ7vtPR48eVVJS0hVvIwcAuPkxwg0AyBa1a9dWTEyMHn/8ce3bt++qgTS7pN9P+4knnlDp0qWdP+PGjdP58+evOpt6165dtWXLFg0fPly33367ypcvf8W+6aPldevWddnO2rVr9cMPPzhHXe+++24VKVLE+XP5LO8ZufvuuyVdmvjLG+bPn++y/SJFilzx9fj5+WU4ql+kSBGlpaXpyJEjLu0XLlzQ8ePHM3Wbr2LFiqlMmTJavXq1JOnAgQNudX3//fdXXceVJq+7XHh4uLZs2aJPP/1Ubdu2VUJCglq2bKkePXpc87npMpqR/EpfLP1zwjYAgO9jhBsAkG26dOmiUaNGqVy5ch5PJmU1Y4zmzJmjhg0bus2oLUkjR45UfHy8evXqleHz77rrLhUvXlwrV67U2LFjr7idffv26fvvv9fAgQPdbjXmcDj0wAMPaM6cORoyZIjGjRvnMgJ7rXCakpIih8PhNiqdVc2bN7/m6dUpKSlatGiRGjRokGF96e/zhg0b1KpVK2f7hg0b5HA4Mv05uHjxovN0+cKFC7vV5Y0RfenSteVxcXGKi4uTw+FQ//79NXnyZA0dOlSlSpXK0lkZ6bc9++fs8b///rvL44IFCyosLEy//PLLVdd3s50ZAgD4HwI3ACDbPPzww/L391ft2rWv2i81NVV79+5V7ty5rziq6m3fffedEhMT9dJLL+mee+5xW75r1y4NHTpUhw4dyjBY2mw2jR8/Xps3b9YDDzxwxe2kj24//fTTzgnTLjd16lTFx8dryJAhql69eobrOHnypHLmzOk2w/vUqVMluc4InpycrP3796tAgQIu9wnPjKuNaqf78ssvdfLkSbfJ0tI1atRI+fLl08SJE10C98SJExUSEqLWrVtfs45du3Zp586dzv0RFBTkdtq5Nxw/flz58+d3Pvbz81PlypUlXfpiQZLzPumZufVaurCwMBUoUECrV6/W448/7mx/7733XPr5+fmpffv2mj17tjZs2OB2HbcxRjabLUs1AABuDAI3ACBTPvjgA3311Vdu7Zef1vzNN9/o/Pnzbn3at2+f4XWoJUqUcLvncEbSbwvWo0ePTE+c9sYbbygkJMSlzc/PT88//7zz8aJFi7Rjxw635/bo0UPx8fHy9/e/YgBs27atXnjhBc2bN0+DBw/OsE+7du2cs5tfSXx8vKpUqZJh2E7fzqOPPqpNmzapWrVqGfZZuXKlBg0apHvuuUelS5fWhQsXtGbNGi1evFg1atRQt27dnH3Xr1+vhg0batiwYS77/rPPPtPWrVslXfqC4+eff9aoUaOcNaQHzWuJj4+X3W5Xx44dM1weHByskSNHasCAAbr33nvVvHlzrVmzRrNnz9bLL7/sNqP6xYsXnaf2OxwOJSYmatKkSXI4HBo2bFimasqqhx9+WH///bcaNWqkYsWK6ffff9c777yjKlWqOCeDq1Klivz9/TV27FidOnVKdrtdjRo1cpuFPaN1jxkzRg8//LBq1Kih1atXZzgJ3CuvvKKvv/5asbGx6tOnj8qVK6fDhw9rwYIFWrt2rfLkyZPlGgAA1iNwAwAyZeLEiRm29+zZ0/n3r776KsNQHhUVdcMnfho9erRbm7+/v0vgvtK1zbGxsVqwYIHq1KlzxVtqVaxYUdHR0Zo9e/YVA/e1bNq0STt27HDemiojcXFxevTRRzV79uwrBu5KlSqpYcOG+uSTT3T48GEZYxQTE6MXX3xRTz311FUnD0u3aNEizZw50/l48+bNzntiFytWLFOBOykpSV988YVat27tMpnbP/Xv31+BgYEaN26cPv30U0VGRurNN9/M8Jr0lJQUlzMEwsLCVLNmTc2aNUuNGze+Zk3Xo1u3bnr//ff13nvv6eTJkypcuLDuu+8+DR8+XH5+l6bBKVy4sCZNmqTRo0froYceUlpamhISEq4Zdl988UUdPXpUCxcudE7MtnTpUrfnFS1aVOvWrdPQoUMVHx+vpKQkFS1aVC1btnR+oZTVGgAA1rMZY0x2FwEAAAAAwK2GWcoBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAMhGM2bMkM1m04YNGyzfls1m0/Dhwy3fDgAAuITADQD410gPt5f/hIeHq2HDhlq6dGmW1/vKK69oyZIl3ivUQ2vXrlXLli1VtGhRBQUFqXjx4oqLi9OcOXOyrSZva9q0qWw2mwYOHHhd69m4caPatGmjwoULK1euXKpcubLGjx+vtLQ0L1UKAMD/BGR3AQAA3GgvvfSSoqOjZYzRX3/9pRkzZqhVq1b67LPP1KZNG4/X98orr+iee+5R+/btvV/sNSxYsED33XefqlSposcee0x58+bVvn37tHr1ak2ZMkVdu3Z19j137pwCAnzvV//ixYv1ww8/XPd6Nm7cqDp16qh06dJ65plnFBISoqVLl+qxxx7T3r179fbbb3uhWgAA/sf3fusCAHCdWrZsqRo1ajgfP/TQQypUqJDmzp2bpcCdnYYPH67y5cvrxx9/VI4cOVyWHTlyxOVxUFDQjSzNK86fP6///Oc/euaZZ/Tiiy9e17omT54sSVq9erXy5csnSerbt69iY2M1Y8YMAjcAwOs4pRwA8K+XJ08eBQcHu43+vv7666pTp47y58+v4OBgVa9eXQsXLnTpY7PZdPbsWc2cOdN5mnrPnj2dyw8ePKiHHnpIERERstvtio6OVr9+/XThwgWX9aSkpGjw4MEqWLCgcubMqQ4dOujo0aPXrH3v3r2qWbOmW9iWpPDwcLda06/hTkxMdDu9/vKfy61bt04tWrRQ7ty5FRISotjYWH333XcufU6fPq3HH39cUVFRstvtCg8PV9OmTbVp0yZnn+TkZO3YsUPHjh275utK9+qrr8rhcOjJJ5/M9HOuJCkpSUFBQcqTJ49Le5EiRRQcHHzd6wcA4J8Y4QYA/OucOnVKx44dkzFGR44c0TvvvKMzZ86oW7duLv3efvtttW3bVvfff78uXLigefPm6d5779Xnn3+u1q1bS5JmzZqlhx9+WLVq1VKfPn0kSTExMZKkQ4cOqVatWjp58qT69OmjsmXL6uDBg1q4cKGSk5NdQvKjjz6qvHnzatiwYUpMTNRbb72lgQMHav78+Vd9LSVKlNA333yjP/74Q8WKFcv0PihYsKBmzZrl0paamqonnnjCpa5vv/1WLVu2VPXq1TVs2DD5+flp+vTpatSokdasWaNatWpJkh555BEtXLhQAwcOVPny5XX8+HGtXbtW27dvV7Vq1SRJ69evV8OGDTVs2LBMTd62f/9+jRkzRh988IFXAnGDBg00f/589e3bV4MHD3aeUr548WK99tpr171+AADcGAAA/iWmT59uJLn92O12M2PGDLf+ycnJLo8vXLhgKlasaBo1auTSnjNnTtOjRw+353fv3t34+fmZn376yW2Zw+FwqalJkybONmOMeeKJJ4y/v785efLkVV/TtGnTjCSTI0cO07BhQzN06FCzZs0ak5aW5tZXkhk2bNgV19W/f3/j7+9vvv32W2eNpUuXNs2bN3epLTk52URHR5umTZs623Lnzm0GDBhw1VoTEhKuWcPl7rnnHlOnTh2X+q+1jau5ePGiGThwoAkMDHS+9/7+/mbixIlZXicAAFfDCDcA4F/n3Xff1W233SZJ+uuvvzR79mw9/PDDCg0N1d133+3sd/mo6okTJ5SWlqZ69epp7ty519yGw+HQkiVLFBcX53K9eLp/nrbdp08fl7Z69erpzTff1O+//67KlStfcTsPPvigihYtqjfeeEMJCQlKSEjQyJEjVbJkSc2aNUt16tS5Zq2S9OGHH+q9997TuHHj1LBhQ0nSli1btHv3bg0ZMkTHjx936d+4cWPNmjVLDodDfn5+ypMnj9atW6dDhw4pIiIiw200aNBAxphM1ZOQkKBFixZp3bp1meqfGf7+/oqJiVHz5s117733KigoSHPnztWjjz6qwoULZ8ukdwCAWxuBGwDwr1OrVi2XENylSxdVrVpVAwcOVJs2bZynVH/++ecaNWqUtmzZopSUFGf/f4bljBw9elRJSUmqWLFipmoqXry4y+O8efNKuhT0r6V58+Zq3ry5kpOTtXHjRs2fP1+TJk1SmzZttGPHDrdruf9py5YteuSRR9SlSxcNHjzY2b57925JUo8ePa743FOnTilv3rx69dVX1aNHD0VGRqp69epq1aqVunfvrpIlS16z/n+6ePGiBg0apAceeEA1a9b0+PlXMmbMGL399tvavXu3cuXKJUnq1KmTGjZsqAEDBqhNmzY+OYs7AODmxaRpAIB/PT8/PzVs2FCHDx92hsw1a9aobdu2CgoK0nvvvacvv/xSy5cvV9euXTM9SusJf3//DNs92VZISIjq1aunCRMmaMiQITpx4sQ17y9+4sQJdezYUbfddpumTp3qsszhcEiSXnvtNS1fvjzDn8uD62+//aZ33nlHEREReu2111ShQoUs3d/8ww8/1M6dO9W3b18lJiY6f6RLk7MlJiYqOTnZ4/W+9957atSokbPmdG3bttWhQ4ec2wAAwFv4GhcAAF0aVZWkM2fOSJIWLVqkoKAgLVu2THa73dlv+vTpbs/NaMS7YMGCCgsL0y+//GJRxVeXPoJ/+PDhK/ZxOBy6//77dfLkSa1YsUIhISEuy9MnfwsLC1OTJk2uuc0iRYqof//+6t+/v44cOaJq1arp5ZdfVsuWLT2qff/+/UpNTVXdunXdln344Yf68MMP9fHHH3t8Cvhff/2ltLQ0t/bU1FRJ//sMAADgLYxwAwD+9VJTU/X1118rR44cKleunKRLI842m80loCUmJmrJkiVuz8+ZM6dOnjzp0ubn56f27dvrs88+04YNG9ye461R8m+++SbD9i+//FKSVKZMmSs+d8SIEVq2bJnmzp2r6Ohot+XVq1dXTEyMXn/9decXEZdLv21ZWlqaTp065bIsPDxcERERLqfiZ/a2YJ07d9bHH3/s9iNJrVq10scff6zatWtfdR0Zue2227R8+XKX69HT0tL00UcfKTQ01PkFAwAA3sIINwDgX2fp0qXasWOHJOnIkSOaM2eOdu/erWeffVZhYWGSpNatW+uNN95QixYt1LVrVx05ckTvvvuuSpUqpZ9//tllfdWrV9eKFSv0xhtvKCIiQtHR0apdu7ZeeeUVff3114qNjVWfPn1Urlw5HT58WAsWLNDatWvd7gedFe3atVN0dLTi4uIUExOjs2fPasWKFfrss89Us2ZNxcXFZfi8bdu2aeTIkapfv76OHDmi2bNnuyzv1q2b/Pz8NHXqVLVs2VIVKlRQr169VLRoUR08eFAJCQkKCwvTZ599ptOnT6tYsWK65557dPvttytXrlxasWKFfvrpJ40bN865zszeFqxs2bIqW7Zshsuio6PdRrYbNGigVatWXfNLjGeffVbdunVT7dq11adPHwUHB2vu3LnauHGjRo0apcDAwKs+HwAATxG4AQD/Oi+++KLz70FBQSpbtqwmTpyovn37OtsbNWqkadOmacyYMXr88ccVHR2tsWPHKjEx0S1wv/HGG+rTp4+GDBmic+fOqUePHqpdu7aKFi2qdevWaejQoYqPj1dSUpKKFi2qli1bup2+nVVTp07VJ598oo8++kiHDh2SMUYlS5bUCy+8oGeeeeaKk4AdP35cxhitWrVKq1atcluefk/yBg0a6IcfftDIkSM1YcIEnTlzRoULF1bt2rWd+yskJET9+/fX119/rcWLF8vhcKhUqVJ677331K9fP6+8zqtJr+la7r//fhUoUECjR4/Wa6+9pqSkJJUpU0aTJk1yee8BAPAWm7Fi5hcAAIAb4PTp08qXL5/eeustDRgwILvLAQDABddwAwAAn7V69WoVLVpUvXv3zu5SAABwwwg3AAAAAAAWYIQbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsIBP34fb4XDo0KFDCg0Nlc1my+5yAAAAAAC3OGOMTp8+rYiICPn5XX0M26cD96FDhxQZGZndZQAAAAAA/mUOHDigYsWKXbWPTwfu0NBQSZdeaFhYWDZXg1tZamqqvv76azVr1kyBgYHZXQ4AXDeOawBuNRzXcKMkJSUpMjLSmUevxqcDd/pp5GFhYQRuWCo1NVUhISEKCwvjAA7glsBxDcCthuMabrTMXNbMpGkAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWMCnr+HOrLS0NKWmpmZ3GfBhqampCggI0Pnz55WWlpbl9QQGBsrf39+LlQEAAAC4Wd3SgdsYoz///FMnT57M7lLg44wxKly4sA4cOHDd93zPkyePChcuzL3jAQAAgFvcLR2408N2eHi4QkJCCDjIMofDoTNnzihXrlzXvLn9lRhjlJycrCNHjkiSihQp4s0SAQAAANxkbtnAnZaW5gzb+fPnz+5y4OMcDocuXLigoKCgLAduSQoODpYkHTlyROHh4ZxeDgAAANzCbtlJ09Kv2Q4JCcnmSgBX6Z9J5hUAAAAAbm23bOBOx2nkuNnwmQQAAAD+HW75wA3fEBUVpbfeeiu7y8iS4cOHq0qVKtldBgAAAICbDIH7JvbDDz/I399frVu3zu5SMnQjQ/LNEmr9/f21ZMmS7C4DAAAAgA+4ZSdNu5qoZ7+4YdtKHJP1sDxt2jQ9+uijmjZtmg4dOqSIiAgvVgYAAAAAsBIj3DepM2fOaP78+erXr59at26tGTNmuPX57LPPVLNmTQUFBalAgQLq0KGDc1lKSoqeeeYZRUZGym63q1SpUpo2bZpz+S+//KKWLVsqV65cKlSokB544AEdO3bMubxBgwYaOHCgBg4cqNy5c6tAgQIaOnSojDHO5b///rueeOIJ2Ww2l+uS165dq3r16ik4OFiRkZEaNGiQzp4961x+5MgRxcXFKTg4WNHR0YqPj7/u/XXgwAF16tRJefLkUb58+dSuXTslJiY6l/fs2VPt27fX66+/riJFiih//vwaMGCAy8Rlhw8fVuvWrZ11zZkzx2UUv3LlypKkDh06yGazKSoqyqWGWbNmKSoqSrlz51bnzp11+vTp635dAAAAAHwXgfsm9dFHH6ls2bIqU6aMunXrpg8++MAZdiXpiy++UIcOHdSqVStt3rxZ33zzjWrVquVc3r17d82dO1fjx4/X9u3bNXnyZOXKlUuSdPLkSTVq1EhVq1bVhg0b9NVXX+mvv/5Sp06dXGqYOXOmAgICtH79er399tt64403NHXqVEnS4sWLVaxYMb300ks6fPiwDh8+LEnau3evWrRooY4dO+rnn3/W/PnztXbtWg0cONC53p49e+rAgQNKSEjQwoUL9d577znvTZ0Vqampat68uUJDQ7VmzRp99913ypUrl1q0aKELFy44+yUkJGjv3r1KSEjQzJkzNWPGDJcvMrp3765Dhw5p5cqVWrRokd5//32Xur799ltJ0vTp03X48GH99NNPzmV79+7VkiVL9Pnnn+vzzz/XqlWrNGbMmCy/JgAAAAC+7195SrkvmDZtmrp16yZJatGihU6dOqVVq1apQYMGkqSXX35ZnTt31ogRI5zPuf322yVJu3bt0kcffaTly5erSZMmkqSSJUs6+02YMEFVq1bVK6+84mz74IMPFBkZqV27dum2226TJEVGRurNN9+UzWZTmTJltG3bNr355pvq3bu38uXLJ39/f4WGhqpw4cLO9YwePVr333+/Hn/8cUlS6dKlNX78eMXGxmrixInav3+/li5dqvXr16tmzZrO11quXLks76v58+fL4XBo6tSpzpH26dOnK0+ePFq5cqWaNWsmScqbN68mTJggf39/lS1bVq1bt9Y333yj3r17a8eOHVqxYoV++ukn1ahRQ5I0depUlS5d2rmdAgUKSJLy5Mnj8pqlS/fpnjFjhkJDQyVJDzzwgL755hu9/PLLWX5dAAAAAHwbI9w3oZ07d2r9+vXq0qWLJCkgIED33XefyynhW7ZsUePGjTN8/pYtW+Tv76/Y2NgMl2/dulUJCQnKlSuX86ds2bKSLo3UprvjjjtcThW/8847tXv3bqWlpV2x9q1bt2rGjBku627evLkcDof27dun7du3KyAgQNWrV3c+p2zZssqTJ8+1d8xVtrlnzx6FhoY6t5kvXz6dP3/e5fVUqFBB/v7+zsdFihRxjmDv3LlTAQEBqlatmnN5qVKllDdv3kzVEBUV5Qzb/1w3AAAAgH8nRrhvQtOmTdPFixddJkkzxshut2vChAnKnTu3goODr/j8qy2TLl0fHhcXp7Fjx7otK1KkSNYL//919+3bV4MGDXJbVrx4ce3ateu61n+lbVavXj3Da8ELFizo/HtgYKDLMpvNJofD4ZUarFw3AAAAAN9E4L7JXLx4UR9++KHGjRvnPBU6Xfv27TV37lw98sgjqly5sr755hv16tXLbR2VKlWSw+HQqlWrnKeUX65atWpatGiRoqKiFBBw5Y/AunXrXB7/+OOPKl26tHOUOEeOHG6j3dWqVdOvv/6qUqVKZbjOsmXL6uLFi9q4caPzlPKdO3fq5MmTV6zjWqpVq6b58+crPDxcYWFhWVpHmTJldPHiRW3evNk5+r5nzx6dOHHCpV9gYOBVR/gBAAAAIB2nlN9kPv/8c504cUIPPfSQKlas6PLTsWNH52nlw4YN09y5czVs2DBt375d27Ztc45YR0VFqUePHnrwwQe1ZMkS7du3TytXrtRHH30kSRowYID+/vtvdenSRT/99JP27t2rZcuWqVevXi5hcv/+/Ro8eLB27typuXPn6p133tFjjz3mXB4VFaXVq1fr4MGDzhnOn3nmGX3//fcaOHCgtmzZot27d+uTTz5xTppWpkwZtWjRQn379tW6deu0ceNGPfzww9cclZekc+fOacuWLS4/e/fu1f33368CBQqoXbt2WrNmjfP1Dho0SH/88Uem9nvZsmXVpEkT9enTR+vXr9fmzZvVp08fBQcHu5xWHxUVpW+++UZ//vmnWxgHAAAAgMsRuG8y06ZNU5MmTZQ7d263ZR07dtSGDRv0888/q0GDBlqwYIE+/fRTValSRY0aNdL69eudfSdOnKh77rlH/fv3V9myZdW7d2/nrbkiIiL03XffKS0tTc2aNVOlSpX0+OOPK0+ePPLz+99Honv37jp37pxq1aqlAQMG6LHHHlOfPn2cy1966SUlJiYqJibGeep25cqVtWrVKu3atUv16tVT1apV9eKLL7qcHj99+nRFREQoNjZWd999t/r06aPw8PBr7ptdu3apatWqLj99+/ZVSEiIVq9ereLFi+vuu+9WuXLl9NBDD+n8+fMejXh/+OGHKlSokOrXr68OHTqod+/eCg0NVVBQkLPPa6+9puXLlysyMlJVq1bN9LoBAAAA/PvYzOX3mvIxSUlJyp07t06dOuUWrM6fP699+/YpOjraJTAhcxo0aKAqVao470H9b/THH38oMjJSK1asUMOGDZWUlKSwsDCXLyWygs8mgJtBamqqvvzyS7Vq1cptHgoA8EUc13CjXC2H/hPXcAP/79tvv9WZM2dUqVIlHT58WE8//bSioqJUv3797C4NAAAAgA8icAP/LzU1Vc8//7x+++03hYaGqk6dOoqPj1dgYCAzjgMAAADwGIEbGVq5cmV2l3DDNW/eXM2bN8/uMgAAAADcIpg0DQAAAAAAC2Rr4B4+fLhsNpvLT9myZbOzJAAAAAAAvCLbTymvUKGCVqxY4XwcEODdknx4EnbcovhMAgCAzIh69ovsLsGn2P2NXq0lVRy+TClptuwux6ckjmmd3SXcsrI9cAcEBKhw4cJeX2/6rQCSk5MVHBzs9fUDWZWcnCxJ3K4CAAAAuMVle+DevXu3IiIiFBQUpDvvvFOjR49W8eLFM+ybkpKilJQU5+OkpCRJl2aXTk1NdesfGhqqv/76Sw6HQyEhIbLZ+KYLWWOM0YULF3Tu3Lksf46MMUpOTtbRo0cVFhYmh8PB7OcAsk36782Mfn8CuDnY/TkrzhN2P+PyJzKP3wWe8WR/2Uw2nt+6dOlSnTlzRmXKlNHhw4c1YsQIHTx4UL/88otCQ0Pd+g8fPlwjRoxwa58zZ45CQkIy3EZoaKhCQ0Pl58f8cMh+DodDp0+f1unTp7O7FAAAAABZkJycrK5du+rUqVMKCwu7at9sDdz/dPLkSZUoUUJvvPGGHnroIbflGY1wR0ZG6tixY1d9oWlpabp48SLXziLLLl68qO+//1516tTJ8jwDNptNAQEB8vf393J1AOC51NRULV++XE2bNuUSF+AmVXH4suwuwafY/YxG1nBo6AY/pTg4s9UTvwzn1rieSEpKUoECBTIVuLP9lPLL5cmTR7fddpv27NmT4XK73S673e7WHhgYeNX/LPAfCVyv1NRUXbx4Ubly5eLzBOCWcq3foQCyDxN/ZU2Kw8a+8xC/Bzzjyf66qc6zPnPmjPbu3asiRYpkdykAAAAAAFyXbA3cTz75pFatWqXExER9//336tChg/z9/dWlS5fsLAsAAAAAgOuWraeU//HHH+rSpYuOHz+uggUL6q677tKPP/6oggULZmdZAAAAAABct2wN3PPmzcvOzQMAAAAAYJmb6hpuAAAAAABuFQRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACN03gHjNmjGw2mx5//PHsLgUAAAAAgOt2UwTun376SZMnT1blypWzuxQAAAAAALwi2wP3mTNndP/992vKlCnKmzdvdpcDAAAAAIBXZHvgHjBggFq3bq0mTZpkdykAAAAAAHhNQHZufN68edq0aZN++umnTPVPSUlRSkqK83FSUpIkKTU1VampqZbUCEhyfr74nAG4VXBcA25+dn+T3SX4FLufcfkTmcfvAs94sr+yLXAfOHBAjz32mJYvX66goKBMPWf06NEaMWKEW/vXX3+tkJAQb5cIuFm+fHl2lwAAXsVxDbh5vVoruyvwTSNrOLK7BJ/z5ZdfZncJPiU5OTnTfW3GmGz5CmjJkiXq0KGD/P39nW1paWmy2Wzy8/NTSkqKyzIp4xHuyMhIHTt2TGFhYTesdvz7pKamavny5WratKkCAwOzuxwAuG4c14CbX8Xhy7K7BJ9i9zMaWcOhoRv8lOKwZXc5PuWX4c2zuwSfkpSUpAIFCujUqVPXzKHZNsLduHFjbdu2zaWtV69eKlu2rJ555hm3sC1JdrtddrvdrT0wMJD/LOCG4LMG4FbDcQ24eaWkERqzIsVhY995iN8DnvFkf2Vb4A4NDVXFihVd2nLmzKn8+fO7tQMAAAAA4GuyfZZyAAAAAABuRdk6S/k/rVy5MrtLAAAAAADAKxjhBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxwU02aBgBAVkU9+0V2l+BT7P5Gr9aSKg5fxv1qPZQ4pnV2lwAA8BGMcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAW8Dhwb9q0Sdu2bXM+/uSTT9S+fXs9//zzunDhgleLAwAAAADAV3kcuPv27atdu3ZJkn777Td17txZISEhWrBggZ5++mmvFwgAAAAAgC/yOHDv2rVLVapUkSQtWLBA9evX15w5czRjxgwtWrTI2/UBAAAAAOCTPA7cxhg5HA5J0ooVK9SqVStJUmRkpI4dO+bd6gAAAAAA8FEeB+4aNWpo1KhRmjVrllatWqXWrVtLkvbt26dChQp5vUAAAAAAAHyRx4H7rbfe0qZNmzRw4EC98MILKlWqlCRp4cKFqlOnjtcLBAAAAADAFwV4+oTKlSu7zFKe7rXXXpO/v79XigIAAAAAwNd5HLjTXbhwQUeOHHFez52uePHi110UAAAAAAC+zuPAvWvXLj300EP6/vvvXdqNMbLZbEpLS/NacQAAAAAA+CqPA3evXr0UEBCgzz//XEWKFJHNZrOiLgAAAAAAfJrHgXvLli3auHGjypYta0U9AAAAAADcEjyepbx8+fLcbxsAAAAAgGvwOHCPHTtWTz/9tFauXKnjx48rKSnJ5QcAAAAAAGThlPImTZpIkho3buzSzqRpAAAAAAD8j8eBOyEhwYo6AAAAAAC4pXgcuGNjY62oAwAAAACAW4rH13BL0po1a9StWzfVqVNHBw8elCTNmjVLa9eu9WpxAAAAAAD4Ko8D96JFi9S8eXMFBwdr06ZNSklJkSSdOnVKr7zyitcLBAAAAADAF3kcuEeNGqVJkyZpypQpCgwMdLbXrVtXmzZt8mpxAAAAAAD4Ko8D986dO1W/fn239ty5c+vkyZPeqAkAAAAAAJ/nceAuXLiw9uzZ49a+du1alSxZ0itFAQAAAADg6zwO3L1799Zjjz2mdevWyWaz6dChQ4qPj9eTTz6pfv36WVEjAAAAAAA+x+Pbgj377LNyOBxq3LixkpOTVb9+fdntdj355JN69NFHragRAAAAAACf43Hgvnjxol544QU99dRT2rNnj86cOaPy5csrV65cOnbsmAoUKGBFnQAAAAAA+BSPTynv3LmzjDHKkSOHypcvr1q1ailXrlz666+/1KBBAwtKBAAAAADA93gcuPfv36+HH37Ype3w4cNq0KCBypYt67XCAAAAAADwZR4H7i+//FLff/+9Bg8eLEk6dOiQGjRooEqVKumjjz7yeoEAAAAAAPgij6/hLliwoL7++mvdddddkqTPP/9c1apVU3x8vPz8PM7vAAAAAADckjwO3JIUGRmp5cuXq169emratKlmzZolm83m7doAAAAAAPBZmQrcefPmzTBQJycn67PPPlP+/PmdbX///bf3qgMAAAAAwEdlKnC/9dZbFpcBAAAAAMCtJVOBu0ePHlbXAQAAAADALSVL13CnpaVpyZIl2r59uySpQoUKatu2rfz9/b1aHAAAAAAAvsrjwL1nzx61atVKBw8eVJkyZSRJo0ePVmRkpL744gvFxMR4vUgAAAAAAHyNx/fxGjRokGJiYnTgwAFt2rRJmzZt0v79+xUdHa1BgwZZUSMAAAAAAD7H4xHuVatW6ccff1S+fPmcbfnz59eYMWNUt25drxYHAAAAAICv8niE22636/Tp027tZ86cUY4cObxSFAAAAAAAvi7TgXv16tVKTU1VmzZt1KdPH61bt07GGBlj9OOPP+qRRx5R27ZtrawVAAAAAACfkenA3bBhQ504cULjx49XTEyM7rzzTgUFBSkoKEh169ZVqVKl9Pbbb1tZKwAAAAAAPiPT13AbYyRJefLk0SeffKLdu3drx44dkqRy5cqpVKlS1lQIAAAAAIAP8mjSNJvN5vx76dKlVbp0aa8XBAAAAADArcCjwN2zZ0/Z7far9lm8ePF1FQQAAAAAwK3Ao8AdGhqq4OBgq2oBAAAAAOCW4VHgHj9+vMLDw62qBQAAAACAW0amZym//PptAAAAAABwdZkO3OmzlAMAAAAAgGvLdOBOSEhQvnz5rKwFAAAAAIBbRqav4Y6NjbWyDgAAAAAAbimZHuEGAAAAAACZR+AGAAAAAMACBG4AAAAAACyQqWu4k5KSMr3CsLCwLBcDAAAAAMCtIlOBO0+ePJm+D3daWtp1FQQAAAAAwK0gU4E7ISHB+ffExEQ9++yz6tmzp+68805J0g8//KCZM2dq9OjR1lQJAAAAAICPyVTgvvyWYC+99JLeeOMNdenSxdnWtm1bVapUSe+//7569Ojh/SoBAAAAAPAxHk+a9sMPP6hGjRpu7TVq1ND69eu9UhQAAAAAAL7O48AdGRmpKVOmuLVPnTpVkZGRXikKAAAAAABfl6lTyi/35ptvqmPHjlq6dKlq164tSVq/fr12796tRYsWeb1AAAAAAAB8kccj3K1atdKuXbsUFxenv//+W3///bfi4uK0a9cutWrVyooaAQAAAADwOR6PcEuXTit/5ZVXvF0LAAAAAAC3DI9HuCVpzZo16tatm+rUqaODBw9KkmbNmqW1a9d6tTgAAAAAAHyVx4F70aJFat68uYKDg7Vp0yalpKRIkk6dOsWoNwAAAAAA/8/jwD1q1ChNmjRJU6ZMUWBgoLO9bt262rRpk1eLAwAAAADAV3kcuHfu3Kn69eu7tefOnVsnT570Rk0AAAAAAPg8jwN34cKFtWfPHrf2tWvXqmTJkl4pCgAAAAAAX+dx4O7du7cee+wxrVu3TjabTYcOHVJ8fLyefPJJ9evXz4oaAQAAAADwOR7fFuzZZ5+Vw+FQ48aNlZycrPr168tut+vJJ5/Uo48+akWNAAAAAAD4HI8Dt81m0wsvvKCnnnpKe/bs0ZkzZ1S+fHnlypXLivoAAAAAAPBJHp9S/uCDD+r06dPKkSOHypcvr1q1ailXrlw6e/asHnzwQStqBAAAAADA53gcuGfOnKlz5865tZ87d04ffvihV4oCAAAAAMDXZfqU8qSkJBljZIzR6dOnFRQU5FyWlpamL7/8UuHh4ZYUCQAAAACAr8l04M6TJ49sNptsNptuu+02t+U2m00jRozwanEAAAAAAPiqTAfuhIQEGWPUqFEjLVq0SPny5XMuy5Ejh0qUKKGIiAhLigQAAAAAwNdkOnDHxsZKkvbt26fIyEj5+Xl8+TcAAAAAAP8aHt8WrESJEjpx4oSmTZum7du3S5LKly+vXr16uYx6AwAAAADwb+bxMPXq1asVFRWl8ePH68SJEzpx4oTGjx+v6OhorV692ooaAQAAAADwOR6PcA8YMED33XefJk6cKH9/f0mXZinv37+/BgwYoG3btnm9SAAAAAAAfI3HI9x79uzRf/7zH2fYliR/f38NHjxYe/bs8WpxAAAAAAD4Ko8Dd7Vq1ZzXbl9u+/btuv32271SFAAAAAAAvs7jU8oHDRqkxx57THv27NEdd9whSfrxxx/17rvvasyYMfr555+dfStXrnzVdU2cOFETJ05UYmKiJKlChQp68cUX1bJlS0/LAgAAAADgpuJx4O7SpYsk6emnn85wmc1mkzFGNptNaWlpV11XsWLFNGbMGJUuXVrGGM2cOVPt2rXT5s2bVaFCBU9LAwAAAADgpuFx4N63b5/XNh4XF+fy+OWXX9bEiRP1448/ErgBAAAAAD4tS/fhtkJaWpoWLFigs2fP6s4777RkGwAAAAAA3CgeB25JmjVrliZNmqR9+/bphx9+UIkSJfTWW28pOjpa7dq182hd27Zt05133qnz588rV65c+vjjj1W+fPkM+6akpCglJcX5OCkpSZKUmpqq1NTUrLwUIFPSP198zoCbl93fZHcJPsXuZ1z+RObxuwA3Csc1z3BcyzqOa57xZH/ZjDEefSInTpyoF198UY8//rhefvll/fLLLypZsqRmzJihmTNnKiEhwaNiL1y4oP379+vUqVNauHChpk6dqlWrVmUYuocPH64RI0a4tc+ZM0chISEebRcAAAAAAE8lJyera9euOnXqlMLCwq7a1+PAXb58eb3yyitq3769QkNDtXXrVpUsWVK//PKLGjRooGPHjl1X8U2aNFFMTIwmT57stiyjEe7IyEgdO3bsmi8UuB6pqalavny5mjZtqsDAwOwuB0AGKg5flt0l+BS7n9HIGg4N3eCnFIctu8vxKb8Mb57dJeBfguOaZziuZR3HNc8kJSWpQIECmQrcWZo0rWrVqm7tdrtdZ8+e9XR1bhwOh0uo/uc27Ha7W3tgYCAhCDcEnzXg5pWSxn+usiLFYWPfeYjfA7hR+LeZNRzXPMdxzTOe7C+PA3d0dLS2bNniNnnaV199pXLlynm0rueee04tW7ZU8eLFdfr0ac2ZM0crV67UsmV8mwcAAAAA8G0eB+7BgwdrwIABOn/+vIwxWr9+vebOnavRo0dr6tSpHq3ryJEj6t69uw4fPqzcuXOrcuXKWrZsmZo2beppWQAAAAAA3FQ8DtwPP/ywgoODNWTIEOfF4hEREXr77bfVuXNnj9Y1bdo0TzcPAAAAAIBPyNJtwe6//37df//9Sk5O1pkzZxQeHu7tugAAAAAA8GlZmjTt4sWLKl26tEJCQpy349q9e7cCAwMVFRXl7RoBAAAAAPA5fp4+oWfPnvr+++/d2tetW6eePXt6oyYAAAAAAHyex4F78+bNqlu3rlv7HXfcoS1btnijJgAAAAAAfJ7Hgdtms+n06dNu7adOnVJaWppXigIAAAAAwNd5HLjr16+v0aNHu4TrtLQ0jR49WnfddZdXiwMAAAAAwFd5PGna2LFjVb9+fZUpU0b16tWTJK1Zs0ZJSUn69ttvvV4gAAAAAAC+yOMR7vLly+vnn39Wp06ddOTIEZ0+fVrdu3fXjh07VLFiRStqBAAAAADA52TpPtwRERF65ZVXvF0LAAAAAAC3DI9HuKdPn64FCxa4tS9YsEAzZ870SlEAAAAAAPg6jwP36NGjVaBAAbf28PBwRr0BAAAAAPh/Hgfu/fv3Kzo62q29RIkS2r9/v1eKAgAAAADA13kcuMPDw/Xzzz+7tW/dulX58+f3SlEAAAAAAPg6jwN3ly5dNGjQICUkJCgtLU1paWn69ttv9dhjj6lz585W1AgAAAAAgM/xeJbykSNHKjExUY0bN1ZAwKWnOxwOde/eXS+//LLXCwQAAAAAwBd5HLhz5Mih+fPna9SoUdqyZYuCg4NVqVIllShRwor6AAAAAADwSVm6D7cklS5dWqVLl5YkJSUlaeLEiZo2bZo2bNjgteIAAAAAAPBVWQ7ckpSQkKAPPvhAixcvVu7cudWhQwdv1QUAAAAAgE/zOHAfPHhQM2bM0PTp03Xy5EmdOHFCc+bMUadOnWSz2ayoEQAAAAAAn5PpWcoXLVqkVq1aqUyZMtqyZYvGjRunQ4cOyc/PT5UqVSJsAwAAAABwmUyPcN9333165plnNH/+fIWGhlpZEwAAAAAAPi/TI9wPPfSQ3n33XbVo0UKTJk3SiRMnrKwLAAAAAACflunAPXnyZB0+fFh9+vTR3LlzVaRIEbVr107GGDkcDitrBAAAAADA52Q6cEtScHCwevTooVWrVmnbtm2qUKGCChUqpLp166pr165avHixVXUCAAAAAOBTPArclytdurReeeUVHThwQLNnz1ZycrK6dOnizdoAAAAAAPBZ13Ufbkny8/NTXFyc4uLidOTIEW/UBAAAAACAz8vyCHdGwsPDvbk6AAAAAAB8llcDNwAAAAAAuITADQAAAACABTIVuMePH6/z589Lkvbv3y9jjKVFAQAAAADg6zIVuAcPHqykpCRJUnR0tI4ePWppUQAAAAAA+LpMzVIeERGhRYsWqVWrVjLG6I8//nCOeP9T8eLFvVogAAAAAAC+KFOBe8iQIXr00Uc1cOBA2Ww21axZ062PMUY2m01paWleLxIAAAAAAF+TqcDdp08fdenSRb///rsqV66sFStWKH/+/FbXBgAAAACAz8pU4Jak0NBQVaxYUdOnT1fdunVlt9utrAsAAAAAAJ+W6cCdrkePHpKkjRs3avv27ZKk8uXLq1q1at6tDAAAAAAAH+Zx4D5y5Ig6d+6slStXKk+ePJKkkydPqmHDhpo3b54KFizo7RoBAAAAAPA5mbot2OUeffRRnT59Wv/973/1999/6++//9Yvv/yipKQkDRo0yIoaAQAAAADwOR6PcH/11VdasWKFypUr52wrX7683n33XTVr1syrxQEAAAAA4Ks8HuF2OBwKDAx0aw8MDJTD4fBKUQAAAAAA+DqPA3ejRo302GOP6dChQ862gwcP6oknnlDjxo29WhwAAAAAAL7K48A9YcIEJSUlKSoqSjExMYqJiVF0dLSSkpL0zjvvWFEjAAAAAAA+x+NruCMjI7Vp0yatWLFCO3bskCSVK1dOTZo08XpxAAAAAAD4Ko8DtyTZbDY1bdpUTZs29XY9AAAAAADcEjw+pRwAAAAAAFwbgRsAAAAAAAsQuAEAAAAAsIBHgfvixYv68MMP9ddff1lVDwAAAAAAtwSPAndAQIAeeeQRnT9/3qp6AAAAAAC4JXh8SnmtWrW0ZcsWC0oBAAAAAODW4fFtwfr376/BgwfrwIEDql69unLmzOmyvHLlyl4rDgAAAAAAX+Vx4O7cubMkadCgQc42m80mY4xsNpvS0tK8Vx0AAAAAAD7K48C9b98+K+oAAAAAAOCW4nHgLlGihBV1AAAAAABwS8nSfbhnzZqlunXrKiIiQr///rsk6a233tInn3zi1eIAAAAAAPBVHgfuiRMnavDgwWrVqpVOnjzpvGY7T548euutt7xdHwAAAAAAPsnjwP3OO+9oypQpeuGFF+Tv7+9sr1GjhrZt2+bV4gAAAAAA8FUeB+59+/apatWqbu12u11nz571SlEAAAAAAPg6jwN3dHS0tmzZ4tb+1VdfqVy5ct6oCQAAAAAAn+fxLOWDBw/WgAEDdP78eRljtH79es2dO1ejR4/W1KlTragRAAAAAACf43HgfvjhhxUcHKwhQ4YoOTlZXbt2VUREhN5++2117tzZihoBAAAAAPA5HgduSbr//vt1//33Kzk5WWfOnFF4eLi36wIAAAAAwKdlKXBL0pEjR7Rz505Jks1mU8GCBb1WFAAAAAAAvs7jSdNOnz6tBx54QBEREYqNjVVsbKwiIiLUrVs3nTp1yooaAQAAAADwOR4H7ocffljr1q3TF198oZMnT+rkyZP6/PPPtWHDBvXt29eKGgEAAAAA8Dken1L++eefa9myZbrrrrucbc2bN9eUKVPUokULrxYHAAAAAICv8niEO3/+/MqdO7dbe+7cuZU3b16vFAUAAAAAgK/zOHAPGTJEgwcP1p9//uls+/PPP/XUU09p6NChXi0OAAAAAABflalTyqtWrSqbzeZ8vHv3bhUvXlzFixeXJO3fv192u11Hjx7lOm4AAAAAAJTJwN2+fXuLywAAAAAA4NaSqcA9bNgwq+sAAAAAAOCW4vEs5Zc7c+aMHA6HS1tYWNh1FQQAAAAAwK3A40nT9u3bp9atWytnzpzOmcnz5s2rPHnyMEs5AAAAAAD/z+MR7m7duskYow8++ECFChVymUwNAAAAAABc4nHg3rp1qzZu3KgyZcpYUQ8AAAAAALcEj08pr1mzpg4cOGBFLQAAAAAA3DI8HuGeOnWqHnnkER08eFAVK1ZUYGCgy/LKlSt7rTgAAAAAAHyVx4H76NGj2rt3r3r16uVss9lsMsbIZrMpLS3NqwUCAAAAAOCLPA7cDz74oKpWraq5c+cyaRoAAAAAAFfgceD+/fff9emnn6pUqVJW1AMAAAAAwC3B40nTGjVqpK1bt1pRCwAAAAAAtwyPR7jj4uL0xBNPaNu2bapUqZLbpGlt27b1WnEAAAAAAPgqjwP3I488Ikl66aWX3JYxaRoAAAAAAJd4HLgdDocVdQAAAAAAcEvx+BpuAAAAAABwbR6PcGd0KvnlXnzxxSwXAwAAAADArcLjwP3xxx+7PE5NTdW+ffsUEBCgmJgYAjcAAAAAAMpC4N68ebNbW1JSknr27KkOHTp4pSgAAAAAAHydV67hDgsL04gRIzR06FBvrA4AAAAAAJ/ntUnTTp06pVOnTnlrdQAAAAAA+DSPTykfP368y2NjjA4fPqxZs2apZcuWXisMAAAAAABf5nHgfvPNN10e+/n5qWDBgurRo4eee+45rxUGAAAAAIAv8zhw79u3z4o6AAAAAAC4pXjtGm4AAAAAAPA/mR7hfvDBB6/Zx2azadq0addVEAAAAAAAt4JMB+4TJ05ccVlaWppWrFihlJQUAjcAAAAAAPIgcH/88ccZtn/yySd6/vnnZbfb9eKLL3qtMAAAAAAAfFmWr+H+7rvvVK9ePXXt2lVt2rTRb7/9pmeffdabtQEAAAAA4LM8Dty//vqr4uLi1KBBA912223auXOnxo4dq7x581pRHwAAAAAAPinTgfvAgQPq1auXbr/9dgUEBOjnn3/WtGnTVKxYMSvrAwAAAADAJ2X6Gu4yZcrIZrNp8ODBqlu3rnbv3q3du3e79Wvbtq1XCwQAAAAAwBdlOnCfP39ekvTaa6/ptddey7CPzWZTWlqadyoDAAAAAMCHZTpwOxwOK+sAAAAAAOCWkuVZygEAAAAAwJURuAEAAAAAsACBGwAAAAAAC2Rr4B49erRq1qyp0NBQhYeHq3379tq5c2d2lgQAAAAAgFdka+BetWqVBgwYoB9//FHLly9XamqqmjVrprNnz2ZnWQAAAAAAXLdMz1J+uZMnT2rhwoXau3evnnrqKeXLl0+bNm1SoUKFVLRo0Uyv56uvvnJ5PGPGDIWHh2vjxo2qX79+VkoDAAAAAOCm4HHg/vnnn9WkSRPlzp1biYmJ6t27t/Lly6fFixdr//79+vDDD7NczKlTpyRJ+fLly3B5SkqKUlJSnI+TkpIkSampqUpNTc3ydoFrSf988TkDbl52f5PdJfgUu59x+ROZx+8C3Cgc1zzDcS3rOK55xpP9ZTPGePSJbNKkiapVq6ZXX31VoaGh2rp1q0qWLKnvv/9eXbt2VWJioqf1Srp0n++2bdvq5MmTWrt2bYZ9hg8frhEjRri1z5kzRyEhIVnaLgAAAAAAmZWcnKyuXbvq1KlTCgsLu2pfjwN37ty5tWnTJsXExLgE7t9//11lypTR+fPns1R0v379tHTpUq1du1bFihXLsE9GI9yRkZE6duzYNV8ocD1SU1O1fPlyNW3aVIGBgdldDoAMVBy+LLtL8Cl2P6ORNRwausFPKQ5bdpfjU34Z3jy7S8C/BMc1z3BcyzqOa55JSkpSgQIFMhW4PT6l3G63O0/lvtyuXbtUsGBBT1cnSRo4cKA+//xzrV69+ophO33bdrvdrT0wMJAQhBuCzxpw80pJ4z9XWZHisLHvPMTvAdwo/NvMGo5rnuO45hlP9pfHs5S3bdtWL730kvO8dZvNpv379+uZZ55Rx44dPVqXMUYDBw7Uxx9/rG+//VbR0dGelgMAAAAAwE3J48A9btw4nTlzRuHh4Tp37pxiY2NVqlQphYaG6uWXX/ZoXQMGDNDs2bM1Z84chYaG6s8//9Sff/6pc+fOeVoWAAAAAAA3FY9PKc+dO7eWL1+utWvX6ueff9aZM2dUrVo1NWnSxOONT5w4UZLUoEEDl/bp06erZ8+eHq8PAAAAAICbRZbuwy1Jd911l+66667r2riH87UBAAAAAOAzPA7c48ePz7DdZrMpKChIpUqVUv369eXv73/dxQEAAAAA4Ks8Dtxvvvmmjh49quTkZOXNm1eSdOLECYWEhChXrlw6cuSISpYsqYSEBEVGRnq9YAAAAAAAfIHHk6a98sorqlmzpnbv3q3jx4/r+PHj2rVrl2rXrq23335b+/fvV+HChfXEE09YUS8AAAAAAD7B4xHuIUOGaNGiRYqJiXG2lSpVSq+//ro6duyo3377Ta+++qrHtwgDAAAAAOBW4vEI9+HDh3Xx4kW39osXL+rPP/+UJEVEROj06dPXXx0AAAAAAD7K48DdsGFD9e3bV5s3b3a2bd68Wf369VOjRo0kSdu2bVN0dLT3qgQAAAAAwMd4HLinTZumfPnyqXr16rLb7bLb7apRo4by5cunadOmSZJy5cqlcePGeb1YAAAAAAB8hcfXcBcuXFjLly/Xjh07tGvXLklSmTJlVKZMGWefhg0beq9CAAAAAAB8kMeBO13ZsmVVtmxZb9YCAAAAAMAtI0uB+48//tCnn36q/fv368KFCy7L3njjDa8UBgAAAACAL/M4cH/zzTdq27atSpYsqR07dqhixYpKTEyUMUbVqlWzokYAAAAAAHyOx5OmPffcc3ryySe1bds2BQUFadGiRTpw4IBiY2N17733WlEjAAAAAAA+x+PAvX37dnXv3l2SFBAQoHPnzilXrlx66aWXNHbsWK8XCAAAAACAL/I4cOfMmdN53XaRIkW0d+9e57Jjx455rzIAAAAAAHyYx9dw33HHHVq7dq3KlSunVq1a6T//+Y+2bdumxYsX64477rCiRgAAAAAAfI7HgfuNN97QmTNnJEkjRozQmTNnNH/+fJUuXZoZygEAAAAA+H8eBe60tDT98ccfqly5sqRLp5dPmjTJksIAAAAAAPBlHl3D7e/vr2bNmunEiRNW1QMAAAAAwC3B40nTKlasqN9++82KWgAAAAAAuGV4HLhHjRqlJ598Up9//rkOHz6spKQklx8AAAAAAJCFSdNatWolSWrbtq1sNpuz3Rgjm82mtLQ071UHAAAAAICP8jhwJyQkWFEHAAAAAAC3FI8Dd2xsrBV1AAAAAABwS/H4Gm5JWrNmjbp166Y6dero4MGDkqRZs2Zp7dq1Xi0OAAAAAABf5XHgXrRokZo3b67g4GBt2rRJKSkpkqRTp07plVde8XqBAAAAAAD4oizNUj5p0iRNmTJFgYGBzva6detq06ZNXi0OAAAAAABf5XHg3rlzp+rXr+/Wnjt3bp08edIbNQEAAAAA4PM8DtyFCxfWnj173NrXrl2rkiVLeqUoAAAAAAB8nceBu3fv3nrssce0bt062Ww2HTp0SPHx8XryySfVr18/K2oEAAAAAMDneHxbsGeffVYOh0ONGzdWcnKy6tevL7vdrieffFKPPvqoFTUCAAAAAOBzPA7cNptNL7zwgp566int2bNHZ86cUfny5ZUrVy4r6gMAAAAAwCd5fEr57NmzlZycrBw5cqh8+fKqVasWYRsAAAAAgH/wOHA/8cQTCg8PV9euXfXll18qLS3NiroAAAAAAPBpHgfuw4cPa968ebLZbOrUqZOKFCmiAQMG6Pvvv7eiPgAAAAAAfJLHgTsgIEBt2rRRfHy8jhw5ojfffFOJiYlq2LChYmJirKgRAAAAAACf4/GkaZcLCQlR8+bNdeLECf3+++/avn27t+oCAAAAAMCneTzCLUnJycmKj49Xq1atVLRoUb311lvq0KGD/vvf/3q7PgAAAAAAfJLHI9ydO3fW559/rpCQEHXq1ElDhw7VnXfeaUVtAAAAAAD4LI8Dt7+/vz766CM1b95c/v7+Lst++eUXVaxY0WvFAQAAAADgqzwO3PHx8S6PT58+rblz52rq1KnauHEjtwkDAAAAAEBZvIZbklavXq0ePXqoSJEiev3119WoUSP9+OOP3qwNAAAAAACf5dEI959//qkZM2Zo2rRpSkpKUqdOnZSSkqIlS5aofPnyVtUIAAAAAIDPyfQId1xcnMqUKaOff/5Zb731lg4dOqR33nnHytoAAAAAAPBZmR7hXrp0qQYNGqR+/fqpdOnSVtYEAAAAAIDPy/QI99q1a3X69GlVr15dtWvX1oQJE3Ts2DErawMAAAAAwGdlOnDfcccdmjJlig4fPqy+fftq3rx5ioiIkMPh0PLly3X69Gkr6wQAAAAAwKd4PEt5zpw59eCDD2rt2rXatm2b/vOf/2jMmDEKDw9X27ZtragRAAAAAACfk+XbgklSmTJl9Oqrr+qPP/7Q3LlzvVUTAAAAAAA+77oCdzp/f3+1b99en376qTdWBwAAAACAz/NK4AYAAAAAAK4I3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABbI1cK9evVpxcXGKiIiQzWbTkiVLsrMcAAAAAAC8JlsD99mzZ3X77bfr3Xffzc4yAAAAAADwuoDs3HjLli3VsmXL7CwBAAAAAABLZGvg9lRKSopSUlKcj5OSkiRJqampSk1Nza6y8C+Q/vnicwbcvOz+JrtL8Cl2P+PyJzKP3wW4UTiueYbjWtZxXPOMJ/vLZoy5KT6RNptNH3/8sdq3b3/FPsOHD9eIESPc2ufMmaOQkBALqwMAAAAAQEpOTlbXrl116tQphYWFXbWvTwXujEa4IyMjdezYsWu+UOB6pKamavny5WratKkCAwOzuxwAGag4fFl2l+BT7H5GI2s4NHSDn1Ictuwux6f8Mrx5dpeAfwmOa57huJZ1HNc8k5SUpAIFCmQqcPvUKeV2u112u92tPTAwkBCEG4LPGnDzSknjP1dZkeKwse88xO8B3Cj828wajmue47jmGU/2F/fhBgAAAADAAtk6wn3mzBnt2bPH+Xjfvn3asmWL8uXLp+LFi2djZQAAAAAAXJ9sDdwbNmxQw4YNnY8HDx4sSerRo4dmzJiRTVUBAAAAAHD9sjVwN2jQQDfJnG0AAAAAAHgV13ADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFgjI7gKQPaKe/SK7S/Apdn+jV2tJFYcvU0qaLbvL8SmJY1pndwkAAABAtmCEGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALDATRG43333XUVFRSkoKEi1a9fW+vXrs7skAAAAAACuS7YH7vnz52vw4MEaNmyYNm3apNtvv13NmzfXkSNHsrs0AAAAAACyLNsD9xtvvKHevXurV69eKl++vCZNmqSQkBB98MEH2V0aAAAAAABZlq2B+8KFC9q4caOaNGnibPPz81OTJk30ww8/ZGNlAAAAAABcn4Ds3PixY8eUlpamQoUKubQXKlRIO3bscOufkpKilJQU5+NTp05Jkv7++2+lpqZaW+wtJuDi2ewuwacEOIySkx0KSPVTmsOW3eX4lOPHj2d3CfiX4LjmGY5rWcdxDTcKxzXPcFzLOo5rnjl9+rQkyRhzzb7ZGrg9NXr0aI0YMcKtPTo6Ohuqwb9N1+wuwEcVGJfdFQC4Eo5rWcNxDbh5cVzLGo5rWXP69Gnlzp37qn2yNXAXKFBA/v7++uuvv1za//rrLxUuXNit/3PPPafBgwc7HzscDv3999/Knz+/bDa+xYJ1kpKSFBkZqQMHDigsLCy7ywGA68ZxDcCthuMabhRjjE6fPq2IiIhr9s3WwP1/7d15eI13/v/x18lGksqiGss0spBaxpY0ZdRW1K7aUgylaumU4YvRodqZ2jpozdBWa2hpbaNaX6VViijBWKohBFVFg6RGY4k1CYnk/P7wa749iWOy+pyT83xcV6+678+pebmuuY683vd9f24vLy89/PDD2rx5s5566ilJt0v05s2bNWLEiHyfL1eunMqVK2dzLiAg4B4kBW7z8/PjCxxAmcL3GoCyhu813Av/7cr2L4zfUj5mzBgNGDBA0dHRaty4sd5++22lpaVp4MCBpqMBAAAAAFBkxgt37969df78eU2YMEE///yzGjVqpA0bNuTbSA0AAAAAAGdivHBL0ogRI+54CzngKMqVK6eJEyfme6QBAJwV32sAyhq+1+CILNaC7GUOAAAAAAAKxc10AAAAAAAAyiIKNwAAAAAApYDCDQAAAABAKaBwAwAAAABQCijcAAAAAACUAgo3AAAuYsmSJbp582a+85mZmVqyZImBRAAAlG28FgzIIzw8vECfS0xMLOUkAFCy3N3ddfbsWQUFBdmcv3jxooKCgpSdnW0oGQAUzZQpU/TnP/9ZPj4+NuczMjL097//XRMmTDCUDLiNwg3k4ebmppCQEPXt2zffD6W/NmrUqHuYCgCKz83NTSkpKXrggQdszickJKh169ZKTU01lAwAioZBIhydh+kAgKP59NNP9dFHH2nWrFnq1KmTBg0apM6dO8vNjScwADinyMhIWSwWWSwWtW3bVh4e//fXf3Z2tk6ePKmOHTsaTAgARWO1WmWxWPKdT0hIUMWKFQ0kAmxxhRuw48yZM1q0aJEWLVqk9PR09e/fX4MHD1ZERITpaABQKJMnT87990svvaT77rsvd83Ly0uhoaHq0aOHvLy8TEUEgEIJDAyUxWLRlStX5OfnZ1O6s7Ozdf36dQ0dOlRz5swxmBKgcAMFsm3bNk2aNEnbt2/XhQsXFBgYaDoSABTa4sWL9fvf/17lypUzHQUAimXx4sWyWq0aNGiQ3n77bfn7++eu/TJIbNq0qcGEwG0UbuAubty4oZUrV+qjjz7SN998o27dumnx4sX8sArAKcXFxSknJ0dNmjSxOb9nzx65u7srOjraUDIAKJpt27apWbNmNo/KAI6Eh1KBO9izZ4/+8Ic/qEqVKpo1a5a6d++uM2fO6JNPPqFsA3Baw4cPV3Jycr7zZ86c0fDhww0kAoDiSUtL0+bNm/Od37hxo9avX28gEWCLwg3k8dvf/lZdu3aVt7e3tm3bpvj4eI0YMYLbyAE4vSNHjigqKirf+cjISB05csRAIgAonvHjx99xJ3Kr1arx48cbSATY4t4LII/vv/9evr6+WrJkiZYuXWr3c7w+B4CzKVeunFJSUhQeHm5z/uzZs9yOCcApHT9+XHXr1s13vnbt2jpx4oSBRIAt/nYF8li4cKHpCABQKtq3b69XXnlFX3zxRe4GQ5cvX9arr76qdu3aGU4HAIXn7++vxMREhYaG2pw/ceKEfH19zYQCfoVN0wAAcBFnzpxRy5YtdfHiRUVGRkqSDhw4oMqVK2vTpk0KDg42nBAACufFF1/U7t27tXr1atWoUUPS7bLdo0cPPfLII1qwYIHhhHB1FG7Ajlu3bum7777Tzz//LEmqUqWK6tatK09PT8PJAKDo0tLStGzZMiUkJMjb21sNGjRQnz59+G4D4JSuXLmijh07au/evXrwwQclST/99JNatGihVatWKSAgwGxAuDwKN5BHTk6OJkyYoDlz5ujKlSs2a/7+/hoxYoQmT54sNzf2HAQAADDNarVq06ZNNoPEli1bmo4FSKJwA/mMGzdOixYt0uuvv64OHTqocuXKkqSUlBTFxMTotdde0/PPP68333zTcFIAKJwlS5bcdf255567R0kAAHANFG4gjypVqmjx4sXq0KHDHdc3btyo5557TikpKfc4GQAUT97XG2ZlZSk9PV1eXl7y8fHh7QsAnM6UKVPuuj5hwoR7lAS4M3YpB/K4du2aqlWrZne9atWqSktLu4eJAKBkXLp0Kd+548ePa9iwYRo7dqyBRABQPKtXr7Y5zsrK0smTJ+Xh4aEaNWpQuGEcV7iBPLp06aJbt25p2bJlqlSpks3ahQsX1L9/f7m7u2vt2rWGEgJAydq7d6/69euno0ePmo4CAMV29epVPf/883r66afVv39/03Hg4ijcQB7Jycnq3Lmzjh49qvr169s8w33o0CHVrVtXa9eu5fU5AMqMAwcOqGXLlrp69arpKABQIg4dOqQnnnhCp06dMh0FLo7CDdxBTk6ONm7cqG+++cbmtWBNmzZV+/bt2aEcgFNas2aNzbHVatXZs2f13nvvKTg4WOvXrzeUDABK1o4dO/TEE0/c8VEa4F6icAMA4CLyDgstFoseeOABtWnTRjNnzlTVqlUNJQOAopk9e7bN8S+DxKVLl6pVq1b6+OOPDSUDbqNwA3dgtVp16tQpBQcHy8PDQ5mZmVq9erVu3rypzp0753u2GwAAAPdeWFiYzbGbm1vuIPGVV15RhQoVDCUDbqNwA3n88MMP6tChg5KTkxUeHq6YmBj17NlTR48eldVqlY+Pj3bt2qWIiAjTUQGgwLKyslS7dm2tXbtWderUMR0HAACXwIOoQB4vv/yyGjZsqAMHDqhr167q0qWLHnzwQV26dEmpqalq2rTpf33nIwA4Gk9PT924ccN0DAAoMVlZWfLw8NDhw4dNRwHs4go3kEdQUJBiYmLUqFEjpaWlqUKFCtq+fbuaN28uSdq1a5f69Omj06dPG04KAIUzbdo0HTt2TAsWLJCHh4fpOABQbOHh4Vq9erUaNmxoOgpwR/xtC+Rx/fp1VaxYUZLk6+srX19fm42EgoODlZKSYioeABRZXFycNm/erJiYGNWvX1++vr4266tWrTKUDACK5i9/+YteffVVLV26NPfnN8CRULiBPKpVq6akpCRVr15dkjRjxgwFBQXlrp8/f16BgYGm4gFAkQUEBKhHjx6mYwBAiXnvvfd04sQJVatWTSEhIfkGifHx8YaSAbdRuIE8Hn/8cR09ejT3FvJhw4bZrMfExCgqKspENAAoloULF5qOAAAl6sknn5TFYjEdA7CLZ7iBQjp58qTKly/P+2oBOJ02bdpo1apVCggIsDl/9epVPfXUU9qyZYuZYAAAlFHsUg4UUlhYGGUbgFPaunWrMjMz852/ceOG/v3vfxtIBADFEx4erosXL+Y7f/nyZYWHhxtIBNjilnLgDr7//nt98803atq0qWrXrq2jR4/qnXfe0c2bN9WvXz+1adPGdEQAKLCDBw/m/vrIkSP6+eefc4+zs7O1YcMG/eY3vzERDQCK5dSpU8rOzs53/ubNm/rpp58MJAJsUbiBPDZs2KAnn3xS9913n9LT07V69Wo999xzatiwoXJyctS+fXvFxMRQugE4jUaNGslischisdzxu8vb21vvvvuugWQAUDRr1qzJ/fXGjRvl7++fe5ydna3NmzcrLCzMRDTABs9wA3k8+uijatOmjf72t7/pk08+0R//+EcNGzZMU6dOlSS98sor2rdvn2JiYgwnBYCCOX36tKxWq8LDw/Xtt9/qgQceyF3z8vJSUFCQ3N3dDSYEgMJxc7v9ZKzFYlHeOuPp6anQ0FDNnDlTXbt2NREPyEXhBvLw9/fXvn37VLNmTeXk5KhcuXL69ttvFRkZKUk6fPiwHn/8cZtbMgEAAHDvhYWFKS4uTpUqVTIdBbgjNk0D7uCX10u4ubmpfPnyNrcpVahQQVeuXDEVDQCKbPHixVq3bl3u8bhx4xQQEKBHH31Up0+fNpgMAIrm5MmT+cr25cuXzYQB7oDCDeQRGhqq48eP5x7v3r1b1atXzz1OSkpil3IATmnatGny9vaWdPu77b333tOMGTNUqVIl/elPfzKcDgAK780339Snn36ae9yzZ09VrFhRv/nNb5SQkGAwGXAbhRvIY9iwYTa7XdarV08eHv+3v+D69evZMA2AU0pOTlbNmjUlSZ9//rmeeeYZ/eEPf9D06dN5LRgApzRv3jwFBwdLkjZt2qSvv/5aGzZsUKdOnTR27FjD6QB2KQfyGTp06F3Xp02bdo+SAEDJuu+++3Tx4kVVr15dMTExGjNmjCSpfPnyysjIMJwOAArv559/zi3ca9euVa9evdS+fXuFhoaqSZMmhtMBXOEGCmT58uVKS0szHQMAiqVdu3YaMmSIhgwZomPHjqlz586SpO+++06hoaFmwwFAEQQGBio5OVnS7Ve7Pv7445Ikq9V6x/dzA/cahRsogBdffFEpKSmmYwBAscyZM0dNmzbV+fPn9dlnn+n++++XJO3bt099+vQxnA4ACq979+7q27ev2rVrp4sXL6pTp06SpP379+c+QgOYxGvBgAKoUKGCEhISFB4ebjoKAAAA/r+srCy98847Sk5O1vPPP5/7Gte33npLFSpU0JAhQwwnhKujcAMFQOEGUFZcvnxZ3377rc6dO6ecnJzc8xaLRf379zeYDACAsofCDRTAjh07FB0drfLly5uOAgBF9uWXX+rZZ5/V9evX5efnJ4vFkrtmsViUmppqMB0AFM3x48cVGxubb5AoSRMmTDCUCriNwg0UkNVqVU5Ojtzd3U1HAYAieeihh9S5c2dNmzZNPj4+puMAQLHNnz9fw4YNU6VKlVSlSpV8g8T4+HiD6QAKN5DPrVu3NGnSJP373//WY489psmTJ+vvf/+7Jk2apFu3bun3v/+95s+fLy8vL9NRAaBQfH19dejQIR6PAVBmhISE6I9//KNefvll01GAO2KXciCPyZMna8GCBYqOjtbKlSs1bNgwzZ49Wx988IHmz5+vzZs36+233zYdEwAKrUOHDtq7d6/pGABQYi5duqSePXuajgHYxRVuII8aNWronXfeUdeuXXXixAnVqlVLH3/8sXr37i1JWrFihV5//XUdOnTIcFIAKJwPP/xQU6ZM0cCBA1W/fn15enrarHfr1s1QMgAomsGDB+uRRx7R0KFDTUcB7ojCDeTh7e2tY8eOKTg4OPd4//79ql27tiTp5MmTatiwoa5evWoyJgAUmpub/RvbLBaLsrOz72EaACi+6dOna9asWerSpcsdB4kjR440lAy4zcN0AMDR+Pv76/Lly7mFOyoqShUqVMhdv3nzps2GHADgLPLu3gsAzu6DDz7Qfffdp23btmnbtm02axaLhcIN4yjcQB5169ZVfHy86tevL0nauXOnzfqhQ4cUERFhIhoAAAB+5eTJk6YjAHdF4QbymDdvXr7bkX4tKytL48aNu4eJAKB4Zs+eXaDPcSUIAICSxTPcAACUcWFhYf/1MxaLRYmJifcgDQAU35gxYwr0uVmzZpVyEuDuuMINFECXLl20YMECVa1a1XQUACg0brkEUNbs37//v36GPXfgCLjCDRRAhQoVlJCQoPDwcNNRAKBE/PTTT6pWrdpddy4HAADFw9+yAAC4oLp16+rUqVOmYwBAidm5c6du3rxpOgZgg8INFEBISMhdN1IDAGfDDW4AyppOnTrpzJkzpmMANniGGyiAw4cPm44AAACAu2CQCEdE4Qb+i/T0dCUlJSkzM9PmfIMGDQwlAoDie/XVV1WxYkXTMQAAKNPYNA2w4/z58xo4cKDWr19/x/Xs7Ox7nAgAAAD2fPzxx3ryySfl6+trOgqQi2e4ATtGjx6ty5cva8+ePfL29taGDRu0ePFiRUREaM2aNabjAUChHD9+XJ999lnuK8LWrVunli1b6pFHHtHUqVO5FROA0+vbty9lGw6Hwg3YsWXLFs2aNUvR0dFyc3NTSEiI+vXrpxkzZmj69Omm4wFAga1evVp169ZV3759VadOHS1ZskTPPPOMfH19VblyZU2aNEkzZswwHRMACiU2NlYzZ87Uzp07JUnvv/++qlevrgceeEAvvPCCMjIyDCcEKNyAXWlpaQoKCpIkBQYG6vz585Kk+vXrKz4+3mQ0ACiUqVOnaty4cbpx44bmzp2roUOHavr06Vq/fr3Wrl2rOXPmaNGiRaZjAkCBzZ8/X+3atdO8efPUtm1bTZ8+XS+99JK6dOmiXr16acWKFZo8ebLpmACFG7CnVq1a+uGHHyRJDRs21Pvvv68zZ85o3rx5qlq1quF0AFBwP/zwgwYNGiSLxaIBAwYoMzNTjz/+eO56+/btdfr0aYMJAaBw3nnnHb311ls6fvy4Pv/8c02YMEFz5szR3LlzNWfOHC1YsEArV640HRNgl3LAnlGjRuns2bOSpIkTJ6pjx45atmyZvLy8uBIEwKmkpaWpQoUKkiQ3Nzd5e3vLx8cnd93b21s3b940FQ8ACi0xMVHdunWTJHXs2FEWi0WNGzfOXW/SpImSk5NNxQNyUbgBO/r165f764cfflinT5/W0aNHVb16dVWqVMlgMgAoHIvFIovFYvcYAJzNjRs35O3tnXtcrlw5lStXzub41q1bJqIBNijcQAH5+PgoKirKdAwAKDSr1aqHHnoot2Rfv35dkZGRcnNzy10HAGdisVh07do1lS9fXlarVRaLRdevX9fVq1clKfffgGkUbsAOq9WqlStXKjY2VufOnVNOTo7N+qpVqwwlA4DCWbhwoekIAFCifhkk/vo4MjLS5pg7eeAIKNyAHaNHj9b777+v1q1bq3LlynxpA3BaAwYMMB0BAEpUbGys6QhAgVis3EcG3FHFihX1r3/9S507dzYdBQAAAIAT4rVggB3+/v4KDw83HQMASl1CQoLc3d1NxwCAQsnOzlZiYmLuY383b97UihUr9MknnyglJcVwOuA2Cjdgx6RJkzR58mRlZGSYjgIApY4b3gA4k4MHD+rBBx9URESEGjZsqOTkZEVHR2vQoEF64YUXVKdOHcXFxZmOCXBLOWBPRkaGnn76ae3cuVOhoaHy9PS0WY+PjzeUDAAKp3v37nddv3LlirZu3ars7Ox7lAgAiqdjx46qUKGCJk6cqAULFigmJkb16tXTsmXLZLFYNHDgQP3888/atGmT6ahwcRRuwI5evXopNjZWzzzzzB03TZs4caKhZABQOJ6enmrXrp0qV658x/XU1FStXbuWwg3AaVSsWFE7d+5UnTp1lJGRoQoVKmjXrl1q3LixJOm7775Tq1atdOHCBcNJ4erYpRywY926ddq4caOaN29uOgoAFEudOnXUo0cPDR48+I7rBw4c0Nq1a+9xKgAoOqvVKg+P21Um778lyd3dPd8rXQETeIYbsCM4OFh+fn6mYwBAsT388MN3fQymXLlyql69+j1MBADF8/DDD+vNN9/UmTNnNH36dIWFhem9997LXX/33XdVr149gwmB27ilHLBj3bp1evfddzVv3jyFhoaajgMARXbz5k1lZ2fLx8fHdBQAKBFxcXHq1KmTLl26pPvvv1+xsbEaPHiwTp8+LTc3N126dElffvml2rZtazoqXByFG7AjMDBQ6enpunXrlnx8fPJtmpaammooGQAAANLS0nT06FHVqlVL9913n27cuKFly5YpIyND7dq1U61atUxHBCjcgD2LFy++6/qAAQPuURIAKHldunTRggULVLVqVdNRAAAosyjcAAC4oAoVKighIUHh4eGmowBAiahfv76++uorBQcHm44C5GKXcqAAbty4oczMTJtzbKgGAADgOE6dOqWsrCzTMQAb7FIO2JGWlqYRI0YoKChIvr6+CgwMtPkHAJxZSEhIvr0pAABAyaJwA3aMGzdOW7Zs0dy5c1WuXDktWLBAkydPVrVq1bRkyRLT8QCgWA4fPsxtlwDKlBYtWsjb29t0DMAGz3ADdlSvXl1LlizRY489Jj8/P8XHx6tmzZpaunSpli9frq+++sp0RAAolJycHLm55Z+15+Tk6KeffuJd3AAAlDCucAN2pKam5m4m5Ofnl/sasObNm2v79u0mowFAoVy9elW9evWSr6+vKleurAkTJig7Ozt3/fz58woLCzOYEABK1qVLl7gjEQ6Bwg3YER4erpMnT0qSateurRUrVkiSvvzySwUEBBhMBgCF89prrykhIUFLly7V1KlTtWTJEj355JM2m0FywxuAsiQpKUkDBw40HQPglnLAnrfeekvu7u4aOXKkvv76az3xxBOyWq3KysrSrFmzNGrUKNMRAaBAQkJCtHjxYj322GOSpAsXLqhLly4KCAjQmjVrdPnyZVWrVs3mqjcAOLKrV6/edf3gwYNq1aoV32swjsINFNDp06e1b98+1axZUw0aNDAdBwAKzMfHR999953NbePXrl1Thw4d5O3trQULFqhmzZr8YArAabi5uclisdhdt1qtslgsfK/BOAo3AABlXO3atTVr1ix17tzZ5vz169fVvn17paen69ChQ/xgCsBp+Pv76y9/+YuaNGlyx/Xjx4/rxRdf5HsNxnmYDgA4ktmzZxf4syNHjizFJABQctq3b6+FCxfmK9z33XefNm7cqHbt2hlKBgBFExUVJUlq1arVHdcDAgLYmwIOgSvcwK8UdJdei8WixMTEUk4DACXj0qVL+s9//qPf/va3d1y/du2a4uPj7f7gCgCOZv78+crIyLB7ASQlJUXz5s3TxIkT73EywBaFGwAAAACAUsBrwYBCSkxMVPv27U3HAIASs3fvXm3fvt10DAAAyhyucAOFlJCQoKioKDbhAFBm1KlTR8eOHeN7DUCZsXfvXqWnp6tly5amo8DFsWkaAAAubvPmzcrKyjIdAwBKTP/+/RkkwiFQuAEAcHHVqlUzHQEAShSDRDgKCjcAAC4iOztb7u7uucd79uzRzZs31bRpU3l6ehpMBgAli0EiHAWFG8gjMjJSFovF7np6evo9TAMAxXf27Fn17NlT33zzjZo1a6bPP/9c/fv311dffSVJioiI0NatW1W1alXDSQGgcBgkwtFRuIE8nnrqKdMRAKBEvfzyy7JarVq9erWWLVumrl27yt3dXcnJycrOzlbfvn01depUvffee6ajAkCBMEiEs2CXcgAAyrhq1app1apV+t3vfqfU1FRVqlRJmzZtUtu2bSVJW7Zs0QsvvKAff/zRcFIAKJjnnntOP/74o8aPH69ly5YpOTlZ7u7uWr58ee4gsVGjRgwSYRxXuIECeOONNzR06FAFBASYjgIAhXbp0iX95je/kSRVrFhRPj4+CgkJyV2vWbOmzp49ayoeABTa119/nTtIbNasWe4g8ZfvuilTpuiFF14wnBKQ3EwHAJzBtGnTlJqaajoGABRJUFCQTaEeMWKEKlasmHt86dIl+fr6mogGAEXCIBHOgsINFABPXgBwZo0aNdLu3btzj9944w2bwr1jxw41aNDARDQAKBIGiXAW3FIOAEAZ98UXX9x1/ZFHHlGrVq3uURoAKL5fBomNGzeWdHuQ+GsMEuEo2DQNKIDk5GRVq1bN5rUTAAAAcEzffvutfHx8VK9ePdNR4OIo3IAdcXFxysnJUZMmTWzO79mzR+7u7oqOjjaUDACKz8/PTwcOHFB4eLjpKAAAlFk8ww3YMXz4cCUnJ+c7f+bMGQ0fPtxAIgAoOczbAZQ1fn5+SkxMNB0DsEHhBuw4cuSIoqKi8p2PjIzUkSNHDCQCAACAPQwS4Ygo3IAd5cqVU0pKSr7zZ8+elYcH+w0CcG79+vWTn5+f6RgAAJRpFG7Ajvbt2+uVV17RlStXcs9dvnxZr776qtq1a2cwGQAU39y5c1WpUiXTMQCgxDBIhCNi0zTAjjNnzqhly5a6ePGiIiMjJUkHDhxQ5cqVtWnTJgUHBxtOCACFl5aWpm3btikpKUmZmZk2ayNHjjSUCgCAsonCDdxFWlqali1bpoSEBHl7e6tBgwbq06ePPD09TUcDgELbv3+/OnfurPT0dKWlpalixYq6cOGCfHx8FBQUxGZDAJwSg0Q4Mgo3AAAu4rHHHtNDDz2kefPmyd/fXwkJCfL09FS/fv00atQode/e3XREACgUBolwdBRu4C6OHz+u2NhYnTt3Tjk5OTZrEyZMMJQKAIomICBAe/bsUa1atRQQEKDdu3erTp062rNnjwYMGKCjR4+ajggAhcIgEY6OrZYBO+bPn69hw4apUqVKqlKliiwWS+6axWKhcANwOp6ennJzu71falBQkJKSklSnTh35+/srOTnZcDoAKLwDBw7o/fffl5ubm9zd3XXz5k2Fh4drxowZGjBgAIUbxlG4ATv+9re/aerUqXr55ZdNRwGAEhEZGam4uDhFRESoVatWmjBhgi5cuKClS5eqXr16puMBQKExSISj47VggB2XLl1Sz549TccAgBIzbdo0Va1aVZI0depUBQYGatiwYTp//rw++OADw+kAoPB+GSRKyh0kLlu2TKNHj2aQCIfAM9yAHYMHD9YjjzyioUOHmo4CAACAO9i7d6+uXbum1q1b69y5c3ruuee0a9cuRURE6KOPPlLDhg1NR4SLo3ADdkyfPl2zZs1Sly5dVL9+/XyvAuM1EwAAAADuhsIN2BEWFmZ3zWKx8JoJAE7n4sWLmjBhgt23L6SmphpKBgBA2cSmaYAdJ0+eNB0BAEpU//79deLECQ0ePFiVK1e2efsCADgjBolwdBRuAABcxL///W/t2LGDZxoBlBkMEuHoKNzAr4wZM0avv/66fH19NWbMmLt+dtasWfcoFQCUjNq1aysjI8N0DAAoMQwS4ego3MCv7N+/X1lZWbm/tofpKQBn9M9//lPjx4/XhAkTVK9evXybQfr5+RlKBgBFwyARjo5N0wAAcBHHjx9X3759FR8fb3PearXKYrEoOzvbUDIAKJq4uDgGiXBoXOEGAMBFPPvss/L09NTHH3/Ms44AyoSAgABdvXpVbdq0sTnPIBGOgsIN2HHjxg29++67dne9zHuFCAAc3eHDh7V//37VqlXLdBQAKBEMEuHoKNyAHYMHD1ZMTIyeeeYZNW7cmC9wAE4vOjpaycnJFG4AZQaDRDg6Cjdgx9q1a/XVV1+pWbNmpqMAQIn4n//5H40aNUpjx45V/fr18z3r2KBBA0PJAKBoGCTC0bFpGmBH3bp19cknn/ADKIAyw83NLd85i8XCs44AnNb//u//atKkSQwS4bAo3IAd69ev1+zZszVv3jyFhISYjgMAxXb69Om7rvNdB8DZMEiEo+OWcsCO6Oho3bhxQ+Hh4fLx8ck3MU1NTTWUDACKhkINoKw5efKk6QjAXVG4ATv69OmjM2fOaNq0aex6CaBMOXLkiJKSkpSZmWlzvlu3boYSAUDRMEiEo+OWcsAOHx8f7d69Ww0bNjQdBQBKRGJiop5++mkdOnQo95ZLSbkDRW69BOCsGCTCUXGFG7Cjdu3aysjIMB0DAErMqFGjFBYWps2bNyssLEzffvutLl68qJdeekn/+Mc/TMcDgEJjkAhHl3+XAQCSpDfeeEMvvfSStm7dqosXL+rq1as2/wCAs9m9e7emTJmiSpUqyc3NTW5ubmrevLmmT5+ukSNHmo4HAIX2yyDx3Llz8vHx0Xfffaft27crOjpaW7duNR0P4Ao3YE/Hjh0lSW3btrU5z66XAJxVdna2KlSoIEmqVKmS/vOf/6hWrVoKCQnRDz/8YDgdABTe7t27tWXLFruDxP3795uOCBdH4QbsiI2NNR0BAEpUvXr1lJCQoLCwMDVp0kQzZsyQl5eXPvjgA4WHh5uOBwCFxiARjo7CDdjRqlUr0xEAoET99a9/VVpamiRpypQp6tq1q1q0aKH7779fn376qeF0AFB4DBLh6NilHLiLy5cv68MPP9T3338vSfrtb3+rQYMGyd/f33AyACgZqampCgwM5NWHAJzSxo0blZaWpu7du+vEiRPq2rWrjh07ljtIbNOmjemIcHEUbsCOvXv3qkOHDvL29lbjxo0lSXFxccrIyFBMTIyioqIMJwQAAEBeDBLhSCjcgB0tWrRQzZo1NX/+fHl43H764tatWxoyZIgSExO1fft2wwkB4L/r3r17gT+7atWqUkwCAIDr4RluwI69e/falG1J8vDw0Lhx4xQdHW0wGQAUHI/AAChrGCTCmVC4ATv8/PyUlJSk2rVr25xPTk7O3Q0TABzdwoULTUcAgBLFIBHOhMIN2NG7d28NHjxY//jHP/Too49Kknbu3KmxY8eqT58+htMBQMk5ePCgoqOjlZmZaToKAPxXDBLhTCjcgB3/+Mc/ZLFY9Nxzz+nWrVuSJE9PTw0bNkxvvPGG4XQAUHKsVmvu9xwAlAUMEuEo2DQN+C/S09P1448/SpJq1KghHx8fw4kAoGQlJCQoKipK2dnZpqMAQIlISEhQZGSkcnJyTEeBi+MKN/Bf+Pj4qH79+qZjAAAAoBB4LRgcAYUbsKN169Z3/aLesmXLPUwDAEV39erVu65fu3btHiUBAMC1ULgBOxo1amRznJWVpQMHDujw4cMaMGCAmVAAUAQBAQF3HSBarVauBAFwKgwS4Swo3IAdb7311h3PT5o0SdevX7/HaQCg6GJjY01HAIASxSARzoJN04BCOnHihBo3bqzU1FTTUQAAAFzStm3bCvS5Vq1alXIS4O64wg0U0u7du1W+fHnTMQCgWLp06aIFCxaoatWqpqMAQKFRpOEsKNyAHd27d7c5tlqtOnv2rPbu3avXXnvNUCoAKBnbt29XRkaG6RgAUGIYJMIRUbgBO/z9/W2O3dzcVKtWLU2ZMkXt27c3lAoAAAB3wiARjojCDdixcOFC0xEAoNSEhITI09PTdAwAAMo0CjfwX2RmZurcuXPKycmxOV+9enVDiQCg+A4fPmw6AgCUKAaJcETsUg7YcezYMQ0ePFi7du2yOf/Layays7MNJQOA4klPT1dSUpIyMzNtzjdo0MBQIgAAyiaucAN2DBw4UB4eHlq7dq2qVq3KuxwBOL3z589r4MCBWr9+/R3XGSQCcFYMEuGoKNyAHQcOHNC+fftUu3Zt01EAoESMHj1aly9f1p49e/TYY49p9erVSklJ0d/+9jfNnDnTdDwAKDQGiXB0bqYDAI6qbt26unDhgukYAFBitmzZolmzZik6Olpubm4KCQlRv379NGPGDE2fPt10PAAotF8PEr29vbVhwwYtXrxYERERWrNmjel4AFe4gV+7evVq7q/ffPNNjRs3TtOmTVP9+vXzbcLh5+d3r+MBQLGkpaUpKChIkhQYGKjz58/roYceUv369RUfH284HQAU3pYtW/TFF1/YDBLbtWsnPz8/TZ8+XV26dDEdES6Owg38SkBAgM2z2larVW3btrX5DJumAXBWtWrV0g8//KDQ0FA1bNhQ77//vkJDQzVv3jxVrVrVdDwAKDQGiXB0FG7gV2JjY01HAIBSM2rUKJ09e1aSNHHiRHXs2FHLli2Tl5eXFi1aZDYcABQBg0Q4Ol4LBuTRtm1bDR8+XN27d7/j+oULF9S4cWMlJibe42QAULLS09N19OhRVa9eXZUqVTIdBwAK7V//+pdu3bql559/Xvv27VPHjh2VmpqaO0js3bu36YhwcRRuIA83Nze5ubnpL3/5iyZPnpxvPSUlRdWqVeOWcgAAAAfDIBGOhlvKgTuYO3eu/vznP+vgwYP617/+JV9fX9ORAKDYrFarVq5cqdjYWJ07d045OTk266tWrTKUDABKho+Pj6KiokzHAHJRuIE7ePLJJ9W8eXM9+eST+t3vfqcvvvhC4eHhpmMBQLGMHj1a77//vlq3bq3KlSvbbBIJAM6IQSIcHYUbsKNOnTqKi4tTnz599Mgjj+jTTz/V448/bjoWABTZ0qVLtWrVKnXu3Nl0FAAoEQwS4ego3MBd+Pv7a926dXrllVfUuXNnvfnmm+rbt6/pWABQJP7+/tytA6BMYZAIR0fhBvLIOxm1WCx644031KhRIw0ZMkRbtmwxlAwAimfSpEmaPHmyPvroI3l7e5uOAwDFxiARjo5dyoE83Nzc9PPPPysoKCjf2oEDB/TUU08pOTmZXcoBOJ2MjAw9/fTT2rlzp0JDQ+Xp6WmzHh8fbygZABTN4sWLtWHDBgaJcFhc4QbyiI2NVcWKFe+41qhRI+3bt0/r1q27x6kAoPgGDBigffv2qV+/fjzrCKBM6NWrl5YvX66goCAGiXBIXOEGAMBF+Pr6auPGjWrevLnpKABQInr16qXY2Fg988wzdxwkTpw40VAy4DaucAMA4CKCg4Pl5+dnOgYAlJh169YxSIRDczMdAAAA3BszZ87UuHHjdOrUKdNRAKBEMEiEo+OWcgAAXERgYKDS09N169Yt+fj45HvWMTU11VAyACiadevW6d1339W8efMUGhpqOg6QD4UbAAAXsXjx4ruuDxgw4B4lAYCSwSARjo7CDQAAAMApMUiEo6NwAwDggm7cuKHMzEybczwHCQBAyWKXcgAAXERaWppefvllrVixQhcvXsy3np2dbSAVAJQMBolwROxSDgCAixg3bpy2bNmiuXPnqly5clqwYIEmT56satWqacmSJabjAUChpaWlacSIEQoKCpKvr68CAwNt/gFMo3ADAOAivvzyS/3zn/9Ujx495OHhoRYtWuivf/2rpk2bpmXLlpmOBwCFxiARjo7CDQCAi0hNTVV4eLik27dZ/rJ7b/PmzbV9+3aT0QCgSBgkwtFRuAEAcBHh4eE6efKkJKl27dpasWKFpNs/sAYEBBhMBgBFwyARjo7CDQCAixg4cKASEhIkSePHj9ecOXNUvnx5/elPf9LYsWMNpwOAwmOQCEfHa8EAAHBRp0+f1r59+1SzZk01aNDAdBwAKLS33npL7u7uGjlypL7++ms98cQTslqtysrK0qxZszRq1CjTEeHiKNwAAAAAygQGiXA0FG4AAMqw2bNnF/izI0eOLMUkAAC4Hgo3AABlWFhYWIE+Z7FYlJiYWMppAKD4GCTCmVC4AQAAADgNBolwJhRuAAAAAABKgYfpAAAAoPSMGTOmwJ+dNWtWKSYBAMD1ULgBACjD9u/fX6DPWSyWUk4CACWDQSKcCYUbAIAyLDY21nQEAChRDBLhTHiGGwAAF7R8+XJ169ZNvr6+pqMAAFBmuZkOAAAA7r0XX3xRKSkppmMAQIlZvny50tLSTMcAbFC4AQBwQdzgBqCsYZAIR0ThBgAAAOD0GCTCEVG4AQBwQevXr1e1atVMxwAAoExj0zQAAAAATm/Hjh2Kjo5W+fLlTUcBclG4AQBwIStXrtSKFSuUlJSkzMxMm7X4+HhDqQAAKJu4pRwAABcxe/ZsDRw4UJUrV9b+/fvVuHFj3X///UpMTFSnTp1MxwOAIlm5cqV69eql3/3ud4qKirL5BzCNwg0AgIv45z//qQ8++EDvvvuuvLy8NG7cOG3atEkjR47UlStXTMcDgEJjkAhHR+EGAMBFJCUl6dFHH5UkeXt769q1a5Kk/v37a/ny5SajAUCRMEiEo6NwAwDgIqpUqaLU1FRJUvXq1fXNN99Ikk6ePMnrdAA4JQaJcHQUbgAAXESbNm20Zs0aSdLAgQP1pz/9Se3atVPv3r319NNPG04HAIXHIBGOjl3KAQBwETk5OcrJyZGHh4ck6ZNPPtGuXbsUERGhF198UV5eXoYTAkDhDBkyRMHBwZo4caLmzJmjsWPHqlmzZtq7d6+6d++uDz/80HREuDgKNwAALiIpKUnBwcGyWCw2561Wq5KTk1W9enVDyQCgaBgkwtFRuAEAcBHu7u46e/asgoKCbM5fvHhRQUFBys7ONpQMAIqGQSIcHc9wAwDgIqxWa74fSiXp+vXrKl++vIFEAFA8YWFhOn/+fL7zqampCgsLM5AIsOVhOgAAAChdY8aMkSRZLBa99tpr8vHxyV3Lzs7Wnj171KhRI0PpAKDoGCTC0VG4AQAo4/bv3y/p9g+mhw4dsnmm0cvLSw0bNtSf//xnU/EAoNAYJMJZULgBACjjYmNjJd1+Fdg777wjPz8/w4kAoHgYJMJZsGkaAAAu5sSJE/rxxx/VsmVLeXt7270lEwAcHYNEODoKNwAALiI1NVU9e/ZUbGysLBaLjh8/rvDwcA0aNEiBgYGaOXOm6YgAUCQMEuGo2KUcAAAXMXr0aHl6eiopKcnmecfevXtrw4YNBpMBQNGkpqaqbdu2euihh9S5c2edPXtWkjR48GC99NJLhtMBFG4AAFxGTEyM3nzzTT344IM25yMiInT69GlDqQCg6BgkwtGxaRoAAC4iLS3N5gfSX6SmpqpcuXIGEgFA8cTExGjjxo0MEuGwuMINAICLaNGihZYsWZJ7bLFYlJOToxkzZqh169YGkwFA0TBIhKPjCjcAAC5ixowZatu2rfbu3avMzEyNGzdO3333nVJTU7Vz507T8QCg0H4ZJL7++uuSGCTC8bBLOQAALuTy5cuaM2eOEhISdP36dUVFRWn48OGqWrWq6WgAUGiHDx9W27ZtFRUVpS1btqhbt242g8QaNWqYjggXR+EGAMCF3LhxQwcPHtS5c+eUk5Njs9atWzdDqQCg6BgkwpFRuAEAcBEbNmxQ//79lZqaqrx//VssFmVnZxtKBgBFxyARjozCDQCAi4iIiFD79u01YcIEVa5c2XQcACg2BolwdBRuAABchJ+fn/bv388zjQDKDAaJcHS8FgwAABfxzDPPaOvWraZjAECJSUlJ0ZgxYyjbcFhc4QYAwEWkp6erZ8+eeuCBB1S/fn15enrarI8cOdJQMgAomkGDBqlZs2YaPHiw6SjAHVG4AQBwER9++KGGDh2q8uXL6/7775fFYslds1gsSkxMNJgOAAqPQSIcHYUbAAAXUaVKFY0cOVLjx4+XmxtPlQFwfgwS4ego3AAAuIiKFSsqLi6OTdMAlBkMEuHo+H8lAAAuYsCAAfr0009NxwCAEpOZmanevXtTtuGwPEwHAAAA90Z2drZmzJihjRs3qkGDBvmedZw1a5ahZABQNL8MEl999VXTUYA7onADAOAiDh06pMjISEnS4cOHbdZ+/dwjADgLBolwdDzDDQAAAMAptW7d2u6axWLRli1b7mEaID8KNwAAAAAApYDdBQAAAAAAKAUUbgAAAAAASgGFGwAAAACAUkDhBgAAubZu3SqLxaLLly8X+L8JDQ3V22+/XWqZAABwVhRuAACcyPPPPy+LxaKhQ4fmWxs+fLgsFouef/75ex8MAADkQ+EGAMDJBAcH65NPPlFGRkbuuRs3bujjjz9W9erVDSYDAAC/RuEGAMDJREVFKTg4WKtWrco9t2rVKlWvXl2RkZG5527evKmRI0cqKChI5cuXV/PmzRUXF2fze3311Vd66KGH5O3trdatW+vUqVP5/vd27NihFi1ayNvbW8HBwRo5cqTS0tJK7c8HAEBZQeEGAMAJDRo0SAsXLsw9/uijjzRw4ECbz4wbN06fffaZFi9erPj4eNWsWVMdOnRQamqqJCk5OVndu3fXE088oQMHDmjIkCEaP368ze/x448/qmPHjurRo4cOHjyoTz/9VDt27NCIESNK/w8JAICTo3ADAOCE+vXrpx07duj06dM6ffq0du7cqX79+uWup6Wlae7cufr73/+uTp06qW7dupo/f768vb314YcfSpLmzp2rGjVqaObMmapVq5aeffbZfM9/T58+Xc8++6xGjx6tiIgIPfroo5o9e7aWLFmiGzdu3Ms/MgAATsfDdAAAAFB4DzzwgLp06aJFixbJarWqS5cuqlSpUu76jz/+qKysLDVr1iz3nKenpxo3bqzvv/9ekvT999+rSZMmNr9v06ZNbY4TEhJ08OBBLVu2LPec1WpVTk6OTp48qTp16pTGHw8AgDKBwg0AgJMaNGhQ7q3dc+bMKZX/jevXr+vFF1/UyJEj862xQRsAAHdH4QYAwEl17NhRmZmZslgs6tChg81ajRo15OXlpZ07dyokJESSlJWVpbi4OI0ePVqSVKdOHa1Zs8bmv/vmm29sjqOionTkyBHVrFmz9P4gAACUUTzDDQCAk3J3d9f333+vI0eOyN3d3WbN19dXw4YN09ixY7VhwwYdOXJEL7zwgtLT0zV48GBJ0tChQ3X8+HGNHTtWP/zwgz7++GMtWrTI5vd5+eWXtWvXLo0YMUIHDhzQ8ePH9cUXX7BpGgAABUDhBgDAifn5+cnPz++Oa2+88YZ69Oih/v37KyoqSidOnNDGjRsVGBgo6fYt4Z999pk+//xzNWzYUPPmzdO0adNsfo8GDRpo27ZtOnbsmFq0aKHIyEhNmDBB1apVK/U/GwAAzs5itVqtpkMAAAAAAFDWcIUbAAAAAIBSQOEGAAAAAKAUULgBAAAAACgFFG4AAAAAAEoBhRsAAAAAgFJA4QYAAAAAoBRQuAEAAAAAKAUUbgAAAAAASgGFGwAAAACAUkDhBgAAAACgFFC4AQAAAAAoBRRuAAAAAABKwf8DaRzp0Mqrc5QAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "accepted_lengths = []\n", + "\n", + "for ssm in small_model_names:\n", + " for batch_size in batch_sizes:\n", + " for arrival_rate in arrival_rates:\n", + " model_name = ssm.replace(\"/\", \"-\")\n", + " filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n", + " if os.path.exists(filepath):\n", + " accepted_lengths.append({\n", + " 'Model': model_name,\n", + " 'Batch Size': batch_size,\n", + " 'Arrival Rate': arrival_rate,\n", + " 'Accepted Length': get_accepted_len(filepath)\n", + " })\n", + "\n", + "accepted_df = pd.DataFrame(accepted_lengths)\n", + "\n", + "# # Create a bar plot\n", + "# fig, ax = plt.subplots(figsize=(12, 8))\n", + "# accepted_df.pivot_table(index=['Model', 'Batch Size'], columns='Arrival Rate', values='Accepted Length').plot(kind='bar', ax=ax)\n", + "# plt.title('Accepted Length by Model, Batch Size, and Arrival Rate')\n", + "# plt.ylabel('Accepted Length')\n", + "# plt.xlabel('Model and Batch Size')\n", + "# plt.legend(title='Arrival Rate')\n", + "# plt.show()\n", + "# Group by model and calculate the mean of accepted lengths\n", + "average_accepted_df = accepted_df.groupby('Model')['Accepted Length'].mean().reset_index()\n", + "\n", + "# Sort the dataframe by 'Accepted Length' in ascending order\n", + "average_accepted_df = average_accepted_df.sort_values(by='Accepted Length')\n", + "\n", + "# Create a bar plot\n", + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "average_accepted_df.plot(x='Model', y='Accepted Length', kind='bar', ax=ax)\n", + "plt.title('Average Number of Accepted Tokens per Step\\nLLM: LLAMA-3.1-70B-Instruct\\nBatch Sizes: 4, 8')\n", + "plt.ylabel('Average Number of Accepted Tokens')\n", + "plt.xlabel('Model')\n", + "plt.grid(True) # Turn the grid on\n", + "\n", + "# Save the plot as a PDF\n", + "plt.savefig('/usr/FlexFlow/benchmarking/average_accepted_tokens.pdf', bbox_inches='tight')\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAB8UAAAHvCAYAAADNQw6XAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gUV9sG8HupSwcVRBABkfiCXewNsCEidrELRmOPGjUajQXFGnvXaAKKEOy9koixJZbYYi8RNTZEpQhSd74/+HbisksVXBbv33VxwZ45M/PM7DqP8Mw5IxEEQQAREREREREREREREREREVEppKXuAIiIiIiIiIiIiIiIiIiIiIoLi+JERERERERERERERERERFRqsShORERERERERERERERERESlFoviRERERERERERERERERERUarEoTkREREREREREREREREREpRaL4kREREREREREREREREREVGqxKE5ERERERERERERERERERKUWi+JERERERERERERERERERFRqsShORERERERERERERERERESlFoviRERERERUYp04cQISiQQ7duxQdyiFEh0dDYlEgkWLFqk7FCoi7969w+DBg2FtbQ2JRIKxY8eqOyQlEokEgYGBCm0XLlxAkyZNYGRkBIlEgitXrgAAjhw5gtq1a0MqlUIikSAuLu6Tx6upAgMDIZFIinUfDg4OCAgIKNZ9EBERERERfQ5YFCciIiIiok9KIpHk6+vEiRPqDlXjHTp0SKk4qg7y4mFeXx4eHgCAgIAAhXZTU1PUqlULixcvRmpqqtL2z5w5gy5duqB8+fLQ19eHg4MDhg4disePH4t95Dco5OcrOjo6x2OZO3cuQkJCMHz4cISGhqJ///5FfboUODg4iHFpaWnB3NwcNWrUwJAhQ3Du3Ll8bSM9PR09evTAmzdvsHTpUoSGhsLe3h6vX7+Gn58fDAwMsHr1aoSGhsLIyKhYj6ewnj17hsDAQLGYXxBr1qyBRCJBw4YNiz6wEiL7Z9jU1BTu7u44ePBgobcZHh6OZcuWFV2QREREREREaqSj7gCIiIiIiOjzEhoaqvB68+bNiIyMVGp3cXHBrVu3PmVopc6hQ4ewevVqtRfGu3btiipVqoiv3717h+HDh6NLly7o2rWr2F6+fHnxZ319fWzcuBEAEBcXh507d2LChAm4cOECIiIixH4rV67EmDFjULlyZXz99deoUKECbt26hY0bN2Lr1q04dOgQmjRpAktLS6XP2OLFi/Hvv/9i6dKlCu2WlpY5Hsvx48fRqFEjzJgxo3AnoxBq166N8ePHAwASExNx69YtbN++HRs2bMA333yDJUuWKPR///49dHT++3X/wYMHePToETZs2IDBgweL7UeOHEFiYiKCgoLQunXrT3MwhfTs2TPMnDkTDg4OqF27doHWDQsLg4ODA86fP4/79+8rfBY/xtSpU/Hdd98VybaKQps2bTBgwAAIgoBHjx5h7dq18PX1xeHDh+Hl5VXg7YWHh+P69eslcjYEIiIiIiKigmJRnIiIiIiIPql+/fopvP7zzz8RGRmp1A7go4viycnJMDQ0/Kht0MerWbMmatasKb6OjY3F8OHDUbNmTZXvOwDo6OgoLBsxYgQaNmyIrVu3YsmSJbCxscGZM2cwduxYNGvWDEeOHFF4r4cPH46mTZuie/fuuHHjBiwsLJT2FRERgbdv3+YYgyoxMTFwdXXNd/+8ZGRkQCaTQU9PL8c+tra2SjEuWLAAffr0wdKlS+Hs7Izhw4eLy6RSqVLMAGBubp6v9o+RlJRUokabP3z4EGfPnsWuXbswdOhQhIWF5euGhtzeF/kx6ujoKNx8oG5ffPGFwuekW7ducHV1xfLlywtVFCciIiIiIipNOH06ERERERGVeDKZDHPmzEHFihUhlUrRqlUr3L9/X6GPh4cHqlevjr/++gstWrSAoaEhpkyZAiCr+Ddo0CCUL18eUqkUtWrVwqZNmxTWlz+/PPu07fJpt0NCQhTat2/fDldXV0ilUlSvXh27d+9GQEAAHBwcVB7Djz/+CCcnJ+jr66N+/fq4cOGCwvKAgAAYGxvjn3/+gZeXF4yMjGBjY4NZs2ZBEIQCxxkQEIDVq1cDUJxaOScdOnRA5cqVVS5r3Lgx6tWrJ76OjIxEs2bNYG5uDmNjY1StWlU818VFS0tLnF5dPr15UFAQJBIJNm3apHTzg5OTE3744Qc8f/4c69ev/+j9y8/7w4cPcfDgQaWp1vPzGfvwGfPLli0TPw83b94scDwGBgYIDQ1FmTJlMGfOHIXPyIfPFA8ICIC7uzsAoEePHuI09R4eHvD39wcA1K9fHxKJROHZ1efOnUO7du1gZmYGQ0NDuLu748yZMwoxyKfFv3nzJvr06QMLCws0a9ZMXL5lyxa4ubnBwMAAZcqUQa9evfDkyROFbcj/3d68eROenp4wNDSEra0tfvjhB4VzX79+fQDAwIEDxXOf/d+kKmFhYbCwsICPjw+6d++OsLAwpT65vS+5HWP2Z4pXr14dnp6eStuXyWSwtbVF9+7dxbZFixahSZMmKFu2LAwMDODm5oYdO3bkeTwF4eLignLlyuHBgwcK7Xv37oWPjw9sbGygr68PJycnBAUFITMzU+zj4eGBgwcP4tGjR+L5/vDalpqaihkzZqBKlSrQ19eHnZ0dJk6cqPR4A3VcK4iIiIiIiFQpObc0ExERERER5WD+/PnQ0tLChAkTEB8fjx9++AF9+/ZVeqby69ev4e3tjV69eqFfv34oX7483r9/Dw8PD9y/fx+jRo2Co6Mjtm/fjoCAAMTFxWHMmDEFjufgwYPo2bMnatSogXnz5uHt27cYNGgQbG1tVfYPDw9HYmIihg4dColEgh9++AFdu3bFP//8A11dXbFfZmYm2rVrh0aNGuGHH37AkSNHMGPGDGRkZGDWrFkFinHo0KF49uyZyqnpVenZsycGDBiACxcuiAVIAHj06BH+/PNPLFy4EABw48YNdOjQATVr1sSsWbOgr6+P+/fvKxVMi4O8uFe2bFkkJyfjt99+Q/PmzeHo6JjjMQ0ZMgQHDhz46GmuXVxcEBoaim+++QYVK1YUpzO3tLQs8GcsODgYKSkpGDJkCPT19VGmTJlCxWRsbIwuXbrgp59+ws2bN1GtWjWlPkOHDoWtrS3mzp2L0aNHo379+uI09VWrVsWPP/6IWbNmwdHREU5OTgCypoj39vaGm5sbZsyYAS0tLQQHB6Nly5Y4deoUGjRooLCPHj16wNnZGXPnzhWL83PmzMG0adPg5+eHwYMH49WrV1i5ciVatGiBy5cvK4xOf/v2Ldq1a4euXbvCz88PO3bswKRJk1CjRg14e3vDxcUFs2bNwvTp0zFkyBA0b94cANCkSZM8z1FYWBi6du0KPT099O7dG2vXrlX6jMvl9r6oOsbsevbsicDAQLx48QLW1tZi++nTp/Hs2TP06tVLbFu+fDk6duyIvn37Ii0tDREREejRowcOHDgAHx+fPI8rP+Lj4/H27VvxfZULCQmBsbExxo0bB2NjYxw/fhzTp09HQkKC+O/8+++/R3x8vMLjBYyNjQFkFfk7duyI06dPY8iQIXBxccHff/+NpUuX4u7du9izZw8A9V4riIiIiIiIlAhERERERERqNHLkSCGnX02ioqIEAIKLi4uQmpoqti9fvlwAIPz9999im7u7uwBAWLduncI2li1bJgAQtmzZIralpaUJjRs3FoyNjYWEhASFfUVFRSms//DhQwGAEBwcLLbVqFFDqFixopCYmCi2nThxQgAg2NvbK61btmxZ4c2bN2L73r17BQDC/v37xTZ/f38BgPD111+LbTKZTPDx8RH09PSEV69eFTjO3M5tdvHx8YK+vr4wfvx4hfYffvhBkEgkwqNHjwRBEISlS5cKAMR4CuPVq1cCAGHGjBkql/v7+wtGRkbCq1evhFevXgn3798X5s6dK0gkEqFmzZqCIAjClStXBADCmDFjct1XzZo1hTJlyqhc5uPjo/B+5Ye9vb3g4+Oj0Jbfz5j8PTI1NRViYmIKvb8Pyd+PvXv3im3Zz638M7N9+3aFdYODgwUAwoULF8Q2mUwmODs7C15eXoJMJhPbk5OTBUdHR6FNmzZi24wZMwQAQu/evRW2Gx0dLWhrawtz5sxRaP/7778FHR0dhXb5v9vNmzeLbampqYK1tbXQrVs3se3ChQtKn++8XLx4UQAgREZGisdWsWJFpc9Mbu9LTsf44TK5O3fuCACElStXKvQbMWKEYGxsLCQnJ4ttH/4sCFmfl+rVqwstW7ZUaLe3txf8/f3zPFYAwqBBg4RXr14JMTExwsWLF4V27doJAISFCxcq9M2+b0EQhKFDhwqGhoZCSkqK2JbTv4/Q0FBBS0tLOHXqlEL7unXrBADCmTNnBEEommsFERERERFRUeH06UREREREVOINHDhQ4dm+8pGi//zzj0I/fX19DBw4UKHt0KFDsLa2Ru/evcU2XV1djB49Gu/evcPvv/9eoFiePXuGv//+GwMGDBBHTgKAu7s7atSooXKdnj17wsLCIs/4AWDUqFHizxKJBKNGjUJaWhp+/fXXAsVZUKampvD29sa2bdsURsJu3boVjRo1QqVKlQD89/zpvXv3QiaTFVs8SUlJsLS0hKWlJapUqYIpU6agcePG2L17NwAgMTERAGBiYpLrdkxMTJCQkFBscQIF/4x169YNlpaWRbJv+WdQfj4+1pUrV3Dv3j306dMHr1+/RmxsLGJjY5GUlIRWrVrh5MmTSu/7sGHDFF7v2rULMpkMfn5+4vqxsbGwtraGs7MzoqKilI7hw2dh6+npoUGDBir/fRREWFgYypcvL05pLpFI0LNnT0RERChMFS6X2/uS/RhV+eKLL1C7dm1s3bpVbMvMzMSOHTvg6+sLAwMDsf3Dn9++fYv4+Hg0b94cly5dyvfxZffTTz/B0tISVlZWqFevHn777TdMnDgR48aNU+j34b4TExMRGxuL5s2bIzk5Gbdv385zP9u3b4eLiwv+97//Kby/LVu2BADx/f1U1woiIiIiIqL8YFGciIiIiIhKPHlBVk5eYH779q1Cu62trULxHMia/tvZ2RlaWoq//ri4uIjLC0Lev0qVKkrLVLUB+Y9fS0tL6bneX3zxBYD/nqNdnHr27IknT57gjz/+AJA1Xflff/2Fnj17KvRp2rQpBg8ejPLly6NXr17Ytm1bkRe9pFIpIiMjERkZiZMnT+LJkyc4c+aMeH7kxfC8isGJiYl5Fs4/VkE/YzlN914Y7969A5D3zQH5de/ePQCAv7+/eFOC/Gvjxo1ITU1FfHy8wjrZj+fevXsQBAHOzs5K27h16xZiYmIU+lesWFHpefcWFhZK/z4KIjMzExEREfD09MTDhw9x//593L9/Hw0bNsTLly/x22+/Ka2T2/uS3/esZ8+eOHPmDJ4+fQog63noMTExCv+GAODAgQNo1KgRpFIpypQpA0tLS6xdu1bp3BZEp06dEBkZiYMHD4rPO09OTlb6XN64cQNdunSBmZkZTE1NYWlpKd6UkJ/937t3Dzdu3FB6b+XXKvn7+6muFURERERERPnBZ4oTEREREVGJp62trbJdyPZs3w9HQBZU9qKcnKoRpQWV3/jzozjj9PX1haGhIbZt24YmTZpg27Zt0NLSQo8ePcQ+BgYGOHnyJKKionDw4EEcOXIEW7duRcuWLXHs2LEcj7WgtLW10bp16xyXV6lSBTo6Orh27VqOfVJTU3Hnzh3Uq1evSGIqKh/zOc3u+vXrAHK+IaOg5AXLhQsXonbt2ir7fDhDAqB8PDKZDBKJBIcPH1b5eci+flH++5A7fvw4nj9/joiICERERCgtDwsLQ9u2bRXacntf8vue9ezZE5MnT8b27dsxduxYbNu2DWZmZmjXrp3Y59SpU+jYsSNatGiBNWvWoEKFCtDV1UVwcDDCw8PzeYTKKlasKP6bad++PcqVK4dRo0bB09MTXbt2BQDExcXB3d0dpqammDVrFpycnCCVSnHp0iVMmjQpXwVrmUyGGjVqYMmSJSqX29nZAfh01woiIiIiIqL8YFGciIiIiIhKNXt7e1y7dg0ymUxhxKR8mmB7e3sA/43ejouLU1g/+yhfef/79+8r7UtVW0HIZDL8888/4ohLALh79y4AwMHBoUBxAjkX0HNiZGSEDh06YPv27ViyZAm2bt2K5s2bw8bGRqGflpYWWrVqhVatWmHJkiWYO3cuvv/+e0RFReVayC5KRkZG8PT0xPHjx/Ho0SPxffnQtm3bkJqaig4dOhRrLPn9jBW1d+/eYffu3bCzsxNHpX8sJycnAFnT6Rf2vXRycoIgCHB0dFT4LH+Mgn6Ww8LCYGVlhdWrVyst27VrF3bv3o1169YV6Q0KQNaI8gYNGmDr1q0YNWoUdu3ahc6dO0NfX1/ss3PnTkilUhw9elShPTg4uEhjGTp0KJYuXYqpU6eiS5cukEgkOHHiBF6/fo1du3ahRYsWYt+HDx8qrZ/TOXdycsLVq1fRqlWrPN+XknCtICIiIiIiAjh9OhERERERlXLt27fHixcvFJ7zm5GRgZUrV8LY2Bju7u4AsgqX2traOHnypML6a9asUXhtY2OD6tWrY/PmzeLU1QDw+++/4++///7oeFetWiX+LAgCVq1aBV1dXbRq1apAcQJZhWNAuYCem549e+LZs2fYuHEjrl69qjTt85s3b5TWkY8oTk1Nzfd+isLUqVMhCAICAgLw/v17hWUPHz7ExIkTUaFCBQwdOrRY48jvZ6wovX//Hv3798ebN2/w/fffF7honBM3Nzc4OTlh0aJFCp9vuVevXuW5ja5du0JbWxszZ85UGu0tCAJev35d4LgK8ll+//49du3ahQ4dOqB79+5KX6NGjUJiYiL27dtX4Djyo2fPnvjzzz/x888/IzY2VunfkLa2NiQSicLsDtHR0dizZ0+RxqGjo4Px48fj1q1b2Lt3r7hvQHEUflpaWo7XD1XTqfv5+eHp06fYsGGD0rL3798jKSkJQMm6VhAREREREXGkOBERERERlWpDhgzB+vXrERAQgL/++gsODg7YsWMHzpw5g2XLlonPYjYzM0OPHj2wcuVKSCQSODk54cCBA0rPPwaAuXPnolOnTmjatCkGDhyIt2/fYtWqVahevbrKQmJ+SaVSHDlyBP7+/mjYsCEOHz6MgwcPYsqUKbC0tCxwnG5ubgCA0aNHw8vLC9ra2ujVq1euMbRv3x4mJiaYMGECtLW10a1bN4Xls2bNwsmTJ+Hj4wN7e3vExMRgzZo1qFixIpo1a1boYy+MFi1aYNGiRRg3bhxq1qyJgIAAVKhQAbdv38aGDRsgk8lw6NAhcXR9ccnvZ6ywnj59ii1btgDIGh1+8+ZNbN++HS9evMD48eOLtOivpaWFjRs3wtvbG9WqVcPAgQNha2uLp0+fIioqCqampti/f3+u23BycsLs2bMxefJkREdHo3PnzjAxMcHDhw+xe/duDBkyBBMmTChQXE5OTjA3N8e6detgYmICIyMjNGzYUOWzvvft24fExER07NhR5bYaNWoES0tLhIWFKRWsi4Kfnx8mTJiACRMmoEyZMkojon18fLBkyRK0a9cOffr0QUxMDFavXo0qVark+jiAwggICMD06dOxYMECdO7cGU2aNIGFhQX8/f0xevRoSCQShIaGqpyq3s3NDVu3bsW4ceNQv359GBsbw9fXF/3798e2bdswbNgwREVFoWnTpsjMzMTt27exbds2HD16FPXq1StR1woiIiIiIiIWxYmIiIiIqFQzMDDAiRMn8N1332HTpk1ISEhA1apVERwcjICAAIW+K1euRHp6OtatWwd9fX34+flh4cKFqF69ukI/X19f/PLLLwgMDMR3330HZ2dnhISEYNOmTbhx40ahY9XW1saRI0cwfPhwfPvttzAxMcGMGTMwffr0QsXZtWtXfP3114iIiMCWLVsgCEKeRXGpVIqOHTsiLCwMrVu3hpWVlcLyjh07Ijo6WhwFW65cObi7u2PmzJkwMzMr9LEX1jfffIN69eph8eLFWLZsGeLj41GhQgX06NED33//fbFNXf6hgnzGCuPKlSvo378/JBIJTExMYGdnB19fXwwePBgNGjT4+APIxsPDA3/88QeCgoKwatUqvHv3DtbW1mjYsGG+C/DfffcdvvjiCyxduhQzZ84EkPWs6bZt2+ZYrM6Nrq4uNm3ahMmTJ2PYsGHIyMhAcHCwyqJ4WFgYpFIp2rRpo3JbWlpa8PHxQVhYWKFGreelYsWKaNKkCc6cOYPBgwdDV1dXYXnLli3x008/Yf78+Rg7diwcHR2xYMECREdHF3lR3MDAAKNGjUJgYCBOnDgBDw8PHDhwAOPHj8fUqVNhYWGBfv36oVWrVvDy8lJYd8SIEbhy5QqCg4OxdOlS2Nvbw9fXF1paWtizZw+WLl2KzZs3Y/fu3TA0NETlypUxZswYccr8knatICIiIiKiz5tEUHU7MBERERERERVY7dq1YWlpicjIyAKvGxAQgB07dnzUSHMiIiIiIiIiIlLGZ4oTEREREREVUHp6OjIyMhTaTpw4gatXr8LDw0M9QRERERERERERkUqcPp2IiIiIiKiAnj59itatW6Nfv36wsbHB7du3sW7dOlhbW2PYsGHqDo+IiIiIiIiIiD7AojgREREREVEBWVhYwM3NDRs3bsSrV69gZGQEHx8fzJ8/H2XLllV3eERERERERERE9AE+U5yIiIiIiIiIiIiIiIiIiEotPlOciIiIiIiIiIiIiIiIiIhKLRbFiYiIiIiIiIiIiIiIiIio1GJRnIiIiIiIiIiIiIiIiIiISi0WxYmIiIiIiEqokJAQSCQSXLx4Mcc+0dHRkEgkWLRoUa7bcnBwgEQiQevWrVUu37BhAyQSSZ77y01gYCAkEgliY2Nz7HPixAlIJBLs2LEj39v18/ODRCLBpEmTct2mRCLBli1bVPZp2rQpJBIJqlevrnJ5ZmYmbGxsIJFIcPjw4XzHBgDffPMN6tatizJlysDQ0BAuLi4IDAzEu3fv8rX+2rVr0aNHD1SqVAkSiQQBAQEF2r/8vVX15ezsrNT/p59+gouLC6RSKZydnbFy5UqlPgEBAQrb0dHRgZ2dHXr16oWbN2/mK678fH4/xs2bNxEYGIjo6Ohi2b6mxEBERERERER501F3AERERERERPRpSKVSREVF4cWLF7C2tlZYFhYWBqlUipSUFDVFp1pCQgL2798PBwcH/PLLL5g/fz4kEonKvlKpFOHh4ejXr59Ce3R0NM6ePQupVJrjfo4fP47nz5/DwcEBYWFh8Pb2zneMFy5cQPPmzTFw4EBIpVJcvnwZ8+fPx6+//oqTJ09CSyv3+9EXLFiAxMRENGjQAM+fP8/3fuWWLVumVIB/9OgRpk6dirZt2yq0r1+/HsOGDUO3bt0wbtw4nDp1CqNHj0ZycrLSTQf6+vrYuHEjACAjIwMPHjzAunXrcOTIEdy8eRM2NjYFjrUo3bx5EzNnzoSHhwccHBw+2xiIiIiIiIgobyyKExERERERfSaaNm2KCxcuYOvWrRgzZozY/u+//+LUqVPo0qULdu7cqcYIle3cuROZmZn4+eef0bJlS5w8eRLu7u4q+7Zv3x779u1DbGwsypUrJ7aHh4ejfPnycHZ2xtu3b1Wuu2XLFtStWxf+/v6YMmUKkpKSYGRklK8YT58+rdTm5OSECRMm4Pz582jUqFGu6//+++/iKHFjY+N87fNDnTt3VmqbPXs2AKBv375i2/v37/H999/Dx8dHHKn/1VdfQSaTISgoCEOGDIGFhYXYX0dHR+kGg0aNGqFDhw44ePAgvvrqqwLHqi6CICAlJQUGBgbqDoWIiIiIiIjUgNOnExERERERfSakUim6du2K8PBwhfZffvkFFhYW8PLyUlonPT0dt2/fLtQI5qIQFhaGNm3awNPTEy4uLggLC8uxb6dOnaCvr4/t27crtIeHh8PPzw/a2toq13v//j12796NXr16wc/PD+/fv8fevXs/Km75qOG4uLg8+9rb2+c4+r2wwsPD4ejoiCZNmohtUVFReP36NUaMGKHQd+TIkUhKSsLBgwfz3K58hgEdncLdYx8QEABjY2M8ffoUnTt3hrGxMSwtLTFhwgRkZmYq9I2IiICbmxtMTExgamqKGjVqYPny5QCypmbv0aMHAMDT01Oc5v3EiRMAss5/hw4dcPToUdSrVw8GBgZYv369+LiBkJAQpdgkEgkCAwMV2p4+fYpBgwbBxsYG+vr6cHR0xPDhw5GWlpZnDERERERERFRysChORERERET0GenTpw/Onz+PBw8eiG3h4eHo3r07dHV1lfo/ffoULi4umDx58qcMEwDw7NkzREVFoXfv3gCA3r17Y8eOHUhLS1PZ39DQEJ06dcIvv/witl29ehU3btxAnz59ctzPvn378O7dO/Tq1QvW1tbw8PDItfiuSkZGBmJjY/Hs2TMcO3YMU6dOhYmJCRo0aFCg7RSFy5cv49atW0rHfPnyZQBAvXr1FNrd3NygpaUlLv9QbGwsYmNj8fLlS/zxxx/45ptvULZsWXTo0KHQ8WVmZsLLywtly5bFokWL4O7ujsWLF+PHH38U+0RGRqJ3796wsLDAggULMH/+fHh4eODMmTMAgBYtWmD06NEAgClTpiA0NBShoaFwcXERt3Hnzh307t0bbdq0wfLly1G7du0Cxfns2TM0aNAAERER6NmzJ1asWIH+/fvj999/R3Jycr5iICIiIiIiopKB06cTERERERF9Rlq2bAlra2v88ssvmDp1Km7duoUrV65g+fLl+Oeff9QdnoJffvkF+vr66NSpEwCgV69emD59Og4dOqRyynAgq+jv6+uLJ0+ewM7ODmFhYahcuXKuU5hv2bIFTZo0gZ2dnbifESNG4NWrV7C0tMxXrBcvXkTjxo3F11WrVsW+fftQpkyZfB5t0ZEX9D+cOh0Anj9/Dm1tbVhZWSm06+npoWzZsnj27JlCe1JSktLx29ra4tixY/k+L6qkpKSgZ8+emDZtGgBg2LBhqFu3Ln766ScMHz4cAHDw4EGYmpri6NGjKkf4V65cGc2bN8eKFSvQpk0beHh4KPW5f/8+jhw5ojADQnR0dL7jnDx5Ml68eIFz584p3Egwa9YsCIIAc3PzPGMgIiIiIiKikoEjxYmIiIiIiD4j2tra8PPzE0dTh4WFwc7ODs2bN1fZ38HBAYIgqJxuuriFhYXBx8cHJiYmAABnZ2e4ubnlOoq7bdu2KFOmDCIiIiAIAiIiIsSR5qq8fv0aR48eVejTrVs3SCQSbNu2Ld+xurq6IjIyEnv27MHEiRNhZGSEd+/e5Xv9oiKTyRAREYE6deoojVh+//499PT0VK4nlUrx/v17pbbIyEhERkbi6NGjWL9+PYyNjdG+fXvcvXv3o+IcNmyYwuvmzZsr3JRhbm6OpKQkREZGFnofjo6OKh8JkB8ymQx79uyBr6+v0sh6AEU+3T0REREREREVL44UJyIiIiIi+sz06dMHK1aswNWrVxEeHo5evXqVuCLfrVu3cPnyZQwYMAD3798X2z08PLB69WokJCTA1NRUaT1dXV306NED4eHhaNCgAZ48eZLr1Olbt25Feno66tSpo7Cfhg0bIiwsDCNHjgQAvHnzRmHadgMDA5iZmYmvTU1N0bp1awBZzzYPDw9Hp06dcOnSJdSqVavwJ+L/vX//HvHx8Qpt8ud7f+j333/H06dP8c033ygtMzAwyHHq+ZSUFBgYGCi0aWtri8ck1759ezg7O2Py5MnYuXMnMjMz8erVK4U+ZcqUybH4DmQV27OPNLewsMDbt2/F1yNGjMC2bdvg7e0NW1tbtG3bFn5+fmjXrl2O283O0dEx332ze/XqFRISElC9evVCb4OIiIiIiIhKDo4UJyIiIiIi+sw0bNgQTk5OGDt2LB4+fJhr0VhdtmzZAgD45ptv4OzsLH4tXrwYKSkp2LlzZ47r9unTB1euXEFgYCBq1aoFV1fXHPvKR503bdpUYT+nT5/GH3/8IY5e7tq1KypUqCB+jRkzJtf4u3btCgCIiIgo0HHnZOvWrQr7r1ChQo7Ho6WlpXJ0fIUKFZCZmYmYmBiF9rS0NLx+/Ro2NjZ5xlGxYkVUrVoVJ0+eBAA8efJEKa6zZ8/mug1V06FnZ2VlhStXrmDfvn3o2LEjoqKi4O3tDX9//zzXlcte5AdyHuGdmZmZ7+0SERERERGR5uFIcSIiIiIios9Q7969MXv2bLi4uKB27drqDkeBIAgIDw+Hp6cnRowYobQ8KCgIYWFhGDhwoMr1mzVrhkqVKuHEiRNYsGBBjvt5+PAhzp49i1GjRsHd3V1hmUwmQ//+/REeHo6pU6di8eLFCiOZ8yogp6amQiaTKY3uLiwvL688pxJPTU3Fzp074eHhoTI++ft88eJFtG/fXmy/ePEiZDJZvj8HGRkZ4tTw1tbWSnEVxch4IOtZ576+vvD19YVMJsOIESOwfv16TJs2DVWqVCnU7AYWFhYAgLi4OIX2R48eKby2tLSEqakprl+/nuv2StoMC0RERERERKQai+JERERERESfocGDB0NbWxsNGzbMtV96ejoePHgAMzOzHEcnF7UzZ84gOjoas2bNQvfu3ZWW3717F9OmTcOzZ89UFn8lEglWrFiBy5cvo3///jnuRz5KfOLEibCzs1NavnHjRoSFhWHq1Klwc3NTuY24uDgYGRlBV1dXaV0ACs+jTk5OxuPHj1GuXDmUK1cux7hUyW10uNyhQ4cQFxeHvn37qlzesmVLlClTBmvXrlUoiq9duxaGhobw8fHJM467d+/izp074vmQSqVKU6wXhdevX6Ns2bLiay0tLdSsWRNAVvEfAIyMjAAoF7hzY2pqinLlyuHkyZMYO3as2L5mzRqFflpaWujcuTO2bNmCixcvKj1XXBAESCSSQsVAREREREREnx6L4kRERERERCXczz//jCNHjii1fziF92+//YaUlBSlPp07d1b5XGR7e3sEBgbmue+nT5/CxcUF/v7+CAkJyVe8S5YsgaGhoUKblpYWpkyZIr7euXMnbt++rbSuv78/wsLCoK2tnWORtmPHjvj+++8RERGBcePGqezTqVMndOrUKdc4w8LCULt2bZUFcfl+vv76a1y6dAl169ZV2efEiRMYPXo0unfvDmdnZ6SlpeHUqVPYtWsX6tWrh379+ol9z58/D09PT8yYMUPh3O/fvx9Xr14FkHUTwrVr1zB79mwxBnkxOC9hYWHQ19dHt27dVC43MDBAUFAQRo4ciR49esDLywunTp3Cli1bMGfOHJQpU0ahf0ZGhjiNvUwmQ3R0NNatWweZTIYZM2bkK6bCGjx4MN68eYOWLVuiYsWKePToEVauXInatWvDxcUFQNbId21tbSxYsADx8fHQ19dHy5YtYWVllee258+fj8GDB6NevXo4efIk7t69q9Rv7ty5OHbsGNzd3TFkyBC4uLjg+fPn2L59O06fPg1zc/NCx0BERERERESfFoviREREREREJdzatWtVtgcEBIg/HzlyRGXh3MHBQWVRvDjNmzdPqU1bW1uhKJ7Ts7bd3d2xfft2NGnSRKlIK1e9enU4Ojpiy5YtORbF83Lp0iXcvn0b06ZNy7GPr68vvv76a2zZsiXHoniNGjXg6emJvXv34vnz5xAEAU5OTpg+fTq+/fZb6Onp5RnLzp07sWnTJvH15cuXcfnyZQBZz/DOT1E8ISEBBw8ehI+PD8zMzHLsN2LECOjq6mLx4sXYt28f7OzssHTpUpXPSE9NTVUYaW9qaor69esjNDQUrVq1yjOmj9GvXz/8+OOPWLNmDeLi4mBtbY2ePXsiMDAQWlpaALKmbl+3bh3mzZuHQYMGITMzE1FRUXkWpKdPn45Xr15hx44d2LZtG7y9vXH48GGl9WxtbXHu3DlMmzYNYWFhSEhIgK2tLby9vcWbPgobAxEREREREX1aEkEQBHUHQUREREREREREREREREREVBy01B0AERERERERERERERERERFRcWFRnIiIiIiIiIiIiIiIiIiISi0WxYmIiIiIiIiIiIiIiIiIqNRiUZyIiIiIiIiIiIiIiIiIiEotFsWJiIiIiIiIiIiIiIiIiKjUYlGciIiIiIiIiIiIiIiIiIhKLRbFiYiIiIiI6LMUEhICiUSCixcvFvu+JBIJAgMDi30/RERERERERKSMRXEiIiIiIiIqdvIC9IdfVlZW8PT0xOHDhwu93blz52LPnj1FF2gBnT59Gt7e3rC1tYVUKkWlSpXg6+uL8PBwtcVU1Nq0aQOJRIJRo0Z91Hb++usvdOjQAdbW1jA2NkbNmjWxYsUKZGZmFlGkRERERERERKrpqDsAIiIiIiIi+nzMmjULjo6OEAQBL1++REhICNq3b4/9+/ejQ4cOBd7e3Llz0b17d3Tu3Lnog83D9u3b0bNnT9SuXRtjxoyBhYUFHj58iJMnT2LDhg3o06eP2Pf9+/fQ0dG8X8F37dqFP/7446O389dff6FJkyZwdnbGpEmTYGhoiMOHD2PMmDF48OABli9fXgTREhEREREREammeb+RExERERERkcby9vZGvXr1xNeDBg1C+fLl8csvvxSqKK5OgYGBcHV1xZ9//gk9PT2FZTExMQqvpVLppwytSKSkpGD8+PGYNGkSpk+f/lHbWr9+PQDg5MmTKFOmDABg6NChcHd3R0hICIviREREREREVKw4fToRERERERGpjbm5OQwMDJRGUS9atAhNmjRB2bJlYWBgADc3N+zYsUOhj0QiQVJSEjZt2iROyR4QECAuf/r0KQYNGgQbGxvo6+vD0dERw4cPR1pamsJ2UlNTMW7cOFhaWsLIyAhdunTBq1ev8oz9wYMHqF+/vlJBHACsrKyUYpU/Uzw6OlppKvkPvz507tw5tGvXDmZmZjA0NIS7uzvOnDmj0CcxMRFjx46Fg4MD9PX1YWVlhTZt2uDSpUtin+TkZNy+fRuxsbF5HpfcDz/8AJlMhgkTJuR7nZwkJCRAKpXC3Nxcob1ChQowMDD46O0TERERERER5YYjxYmIiIiIiOiTiY+PR2xsLARBQExMDFauXIl3796hX79+Cv2WL1+Ojh07om/fvkhLS0NERAR69OiBAwcOwMfHBwAQGhqKwYMHo0GDBhgyZAgAwMnJCQDw7NkzNGjQAHFxcRgyZAj+97//4enTp9ixYweSk5MVCtlff/01LCwsMGPGDERHR2PZsmUYNWoUtm7dmuux2Nvb47fffsO///6LihUr5vscWFpaIjQ0VKEtPT0d33zzjUJcx48fh7e3N9zc3DBjxgxoaWkhODgYLVu2xKlTp9CgQQMAwLBhw7Bjxw6MGjUKrq6ueP36NU6fPo1bt26hbt26AIDz58/D09MTM2bMEIvzuXn8+DHmz5+Pn3/+uUiK1h4eHti6dSuGDh2KcePGidOn79q1CwsXLvzo7RMRERERERHlhkVxIiIiIiIi+mRat26t8FpfXx8///wz2rRpo9B+9+5dhWLsqFGjULduXSxZskQsivfr1w/Dhg1D5cqVlYrqkydPxosXL3Du3DmF6dpnzZoFQRAU+pYtWxbHjh0TR2nLZDKsWLEC8fHxMDMzy/FYJk2ahEGDBsHJyQlNmzZFs2bN0LZtWzRp0gRaWjlPzGZkZKQU78iRI/Hu3TtERkYCAARBwLBhw+Dp6YnDhw+LsQ0dOhTVqlXD1KlTcezYMQDAwYMH8dVXX2Hx4sXi9iZOnJjj/vNj/PjxqFOnDnr16vVR25H76quvcOPGDaxfvx4bN24EAGhra2PVqlUYNmxYkeyDiIiIiIiIKCcsihMREREREdEns3r1anzxxRcAgJcvX2LLli0YPHgwTExM0LVrV7HfhwXxt2/fIjMzE82bN8cvv/yS5z5kMhn27NkDX19fhYK4XPYpyocMGaLQ1rx5cyxduhSPHj1CzZo1c9zPl19+CVtbWyxZsgRRUVGIiopCUFAQKleujNDQUDRp0iTPWAFg8+bNWLNmDRYvXgxPT08AwJUrV3Dv3j1MnToVr1+/VujfqlUrhIaGQiaTQUtLC+bm5jh37hyePXsGGxsblfvw8PBQuhkgJ1FRUdi5cyfOnTuXr/75oa2tDScnJ3h5eaFHjx6QSqX45Zdf8PXXX8Pa2hqdO3cusn0RERERERERZceiOBEREREREX0yDRo0UChU9+7dG3Xq1MGoUaPQoUMHcfrwAwcOYPbs2bhy5QpSU1PF/tkL2qq8evUKCQkJqF69er5iqlSpksJrCwsLAFnF+Lx4eXnBy8sLycnJ+Ouvv7B161asW7cOHTp0wO3bt5WeLZ7dlStXMGzYMPTu3Rvjxo0T2+/duwcA8Pf3z3Hd+Ph4WFhY4IcffoC/vz/s7Ozg5uaG9u3bY8CAAahcuXKe8WeXkZGB0aNHo3///qhfv36B18/J/PnzsXz5cty7dw/GxsYAAD8/P3h6emLkyJHo0KGD0nPliYiIiIiIiIpKzvO5ERERERERERUzLS0teHp64vnz52Ih+NSpU+jYsSOkUinWrFmDQ4cOITIyEn369Mn3aOeC0NbWVtlekH0ZGhqiefPmWLVqFaZOnYq3b9/i8OHDua7z9u1bdOvWDV988YU4pbicTCYDACxcuBCRkZEqvz4sLv/zzz9YuXIlbGxssHDhQlSrVi3P/auyefNm3LlzB0OHDkV0dLT4BQCJiYmIjo5GcnJygbe7Zs0atGzZUoxZrmPHjnj27Jm4DyIiIiIiIqLiwNuwiYiIiIiISK0yMjIAAO/evQMA7Ny5E1KpFEePHoW+vr7YLzg4WGldVSPHLS0tYWpqiuvXrxdTxLmTj4R//vx5jn1kMhn69u2LuLg4/PrrrzA0NFRY7uTkBAAwNTVVeg67KhUqVMCIESMwYsQIxMTEoG7dupgzZw68vb0LFPvjx4+Rnp6Opk2bKi3bvHkzNm/ejN27dxd4uvOXL18iMzNTqT09PR3Af58BIiIiIiIiouLAkeJERERERESkNunp6Th27Bj09PTg4uICIGvktkQiUSiiRkdHY8+ePUrrGxkZIS4uTqFNS0sLnTt3xv79+3Hx4kWldYpqtPlvv/2msv3QoUMAgKpVq+a47syZM3H06FH88ssvcHR0VFru5uYGJycnLFq0SLxZ4EOvXr0CAGRmZiI+Pl5hmZWVFWxsbBSmnU9OTsbt27cRGxub6zH16tULu3fvVvoCgPbt22P37t1o2LBhrttQ5YsvvkBkZKTC89EzMzOxbds2mJiYiDcBEBERERERERUHjhQnIiIiIiKiT+bw4cO4ffs2ACAmJgbh4eG4d+8evvvuO5iamgIAfHx8sGTJErRr1w59+vRBTEwMVq9ejSpVquDatWsK23Nzc8Ovv/6KJUuWwMbGBo6OjmjYsCHmzp2LY8eOwd3dHUOGDIGLiwueP3+O7du34/Tp0zA3N//oY+nUqRMcHR3h6+sLJycnJCUl4ddff8X+/ftRv359+Pr6qlzv77//RlBQEFq0aIGYmBhs2bJFYXm/fv2gpaWFjRs3wtvbG9WqVcPAgQNha2uLp0+fIioqCqampti/fz8SExNRsWJFdO/eHbVq1YKxsTF+/fVXXLhwAYsXLxa3ef78eXh6emLGjBkIDAzM8Zj+97//4X//+5/KZY6OjkojxD08PPD777/neaPBd999h379+qFhw4YYMmQIDAwM8Msvv+Cvv/7C7Nmzoaurm+v6RERERERERB+DRXEiIiIiIiL6ZKZPny7+LJVK8b///Q9r167F0KFDxfaWLVvip59+wvz58zF27Fg4OjpiwYIFiI6OViqKL1myBEOGDMHUqVPx/v17+Pv7o2HDhrC1tcW5c+cwbdo0hIWFISEhAba2tvD29laaqrywNm7ciL1792Lbtm149uwZBEFA5cqV8f3332PSpEnQ0VH9K/fr168hCAJ+//13/P7770rL+/XrByCr4PzHH38gKCgIq1atwrt372BtbY2GDRuK58vQ0BAjRozAsWPHsGvXLshkMlSpUgVr1qzB8OHDi+Q4cyOPKS99+/ZFuXLlMG/ePCxcuBAJCQmoWrUq1q1bp/DeExERERERERUHiVBU88YRERERERER0WcjMTERZcqUwbJlyzBy5Eh1h0NERERERESUIz5TnIiIiIiIiIgK7OTJk7C1tcVXX32l7lCIiIiIiIiIcsWR4kREREREREREREREREREVGpxpDgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosilOpFxgYCIlEUqz7cHBwQEBAQLHuQxMtXLgQlStXhra2NmrXrg0AyMjIwMSJE2FnZwctLS107twZACCRSBAYGCiuGxISAolEgujo6E8eNxEVL16XiYgoO+YGIiLKjrmBiIg+xLxARB+LRXEqMdasWQOJRIKGDRuqO5RiI5FIFL5MTU3h7u6OgwcPFnqb4eHhWLZsWdEFWUSOHTuGiRMnomnTpggODsbcuXMBAD///DMWLlyI7t27Y9OmTfjmm2/UHCkR5YTX5cL51NflY8eOYdCgQahevTq0tbXh4ODwyfatCW7evInAwMCPvslq/PjxcHV1LZqgiDQYc0PhfMrckJycjNWrV6Nt27aoUKECTExMUKdOHaxduxaZmZmfJIaSjrmBqGgxNxSOOv+eExcXBysrK0gkEuzYsUMtMZQ0Z8+eRWBgIOLi4j5qO926dUP79u2LJigiDcW8UDifOi/IZDKsW7cOtWvXhrGxMcqXLw9vb2+cPXv2k8VQkjEvlE4silOJERYWBgcHB5w/fx73798vsu1OnToV79+/L7Ltfaw2bdogNDQUmzdvxsSJE3H//n34+vri6NGjhdpeSS2KHz9+HFpaWvjpp58wYMAA8cJ//Phx2NraYunSpejfvz/c3d1Vrt+/f3+8f/8e9vb2nzJsIvoAr8uacV0ODw9HeHg4zMzMYGNj88n2qylu3ryJmTNnfnTh4+DBg/Dx8SmaoIg0GHNDyc8N//zzD77++msIgoBx48Zh0aJFcHR0xIgRI/Dll19+khhKOuYGoqLF3FDyc0N206dPR3Jyslr2XVKdPXsWM2fO/KjiR3p6OiIjI5kb6LPHvKAZeeHbb7/F8OHDUaNGDSxZsgTjx4/H3bt34e7ujvPnz3+yOEoq5oXSiUVxKhEePnyIs2fPYsmSJbC0tERYWFi+1svIyEBaWprKZUlJSQAAHR0dSKXSIov1Y33xxRfo168f+vfvj6lTp+LXX3+FIAhYvny5ukMrUjExMTAwMICenp5Su7m5eZ7ra2trQyqVFvuUOESkGq/LmnNdnjt3LhISEnDmzBnUqlXro7eX23v4ufrnn39w584d/hJDnz3mBs3IDdbW1vj7778RGRmJb7/9FkOHDsWuXbswcOBAbN68uVB/mGRuUMbcQJSFuUEzcsOHrl+/jrVr12LSpEkftZ2UlBTIZLIiiqp0OHXqFBITE5kb6LPGvKAZeSEjIwNr165F9+7dERoaiiFDhmDixIn49ddfkZGRke/37UPMC8qYF0oeFsWpRAgLC4OFhQV8fHzQvXt3lRfd6OhoSCQSLFq0CMuWLYOTkxP09fXFqe8kEglu3ryJPn36wMLCAs2aNQOg/KyR6tWrw9PTU2n7MpkMtra26N69u9i2aNEiNGnSBGXLloWBgQHc3NyKfFopFxcXlCtXDg8ePFBo37t3L3x8fGBjYwN9fX04OTkhKChIYcpDDw8PHDx4EI8ePRKnavlw2tzU1FTMmDEDVapUgb6+Puzs7DBx4kSkpqYq7Cs2Nha3b9/O113CGRkZCAoKEs+/g4MDpkyZorBNiUSC4OBgJCUliXHJnxEeFRWFGzduiO0nTpxQuR9VzxR3cHBAhw4dcPr0aTRo0ABSqRSVK1fG5s2bldaPi4vD2LFjYWdnB319fVSpUgULFixgYibKJ16XNee6bGNjA11d3UIda27vIQDcvn0b3bt3R5kyZSCVSlGvXj3s27dPaTs3btxAy5YtYWBggIoVK2L27Nn4+eefla7jEokEgYGBSuuremZXfq/jERERcHNzg4mJCUxNTVGjRg3xF9CQkBD06NEDAODp6amUey5evAgvLy+UK1cOBgYGcHR0VDmK8uDBgzAzMxM/w4mJiRg7diwcHBygr68PKysrtGnTBpcuXVJY79y5c2jXrh3MzMxgaGgId3d3nDlzRmn7T58+xaBBg8TPlqOjI4YPH84CFJU4zA2akRvKlSuHatWqKbV36dIFAHDr1q1c12duYG4gKgjmBs3IDR8aM2YMunTpgubNm+d7nRMnTkAikSAiIgJTp06Fra0tDA0NkZCQACD/17bTp0+jfv36kEqlcHJywvr165XeZ/nnJSQkRGl9VTnj6dOn+PLLL1G+fHno6+ujWrVq+Pnnn5XWXblyJapVqwZDQ0NYWFigXr16CA8PB5D1Wfv2228BAI6OjuJ7Is9XkZGRaNasGczNzWFsbIyqVatiypQpSvs4ePAgXF1dxffyxYsXGDhwICpWrAh9fX1UqFABnTp1Upqp5PDhw2jevDmMjIxgYmICHx8f3LhxQ2n7t2/fhp+fHywtLWFgYICqVavi+++/V+pHpE7MC5qRF9LT0/H+/XuUL19eod3KygpaWlowMDDIdX3mBeYFTaWj7gCIgKxk2bVrV+jp6aF3795Yu3YtLly4gPr16yv1DQ4ORkpKCoYMGQJ9fX2UKVNGXNajRw84Oztj7ty5EARB5b569uyJwMBAvHjxAtbW1mL76dOn8ezZM/Tq1UtsW758OTp27Ii+ffsiLS0NERER6NGjBw4cOFBkd/fEx8fj7du3cHJyUmgPCQmBsbExxo0bB2NjYxw/fhzTp09HQkICFi5cCAD4/vvvER8fj3///RdLly4FABgbGwPISv4dO3bE6dOnMWTIELi4uODvv//G0qVLcffuXezZs0fc16pVqzBz5kxERUXBw8Mj13gHDx6MTZs2oXv37hg/fjzOnTuHefPm4datW9i9ezcAIDQ0FD/++CPOnz+PjRs3AgDq1KmD0NBQzJkzB+/evcO8efMAZP1noSDu37+P7t27Y9CgQfD398fPP/+MgIAAuLm5iX/8S05Ohru7O54+fYqhQ4eiUqVKOHv2LCZPnoznz5+XyOnmiUoaXpc157pcFFS9hzdu3EDTpk1ha2uL7777DkZGRti2bRs6d+6MnTt3isWVFy9ewNPTExkZGWK/H3/8Mc9foHKT3+t4ZGQkevfujVatWmHBggUAsoo9Z86cwZgxY9CiRQuMHj0aK1aswJQpU8Sc4+LigpiYGLRt2xaWlpb47rvvYG5ujujoaOzatUspnkOHDqFNmzbQ0cn6r/OwYcOwY8cOjBo1Cq6urnj9+jVOnz6NW7duoW7dugCyHhfi7e0NNzc3zJgxA1paWggODkbLli1x6tQpNGjQAADw7NkzNGjQAHFxcRgyZAj+97//4enTp9ixYweSk5OVZlwhUifmBs3ODS9evACQVTTPD+YG5gai/GBu0KzcsH37dpw9exa3bt0q1CMkgoKCoKenhwkTJiA1NRV6enr5vrb9/fff4jU2MDAQGRkZmDFjhlJBpiBevnyJRo0aQSKRYNSoUbC0tMThw4cxaNAgJCQkYOzYsQCADRs2YPTo0ejevTvGjBmDlJQUXLt2DefOnUOfPn3QtWtX3L17F7/88guWLl0q5kpLS0vcuHEDHTp0QM2aNTFr1izo6+vj/v37Kos7hw4dQocOHcTX3bp1w40bN/D111/DwcEBMTExiIyMxOPHj8UCSWhoKPz9/eHl5YUFCxYgOTkZa9euRbNmzXD58mWx37Vr19C8eXPo6upiyJAhcHBwwIMHD7B//37MmTOn0OeQqKgxL2hGXjAwMEDDhg0REhKCxo0bo3nz5oiLi0NQUBAsLCwwZMiQfB0z8wLzgsYRiNTs4sWLAgAhMjJSEARBkMlkQsWKFYUxY8Yo9Hv48KEAQDA1NRViYmIUls2YMUMAIPTu3Vtp+/Jlcnfu3BEACCtXrlToN2LECMHY2FhITk4W2z78WRAEIS0tTahevbrQsmVLhXZ7e3vB398/z2MFIAwaNEh49eqVEBMTI1y8eFFo166dAEBYuHChQt/s+xYEQRg6dKhgaGgopKSkiG0+Pj6Cvb29Ut/Q0FBBS0tLOHXqlEL7unXrBADCmTNnxDb5OYqKiso1/itXrggAhMGDByu0T5gwQQAgHD9+XGzz9/cXjIyMlLbh7u4uVKtWTakdgDBjxgzxdXBwsABAePjwodhmb28vABBOnjwptsXExAj6+vrC+PHjxbagoCDByMhIuHv3rsI+vvvuO0FbW1t4/PhxrsdJ9LnjdVlzrsvZ5bTvnOT2HrZq1UqoUaOGwrHJZDKhSZMmgrOzs9g2duxYAYBw7tw5sS0mJkYwMzNTuo5nv9bLZX+/8nsdHzNmjGBqaipkZGTkeIzbt29XeS53794tABAuXLiQ47qCIAhJSUmCVCoVgoODxTYzMzNh5MiROa4jk8kEZ2dnwcvLS5DJZGJ7cnKy4OjoKLRp00ZsGzBggKClpaUyjg/XJVI35gbNzQ2CIAipqamCq6ur4OjoKKSnp+fal7mBuYEov5gbNCs3JCcnC5UqVRImT54sCIIgREVFCQCE7du357muvG/lypUVjq8g17bOnTsLUqlUePTokdh28+ZNQVtbW+F9ln9ePrzGymXPGYMGDRIqVKggxMbGKvTr1auXYGZmJsbaqVMnlX+L+tDChQuVcpQgCMLSpUsFAMKrV69yXf+ff/5ReC/evn2r8vPxocTERMHc3Fz46quvFNpfvHghmJmZKbS3aNFCMDExUTh/gsC8QCUL84Jm5YV79+4JdevWFQCIX5UrVxZu376d57rMC8wLmorTp5PahYWFoXz58uJUJxKJBD179kRERITCFCJy3bp1g6WlpcptDRs2LM/9ffHFF6hduza2bt0qtmVmZmLHjh3w9fVVGL3w4c9v375FfHw8mjdvrjQFXkH89NNPsLS0hJWVFerVq4fffvsNEydOxLhx4xT6fbjvxMRExMbGonnz5khOTsbt27fz3M/27dvh4uKC//3vf4iNjRW/WrZsCQCIiooS+wYGBkIQhDzvKj506BAAKMU6fvx4AFnTgRQ3V1dXhSm+LC0tUbVqVfzzzz9i2/bt29G8eXNYWFgoHHvr1q2RmZmJkydPFnucRJqM12XNuS4Xlezv4Zs3b3D8+HH4+fmJxxobG4vXr1/Dy8sL9+7dw9OnTwFk5YZGjRqJd/oCWdfmvn37Fjqe/F7Hzc3NkZSUhMjIyALvw9zcHABw4MABpKen59jv+PHjSE1Nhbe3t8K6586dw7Nnz1Suc+XKFdy7dw99+vTB69evxfiTkpLQqlUrnDx5EjKZDDKZDHv27IGvry/q1auntJ0PpwsjUjfmBs3ODaNGjcLNmzexatUqcWRzXpgbmBuI8sLcoFm5Yf78+UhPT1c5vWt++fv7Kxxffq9tmZmZOHr0KDp37oxKlSqJ67u4uMDLy6tQsQiCgJ07d8LX1xeCICicKy8vL8THx4vvt7m5Of79919cuHChwPuR54a9e/fm+ki+7I/VMDAwgJ6eHk6cOIG3b9+qXCcyMhJxcXHo3bu3Qvza2tpo2LCh+F6/evUKJ0+exJdffqlw/gDmBSpZmBc0Ky+YmJigWrVqGDlyJHbt2oU1a9YgIyMDnTt3RmxsbL7OAfMC84Km4fTppFaZmZmIiIiAp6cnHj58KLY3bNgQixcvxm+//Ya2bdsqrOPo6Jjj9nJb9qGePXtiypQpePr0KWxtbXHixAnExMSgZ8+eCv0OHDiA2bNn48qVK0rPzC6sTp06YdSoUUhLS8OFCxcwd+5cJCcnQ0tL8R6VGzduYOrUqTh+/Lj4LA65+Pj4PPdz79493Lp1K8f/WMTExBQ49kePHkFLSwtVqlRRaLe2toa5uTkePXpU4G0WVPaLPABYWFgoJJJ79+7h2rVrRXrsRJ8LXpc167pcVLK/T/fv34cgCJg2bRqmTZumcp2YmBjY2tri0aNHaNiwodLyqlWrFjqe/F7HR4wYgW3btsHb2xu2trZo27Yt/Pz80K5duzz34e7ujm7dumHmzJlYunQpPDw80LlzZ/Tp0wf6+vpiv4MHD6JevXoK03f98MMP8Pf3h52dHdzc3NC+fXsMGDAAlStXFuMHsn45zEl8fDzS0tKQkJCA6tWr531SiNSIuUGzc8PChQuxYcMGBAUFoX379vlej7mBuYEoN8wNmpUboqOjsXDhQqxevVqcjrcwsr9P+b22paam4v3793B2dlZaXrVqVXEQRkG8evUKcXFx+PHHH/Hjjz+q7CM/V5MmTcKvv/6KBg0aoEqVKmjbti369OmDpk2b5rmfnj17YuPGjRg8eDC+++47tGrVCl27dkX37t0V3vuDBw+ibdu24s1n+vr6WLBgAcaPH4/y5cujUaNG6NChAwYMGCBO8yw/f/LCVnampqYAIA4EYW6gkox5QbPyQkZGBlq3bg0PDw+sXLlSbG/dujWqVauGhQsXio8iyg3zAvOCpmFRnNTq+PHjeP78OSIiIhAREaG0PCwsTClZ5vYcuvw+o65nz56YPHkytm/fjrFjx2Lbtm0wMzNT+EPJqVOn0LFjR7Ro0QJr1qxBhQoVoKuri+DgYISHh+fzCJVVrFgRrVu3BgC0b98e5cqVw6hRo+Dp6YmuXbsCAOLi4uDu7g5TU1PMmjULTk5OkEqluHTpEiZNmpTrHUhyMpkMNWrUwJIlS1Qut7OzK/QxqPNuI21tbZXtwgfPlpHJZGjTpg0mTpyosu8XX3xRLLERlQa8LmvmdfljZX+f5MczYcKEHO/QzX6D1MfIfsd4fq/jVlZWuHLlCo4ePYrDhw/j8OHDCA4OxoABA7Bp06Zc9ymRSLBjxw78+eef2L9/P44ePYovv/wSixcvxp9//in+sfDQoUMYOHCgwrp+fn5o3rw5du/ejWPHjom/LO7atQve3t7i+Vu4cCFq166tcv/GxsZ48+ZNnueGqCRgbtDc3BASEoJJkyZh2LBhmDp1aoHWZW5gbiDKDXODZuWG6dOnw9bWFh4eHuKzxF+8eAEgq4gQHR2NSpUqKRVyssspN+R1bfuwAJWXnP7mpCovAEC/fv1yLL7UrFkTQNbIwzt37uDAgQM4cuQIdu7ciTVr1mD69OmYOXNmrvEYGBjg5MmTiIqKwsGDB3HkyBFs3boVLVu2xLFjx6CtrY3k5GScOHECa9euVVh37Nix8PX1xZ49e3D06FFMmzYN8+bNw/Hjx1GnTh3xGEJDQxWehyyX39ldiEoC5gXNygsnT57E9evXlbbp7OwMFxcXlc/HVoV5gXlB0/AMklqFhYXBysoKq1evVlq2a9cu7N69G+vWrct3EswvR0dHNGjQAFu3bsWoUaOwa9cudO7cWeHu/507d0IqleLo0aMK7cHBwUUay9ChQ7F06VJMnToVXbp0gUQiwYkTJ/D69Wvs2rULLVq0EPt+eJedXE5JwcnJCVevXkWrVq2KrIhtb28PmUyGe/fuwcXFRWx/+fIl4uLiYG9vXyT7+VhOTk549+6d+J8SIso/Xpc167pcXOSj2nR1dfO8ltrb24t3sn7ozp07Sm0WFhaIi4tTaEtLS8Pz588V2gpyHdfT04Ovry98fX0hk8kwYsQIrF+/HtOmTUOVKlXyPNeNGjVCo0aNMGfOHISHh6Nv376IiIjA4MGDcf36dTx+/Bg+Pj5K61WoUAEjRozAiBEjEBMTg7p162LOnDnw9vaGk5MTgKw7eHM7BktLS5iamuL69et5HieROjE3aGZu2Lt3LwYPHoyuXbuqfO8KirmBuYHoQ8wNmpUbHj9+jPv374vX8g+NGDECQNZ0wvIpYfOrINc2AwODfOUGCwsLAFDKDdlnJ7S0tISJiQkyMzPzlRuMjIzQs2dP9OzZE2lpaejatSvmzJmDyZMnQyqV5nqutbS00KpVK7Rq1QpLlizB3Llz8f333yMqKgqtW7dW+VgNOScnJ4wfPx7jx4/HvXv3ULt2bSxevBhbtmwRz5+VlVWuxyB/35gbqCRjXtCsvPDy5UsAyoVlAEhPT0dGRkahtsu8wLxQ0vGZ4qQ279+/x65du9ChQwd0795d6WvUqFFITEzEvn37imX/PXv2xJ9//omff/4ZsbGxSlOqaGtrQyKRKCSG6Oho7Nmzp0jj0NHRwfjx43Hr1i3s3btX3DegOPo5LS0Na9asUVrfyMhI5TQrfn5+ePr0KTZs2KC07P3790hKShJfx8bG4vbt20hOTs41Vvl0i8uWLVNol99RpuoPQ+rg5+eHP/74A0ePHlVaFhcXV+ikTlTa8bqcRZOuy8XFysoKHh4eWL9+vVJRAsgaTSLXvn17/Pnnnzh//rzC8rCwMKX1nJycxGe+yv34449Kv4Tl9zr++vVrhWVaWlriXb/yu46NjIzE9T709u1bhfcTgHgXs3zdQ4cOoXz58grPdM3MzFR6f62srGBjYyOu5+bmBicnJyxatAjv3r1TOgb5+dPS0kLnzp2xf/9+XLx4Ualf9viI1IG5IYum5YaTJ0+iV69eaNGiBcLCwvIc+ZcfzA3MDURyzA1ZNCk3zJ49G7t371b4CgoKAgBMnDgRu3fvFq+NBZHfa5u2tja8vLywZ88ePH78WFx+69Ytpeu6qakpypUrp5Qbsp9DbW1tdOvWDTt37lRZFPgwL2XPDXp6enB1dYUgCEhPTweQc25QNYOHqtyQ/bEaycnJSElJUVjPyckJJiYm4npeXl4wNTXF3LlzxThUHYOlpSVatGiBn3/+WeH8AcwLVDIwL2TRpLwgn2Up+6j+S5cu4c6dO6hTp06u6+eEeYF5oaTjSHFSm3379iExMREdO3ZUubxRo0awtLREWFiYUiIrCn5+fpgwYQImTJiAMmXKKN154+PjgyVLlqBdu3bo06cPYmJisHr1alSpUgXXrl0r0lgCAgIwffp0LFiwAJ07d0aTJk1gYWEBf39/jB49GhKJBKGhoSovaG5ubti6dSvGjRuH+vXrw9jYGL6+vujfvz+2bduGYcOGISoqCk2bNkVmZiZu376Nbdu24ejRo+IfclatWoWZM2ciKioKHh4eOcZZq1Yt+Pv748cffxSnfjl//jw2bdqEzp07w9PTs0jPS2F9++232LdvHzp06ICAgAC4ubkhKSkJf//9N3bs2IHo6GiUK1dO3WESlTi8Lv9HU67LAHDt2jXxF8v79+8jPj4es2fPBpB13fb19S3UOVi9ejWaNWuGGjVq4KuvvkLlypXx8uVL/PHHH/j3339x9epVAFl/RAsNDUW7du0wZswYGBkZ4ccff4S9vb3S+zJ48GAMGzYM3bp1Q5s2bXD16lUcPXpU6Zqc3+v44MGD8ebNG7Rs2RIVK1bEo0ePsHLlStSuXVuc0aR27drQ1tbGggULEB8fD319fbRs2RLh4eFYs2YNunTpAicnJyQmJmLDhg0wNTUVbwI7ePAgvL29Fe4OTkxMRMWKFdG9e3fUqlULxsbG+PXXX3HhwgUsXrwYQFZBY+PGjfD29ka1atUwcOBA2Nra4unTp4iKioKpqSn2798PAJg7dy6OHTsGd3d3DBkyBC4uLnj+/Dm2b9+O06dPF3i0DlFRY274j6bkhkePHqFjx46QSCTo3r07tm/frrC8Zs2aYpG4oJgbmBuIAOaGD2lKbmjWrJlSm/xaUr9+fXTu3LlQx1+Qa9vMmTNx5MgRNG/eHCNGjEBGRgZWrlyJatWqqcwN8+fPx+DBg1GvXj2cPHkSd+/eVdr//PnzERUVhYYNG+Krr76Cq6sr3rx5g0uXLuHXX38VCxdt27aFtbU1mjZtivLly+PWrVtYtWoVfHx8YGJiAiDr/QCA77//Hr169YKuri58fX0xa9YsnDx5Ej4+PrC3t0dMTAzWrFmDihUriudV1WM17t69i1atWsHPzw+urq7Q0dHB7t278fLlS/Tq1QtAVqFn7dq16N+/P+rWrYtevXrB0tISjx8/xsGDB9G0aVOsWrUKALBixQo0a9YMdevWxZAhQ+Do6Ijo6GgcPHgQV65cKdT7R1RUmBf+oyl5wc3NDW3atMGmTZuQkJCAtm3b4vnz51i5ciUMDAwwduzYQh0/8wLzQoknEKmJr6+vIJVKhaSkpBz7BAQECLq6ukJsbKzw8OFDAYCwcOFCpX4zZswQAAivXr3KcZkqTZs2FQAIgwcPVrn8p59+EpydnQV9fX3hf//7nxAcHKxye/b29oK/v38uR5sFgDBy5EiVywIDAwUAQlRUlCAIgnDmzBmhUaNGgoGBgWBjYyNMnDhROHr0qEIfQRCEd+/eCX369BHMzc0FAIK9vb24LC0tTViwYIFQrVo1QV9fX7CwsBDc3NyEmTNnCvHx8Urn6MPt5iQ9PV2YOXOm4OjoKOjq6gp2dnbC5MmThZSUFIV+/v7+gpGRkdL67u7uQrVq1VSemxkzZoivg4ODBQDCw4cPxTZ7e3vBx8dH5Tbd3d0V2hITE4XJkycLVapUEfT09IRy5coJTZo0ERYtWiSkpaXleZxEnyNelxVpynVZfr1U9ZXXOcjtPRQEQXjw4IEwYMAAwdraWtDV1RVsbW2FDh06CDt27FDod+3aNcHd3V2QSqWCra2tEBQUJPz0009K1/HMzExh0qRJQrly5QRDQ0PBy8tLuH//vsr3Kz/X8R07dght27YVrKysBD09PaFSpUrC0KFDhefPnytsa8OGDULlypUFbW1t8bxeunRJ6N27t1CpUiVBX19fsLKyEjp06CBcvHhREARBiIuLE3R0dIRt27YpbCs1NVX49ttvhVq1agkmJiaCkZGRUKtWLWHNmjVK5+/y5ctC165dhbJlywr6+vqCvb294OfnJ/z2228K/R49eiQMGDBAsLS0FPT19YXKlSsLI0eOFFJTU3N+84g+EeYGRZqQG6KionLMC9n/z60KcwNzA1FemBsUaUJuUEWeL7Zv3/7RffN7bfv9998FNzc3QU9PT6hcubKwbt06le9LcnKyMGjQIMHMzEwwMTER/Pz8hJiYGJV57OXLl8LIkSMFOzs7QVdXV7C2thZatWol/Pjjj2Kf9evXCy1atBDjc3JyEr799luFcykIghAUFCTY2toKWlpaYr767bffhE6dOgk2NjaCnp6eYGNjI/Tu3Vu4e/euIAiCcP36dQGAcP78eYVtxcbGCiNHjhT+97//CUZGRoKZmZnQsGFDpRwiP79eXl6CmZmZIJVKBScnJyEgIEDMP3LXr18XunTpIpibmwtSqVSoWrWqMG3aNJXvCdGnxLygSFPyQnJysjBr1izB1dVVMDAwEMzMzIQOHToIly9fznNd5gXmBU0lEQSOpSciIiKiohUSEoKBAwfi4cOHcHBwUHc4BbZt2zb07dsXsbGxMDMzU3c4RESlAnMDERFlFxgYiJkzZ2rsdK8//PADlixZgufPnxfZs36JiD5nzAtUnPhMcSIiIiKibMzNzbFixQoWPYiISMTcQERE2Tk4OGDp0qUsfBAREQDmhZKOzxQnIiIiIsqmbdu26g6BiIhKGOYGIiLKzs/PT90hEBFRCcK8ULJxpDgREREREREREREREREREZVafKY4ERERERERERERERERERGVWhwpTkREREREREREREREREREpRaL4kREREREREREREREREREVGrpqDuAkkAmk+HZs2cwMTGBRCJRdzhERJ8FQRCQmJgIGxsbaGmVrHu0mBeIiNSDuYGIiD5UkvMCwNxARKQOzA1ERJRdfnMDi+IAnj17Bjs7O3WHQUT0WXry5AkqVqyo7jAUMC8QEakXcwMREX2oJOYFgLmBiEidmBuIiCi7vHIDi+IATExMAGSdLFNTUzVHk7f09HQcO3YMbdu2ha6urrrDKRBNjV1T4wYYuzpoatzAp409ISEBdnZ24jW4JJHH9PDhQ/zxxx8a915q6mdQU+MGNDd2xv1pMe68aUJu0JTfGeT4ufu0NDVuQHNjZ9yf1qeOuyTnBeC/3LBx40Z07txZo95LQHM/hwBjVxdNjV1T4wYYuyqakhs06fcGTf6cAZodvybHDjB+dWP8/8lvbmBRHBCnMTE1NdWIRJWeng5DQ0OYmppq3AddU2PX1LgBxq4Omho3oJ7YS+JUUvKYTExMNPK91NTPoKbGDWhu7Iz702Lc+VeSc4Om/M4gx8/dp6WpcQOaGzvj/rTUFXdJzAvAf3Fp4nsJaO7nEGDs6qKpsWtq3ABjz01Jzw2a9HuDJn/OAM2OX5NjBxi/ujF+ZXnlhpL30A0iIiIiIiIiIiIiIiIiIqIiwqI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcVnin/GMjMzkZ6e/kn3mZ6eDh0dHaSkpCAzM/OT7vtjaGrcAGNXB02NGyja2HV1daGtrV1EkRERERERERERERERERUOi+KfIUEQ8OLFC8TFxall39bW1njy5EmeD7wvSTQ1boCxq4Omxg0Ufezm5uawtrbWuPNARERERERERERERESlB4vinyF5QdzKygqGhoaftFglk8nw7t07GBsbQ0tLc2bv19S4AcauDpoaN1B0sQuCgOTkZMTExAAAKlSoUFQhEhERERERERERERERFQiL4sVNlgk8Ogu8ewkYlwfsmwBa6ptOODMzUyyIly1b9pPvXyaTIS0tDVKpVKOKhZoaN8DY1UFT4waKNnYDAwMAQExMDKysrDiVOhERERERERER0ccoYfUGIiJNwqJ4cbq5DzgyCUh49l+bqQ3QbgHg2lEtIcmfIW5oaKiW/RPR50V+rUlPT2dRnIiIKD/4Ry4iIiIiIlKlBNYbiIg0CYvixeXmPmDbAACCYnvC86x2v81qTVR8vi8RfQq81hARERUA/8hFRER54c1TRESfpxJebyAi0gSaNa+vppBlZv0xK3uCAv5rO/JdVj8iDXbixAlIJBLExcXlex0HBwcsW7as2GIiIiIi0kjyP3J9WBAH/vsj18196omLiIhKjpv7gGXVgU0dgJ2Dsr4vq84cQURU2rHeQERUJFgULw6Pzir/MUuBACQ8zepHVIwCAgKgra2Nb775RmnZyJEjIZFIEBAQ8OkDIyIiIqL/8I9cRESUF948RUT0+WK9gYioSLAoXhzevSzafkQfwc7ODrt27cL79+/FtpSUFISHh6NSpUpqjIyIAGQVOB6eAv7ekfWdBQ8ios8P/8hFRES54c1TRESfN9YbiIiKBIvixcG4fNH2I/oIderUga2tLXbt2iW27dq1C5UqVUKdOnXEttTUVIwePRpWVlaQSqVo1qwZLly4oLCtQ4cO4YsvvoCBgQE8PT0RHR2ttL/Tp0+jefPmMDAwgJ2dHUaPHo2kpKRiOz4ijcbpD4mICOAfuYiIKHe8eYqI6PPGegMRUZFgUbw42DcBTG0ASHLoIAFMbbP6EX0C/fr1w6ZNm8TXP//8MwYOHKjQZ+LEidi5cyc2bdqES5cuoUqVKvDy8sKbN28AAE+ePEHXrl3h6+uLK1euYPDgwfjuu+8UtvHgwQO0a9cO3bp1w7Vr17B161acPn0ao0aNKv6DJNI0nP6QiIjk+EcuIiLKDW+eIiL6vLHeQERUJFgULw5a2kC7Bf//Inui+v/X7eZn9SP6BPz8/HD69Gk8evQIjx49wpkzZ9CvXz9xeVJSEtauXYuFCxfC29sbrq6u2LBhAwwMDPDTTz8BANauXQsnJycsXrwYVatWRd++fZWeRz5v3jz07dsXY8eOhbOzM5o0aYIVK1Zg8+bNSElJ+ZSHTFSycfpDIiL6EP/IRUREueHNU0REnzeFekN2rDcQEeUXi+LFxbUj4LcZMLFWbDe1yWp37aieuOizVK5cObRv3x4hISEIDg6Gj48PypUrJy5/8OAB0tPT0bRpU7FNV1cXDRo0wK1btwAAt27dQsOGDRW227hxY4XXV69eRUhICIyNjcUvLy8vyGQyPHz4sBiPkEjDcPpDIiL6EP/IRUREueHNU0REJK836BoqtrPeQESUbzrqDqBUc+0I2NYDlrpkvfbfD9g35R+zSC0GDhyI0aNHAwBWr15dLPt49+4dhg4dKu7nQ5UqVSqWfRJpJE5/SERE2cn/yLV7CJD+/r92U5usgjj/yEVE9PmS3zy1bYCKhbx5iojos+HaEbi+E7i5B6jZG6jTN+uGKF7/iYjyhSPFi1vG//9BS88EcGzBBEVq065dO6SlpSE9PR1eXl4Ky5ycnKCnp4czZ86Ibenp6bhw4QJcXV0BAC4uLjh//rzCen/++afC67p16+LmzZuoUqWK0peenl4xHRmRBuL0h0REpIprR8DBPevnOv0B/wPA2L9ZECciov9untLO9rs1RwgSEX1e0pOzvjs2Axybs95ARFQALIoXt7R3Wd/1jNQbB332tLW1cevWLdy8eRPa2or/WTIyMsLw4cPx7bff4siRI7h58ya++uorJCcnY9CgQQCAYcOG4d69e/j2229x584dhIeHIyQkRGE7kyZNwtmzZzFq1ChcuXIF9+7dw969ezFq1KhPdZhEmoHTHxIRUU7SErO+O7XkH7mIiEiRa0fA7P9nYWvxLW+eIiL6HKXK6w3G6o2DiEgDsShe3NKSsr7rM0mR+pmamsLU1FTlsvnz56Nbt27o378/6tati/v37+Po0aOwsLAAkDX9+c6dO7Fnzx7UqlUL69atw9y5cxW2UbNmTfz++++4e/cumjdvjjp16mD69OmwsbEp9mMj0igKz47NXhjn9IdERJ+1lISs71LV/2cjIqLP3Ps3Wd+rd+PNU0REnyP5TbSsNxARFRifKV7c5EVxjhQnNQgJCYFMJkNCQoLK5Xv27BF/lkqlWLFiBVasWJHj9jp06IAOHTootA0cOFDhdf369XHs2LEctxEdHZ134ESfA/n0h0cmAQnP/mvns2OJiD5vqfFZ3/XN1BsHERGVPLJM4P3brJ8Ny6o3FiIiUg9xpLiJeuMgItJAah0pPm/ePNSvXx8mJiawsrJC586dcefOHXH5mzdv8PXXX6Nq1aowMDBApUqVMHr0aMTHxytsRyKRKH1FRER86sNRLY3TmRARUQ5cOwLD//zvdd+dnP6QiOhzl/L/v+twpDgREWX3Pg6AkPWzgYU6IyEiInWR1xs4UpyIqMDUOlL8999/x8iRI1G/fn1kZGRgypQpaNu2LW7evAkjIyM8e/YMz549w6JFi+Dq6opHjx5h2LBhePbsGXbs2KGwreDgYLRr1058bW5u/omPJgepfKY4ERHlIvX/Z3LQ1gOqtAIkOT1nnIiISj1BAFLl0yGyKE5ERNkkv876LjUDtHXVGwsREakH6w1ERIWm1qL4kSNHFF6HhITAysoKf/31F1q0aIHq1atj586d4nInJyfMmTMH/fr1Q0ZGBnR0/gvf3Nwc1tbWnyz2fOP06URElBv59IcGFiyIExF97tLeAYIs62eOFCciouzkRXFOnU5E9HnKzAAy3mf9zOnTiYgKrEQ9U1w+LXqZMmVy7WNqaqpQEAeAkSNHYvDgwahcuTKGDRuGgQMHQpJDcSE1NRWpqania/nzltPT05Genv6xh6FAKyUB2gBkOobILKJty2MsTKzp6ekQBAEymQwymaxI4ikIQRDE7+rYf2FpatwAY1cHTY0bKPrYZTIZBEFAeno6tLW1FZYV9fX2Y+SWFz78Xhwk715BB4AgNUdGCcgT6qSpcQOaGzvj/rQYd/73VRJ8yt8ZRO/eQBeAoKWDDOgCRbAffu4+LU2NG9Dc2Bn3p/Wp4y5p5yen3AB8mlgliTHQASAzKFMkf2PS1M8hwNjVRVNj19S4Acae23ZLik/6e0NKAuTzhKRr6RfJ7wuAZn/OAM2OX5NjBxi/ujF+5W3lRSLIKyBqJpPJ0LFjR8TFxeH06dMq+8TGxsLNzQ39+vXDnDlzxPagoCC0bNkShoaGOHbsGGbMmIEffvgBo0ePVrmdwMBAzJw5U6k9PDwchoaGRXNA/8/l2XZ88XI/Hli2xfWK/Yp024Who6MDa2tr2NnZQU9PT93hEFEpl5aWhidPnuDFixfIyMhQWJacnIw+ffqINzup06fMC9lViLuABg9X4rXRFzj9xdRi3RcRUUn3uecGk/f/ouXtKUjVNsaRmmuKZR9ERJqkJOUFQL2/NwBApdgTqPPkZ7wwrYVzTuOLfX9ERCXR55wbpGlv4HVjLGQSbeyvHVyk2yYi0mT5zQ0lpig+fPhwHD58GKdPn0bFihWVlickJKBNmzYoU6YM9u3bB13dnJ+dNH36dAQHB+PJkycql6u6e8vOzg6xsbFFnki1jn4H7Ysbkdl0HGQeU4pkm+np6YiMjESbNm1yPQ+qpKSk4MmTJ3BwcIBUKi2SeApCEAQkJibCxMQkx5H8JZGmxg0wdnXQ1LiBoo89JSUF0dHRsLOzU7rmJCQkoFy5ciXil5ic8sLz589x7ty5Ql1v80tyeTN0Do2DzNkLmX5hRbLNj8kT6qSpcQOaGzvj/rQYd940ITcUx+8McpJ/z0NnU3sI5g7IGHmxSLbJz92npalxA5obO+P+tD513CUpLwA554bw8HB06tSp2M+J1tnl0I4KgqxmL2T6rvro7Wnq5xBg7OqiqbFratwAY1dFU3JDsfzeEHsXuuubQDCwQMa4e0W2WU3+nAGaHb8mxw4wfnVj/P/Jb24oEdOnjxo1CgcOHMDJkydVFsQTExPRrl07mJiYYPfu3XmenIYNGyIoKAipqanQ19dXWq6vr6+yXVdXt+g/OP//jA9tqQm0i3jbhYk3MzMTEokEWlpa0NLSKtJ48kM+HbM8Bk2hqXEDjF0dNDVuoOhj19LSgkQiUXm9KkmJOre8IP9ebPGmZU2rpWVYFlolIE+UBJoaN6C5sTPuT4tx576PkuKT/s4gl54EAJBITYt8H/zcfVqaGjegubEz7k/rU8Vd0s5NTrkB+ETnJOUtAEDLqFyR/u6gqZ9DgLGri6bGrqlxA4w9+/ZKkk/6e0NmCgBAomdSLOdBkz9ngGbHr8mxA4xf3Rh//nODWqs1giBg1KhR2L17N44fPw5HR0elPgkJCWjbti309PSwb9++fI1uvnLlCiwsLHL8ReWTSnuX9V3PWL1xEBFRyfQ+6w9bMLBQbxxERKR+qf//bFqpmXrjICKikin5TdZ3w7LqjYOIiNQjLTHruz5rDUREhaHWkeIjR45EeHg49u7dCxMTE7x48QIAYGZmBgMDA7EgnpycjC1btiAhIQEJCVl/KLK0tIS2tjb279+Ply9folGjRpBKpYiMjMTcuXMxYcIEdR7af9KyRnswURERkUrv47K+syhOREQp8Vnf9dU/DSQREZVA71kUJyL6rKXKB+AZqTcOIiINpdaR4mvXrkV8fDw8PDxQoUIF8Wvr1q0AgEuXLuHcuXP4+++/UaVKFYU+8ueF6+rqYvXq1WjcuDFq166N9evXY8mSJZgxY4Y6D+0/8qI4E1WxO3HiBCQSCeLi4tQdikoeHh4YO3asWmNwcHDAsmXL1BoDEWUjjhQ3V2sYRERUAnCkOBER5Sb5ddZ3FsWJiD5PnJWWiOijqH36dFVfAQEBALKKiDn1cXBwAAC0a9cOly9fRmJiIt69e4crV65g6NChJec5vrx7q0jIC945fXl6eqo7xDzt2rULQUFBxbqPwMBA1K5du1j3UVzu3LkDT09PlC9fHlKpFJUrV8a0adOQnp6e63pr165FzZo1YWpqClNTUzRu3BiHDx9W6DN06FA4OTnBwMAAlpaW6NSpE27fvq3Q5/Hjx/Dx8YGhoSGsrKzw7bffIiMjQ1weEhICiUQCFxcXpRi2b98OiUQiXpeICoTTpxMRkVyKvCjOkeJERKQCi+JERJ83eVGcs9ISERWKWqdP/yyU4ru3MmUCzj98g5jEFFiZSNHAsQy0tSTFsq8mTZrg+fPnSu379u3DsGHDMGLEiGLZb1EqU6aMukMo0XR1dTFgwADUrVsX5ubmuHr1Kr766iu8f/8eixYtynG9ihUrYv78+XB2doYgCNi0aRM6deqEy5cvo1q1agAANzc39O3bF5UqVcKbN28QGBiItm3b4uHDh9DW1kZmZiZ8fHxgbW2Ns2fP4vnz5xgwYAB0dXUxd+5ccV9GRkaIiYnBH3/8gcaNG4vtP/30EypVqlR8J4dKN3H6dHN1RkFERCWBfKQ4p08nIiJVWBQnIvq8iQPwTNQbBxGRhiohw6lLsVI6ffqR68/RbMFx9N7wJ8ZEXEHvDX+i2YLjOHJduXBdFPT09GBtba3w9fbtW0yYMAFTpkxBjx49xL5//fUX6tWrB0NDQzRp0gR37twRlwUEBKBz584K2x47diw8PDzE16mpqRg9ejSsrKwglUrRrFkzXLhwQVwuH7V+9OhR1KlTBwYGBmjZsiViYmJw+PBhuLi4wNTUFH369EFycrK4Xvbp0x0cHDB37lx8+eWXMDExQaVKlfDjjz8qxPbvv/+id+/eKFOmDIyMjFCvXj2cO3fuI8/mf5YsWYIaNWrAyMgIdnZ2GDFiBN69eycuDwkJgbm5OQ4cOICqVavC0NAQ3bt3R3JyMjZt2gQHBwdYWFhg9OjRyMzMFNcLDQ1FvXr1YGJiAmtra/Tp0wcxMTG5xlK5cmUMHDgQtWrVgr29PTp27Ig+ffrgjz/+yHU9X19ftG/fHs7Ozvjiiy8wZ84cGBsb488//xT7DBkyBC1atICDgwPq1q2L2bNn48mTJ4iOjgYAHDt2DDdv3sSWLVtQu3ZteHt7IygoCKtXr0ZaWpq4HR0dHfTp0wc///yz2Pbvv//ixIkT6NOnT77OOZGSlLis7xwpTkRE8meKc6Q4ERFll5n+X55gUZyI6PPEkeJERB+FRfHiJhbFS0+iOnL9OYZvuYTn8SkK7S/iUzB8y6ViK4x/KC4uDp06dYKHh4fSlOTff/89Fi9ejIsXL0JHRwdffvllgbY9ceJE7Ny5E5s2bcKlS5dQpUoVeHt74+3btwr9AgMDsWrVKpw9exZPnjyBn58fli1bhvDwcBw8eBDHjh3DypUrc93X4sWLUa9ePVy+fBkjRozA8OHDxSL+u3fv4O7ujqdPn2Lfvn24evUqJk6cCJlMVqDjyY2WlhZWrFiBGzduYNOmTTh+/DgmTpyo0Cc5ORkrVqxAREQEjhw5ghMnTqBLly44dOgQDh06hNDQUKxfvx47duwQ10lPT0dQUBCuXr2KPXv2IDo6WnwsQn7dv38fR48eRdOmTfO9TmZmJiIiIpCUlKQwkvtDSUlJCA4OhqOjI+zs7AAAf/zxB2rUqIHy5cuL/by8vJCQkIAbN24orP/ll19i27Zt4g0PISEhaNeuncK6RAUinz5daq7WMIiIqARI4UhxIiLKgfz3Bkg4yxQR0ecqtfTOSktE9Clw+vTiJJMB6SW/KC4IAt6nZ+bdEVlTps/YdwOCqu0AkAAI3HcTTauUUzmVukwmw/u0TOikZUBLSwsGutqQSAo25bpMJkOfPn2go6ODsLAwpfXnzJkDd3d3AMB3330HHx8fpKSkQCqV5rntpKQkrF27FiEhIfD29gYAbNiwAZGRkQgNDcXUqVPFvrNnzxYLtoMGDcLkyZPx4MEDVK5cGQDQvXt3REVFYdKkSTnur3379uLU75MmTcLSpUsRFRWFqlWrIjw8HK9evcKFCxfEqderVKmS39OUL9lHrs+ePRvDhg3DmjVrxPb09HSsXbsWTk5O4nGFhobi5cuXMDY2hqurKzw9PREVFYWePXsCgMKNCJUrV8aKFStQv359vHv3DsbGuf9baNKkCS5duoTU1FR89dVXmDJlSp7H8ffff6Nx48ZISUmBsbExdu/eDVdXV4U+a9aswcSJE5GUlISqVasiMjISenp6AIAXL14oFbXlr1+8eKHQXqdOHVSuXBk7duxA//79ERISgiVLluCff/7JM04iJRlp/93ly5HiRESUymeKExFRDuRTpxtYAFra6o2FiIjUIy0x6ztHihMRFQqL4sVJXhAHSvT06e/TM+E6/WiRbEsA8CIhBTUCj+Wr/81ZXjDUK9jHcMqUKfjjjz9w/vx5mJgoPz+lZs2a4s8VKlQAAMTExOTrmc8PHjxAenq6wuhkXV1d1K9fH3fv3s1xP+XLl4ehoaFYEJe3nT9/Ptf9fbgNiUQCa2trcZrxK1euoE6dOiqfRf748WOFou+UKVPyVTzO7tdff8W8efNw+/ZtJCQkICMjAykpKUhOToahoSEAwNDQUCyIy4/LwcFBobhdvnx5henR//rrLwQGBuLq1at4+/atOLpdHne1atXw6NEjAEDz5s1x+PBhcd2tW7ciMTERV69exbfffgtbW1tMmzYNp06dEm9UAID169ejb9++AICqVaviypUriI+Px44dO+Dv74/ff/9d4Rz17dsXbdq0wfPnz7Fo0SL4+fnhzJkz+bpZIrsvv/wSwcHBqFSpEpKSktC+fXusWrWqwNshEqdOhwSQmqkzEiIiKgnkI8WZE4iIKDs+T5yIiDhSnIjoo7AoXpzkU6dLtABdA/XGUkpERERg0aJFOHjwIJydnVX20dXVFX+WjyKXF2W1tLQgCIrj3NPT0wsVS/b9fPha3pbXVOe5rWNgkPNnxsbGBleuXBFfqyqc5yU6OhodOnTA8OHDMWfOHJQpUwanT5/GoEGDkJaWJhbFVcWYW9xJSUnw8vKCl5cXwsLCYGlpicePH8PLy0t8PvehQ4fE8579OOVTmru6uiI9PR3Dhg3DlClTUK9ePYVj/nBkt56enjiK3s3NDRcuXMDy5cuxfv16sY+ZmRnMzMzg7OyMRo0awcLCArt370bv3r1hbW2tdAPDy5cvAQDW1tZK565v376YOHEiAgMD0b9/f+jo8FJKhfQ+Luu71IyjPYiI6L+R4vosihMRUTYsihMRURqL4kREH4OVnOL04fPECzhF+KdkoKuNm7O88tX3/MM3CAi+kGe/kIH10cBRuVArk8mQmJAIE1MTcfr0/Lpy5QoGDRqE+fPnw8srf/FmZ2lpievXryttV17kdXJygp6eHs6cOQN7e3sAWUXzixcvYujQoYXaZ2HVrFkTGzduxJs3b5SK3jo6Oh89lfpff/0FmUyGxYsXQ0tLCwCwbdu2j9omANy+fRuvX7/G/PnzxQL3xYsXFfrIz21eZDIZ0tPTIZPJYGBgkO9jlslkSE1NzXG5IAgQBEHs07hxY8yZMwcxMTGwsrICAERGRsLU1FRpGnYg6yaEjh07Ytu2bVi3bl2+YiJSSf5cQD4TkIiIgA9GinP6dCIiykYsihf8pngiIiol5PUGTp9ORFQoLIoXJ/HOrZI7dTqQNco3v1OYN3e2RAUzKV7Ep6h8rrgEgLWZFM2dLXN8pniGnjYM9XTEQmx+xMbGonPnzvDw8EC/fv2UnvOsrZ2/4nrLli2xcOFCbN68GY0bN8aWLVtw/fp11KlTBwBgZGSE4cOH49tvv0WZMmVQqVIl/PDDD0hOTkb//v3zHW9R6N27N+bOnYvOnTtj3rx5qFChAi5fvgwbGxs0btw4x/Xev3+vMKJaJpNBIpGgVq1aCv2qVKmC9PR0rFy5Er6+vjhz5kyRFHgrVaoEPT09rFy5EsOGDcP169cRFBSU53phYWHQ1dVFjRo1oK+vj4sXL+L7779Hly5dlEamf2jy5Mnw9vZGpUqVkJiYiPDwcJw4cQJHj2Y9EuCff/7B1q1b0bZtW1haWuLff//F/PnzYWBggPbt2wMA2rZtC1dXV/Tv3x8//PADXrx4galTp2LkyJHQ19dXud+QkBCsWbMGZcvyLn36CGJRnM8TJyL67MlkH4wUZ1GciIiyYVGciIg4fToR0UdhUbw4iSPFS3ZRvCC0tSSY4euK4VsuQQIoFMblJfAZvq4qC+If4+DBg3j06BEePXokPif8Q/b29ggJCclzO15eXpg2bRomTpyIlJQUfPnllxgwYAD+/vtvsc/8+fMhk8nQv39/JCYmol69ejh8+DDMzc2L8Ijypqenh2PHjmH8+PFo3749MjIy4OrqitWrV+e63t27d8Uiv5y7uzuOHz+u0FarVi0sWbIECxYswOTJk9GiRQvMmzcPAwYM+Ki4LS0tERISgilTpmDFihWoW7cuFi1ahI4dO+a6no6ODhYsWIC7d+9CEATY29tj5MiR+PLLL3NdLyYmBgMGDMDz589hZmaGmjVr4ujRo2jTpg0AQCqV4tSpU1i2bBnevn2L8uXLo0WLFjh79qw4KlxbWxsHDhzA8OHD0bhxYxgZGcHf3x+zZs3Kcb8GBga5TnFPlC8sihMRkVxaIsT/XXOkOBERZZf8Jus7p08nIvp8pSVmfdc3UW8cREQaikXx4pSqGSPFC6pd9QpY268uZu6/iefxKWK7tZkUM3xd0a66ctH6Y/n7+8Pf3z/PftmfF167dm2ltpkzZ2LmzJk5bkMqlWLFihVYsWKF2CaTyZCQkDVyx8PDQ2mbAQEBCAgIUGgLDAxEYGCg+PrEiRMKy6Ojo5X2/eEIbyCr2L9jx44cY80u+z6zx559n9988w2++eYbhbYPR8Tn57gAKN2Q0Lt3b/Tu3VuhLfs5y65nz57o2bNnjrHn5Keffsp1uY2NDQ4dOpRrHyDrXOfWT9W5+NDYsWMxduzYPPdDpCAlLus7i+JERCSfOl1LF9CRqjcWIiIqefhMcSIi4khxIqKPUqii+OPHj/Ho0SMkJyfD0tIS1apVy3GK4c9aWulNUu2qV0AbV2ucf/gGMYkpsDKRooFjmSIfIU5ElB8am5fkI8Wl5moNg4ioNNK43CCfOl1qBkj4f2oioqKmcXkhOxbFiYiKnMblBnm9gc8UJyIqlHwXxaOjo7F27VpERETg33//VRj1qaenh+bNm2PIkCHo1q1bgZ4VXaqJ06eXziSlrSVBYyf+MkZE6lEq8hKnTyciKlIanRvkI8U5dToRUZHR6LyQHadPJyIqEhqbGzIzgIz/n7W1lNYbiIiKW76u6qNHj0atWrXw8OFDzJ49Gzdv3kR8fDzS0tLw4sULHDp0CM2aNcP06dNRs2ZNXLhwobjj1gyl8JniREQlQanJSyyKExEVGY3PDfKR4vosihMRFQWNzwvZcaQ4EdFH0+jcIH+eOMCiOBFRIeVrpLiRkRH++ecflC2r/B9vKysrtGzZEi1btsSMGTNw5MgRPHnyBPXr1y/yYDVOWul8pjgRkbqVmrz0Pi7rO4viREQfTeNzQ0p81neOFCciKhIanxey40hxIqKPptG5QT4AT1sP0NFTbyxERBoqX0XxefPm5XuD7dq1K3QwpU4pfqY4EZE6lZq8JI4UN1drGEREpYHG5wZ5UZwjxYmIioTG54UPZaT+N0LQsIx6YyEi0mAanRtSWWsgIvpYBX4oxsOHD3Hv3j2l9nv37iE6OrooYio9OH06EVGx0+i8xOnTiYiKhUbmBvn06VJztYZBRFQaaWRe+JB8lLhEG9A3U28sRESlhMblBvkAPH0WxYmICqvARfGAgACcPXtWqf3cuXMICAgoiphKD3lRnImKiKjYaHReYlGciKhYaGRuSJEXxTlSnIioqGlkXviQ+DzxMoBWgf+UR0REKmhcbkj9/xlD9EzUGwcRkQYr8P+kL1++jKZNmyq1N2rUCFeuXCmKmEoPTp9ORFTsNDYvyWRASlzWzyyKExEVKY3MDfKR4pw+nYioyGlkXviQWBTn88SJiIqKxuUGjhQnIvpoBS6KSyQSJCYmKrXHx8cjMzOzSIIqNTh9OhFRsdPYvJSWCAiyrJ85VS4RUZHSyNzAkeJERMVGI/PCh1gUJyIqchqXG8RnirPWQERUWAUuirdo0QLz5s1TSAyZmZmYN28emjVrVqTBaTwmqs+Kh4cHxo4dK752cHDAsmXL1BbPxwgICEDnzp3VHQZRvmhsXpJPna5jAOhK1RsLEVEpo5G5ISU+6ztHihMRFTmNzAsf+nD6dCIiKhIalxs4Ky0R0UfTKegKCxYsQIsWLVC1alU0b94cAHDq1CkkJCTg+PHjRR6gRivtI8VlmcCjs8C7l4BxecC+CaClre6oCsTDwwO1a9fW2OJ1YZ04cQKenp54+/YtzM3N1R1OgXXs2BFXrlxBTEwMLCws0Lp1ayxYsAA2NjYq+7958wYzZszAsWPH8PjxY1haWqJz584ICgqCmZlZjvtJSUnBsGHD8Ndff+HWrVvo0KED9uzZo9AnJCQEAwcOFF8bGRmhatWqGDt2LPr27ZvrcTg4OGDs2LEKN1MUVnR0NBwdHXH58mXUrl37o7eXl8DAQOzZs6dETCelsXmJzxMnIio2GpkbUjlSnIiouGhkXvhQ8pus7wYsihMRFRWNyw3i9Ol8pjgRUWEVeKS4q6srrl27Bj8/P8TExCAxMREDBgzA7du3Ub169eKIUXOJd2+VwkR1cx+wrDqwqQOwc1DW92XVs9qJipmnpye2bduGO3fuYOfOnXjw4AG6d++eY/9nz57h2bNnWLRoEa5fv46QkBAcOXIEgwYNynU/mZmZMDAwwOjRo9G6desc+5mamuL58+d4/vw5Ll++jLZt22LgwIG4c+dOoY+xuKSlpak7hCKnsXnpfVzWdxbFiYiKnEbmBnH69Jxv2CMiosLRyLzwIU6fTkRU5DQuN6RypDgR0ccqcFEcAGxsbDB37lwcPHgQO3bswPTp01GmDO9WVVJaR4rf3AdsGwAkPFNsT3ie1V5MhXEPDw98/fXXGDt2LCwsLFC+fHls2LABSUlJGDhwIExMTFClShUcPnxYXOf69evw9vaGsbExypcvj/79+yM2NhZA1hThv//+O5YvXw6JRAKJRILo6GhkZmZi0KBBcHR0hIGBAapWrYoVK1Z8dPxLlixBjRo1YGRkBDs7O4wYMQLv3r0Tl4eEhMDc3BwHDhxA1apVYWhoiO7duyM5ORmbNm2Cg4MDLCwsMHr0aIVpfUJDQ1GvXj2YmJjA2toaffr0QUxMzEfH+6EjR46gWbNmMDc3R9myZdGhQwc8ePBAXB4dHQ2JRIJt27ahefPmMDAwQP369XH37l1cuHAB9erVg7GxMby9vfHq1StxvQsXLqBNmzYoV64czMzM4O7ujkuXLuUZzzfffINGjRrB3t4eTZo0wXfffYc///wT6enpKvtXr14dO3fuhK+vL5ycnNCyZUvMmTMH+/fvR0ZGRo77MTIywtq1a/HVV1/B2to6x34SiQTW1tawtraGs7MzgoKCoKWlhWvXruV5LNm3s3HjRnTp0gWGhoZwdnbGvn3//Xt6+/Yt+vbtC0tLSxgYGMDZ2RnBwcEAAEdHRwBAnTp1IJFI4OHhAeC/qfDnzJkDGxsbVK1aVdxX9lHv5ubmCAkJEV//+++/6N27N8qUKQMjIyPUq1cP586dQ0hICGbOnImrV6+K/3Y+XE8dNDIviSPFzdUaBhFRaaVxuUE+UpzTpxMRFQuNywsfYlGciKhYaFRuEEeKsyhORFRYhSqKnzp1Cv369UOTJk3w9OlTAFmFudOnTxdpcBpPU4rigpAVa36+UhKAwxMBCKo2lPXtyKSsfjltIz35v58FVdvJ2aZNm1CuXDmcP38eX3/9NYYPH44ePXqgSZMmuHTpEtq2bYv+/fsjOTkZcXFxaNmyJerUqYOLFy/iyJEjePnyJfz8/AAAy5cvR+PGjfHVV1+Jo3zt7Owgk8lQsWJFbN++HTdv3sT06dPx/fffY/fu3R91mrW0tLBixQrcuHEDmzZtwvHjxzFx4kSFPsnJyVixYgUiIiJw5MgRnDhxAl26dMGhQ4dw6NAhhIaGYv369dixY4e4Tnp6OoKCgnD16lXs2bMH0dHRCAgI+KhYs0tKSsK4ceNw8eJF/Pbbb9DS0kKXLl0gk8kU+s2YMQNTp07FpUuXoKOjgz59+mDixIlYvnw5Tp06hfv372P69Oli/8TERPj7++P06dP4888/4ezsjPbt2yMxMTHfsb158wZhYWFo0qQJdHV1871efHw8TE1NoaNT4KdI5CozMxObNm0CANStW7fA68+cORN+fn64du0a2rdvj759+/4fe/cd31S9/gH8k6Tp3ruVljIKpVCwyFYRZIMoilcEfwrKRcUNF+cFBQeiV1HR68A9QK5e50VFAcGBbASBslcZHbSlMx1pcn5/pOfQTdImOeebft6vl6/QND15Oj+ePOf7fFFQYBuVN2/ePGRkZOCHH37Avn378MYbbyAyMhIAsGXLFgDAmjVrkJWVhS+//FI55tq1a3HgwAGsXr0aK1eutKuO0tJSXHHFFTh9+jS+/fZb7Nq1Cw899BCsVismTZqEf/zjH+jevbvyuzNp0iSHP1dnEjKXOD6diMilhMsGrhQnInIp4XKhNjbFiYhcQqhs4EpxIqJWc7gb9MUXX+Dmm2/GTTfdhB07dqCyshKArcG0cOFCfP/9904vUkhWK2CWm+IaDyqzCVjY+F7MjpNsK8gXJTT6Xj2A0Np3PHbGoYsGevXqhblz5wIAHn30USxatAiRkZGYMWMGAODxxx/HG2+8gb/++gtr1qxBeno6Fi5cqHz8e++9h4SEBBw8eBBdunSBt7c3/P3966wCNhgMWLBggfJ2hw4d8Mcff+Drr7/G1KlT7a61vtr7RiclJeHpp5/GnXfeiddff12532w244033kCnTp0AANdffz0+/vhj5OTkIDAwEKmpqRg6dCjWrVunNCFvu+025eM7duyIJUuWoG/fvigtLUVgoHN+9iZOnFjn7ffeew9RUVHIyMioM05ozpw5GDVqFADg/vvvx+TJk7F27VpceumlAIDp06fXWVF85ZVX1jnu0qVLERoail9++QVXXXVVszU9/PDDeO2112AymTBgwAC7m70AkJeXh6eeegq333673R/TnKKiIuVrXV5eDqPRiJdffln5Pjpi2rRpmDx5MgBg4cKFWLJkCbZs2YLRo0cjMzMT6enp6NOnDwDbz5EsKioKABAREdFgVXtAQADeeecdeHt7213H8uXLcfbsWWzdulW5QrZz587K+wMDA+Hl5dXsCnp3ETaXuFKciMhlhMsGqwWoqrkokCvFiYicTrhcqK+8Zk9xNsWJiJxGuGxQzhc8cKtWIiI3cXil+NNPP40333wTb7/9dp1VmZdeeqldY4/bDLkhDmh/pbhAevbsqfzbYDAgIiICaWlpyn0xMTEAgNzcXOzatQvr1q1DYGCg8l9KSgoA1Bn93Zh///vfuOSSSxAVFYXAwEC8/fbbOHXqFADbFYS1j7ls2TK7al+zZg2GDRuGiy66CEFBQbj55puRn58Pk8mkPMbf379OIzUmJgZJSUl1mtsxMTF1xqNv374d48ePR2JiIoKCgnDFFVcAADIzMwEAaWlpaNeuHYKDgzFmzBi7aq3v0KFDmDx5Mjp27Ijg4GClGSs/h6z290f+XtT//tSuPScnBzNmzEBycjJCQkIQHByM0tJS5bgzZ85Uaq/f4H/wwQfx559/4qeffoLBYMAtt9wCyY7JA8XFxRg3bhxSU1Mxf/585f7u3bsr31NHv05BQUHYuXMndu7ciT///BPPPPMMZs+ejf/9738AbM3t2j8z9b9utdX+GgYEBCA4OFj5ms2cORMrVqzAxRdfjIceegh//PGHXfWlpaU51BAHgF27diE9PV27I6NqETaXKgptt1wpTkTkdMJlgzw6HQB82RQnInI24XKhPhOb4kREziZcNigrxdlrICJqKYdXih84cACDBw9ucH9ISAgKCwudUZNnkEen6/SA0U/dWi7E6G9bsW2PE38Ay66/8ONu+i/QflCDu61WK4pLShAcFAS9Xm97bkdKrTceW6fT1blPp9Mpz1NaWorx48fjueeea3CcuLi4Jp9jxYoVmDNnDl588UUMHDgQQUFBeP7557Fx40YAQJ8+fbBz507l8XLztznHjx/HVVddhZkzZ+KZZ55BeHg4fv/9d0yfPh1VVVXw9/e36/OT75PHlpeVlWHUqFEYNWoUli1bhqioKGRmZmLUqFGoqqoCAKxcuRLnzp1DYGAgAgJa9j9N48ePR/v27fH2228jPj4eVqsVPXr0UJ5D1tj3ov59tUeuT506Ffn5+XjllVfQvn17+Pj4YODAgcpxFyxYgDvuuAOBgYG2n5daIiMjERkZiS5duqBbt25ISEjApk2bMHDgwCY/j5KSEowePRpBQUH46quv6tT2/fffK3uS+/k59jur1+vrrKLu0aMHfvjhB/zrX//CNddcgzvvvFMZ2w/Y9itqSnPf7zFjxuDEiRP4/vvvsXr1agwbNgx33303XnjhhWbra+z7rtPpGlxEUHtPdke/BmoSNpfkleK+oaqWQUTkiYTLBnl0usEH8PJRtxYiIg8kXC7Up4xP1/5Fy0REohAuG6o4Pp2IqLUcborHxsbi8OHDdcb2AsDvv/+Ojh07Oqsu8VXVGp1e0xzULJ3O/ivMOl0JBMcDxVlofF9xne39na4E9IaG77ZaAaPF9nz6Fm1pb7fevXvjiy++QFJSUpP7Rnt7e8NisdS5b8OGDRg0aBDuuusu5b6jR48q//bz86vTALXH9u3bYbVa8eKLLyrN3c8++8yhYzRm//79yM/Px6JFi5CQYBtZv23btjqPad++PcLCwhAcHNygsWyP/Px8HDhwAG+//TYuv/xyAHDavjobNmzA66+/jrFjxwIATp48iby8POX90dHR8PX1vWDtctNYHnPUmOLiYowaNQo+Pj749ttv4evrW+f97du3b82n0oBer0d5eTkAIDw83GkrrqOiojB16lRMnToVl19+OR588EG88MILykrw+j/PzR0nKytLefvQoUN1phakpaXh3XffRUFBQaO1N/a7oxZhc6m80HbLleJERE4nXDZUcj9xIiJXEi4Xaqsy2ba9A7hSnIjIiYTLBrnf4MOmOBFRSzncIZsxYwbuv/9+bN68GTqdDmfOnMGyZcswZ84czJw50xU1iqnKQ8eZ6A3AaHnldf1mf83boxc13hB3s7vvvhsFBQWYPHkytm7diiNHjuDHH3/ErbfeqjTzkpKSsHnzZhw/fhx5eXmwWq1ITk7Gtm3b8OOPP+LgwYOYN28etm7d2qpaOnfuDLPZjFdffRVHjx7Fxx9/jDfffLPVn2NiYiK8vb2V43777bd46qmn7P743bt3K2O/d+7ciV27djV4TFhYGCIiIrB06VIcPnwYP//8M2bPnt3q2gEgOTkZH3/8Mfbt24fNmzfjpptuuuAK5c2bN+O1117Dzp07ceLECfz888+YPHkyOnXqpKwSP336NFJSUrBlyxYAtob4yJEjUVZWhnfffRfFxcXIzs5Gdnb2BRu7GRkZ2LlzJwoKClBUVKR8rWqTJEk53rFjx7B06VL8/PPPuPrqq1v+xWnE448/jm+++QaHDx/G3r17sXLlSnTr1g2A7QICPz8/rFq1Cjk5OSgqKmr2WFdeeSVee+01/Pnnn9i2bRvuvPPOOqvUJ0+ejNjYWEyYMAEbNmzA0aNH8cUXXygTE5KSknDs2DHs3LkTeXl5zV6Q4GrC5pKypzib4kREziZcNsgrxTk6nYjIJYTLhdrk/cT1Ru4jS0TkRMJlQ2XNnuLezAIiopZyuCn+yCOPYMqUKRg2bBhKS0sxePBg/P3vf8cdd9yBe++916FjPfvss+jbty+CgoIQHR2NCRMm4MCBA3UeU1FRgbvvvhsREREIDAzExIkTkZOTU+cxmZmZGDduHPz9/REdHY0HH3wQ1dXVjn5qzuXJe3ykXg3c8BEQXG8EeXC87f5U5zYCWyo+Ph4bNmyAxWLByJEjkZaWhgceeAChoaHKquM5c+bAYDAgNTVVGT1+xx134LrrrsOkSZPQv39/5Ofnt/p/hHr16oXFixfjueeeQ48ePbBs2TI8++yzrf4co6Ki8MEHH+Dzzz9HamoqFi1adMFR2rUNHjwY6enpyn+XXHJJg8fo9XqsWLEC27dvR48ePTBr1iz861//anXtAPDuu+/i3Llz6N27N26++Wbcd999iI6ObvZj/P398eWXX2LYsGHo2rUrpk+fjp49e+KXX36Bj49t3KjZbMaBAweUlc87duzA5s2bsXv3bnTu3BlxcXHKfydPnmz2+caOHYv09HT873//w/r165WvVW3FxcXK8bp164aXXnoJjz76KB577LFWfHUa8vb2xqOPPoqePXti8ODBMBgMWLFiBQDAy8sLS5YswVtvvYX4+Hhcc801zR7rxRdfREJCAi6//HJMmTIFc+bMUcb4y8/1008/ITo6GmPHjkVaWhoWLVoEg8F2wcvEiRMxevRoDB06FFFRUfj000+d+rk6wpm55FZcKU5E5DLCZYO8UtyHTXEiIlcQLhdqU0anR2h/EiERkUCEywZ5ER5XihMRtZjD49N1Oh3++c9/4sEHH8Thw4dRWlqK1NRUBAY6/sf4l19+wd13342+ffuiuroajz32GEaOHImMjAxlD9xZs2bhu+++w+eff46QkBDcc889uO6667BhwwYAtlHB48aNQ2xsLP744w9kZWXhlltugdFoxMKFCx2uyWmU8eke2BQHbI3vlHG2PcZLc4DAGNse4i5cIb5+/foG9x0/frzBfbX3SU5OTsaXX37Z5DG7dOmirHyt7f3338f777+vvG21WvHII484VF/92mbNmoVZs2bVue/mm29W/j1t2jRMmzatzvvnz5+P+fPn17nvgw8+qPP25MmTMXny5Dr31d8rur4hQ4Y0+5j6zzF8+HBkZGQ0+RxJSUkNjtfYc9T/HNPT0xuswr/++ub3rE9LS8PPP//c7GPq13Ohz7c5jf2M1dbY981qtaK4uFhpINt77MZqrL2H0dy5czF37twmj/f3v/8df//73+vcV/97KYuPj8ePP/7Y4Lnk2gHbSPn//ve/jX68j49Pk+9zN2fmklspK8VDVS2DiMgTCZcNXClORORSwuVCbbWb4kRE5DTCZUMl9xQnImoth5viMm9vb6SmpqK4uBhr1qxB165dlTG+9lq1alWdtz/44ANER0dj+/btGDx4MIqKivDuu+9i+fLluPLKKwHYmpXdunXDpk2bMGDAAPz000/IyMjAmjVrEBMTg4svvhhPPfUUHn74YcyfP1/ZZ9ftlPHpHjzORG8AOlyudhVERACck0tuxfHpREQuJ0w2VNRse8KV4kRELiVMLtRmqhmf7h+ubh1ERB5KiGywmAFLzdaFXClORNRiDjfFb7jhBgwePBj33HMPysvL0bdvXxw7dgySJGHFihWYOHFii4uR98AND7f9j/727dthNpsxfPhw5TEpKSlITEzExo0bMWDAAGzcuBFpaWmIiYlRHjNq1CjMnDkTe/fubTDmGAAqKyvr7H8rr4o0m80wm80trr82XXkxvABYjX6wOOmYMrnGltRqNpshSRKsViusVqtT67KHvBJWrkEUotYNsHY1iFo34PzarVYrJEmC2WxusHLeWX9vnZFLzeWCM2tVVFfAWF1uO7ZXEKChnFCTqHUD4tbOut2Lddv/XK3l6mxw9tdCbzoHAwCrd5Cmzh3UxLrdT9TaWbd7ubtuLeUC0HQ2OLPW+vQlubaM8AtnRtTC2tUhau2i1g2w9uaO21quzgannjeUn4Ox5p9mnQ9fR6pH5PpFrh1g/Wpj/Q2PdSE6ycGZwrGxsfjxxx/Rq1cvLF++HE888QR27dqFDz/8EEuXLsWff/7ZooKtViuuvvpqFBYW4vfffwcALF++HLfeemudUAGAfv36YejQoXjuuedw++2348SJE3XGAJtMJgQEBOD777/HmDFjGjzX/PnzsWDBggb3L1++vM6euq3RMfcnpJ3+BKdC+2N7h7udckxn8PLyQmxsLBISEtRbRU9EbUZVVRVOnjyJ7OxsVFdX13mfyWTClClTUFRUhODglq+Mc0YuuSMXavMxF2L0nvsgQYdvL34f0Omd/hxERCJqq9mQenoFknO/x+Go0djbbopTj01EJDIt5QLg/vMGAOia9SVSsr/Gscgr8VfCNJc8BxGRSNpiNvhV5WHk3tmw6IxYefG7TjkmEZEnsTcbHF4pXlRUpKzkXrVqFSZOnAh/f3+MGzcODz74YIsLvvvuu7Fnzx6lIe5Kjz76KGbPnq28XVxcjISEBIwcObJVQVqb/vf9wGkgvn1nxIwd65RjysxmM1avXo0RI0bAaDRe+ANqqaiowMmTJxEYGAhfX1+n1mUPSZJQUlKCoKAg6HQ6tz9/S4laN8Da1SBq3YDza6+oqICfnx8GDx7c4G9O7VUVreGMXGoqF4YOHYrNmze36O9ts87uB/YA8AvD2HFXOe+4NVqTE2oStW5A3NpZt3ux7gsTIRucec4g03+/BsgFOnTrhfaXa+fcQU2s2/1ErZ11u5e769ZSLgBNZwMAl31N9KvWA9lAYtd0tBvCjJCxdnWIWruodQOsvTGiZINTzxvO7gf2Anq/YIx1cq8BEPvnDBC7fpFrB1i/2lj/efZmg8NN8YSEBGzcuBHh4eFYtWoVVqxYAQA4d+5ci5us99xzD1auXIlff/0V7dq1U+6PjY1FVVUVCgsLERoaqtyfk5OD2NhY5TFbtmypc7ycnBzlfY3x8fGBj49Pg/uNRqPzfnAstrG4et9g6F30w9iSei0WC3Q6HfR6PfR6969MlMcxyzWIQtS6AdauBlHrBpxfu16vh06na/TvlbP+3jojl5rLBfnWqf9jYS4BAOj8wlz6PyxOr9tNRK0bELd21u1erLv553AGV2eD078OVaUAAIN/GAwaOnfQAtbtfqLWzrrdy111aykXgKazQa7VJV+TinMAAENQFDOiEaxdHaLWLmrdAGuvfzxncHU2OLfXUAEA0HkH8nWkZohcv8i1A6xfbazf/mxwuOPxwAMP4KabbkK7du0QHx+PIUOGAAB+/fVXpKWlOXQsSZJwzz334KuvvsLPP/+MDh061Hn/JZdcAqPRiLVr1yr3HThwAJmZmRg4cCAAYODAgdi9ezdyc3OVx6xevRrBwcFITU119NNznqoy261PoHo1EBG1Ac7MJbcpt72wBb8wdesgIvJQwmVDZc0VzT7OXYFOREQ2wuVCbaZ8261/hLp1EBF5GKGyocq2uAI+QerWQUQkOIdXit91113o168fTp48iREjRigrCTt27Iinn37aoWPdfffdWL58Ob755hsEBQUhOzsbABASEgI/Pz+EhIRg+vTpmD17NsLDwxEcHIx7770XAwcOxIABAwAAI0eORGpqKm6++WY8//zzyM7Oxty5c3H33Xc3efWuW8hNce8A9WogImoDnJlLblNeaLv1C1WzCiIijyVcNlQU2W592RQnInIF4XKhNlOB7dY/XN06iIg8jFDZUGmbLAXvtrMAz2K1YEfuDpw1nUWUfxR6R/eGQW9QuywiEpzdTfHLL78c11xzDa655hr06dMHffr0qfP+cePGOfzkb7zxBgAoV2HJ3n//fUybNg0A8NJLL0Gv12PixImorKzEqFGj8PrrryuPNRgMWLlyJWbOnImBAwciICAAU6dOxZNPPulwPU5VWXP1VhsKKiIid3JFLrlNG1opzpMYInInYbOhou2sFGcuEJE7CZsLtSlNca4UJyJyBiGzoWa7pbYylXbNiTVYtGURckw5yn0x/jF4pN8jGN5+uIqVEZHo7G6Kz5gxA9988w0WLFiAdu3a4eqrr8bVV1+NQYMGQafTtejJJUm64GN8fX3x73//G//+97+bfEz79u3x/ffft6gGl+FKcSIil3JFLrlNG2mK8ySGiNxN2GyQx6f7hqhbh4sxF4jI3YTNBZkktanx6bxwiojcQchsUFaKe36vYc2JNZi9fjYk1O0d5ZpyMXv9bCwespjnDkTUYnbvKX7LLbfgiy++QF5eHl588UUUFhbib3/7G2JjY3Hbbbfh66+/Rnl5uStrFQub4pAkCWXmMhRVFqHMXGbXRRAiGzJkCB544AHl7aSkJLz88suq1dMa06ZNw4QJE9Qug6hZQudSG2iKyycxtRsfwPmTmDUn1qhUGRF5MmGzQV4p7sHj05kLRKQGYXNBVlUGWCpt//bwpviaE2sw6otRuO3H2/Dwbw/jth9vw6gvRjEfiMjphMwGeU9xb8/eU9xitWDRlkUNGuIAlPue2/IcLFaLu0sjIg9hd1Nc5uPjg7Fjx+Ktt97CmTNn8O233yIuLg7z5s1DREQErrrqKmzYsMEVtYpFaYp77kgTi9WCrdlb8f3R77E1e2udMCquLMbBcwdxvOg4TpWcwvGi4zh47iBKzCUqVtxQ/UZ2W7F+/XrodDoUFhaqXUqLXH311UhMTISvry/i4uJw880348yZM81+zNKlSzFkyBAEBwc79Llv3boVw4YNQ2hoKMLCwjBq1Cjs2rVLeb/8tZT/8/PzQ1paGj744IMLHtvZP386nQ5ff/21047XnA8++AChoaFuea4LETKXKgptt76halbhMjyJISK1CZUNlmrAXHPu4OOZK8WZC0SkNqFyoTZ5lbiXL2D0V7cWF+KFU0SkBqGyQe41ePj49B25OxpkQW0SJGSbsrEjd4cbqyIiT+JwU7y+/v3745lnnsHu3buxe/duDBs2DFlZWc6oTWzyPh8e2hRv7gre4spinCw5iWprdZ2PqbZW41TJKZRbNXalHQln6NCh+Oyzz3DgwAF88cUXOHLkCK6//vpmP8ZkMmH06NF47LHH7H6e0tJSjB49GomJidi8eTN+//13BAUFYdSoUTCbzXUee+DAAWRlZSEjIwO33347/vGPf2Dt2rUt+vxcqaqqSu0SXE6IXPLwleI8iSEirdF0Nsij0wGPXSnOXCAirdF0LtRWe3S6Vkf6thIvnCIirdB0NlR6dq9BdtZ01qmPIyKqz+Gm+MmTJ3Hq1Cnl7S1btuCBBx7A0qVL0alTJ8yaNeuCzak2ocpz9/m40BW8Xx3+qtmPL7IWteh5hwwZgnvvvRcPPPAAwsLCEBMTg7fffhtlZWW49dZbERQUhM6dO+OHH35QPmbPnj0YM2YMAgMDERMTg5tvvhl5eXkAbCPCf/nlF7zyyivKKt/jx4/DYrFg+vTp6NChA/z8/NC1a1csWbKkRTXXtnjxYqSlpSEgIAAJCQm46667UFpaqrxfXnm7cuVKdO3aFf7+/rj++uthMpnw4YcfIikpCWFhYbjvvvtgsZw/Gfz444/Rp08fBAUFITY2FlOmTEFubm6r661t1apVuOyyyxAaGqpcKXnkyBHl/cePH4dOp8Nnn32Gyy+/HH5+fujbty8OHjyIrVu3ok+fPggMDMSYMWNw9uz5/2nZunUrRowYgcjISISEhOCKK67Ajh0XfjF01qxZGDBgANq3b49BgwbhkUcewaZNmxo0qmt74IEH8Mgjj2DAgAF2f9779+9HQUEBnnzySXTt2hXdu3fHE088gZycHJw4caLOY6OjoxEbG4sOHTrg3nvvRfv27fHnn3/a/VyAbeT+woULcdtttyEoKAiJiYlYunSp8v6qqircc889iIuLg6+vL9q3b49nn31W+VgAuPbaa6HT6ZS358+fj4svvhjvvPMOOnToAF9fX+Xx9cf7X3zxxViwYIHydmFhIe644w7ExMTA19cXPXr0wMqVK7F+/XrceuutKCoqUn535s+f79Dn6kxC5pKHN8V5EkNEahMqGypq/t/Yyw8wGNWtxUWYC0SkNqFyoTZTge3WP1zdOlyIF04RkVqEyga51+DhK8Wj/KOc+jgiovocbopPmTIF69atAwBkZ2dj+PDh2LJlC/75z3/iySefdHqBwhJoT3FJkmAym+z6r6SyBM9uebbJK3glSHhvz3swmU2oqK5o9L+y6jLkV+TDZDY5vM/4hx9+iMjISGzZsgX33nsvZs6cib/97W8YNGgQduzYgZEjR+Lmm2+GyWRCYWEhrrzySqSnp2Pbtm1YtWoVcnJycMMNNwAAXnnlFQwcOBAzZsxAVlYWsrKykJCQAKvVinbt2uHzzz9HRkYGHn/8cfzzn//EV1813+y/EL1ejyVLlmDv3r348MMP8fPPP+Ohhx6q8xiTyYQlS5ZgxYoVWLVqFdavX49rr70W33//Pb7//nt8/PHHeOutt/Df//5X+Riz2YynnnoKu3btwtdff43jx49j2rRpraq1vrKyMsyePRvbtm3D2rVrodfrce2118JqtdZ53BNPPIG5c+dix44d8PLywpQpU/DQQw/hlVdewW+//YbDhw/j8ccfVx5fUlKCqVOn4vfff8emTZuQnJyMsWPHoqTE/jH7BQUFWLZsGQYNGgSj0bkvJHft2hURERF49913UVVVhfLycrz77rvo1q2b0nSuT5IkrFq1CqdOnUK/fv0cfs4XX3wRffr0wZ9//om77roLM2fOxIEDBwAAS5Yswbfffquskl+2bJlSx9atWwEA77//PrKyspS3AeDw4cP44osv8OWXX2Lnzp121WG1WjFmzBhs2LABn3zyCTIyMrBo0SIYDAYMGjQIL7/8MoKDg5XfnTlz5jj8uTqLkLnk4U1xnsQQkdqEyoZKz99PnLlARGoTKhdqq71S3EPxwikiUotQ2VAp7ynu2U3x3tG9EeMfAx0an46igw6x/rHoHd3bzZURkafwcvQD9uzZozR6PvvsM6SlpWHDhg346aefcOedd9ZpeLVZVgtgNtn+LUBQlVeXo//y/k47XkFFAW776Ta7Hrt5ymb4O7AvVq9evTB37lwAwKOPPopFixYhMjISM2bMAAA8/vjjeOONN/DXX39hzZo1SE9Px8KFC5WPf++995CQkICDBw+iS5cu8Pb2hr+/P2JjY5XHGAyGOqtlO3TogD/++ANff/01pk6danet9dXeOzopKQlPP/007rzzTrz++uvK/WazGW+88QY6deoEALj++uvx8ccfIycnB4GBgUhNTcXQoUOxbt06TJo0CQBw223nv9YdO3bEkiVL0LdvX5SWliIw0Dk/fxMnTqzz9nvvvYeoqChkZGSgR48eyv1z5szBqFGjAAD3338/Jk+ejLVr1+LSSy8FAEyfPr3OXttXXnllneMuXboUoaGh+OWXX3DVVVc1W9PDDz+M1157DSaTCQMGDMDKlStb8yk2KigoCOvXr8eECRPw1FNPAQCSk5Px448/wsur7p/Pdu3aAQAqKythtVrx6KOPYvDgwQ4/59ixY3HXXXcBsH2OL730EtatW4euXbsiMzMTycnJuOyyy6DT6dC+fXvl46KibC8ih4aG1vl5BmwrzD/66CPlMfZYs2YNtmzZgn379qFLly4AbD9fspCQEOh0ugbPpQYhc6m80HbrF6pmFS4jn8TkmnIbvYhKBx1i/GN4EkNELiNUNlTITXHP3E8cYC4QkfqEyoXa2kBTnBdOEZFahMoGZaV4kLp1uJhBb8Aj/R7B7PWzG7xPbpQ/3O9hGPQGd5dGRB7C4ZXiZrMZPj4+AGxNk6uvvhoAkJKSop09NtQmN8QBjx9p4m49e/ZU/m0wGBAREYG0tDTlvpiYGABAbm4udu3ahXXr1iEwMFD5LyUlBQDqjP5uzL///W9ccskliIqKQmBgIN5++21lnM5vv/1W55jLli2zq/Y1a9Zg2LBhuOiiixAUFISbb74Z+fn5MJnO/7z4+/srDXH580lKSqrT3I6JiakzHn379u0YP348EhMTERQUhCuuuAIAkJmZCQBIS0tDu3btEBwcjDFjxthVa32HDh3C5MmT0bFjRwQHByurk+XnkNX+/sjfi/rfn9q15+TkYMaMGUhOTkZISAiCg4NRWlqqHHfmzJlK7fUb/A8++CD+/PNP/PTTTzAYDLjlllscnjxQmzxmPzAwEN27dwcAlJeXY/r06bj00kuxadMmbNiwAT169MC4ceNQXl5e5+N/++037Ny5Ezt37sTSpUvx0ksv4Y033gAALFu2rM7PzG+//dZkHbW/hnLTWf6aTZs2DTt37kTXrl1x33334aeffrLrc2vfvr1DDXEA2LVrF9q1a6c0xLVMuFyyWs6PyvXQleLySUxjeBJDRO4gVDbIK8V9PHelOHOBiNQmVC7U1gaa4lwVSERqESobKj13q9b6hrcfjsVDFsOorzsRNMY/BouHLMbw9sNVqoyIPIHDK8W7d++ON998E+PGjcPq1auV1ZNnzpxBRITn/k+6Q+TR6To94OWrbi128PPyw+Ypm+167Pac7bhr7V0XfNzDfR9Gt/Bujb7PAAM6h3WGXq+Hn5efQ7XWH4+t0+nq3KfT2U6irFYrSktLMX78eDz33HMNjhMXF9fkc6xYsQJz5szBiy++iIEDByIoKAjPP/88Nm7cCADo06dPnRHUcvO3OcePH8dVV12FmTNn4plnnkF4eDh+//13TJ8+HVVVVfD397fr85Pvk8eWl5WVYdSoURg1ahSWLVuGqKgoZGZmYtSoUaiqqgIArFy5EufOnUNgYCACAlr2P07jx49H+/bt8fbbbyM+Ph5WqxU9evRQnkPW2Pei/n21R65PnToV+fn5eOWVV9C+fXv4+Phg4MCBynEXLFiAO+64A4GBgdDr617DExkZicjISHTp0gXdunVDQkICNm3ahIEDB7boc3znnXeURrdc8/Lly3H8+HFs3LhRef7ly5cjLCwM33zzDW688Ubl4zt06IDQ0FAAQLdu3fD777/j2Wefxd13342rr74a/fufn8Zw0UUXNVlHc9/v3r1749ixY/jhhx+wZs0a3HDDDRg+fHidcfqNaez7rtfrG1xEUHtPdj8/x3431SRcLlUUAfIqOd9QNStxKfkkZs4vc2CRLMr9Mf4xeLjfwzyJISKXEiobKjx/fDpgy4WH+z6MRVsX1bmfuUBE7iBULtQmN8X9PHdPca4KJCK1CJUN8kpxAabSOsMVCVdAX7Oe8x99/oHuEd3RO7o3s4CIWs3hpvhzzz2Ha6+9Fv/6178wdepU9OrVCwDw7bfftmj/XI9UWSukdI1f6aolOp3O7hHmg+IHXXD0YZR/FHpF9YJe1/gggnB9OPyN/g2anM7Wu3dvfPHFF0hKSmow6lrm7e0Ni8VS574NGzZg0KBByghrADh69Kjybz8/P3Tu3NmhWrZv3w6r1YoXX3xR+bw/++wzh47RmP379yM/Px+LFi1CQkICAGDbtm11HtO+fXuEhYUhODi4RV/z/Px8HDhwAG+//TYuv/xyAMDvv//e6toB29f69ddfx9ixYwEAJ0+eRF5envL+6Oho+Pr6XrB2uWlcWVnZ4loaa1SbTCbo9XqlwQ9Aebv+fur1GQwGpckeFBSEoCDnjDcKDg7GpEmTMGnSJFx//fUYPXo0CgoKEB4eDqPR2ODnuSlRUVF1rnotLi7GsWPHlLfT0tJw6tQpZauB+hr73VGLcLkk7yfuHQh4eatbi4tdetGlsEq235XH+j2GzmGdeRJDRG4hVDa0gZXisqCacY/Jocn4e9rfEeUfxVwgIrcQKhdqawMrxYHzF9T+8/d/wlR9fpoeL5wiIlcSKhvayPh02ZHCI6i0ViLQGIhbUm9pss9AROQoh5viQ4YMQV5eHoqLixEWdn7s6+23366sdm3zqjx3nEntK3h10NVpjMtX8D7a71HEB8Yjuyy7zsca9UbEBMRAKm/5iGtH3H333Xj77bcxefJkPPTQQwgPD8fhw4exYsUKvPPOOzAYDEhKSsLmzZtx/PhxBAYGIjw8HMnJyfjoo4/w448/okOHDvj444+xdetWJCYmtriWzp07w2w249VXX8X48eOxYcMGvPnmm63+HBMTE+Ht7Y1XX30Vd955J/bs2aNc1WiP3bt312nW6nQ65X8AZWFhYYiIiMDSpUsRFxeHzMxMPPJI4+MvHZWcnIyPP/4Yffr0QXFxMR588MELrlDevHkztm7dissuuwxhYWE4cuQI5s2bh06dOimrxE+fPo1hw4bho48+Uv4nNjs7G9nZ2Th8+HCdzz0xMRHh4Y1fdT9ixAg8+OCDuPvuu3HvvffCarVi0aJF8PLywtChQ+s8Njc3FxUVFaisrMSmTZvw2WefNdiLvbUWL16MuLg4pKenQ6/X4/PPP0dsbKyyQj0pKUnZw93Hx6fO3+j6rrzySnzwwQcYP348QkND8fjjj8NgOP+C9BVXXIHBgwdj4sSJWLx4MTp37oz9+/dDp9Nh9OjRSEpKQmlpKdauXYtevXrB399ftQwQLpcqCm23HrxKXHbw3EFIkBDhG4EbU26sc4EJEZErCZUN8pYaHr5SHAD25O0BAAyIH4CxHceqXA0RtSVC5UJtpgLbrb/nrhSXDW8/HB/t/Qh/nv0Tf+vyN4zpMIYXThGRSwmVDZVta6W4fN7QPbI7G+JE5FQt+otiMBgaNFuSkpIQHR3tlKKEJ49P99CQkq/gjfav+/1ubF8PXy9ftAtqh6SQJCSHJSPI6L6r2eLj47FhwwZYLBaMHDkSaWlpeOCBBxAaGqqsOp4zZw4MBgNSU1OV0eN33HEHrrvuOkyaNAn9+/dHfn4+Zs6c2apaevXqhcWLF+O5555Djx49sGzZMjz77LOt/hyjoqLwwQcf4PPPP0dqaioWLVqEF154we6PHzx4MNLT05X/LrnkkgaP0ev1WLFiBbZv344ePXpg1qxZ+Ne//tXq2gHg3Xffxblz59C7d2/cfPPNuO+++y74d8Tf3x9ffvklhg0bhq5du2L69Ono2bMnfvnlF2UfILPZjAMHDtTZr/3NN99Eeno6ZsyYUedz//bbb5t8rpSUFPzvf//DX3/9hYEDB+Lyyy/HmTNnsGrVqgYj+Lt27Yq4uDh07twZjz76KKZOnYolS5a09EvTKHmUf58+fdC3b18cP34c33//vfLz/OKLL2L16tVISEhAenp6s8d69NFHccUVV+Cqq67CuHHjMGHChDr72QPAF198gb59+2Ly5MlITU3FQw89pKwOHzRoEO68805MmjQJUVFReP755536uTpKqFySV4p76H7itWXkZwAAukV0Y0OciNxOmGxQmuIh6tbhBvKLWz0ieqhcCRG1RcLkQm1tZKU4AFisFuw/tx8AcFO3m9A3ti8b4kTkcsJkg7JS3DP7DfXxvIGIXMXhleI5OTmYM2cO1q5di9zc3AZ70mplnK6qlKa4560Ulw1vPxxDE4ZiR+4OnDWdbTD6sMxs+xoEewcjxOf8C3z1f14csX79+gb3HT9+vMF9tZ8jOTkZX375ZZPH7NKli7JXeG3vv/8+3n//feVtq9V6wdXR9eurX9usWbMwa9asOvfdfPPNyr+nTZuGadOm1Xn//PnzMX/+/Dr3ffDBB3Xenjx5MiZPnlznvgt9nYcMGdLsY+o/x/Dhw5GRkdHkcyQlJTU4XmPPUf9zTE9Px9atW+s85vrrr2+29rS0NPz888/NPqaxehr7WtpjxIgRGDFiRJPvb+zztFqtKC4uvuAe7hf6mQFQZ//6GTNmKE39xowfPx7jx4+vc19Tn3dwcDBWrFhR576pU6cqtQNAeHg43nvvvSaf74033sAbb7zR5PvdRbhcKi+03fqFqlmFW8hN8dSIVJUrIaK2RqhsUMane3ZT3GwxY3+BrdnRI5IvbhGRewmVC7WVyyvFPb8pfrToKMqry+Hv5Y+k4CS1yyGiNkCYbKiuAixVtn976CK8+uSmeFpkmsqVEJGncbgpPm3aNGRmZmLevHmIi4vjyq/GVLWNcSYGvQF9Y/s2uF+SJKUpHujhXwMiUp9wudSGVorvy98HgE1xInI/obKhoqYp7uHj0w8WHoTZakawdzASghLULoeI2hihckEmSW1qpbjcAEmNSOUKcSJyC2GyQe41AB7fbwAAk9mEw4W27S95MS0ROZvDTfHff/8dv/32Gy6++GIXlOMhPHhPcXuUV5fDKllh0Bvga/BVuxwi8nDC5VIbWSleaanEkcIjAIDuEd1VroaI2hqhskFZKe7ZTfG9eXsB2F7Y0uwLjkTksYTKBVllMWCttv27DewpzlWBRORuwmRDZYnt1ssXMDjczhHO/oL9sEgWRPlFISYgRu1yiMjDOLyneEJCQqtGYLcJbWB8enNKzbaLAgKMAXzBi4hcTrhcaiMrxQ8WHES1VI1w33DE+PMkhojcS6hsaCMrxeVmBy+UIiI1CJULMnmVuDEAMPqpW4sb7M7bDQDoHsmcICL3ECYb2shUWpmynzhXiRORCzjcFH/55ZfxyCOPNLr3LdWQg8qnbQRVffLo9ABj27wogIjcS7hcaiNNcXk/8W7h3XiBFBG5nVDZUFFku/XwleJys4MvbhGRGoTKBZmp7ewnXmmpxKFzhwBwpTgRuY8w2SAvwGsjvQY2xYnIlRyetzFp0iSYTCZ06tQJ/v7+MBqNdd5fUFDgtOKEpawUbxtBVZvFakF5dTkAINDY9j5/InI/4XKpotB26+FN8X0F3E+ciNQjVDbI49N9Q9Stw4VMZhOOFh0FwGYHEalDqFyQKfuJe/7o9AMFB5QpU3EBcWqXQ0RthDDZII9P9w5Stw432ZPPpjgRuY7DTfGXX37ZBWV4mDY8Pt1UbYIkSTDqjTDqjRf+ACKiVhIul+SV4r6hqpbhavJKcTbFiUgNQmVDGxifvq9gH6ySFdH+0Yjyj1K7HCJqg4TKBZnSFPf8leK1p4lwyhQRuYsw2dCGptIWVhTiZMlJANx2iYhcw+Gm+NSpU11Rh2eplPf5aHtNcWV0ujf3Eyci9xAul9rA+PQqSxUOFdrGH7IpTkRqECYbLGagZsqSJ49PV0YgRnC1BxGpQ5hcqK0NNcX35u0FwJwgIvcSJhsq286e4nvzbXnQPrg9Qnw8d5IWEanH4T3FAeDIkSOYO3cuJk+ejNzcXADADz/8gL179zq1OGFVtZ2gqq/UbPvcOTqdiNxJqFxqA03xQ+cOodpajRCfEI4/JCLVCJEN8ipxoG00xTkCkYhUJEQu1NaGmuLySvHukVwVSETuJUQ2tKGV4rUnhxARuYLDTfFffvkFaWlp2Lx5M7788kuUltr+KO/atQtPPPGE0wsUUhvdU9xsNaOyuhIAEGBse6vkiUgdQuWSJAHlhbZ/e3BTPKOgZnR6eCqnhhCRKoTJhsoi260xADA4PMRLGHJTnM0OIlKLMLlQWxvZU7ykqgTHi48DYBOEiNxLmGxQ9hT3/NfbOWGKiFzN4ab4I488gqeffhqrV6+Gt7e3cv+VV16JTZs2ObU4YbXRPcVNZhMAwNfLF156z31Rj4i0RahcMpcDFtvFQ/ALVbUUV+J+4kSkNmGyoaKmKe7B+4kXVhTiVOkpANwXkIjUI0wu1GYqsN16eFNcHpV7UeBFCPf17M+ViLRFmGxQptIGqVuHi0mSxAlTRORyDjfFd+/ejWuvvbbB/dHR0cjLy3NKUcJrI01xyWJB2eYtKFr5Hco2b0FJzYt6XCXufGvXrkW3bt1gsVgAAPPnz8fFF1+sak0ffPABQkNDlbddUdOqVatw+eWXw2q1OvW45FmEyiV5dLrey6OnibApTkRqEyYb5PHpvp67Xx73BSQiLRAmF2prI+PT2QAhIrUIkw1yr8HDx6dnl2UjvyIfXjovpISnqF0OEXkoh5vioaGhyMrKanD/n3/+iYsuusgpRQmvSh5p4rlBVfzTTzg8bDgyp07FmTlzkDl1KkrGTwF+2eSypvi0adMwYcIElxy7OcePH4fBYEBYWBgMBgOCgoLQvXt33H333Th06JBbanjooYcwd+5cGAwGtzxfS8yZMwdr16516jFHjx4No9GIZcuWOfW45FmEyqXa+4l76Fhxs8WMQ+dsfxu7RXRTuRoiaquEyYbKmqZ4G9hPnKvEiUhNwuRCbW2kKb43z3bxFEflEpG7CZMNlfJKcc/tNQDAnnzbeUNyWDJ8vXxVroaIPJXDTfEbb7wRDz/8MLKzs6HT6WC1WrFhwwbMmTMHt9xyiytqFI+HrxQv/uknnL7/AVRnZ9e5X8rNh+6fL8CyboNKlbWOxWJpdkXy119/jdOnT2PXrl1YuHAh9u3bh169ejm9EVzf77//jiNHjmDixIkufZ7WCgwMRESE80/WJ0+ejNdee83pxyXPIVQuVRTabj14P/HDhYdhtpoR7B2MdoHt1C6HiNooYbJBWSnu+U1xrgAkIjUJkwu1tZGm+O683QCYE0TkfsJkg7wAz8NXist50D2SF9MSkes43BRfuHAhUlJSkJCQgNLSUqSmpmLw4MEYNGgQ5s6d64oaxSPYSBNJkmA1mez6z1JSgpynnwEkqcFxdDX/nV24CJaSkqaPU16u/Ftq5Dj2GjJkCO677z489NBDCA8PR2xsLObPn1/nMYWFhbjjjjsQExMDX19f9OjRAytXrgRwfvz3t99+i9TUVPj4+CAzM7PJ55Ofo2PHjrjmmmuwZs0a9O/fH9OnT1fGmgPAN998g969e8PX1xcdO3bEggULUF1dbVdNjVmxYgVGjBgBX9+GV8i99dZbSEhIgL+/P2644QYUFRUp79u6dStGjBiB6OhoJCYmYujQodixY4fyfkmSMH/+fCQmJsLHxwfx8fG47777lPdXVlZizpw5uOiiixAQEID+/ftj/fr1TdZZf3y6vLL/hRdeQFxcHCIiInD33XfDbDY79ByjR4/Gtm3bcOTIkSafm9o2oXJJXinuG6pqGa4kj07vFtENOg9dDU9E2idMNnj4SnFJkpQVH2mRaSpXQ0RtmTC5ILNaz587eHBT/KzpLHJMOdDr9Nx6iYjcTphsqGwbe4rLk0N43kBEruTl6Ad4e3vj7bffxuOPP47du3ejtLQU6enpSE5ORnl5Ofz8/FxRpzisFsBssv1bkJEmUnk5DvS+xGnHq87JwcG+/Zp9TE7Nbdcd26Hz92/xc3344YeYPXs2Nm/ejI0bN2LatGm49NJLMWLECFitVowZMwYlJSX45JNP0KlTJ2RkZNQZQW4ymfDcc8/hnXfeQUREBKKjo+1+br1ej/vvvx/XXnsttm/fjn79+uG3337DLbfcgiVLluDyyy/HkSNHcPvttwMAnnjiCbtqqu+3337DlClTGtx/+PBhfPbZZ/jf//6H4uJiTJ8+HXfddZcyarykpARTp07FK6+8gpKSEixduhRjx47FoUOHEBQUhC+++AIvvfQSVqxYge7duyM7Oxu7du1Sjn/PPfcgIyMDK1asQHx8PL766iuMHj0au3fvRnJysl1fo3Xr1iEuLg7r1q3D4cOHMWnSJFx88cWYMWOG3c+RkJCAmJgY/Pbbb+jUqZN93xxqU4TKpdrj0z0U9xMnIi0QJhs8fKV4jikHeeV5MOgM6BreVe1yiKgNEyYXZBWFgFQzyc4vXNVSXEmeJtIxpCP8jS1/bYiIqCWEyYaqmqa4IAvwWsIqWbE3v2Y7DU4OISIXcrgpft9992HJkiVISEhAQkKCcn9ZWRmuuuoqrFu3zqkFCkduiAMeOz5dS3r27IknnngCAJCcnIzXXnsNa9euxYgRI7BmzRps2bIF+/btQ5cuXQAAHTt2rPPxZrMZr7/+Onr16tWi509JSQFg23e8X79+WLBgAR555BFMnTpVeb6nnnoKDz30EJ544gm7aqrvxIkTiI+Pb3B/RUUFPvroI2WPm1dffRXjxo3Diy++iNjYWFx55ZUAAKvViuLiYrz11lsIDw/HL7/8gquuugqZmZmIjY3F8OHDYTQakZiYiH79bBczZGZm4v3330dmZqby3HPmzMGqVavw/vvvY+HChXZ9fcLCwvDaa6/BYDAgJSUF48aNw9q1azFjxgyHniM+Ph4nTpyw6zmp7REql9pSUzycTXEiUo8w2VBRM+XHQ1eKy6s9Ood2hp+XRl5UJKI2SZhckJkKbLc+wYCXt7q1uJA8TYQNECJSgzDZoKwU99xew/Gi4ygzl8HPyw8dQ5p/rZyIqDUcbop/9913CAsLw4IFC5T7ysrKMHr0aKcWJiw5pHR6wKvhuGst0vn5oeuO7XY91rRtG07efscFH5ew9C349+nT4H6r1YrikhIEBwVBr9dD18or7nr27Fnn7bi4OOTm5gIAdu7ciXbt2inN58Z4e3s3OIYj5PHv8ojgXbt2YcOGDXjmmWeUx1gsFlRUVMBkMtlVU33l5eWNjk5PTExUGuIAMHDgQFitVhw4cACxsbHIycnB3LlzsX79euTk5MBqtcJkMikj4v/2t7/h5ZdfRseOHTF69GiMHTsW48ePh5eXF3bv3g2LxdKgzsrKSof2De/evXudVfBxcXHYvdu2P4wjz+Hn5weTyQSixgiVS+WFtlsPbYqbrWYcPHcQAFeKE5G6hMmGypqmuG+IunW4CPeJJSKtECYXZMp+4p67Shw4v1K8RwRzgojcT5hsqPL88enyeUO38G7w0jvcsiIispvDf2F++uknXH755QgLC8MDDzyAkpISjBo1Cl5eXvjhhx8cOtavv/6Kf/3rX9i+fTuysrLw1VdfYcKECcr7m9qL9Pnnn8eDDz4IAEhKSmqwgvTZZ5/FI4884tgn5izyfuLegYAge6nqdDq7R5gHXHopvGJjUZ2T0+i+4tDp4BUTg4BLL4WusZHgViv01dXQ+/tDr3d4S/sGjEZjvafXwWq1jRizZ8SNn59fq/a83bdvHwCgQ4cOAIDS0lIsWLAA1113XYPH+vr6tmjsTmRkJM6dO+fwx02dOhX5+fl46aWXEBERgYiICFx66aWoqqoCYBtLfuDAAaxZswarV6/GXXfdhX/961/45ZdfUFpaCoPBgO3btzcY7R4YaP+onua+P448R0FBAaKioux+XmpbnJlLLqesFA9VtQxXOVp4FFXWKgQZg5AQlHDhDyAichFhskEZn+6ZTXF5BWD3yO4qV0JEbZ0wuSBTmuKeu5+4JEnnR+VGsSlORO4nTDa0gfHpvJiWiNzF4aZ4p06dsGrVKgwdOhR6vR6ffvopfHx88N133yEgwLERHmVlZejVqxduu+22RpuIWVlZdd7+4YcfMH36dEycOLHO/U8++aSyRzEABAWpeNWUcuWWZ4aUzmBAzGOP4vT9D9ia/rUb4zW95ZjHHm28Ie5mPXv2xKlTp3Dw4EGHVmbby2q1YsmSJejQoQPS09MBAL1798aBAwfQuXNnp9WUnp6OjIyMBvdnZmbizJkzyujxTZs2Qa/Xo2tX236NGzZswOuvv46xY8eiuLgYRUVFyMvLq3MMPz8/jB8/HuPHj8fdd9+NlJQU7N69G+np6bBYLMjNzcXll19u99fEEfY+R0VFBY4cOaJ8jYnqc2YuuZyHj0+XR6d3i+jWqguOiIhaS5hsqKxpinvg+HSrZEVGni0X0iLTVK6GiNo6YXJB1gaa4qdKTqGosghGvRFdQp3/mg0R0YUIkw2Vnt1vAM5vu8TzBiJytRbNoujZsydWrlyJESNGoH///li5cmWLVsCOGTMGY8aMafL9sbGxdd7+5ptvMHTo0AZ7MAcFBTV4rGqUleIaCk4nCx45EnjlZeQsfBbV2dnK/YaYGMQ+9pjt/RpwxRVXYPDgwZg4cSIWL16Mzp07Y//+/dDpdC0ag1NQUIDs7GxUVFRgz549ePnll7FlyxZ89913ykrnxx9/HFdddRUSExNx/fXXQ6/XY9euXdizZw+efvrpFtU0atQofPjhhw3u9/X1xdSpU/HCCy+guLgY9913H2644QbldyE5ORkff/wxevfujaysLDz55JN1fk8/+OADWCwW9O/fH/7+/vjkk0/g5+eH9u3bIyIiAjfddBNuueUWvPjii0hPT8fZs2exdu1a9OzZE+PGjXP461dfly5d7HqObdu2wcfHBwMHDmz1c5LnclYuuZyHN8XllR7dwrupXAkRkSDZoKwU97ymeGZxJkrMJfAx+KBTaCe1yyEiEiMXZG2gKS6vCkwJT4HRYLzAo4mIXEPz2VBdCVjNtn976ErxKksV9p/bD4ATpojI9exqiqenpze64svHxwdnzpzBpZdeqty3Y8cO51VXS05ODr777rtGm4OLFi3CU089hcTEREyZMgWzZs2Cl1fTn1plZSUqKyuVt4uLbS9Gmc1mmM3mVtWpKy+CFwCr0R+WVh6rKXKNLanVbDZDkiRYrVZljHVLBA4fjoChQ1GwZQNyMw/AEBWBTldcA53B0Oxx5T245RocIUlSg49r7O3a933++ed48MEHMXnyZJSVlaFz585YuHBhnc//QnXI75dH+/v7+6N9+/YYMmQI3nzzTXTu3Fl5zIgRI/Dtt9/i6aefxnPPPQej0YiUlBTcdtttdtXUmMmTJ+Ohhx7Cvn37lFXgkiShc+fOmDBhAsaOHYuCggKMGzcOr732mnKct99+G3feeSf69OmDiy66CAsXLsRDDz2kfH2Cg4Px/PPPY/bs2bBYLEhLS8M333yDsLAwWK1WvPvuu3jmmWfwj3/8A6dPn0ZkZCT69++PsWPHNvr1k7+3td9u7PtT+zEXeg5JkvDf//4XU6ZMga+vb6t+Zt2pNT/nanN27fL30Ww2NxiT35q/t87OpeZyobW11uZVfg46ANXGIEguygjA+XXbS76yt2to1xZnVO1bkYhaO+t2L9Zt/3O1hDuzwWm5UFFkywUvf5flglo/d7tydgEAuoZ1BSyA2eLY8/P3xf1ErZ11u5e769ZSLgBNZ0Nra61PX3oWBgAW31BYPfC8AQD+yv0LAJAanspzB4GIWruodQOsvbnjtoQ7s8Ep5w2mc5AvGzLrfAAPO2cAbFMHq63VCPUJRYxPDDNBMKxfXay/4bEuRCdJjW0MXdeCBQvsfuInnnjC7sfWKUSna7CneG3PP/88Fi1ahDNnzsDX11e5f/HixejduzfCw8Pxxx9/4NFHH8Wtt96KxYsXN/lc8+fPb/RzWr58Ofzt3Fu7KfHnNqHv8deRF5iCDcmPtepYruDl5YXY2FgkJCTA29u71ccrshah1FoKf50/wgyeufJRC+bNm4eSkhK8/PLLapfiVvn5+ejbty/WrVuH9u3bq10OtUBVVRVOnjyJ7OxsVFdX13mfyWTClClTUFRUhOBgx1bIOTuXXJkLtQ3fOxsBVXn4tcvjOBfQ+DYLorJIFjxV9BSqUY0Hgh5ApCFS7ZKISEBtLRtG7b4HvtXFWNf1aRT7JzrlmFrxnek7bKzaiIHeAzHOv/VThoiobdJSLgDuO2+4+MTbaF/wGzLi/oZDseOddlwtWVqyFJmWTEz0n4h0b26XRkT2a0vZ4F95FiMy/oFqvTe+6/VOq46lVZsqN2Fl+UokeyVjauBUtcshIkHZmw12NcXd4UJN8ZSUFIwYMQKvvvpqs8d57733cMcdd6C0tBQ+Pj6NPqaxq7cSEhKQl5fncJDWp9v5Cby+ewDWziNgmfRpq47VFLPZjNWrV2PEiBEwGh0bMVVRUYGTJ08iKSmpzsUFLXWs+BgqqisQHxiPEO+QCz5ekiSUlJQgKChIqP1m1a67sLAQb7zxBh5++GHo9XqHPlbt2ltj69at2Lt3L6ZOnSpU7SJ/zZ1de0VFBY4fP46EhIQGf3OKi4sRGRnZopMYZ2sqF7KysrB58+YW/b1tjNcLHaCrLIH5zk1AhOua4q3JiZY6VHgIk76fhACvAPzyt1+g1zn2twpQp25nEbV21u1erPvCRMgGZ5wzyLwWXQSdpRLme/4EQhKccsz61Pq5m/bTNPyV9xeeGvgUxnVwvCnO3xf3E7V21u1e7q5bS7kANJ0Ny5cvxzXXXOO0r4nhs5ugP/Qjqse8CKm36xoEav0cVlurMfjzwaiwVOCLcV+gQ0gHh48h6u8QwNrVIGrdAGtvjCjZ4JTzhpy9ML5zBaSAKFQ/sK+VlTZNzZ+zJzY+gf8d+x9m9JiBmT1ntugY/D1RD+tXF+s/z95saNGe4gCwfft27Ntn+0PcvXt3pKe77qrO3377DQcOHMB//vOfCz62f//+qK6uxvHjx5VR0/X5+Pg02jA3Go2t/8GxVAAA9L7B0Lv4h7Al9VosFuh0Ouj1eoebq/VVW6tRUW37fAONgXYdTx7HLNcgCrXrDg8Pxz//+c8WfazatbdG37590bVrV+FqF/lr7uza9Xo9dDpdo3+vnB3Urcml5nJBvm19PlQDlSW24wVFA274HxWn1G2ng0UHAQApESnw8W78ojR7ubNuZxO1dtbtXqy7+edwJldlg1PqrK4ELLYXz4wB4S7PBXf+3JmtZhw4dwAAcHHMxa16Xv6+uJ+otbNu93JX3VrKBaDpbACc/DUpPwcA8PLA8wYAOFpwFBWWCgQaA9E5onOLLqiVifo7BLB2NYhaN8Da6x/PmVyVDU75vK2219513oFuy113/5ztLbBtxdfa8waAvydqYv3qYv32Z4PDTfHc3FzceOONWL9+PUJDQwHYVrEOHToUK1asQFRUlKOHvKB3330Xl1xyCXr16nXBx+7cuRN6vR7R0dFOr8MuVaW2W+8AdZ7fjcrMZQAAHy8fGA3i/sIRkdjUyKUWqSg6/2/fC0/WEE1GfgYAIDUiVeVKiIgEyYaK8/vQwidIvTpc4EjhEVRaKhFkDEJisGeNhSciMQmRC7WZ8m23/hHq1uEie/L2AAC6R3RvVUOciKg1hMgGudfgE6huHS5SWlWKY0XHANgygYjI1Rz+P897770XJSUl2Lt3LwoKClBQUIA9e/aguLgY9913n0PHKi0txc6dO7Fz504AwLFjx7Bz505kZmYqjykuLsbnn3+Ov//97w0+fuPGjXj55Zexa9cuHD16FMuWLcOsWbPwf//3fwgLU2l/6ypboxjenhlUtclN8QCj518AQETa5cxccqma1R7wCQYMLR7Uoln78m1XVbMpTkRaIEQ2VNY0xb2DAL1B3VqcbHfebgBAamQqmx1EpAlC5EJtnt4Uz69pikeyAUJE6hEiGyrlBXiedRGtLCM/AxIkxAfEI8LPMzOPiLTF4VflV61ahTVr1qBbt27Kfampqfj3v/+NkSNHOnSsbdu2YejQocrbs2fPBgBMnToVH3zwAQBgxYoVkCQJkydPbvDxPj4+WLFiBebPn4/Kykp06NABs2bNUo6jikoxVoo7Yyt5uSkeaPT8CwCIqGWc8bfmQpyZSy4lN8X9QlUtwxUsVosyJpdNcSLSAiGyQZ4g4qv+PojOtjfPNgKxR0QPlSshIrIRIhdklurzGeGpTfGaleI9IpkTRKQeIbLBw1eKyxfT8iIpInIXh5viVqu10dnsRqNR2YvWXkOGDLlgw+T222/H7bff3uj7evfujU2bNjn0nC6nrBTXZlNc/t6ZTCb4+fm1+DhVlipUWaoAAP5e/k6pjYg8j8lkAuD8/Z5qc2YuuZTSFFdpkokLHSs6hvLqcvh7+SMpOEntcoiIxMgGpSnueVtqyM2OtMg0lSshIrIRIhdkFYUAal4r88Bzh4rqChw6dwgAc4KI1CVENigL8DyzKb4333YxLfOAiNzF4ab4lVdeifvvvx+ffvop4uPjAQCnT5/GrFmzMGzYMKcXKJwqbQeVwWBAaGgocnNzAQD+/v7Q6XQOH6eosghWsxV+Xn4wV5lhhtmuj7NaraiqqkJFRQX0enFGKYpaN8Da1SBq3YDzapckCSaTCbm5uQgNDYXB4LqxsMLkUkWh7dYDX9jKKLDtJ54SnsIxuUSkCUJkgzw+3cezVoqXV5fjcOFhAFzxQUTaIUQuyOTR6b6hHrnt0v6C/bBIFkT4RiDGP0btcoioDRMiG6pKbLcevlKck0OIyF0c/r/r1157DVdffTWSkpKQkJAAADh58iR69OiBTz75xOkFCkeAPcVjY2MBQGmMt8S5inMory5HkHcQzN72NcQBW6OsvLwcfn5+LWrGq0XUugHWrgZR6wacX3toaKjyN8dVhMkleaW4b6iqZbgC9xMnIq0RIhsqapriHjY+XW52RPpFstlBRJohRC7IPH0/8VrTREQ7XyYizyJENnjwSvG88jxkl2VDBx1fTyIit3G4KZ6QkIAdO3ZgzZo12L9/PwCgW7duGD58uNOLE1KV9vcU1+l0iIuLQ3R0NMxm+xvaMqtkxbzv56G4qhjPXv4sOkR2sPtjzWYzfv31VwwePNil45SdTdS6AdauBlHrBpxbu9FodOkKcZkwueTB49Mz8m0rxXkSQ0RaIUQ2eOhKcWWf2IgebHYQkWYIkQsyT2+K59tygtNEiEhtQmSDxqfStoZ83tAptBMCjNrtpRCRZ3G4Kf7RRx9h0qRJGDFiBEaMGKHcX1VVhRUrVuCWW25xaoHC0fie4rUZDIYWNaz2F+zHgdID8PPyQ8+4njAa7G+cGQwGVFdXw9fXV6hmoah1A6xdDaLWDYhZuzC55KFNcYvVgn0FXClORNoiRDZ46Epx+cUtNjuISEuEyAWZpzfFa60UJyJSkxDZIPcaPHB8unLeEMHzBiJyH4c3/rz11ltRVFTU4P6SkhLceuutTilKaB589ZZs05lNAIA+MX0caogTEbmCMLlUXmi79bCm+IniEyivLoeflx+SgpPULoeICIAg2eChK8X35u8FwGYHEWmLELkg8+CmeFFlEU4UnwDAJggRqU+IbKis2VPcA3sNvEiKiNTgcFNckqRGx+CdOnUKISEhTilKaB589ZZsU5atKT4gboDKlRARCZRLykrxUFXLcLaMAtvo9K5hXWHQu35cPhGRPYTIhoqaF+B8NVKPE7DZQURaJUQuyEwFtlv/cHXrcAH5wql2ge0Q6huqbjFE1OYJkQ3yAjyfIHXrcDJJkpTtNHpE9lC5GiJqS+wen56eng6dTgedTodhw4bBy+v8h1osFhw7dgyjR492SZFCEWh8ektUWaqwPWc7AGBAPJviRKQe4XLJQ8encz9xItISobJBaYp7zkpxNjuISGuEygWZB68U35vHaSJEpD6hsqHSM6fSnio5haLKIhj1RnQJ66J2OUTUhtjdFJ8wYQIAYOfOnRg1ahQCA8//Ifb29kZSUhImTpzo9AKFYrUAZpPt3x4WVLJdZ3ehwlKBcN9wJIcmq10OEbVhwuUSm+JERC4nVDYo49M1sgrFCeRmB1d7EJFWCJULMqUp7nkrxXfn7QYAdI/kNBEiUo9Q2aCsFPesXoOcBynhKdyelYjcyu6m+BNPPAEASEpKwqRJk+Dr6+uyooQlrxIHPHal+MYzGwHYRqc3Nl6GiMhdhMulikLbrQc1xa2SFfsL9gMAukV0U7kaIiLBsqGipinuQSvF5X0B2RQnIq0QKhdkXClORORSQmWDh64U5+h0IlKLXU3x2vtrTJ061aUFCU1uiuv0gJeGw7QVNmdtBsD9xIlIXcLlkiSdXynuQeNkM4szUWYug6/BFx1DOqpdDhG1ccJlg7JS3IOa4nxxi4g0RLhckHloUzynLAe55bnQ6/RICU9RuxwiaqOEy4aqEtutpzXFeTEtEalEb8+DunfvjhUrVqCqqqrZxx06dAgzZ87EokWLnFKccJT9xIMAD1xFXVxVrLzQxaY4EalJuFyqKgWs1bZ/e9BKcXl0epfwLvDS2z18hojIJYTLBg9bKZ5rykWuydbs6BbO6SFEpD7hckFmKrDdelhTXH49qXNoZ/gb/VWuhojaKqGyQZLO9xs8aHx6tbUa+/L3AWBTnIjcz65XsF999VU8/PDDuOuuuzBixAj06dMH8fHx8PX1xblz55CRkYHff/8de/fuxT333IOZM2e6um5tkvf48NDR6Vuzt8IqWZEUnIS4wDi1yyGiNky4XJJXiRt8AKOfurU4kbKfeDj3Eyci9QmVDZIEVBTZ/u3rGXuKy6s9OoZ0ZLODiDRBqFyQWcznJ4l4WFNcHp3OBggRqUmobKiuPL/AwoNWih8pPIIKSwUCjYFICk5SuxwiamPsaooPGzYM27Ztw++//47//Oc/WLZsGU6cOIHy8nJERkYiPT0dt9xyC2666SaEhXnOCjiHeXhTfNOZTQCA/nH9Va6EiNo64XKpvNB26xfmUZNEMgpqmuIRbIoTkfqEyobqCsBqtv3bQ8ancwQiEWmNULkgk1eJ6/Qec9GUbHfebgBA94juKldCRG2ZUNkg9xoAj2qK184Dvc6uQcZERE7j0KzTyy67DJdddpmrahGfMj7dM5vim7Nt+4kPjBuociVERDbC5JK8UtwvVNUynMkqWZVxV2yKE5GWCJEN8uh06DzmBa69+bYVgGmRaSpXQkRUlxC5IJP3E/cLA/QGdWtxIqtkZU4QkaYIkQ2VNfuJGwMAvec0j3kxLRGpyXP+mmqBfPWWT5C6dbhAdlk2jhUdg16nR5/YPmqXQ0QkFqUprpEVKE5wquQUSs2l8NZ7o2NoR7XLISISizwa1yfYI17gkiRJeXGreyRXABIRtZjcFPew0emZxZkoqSqBj8EHncM6q10OEZEYlF6DZ1xEK2NTnIjUJP4rMFriwSvFN2fZVol3j+iOEB/PGuFFRORyHtgUl/cT7xreFUa9UeVqiIgEI68U9/WM0eknS06iuKoYRr0RXUK7qF0OEZG4PLQpviff1gBJCU/huQMRkb0q5a1aPacpXl5djsOFhwGwKU5E6mBT3Jk8uCm+Kcu2n/iAuAEqV0JEJKCKQtutBzbFu4V3U7kSIiIBVRbZbj1sP/GU8BQYDWx2EBG1mIc2xffm2UanswFCROQAeaW4B/Ua9hfsh0WyINIvEjH+MWqXQ0RtEJvizlTpeUEF2MYhsilORNQK8kpx31BVy3CmjAJbU5z7iRMRtUBFTVPc1zMmMMkrALtHcHQ6EVGrmApst/7h6tbhZLvzdgNgThAROcQDt2qtPTpdp9OpXA0RtUVsijtTleeNNAGAI4VHkFeeB1+DL3pF91K7HCIi8XjY+HRJkrAvfx8ANsWJiFrEw8anyysA06LSVK6EiEhwHrhS3Gw1Y3/BfgBAWiRzgojIbh44Pl2+SIp5QERqcbgpbjAYkJub2+D+/Px8GAwGpxQlLGV8uucEFXB+dHrvmN7wMfioXA0RUV1C5JLSFA9VtQxnOVV6Stk7tnNoZ7XLISJqQPPZUFnTFPeA8enV1mplS40eERyLS0TapPlckHlgU/zwucOotFQiyBiExOBEtcshIlJoPhuUleKe02tQttPgeQMRqcThprgkSY3eX1lZCW9v71YXJDQP3VOco9OJSMuEyKXyQtuth6wUl5sfyWHJ3DuWiDRJ89ngQSvFjxQeQYWlAgHGACSFJKldDhFRozSfCzIPbIorW2xEdodex4GVRKQdms8GD1spXlRZhMySTAC2TCAiUoOXvQ9csmQJAECn0+Gdd95BYOD5P8YWiwW//vorUlJSnF+hSKo8b09xs9WMrdlbAbApTkTaIlQuKU3xUDWrcBq5Kc7R6USkNcJkgwetFN+bb1vtkRqRymYHEWmOMLkgk5vifp6zp7iyKjCSqwKJSBuEyYaqEtuth+wpLu8nnhiUiBCfEJWrIaK2yu6m+EsvvQTAdgXVm2++WWeEiLe3N5KSkvDmm286v0KReOCe4nvy9sBUbUKoTyi6hndVuxwiIoVQueRhe4pzP3Ei0iphssGDVorLL26x2UFEWiRMLshMBbZbD1opLu8fy5wgIq0QJhs8bKU4zxuISAvsboofO3YMADB06FB8+eWXCAvzjBf2ncoDx6dvOmMbnd4/rj9XfhCRpgiVSx7UFJckCRkFXClORNokTDZUFNlufcVfIaG8uMV9AYlIg4TJBZkyPt0zVoqbzCYcKTwCgDlBRNohTDZ42FRaNsWJSAvsborL1q1b54o6PIPcFPfxjKu3AO4nTkTap/lcqq4CzDX54AFN8TNlZ1BUWQQvvReSQ5PVLoeIqFGazwYPGZ9eaanEoXOHAPDFLSLSNs3nAgCYK86fN3jISvH9BfthkSyI8otCTECM2uUQEdWh+WzwoF6DJEnK5JC0yDSVqyGitszhpvhtt93W7Pvfe++9FhcjPA8bn15mLsNfZ/8CwKY4EWmX5nOporDmHzrAA/ZMkvcTTw5NhrfBW+VqiIgap/1s8IyV4vsL9qNaqka4bzjiAuLULoeIqEmazwUAKK8Zna4zCJ8PMq4KJCIt03w2VNbsKe4t/p7iOaYc5Ffkw6AzcItWIlKVw03xc+fO1XnbbDZjz549KCwsxJVXXum0woTkYePTt+dsR7VUjXaB7dAuqJ3a5RARNUrzuSSPTvcNAfTib0PB/cSJSASazwYPWSkuNzu6R3SHTqdTuRoioqZpPheAWqPTIwAP+ZvKpjgRaZnms0FegOcBK8XlPEgOS4afl5/K1RBRW+ZwU/yrr75qcJ/VasXMmTPRqVMnpxQlrErP2udj45mNAIAB8VwlTkTapflc8qD9xIHzK8XZFCciLdN8NlTUNMV9xW6K783bC4AjEIlI+zSfC0DdpriH2JPPpjgRaZfms6HSc6bSyqPTmQdEpDanLFnT6/WYPXs2XnrpJWccTkxWC1Bdbvu3BwQVwP3EiUhcmsql8kLbrQc0xSVJYlOciISlmWyQJI9ZKS6/uNU9srvKlRAROU4zuSDzsKZ4UWURTpacBGCbKEJEJAJNZYMHrhTvEcGmOBGpy2lzXI8cOYLq6mqHPubXX3/F+PHjER8fD51Oh6+//rrO+6dNmwadTlfnv9GjR9d5TEFBAW666SYEBwcjNDQU06dPR2lpaWs/HcfJo9MBj2iK55Xn4XDhYQBAv9h+KldDROS4luSSSygrxUNVLcMZssuyca7yHLx0XkgOS1a7HCIih2kiG8wmwFpTg8B7xpZUleB48XEAXPFBROLSRC7ITDV7ivuHq1uHk8gNkPbB7RHiI27eEVHbo5lsUFaKi72nuFWyYm++bcIUzxuISG0Oj0+fPXt2nbclSUJWVha+++47TJ061aFjlZWVoVevXrjttttw3XXXNfqY0aNH4/3331fe9vHxqfP+m266CVlZWVi9ejXMZjNuvfVW3H777Vi+fLlDtbSa3BTXGQAvn+YfKwB5lXi38G4I8xV/dSMReS5n5pJLeND49IwC2yrxTqGd4GMQP+uIyHNpOhvk0ek6g9DbLsmTQ+ID4hHu6xkNHCLyXJrOBZmHrRSXm+JcJU5EWqXpbJAkoKrE9m+BzxkA4HjRcZSZy+Br8EWnUA2MpSeiNs3hpviff/5Z5229Xo+oqCi8+OKLuO222xw61pgxYzBmzJhmH+Pj44PY2NhG37dv3z6sWrUKW7duRZ8+fQAAr776KsaOHYsXXngB8fHxDtXTKnJT3DsQ0Onc97wusukMR6cTkRicmUsu4UlNcY5OJyJBaDoblNHpQUKfNygjELnag4gEoOlckHloU5w5QURapelsqK4AJKvt34KPT9+Tb8uD1IhUeOkdbkcRETmVw3+F1q1b54o6mrR+/XpER0cjLCwMV155JZ5++mlERNhOEDZu3IjQ0FClIQ4Aw4cPh16vx+bNm3Httdc2eszKykpUVlYqbxcX216YMpvNMJvNLSvUdA5GAJK3P6pbegw7yTW2uNYLkCRJWSneJ7qPU5/H1bW7iqh1A6xdDaLWDbi3dmc9hzNyqblcqH3bEnpTAQwALN4hsLrpZ8JV38e9Z23jrrqGdnXJzwh/d9yPdbsX67b/uVrL1dnQmjp1ZQXwAiD5BLv8vAFw3fdv91nbfuLdwroxE2oRtW5A3NpZt3u5u24t5QLQdDYAra/VUHoWegAW31DhzxskScLuvJqcCGVO1Mfa3U/UugHW3txxW8vV2dCq84YyW68BAMw6b0DgfsOunF0AXHfeAPD3RE2sX12sv+GxLkQnSZLUkifIzc3FgQMHAABdu3ZFdHR0Sw5zvhCdDl999RUmTJig3LdixQr4+/ujQ4cOOHLkCB577DEEBgZi48aNMBgMWLhwIT788EOlDll0dDQWLFiAmTNnNvpc8+fPx4IFCxrcv3z5cvj7+7eo/oiS/bjs8EKU+MTh59TnWnQMrThrOYtXSl6BAQb8M+Sf8NZ5q10SEXkgk8mEKVOmoKioCMHBwa0+XmtyyRW5IOt9/A0knNuIPRdNxpHo5qejaJkkSVhUvAhlUhnuCLwDCV4JapdERB6oLWRDVPFfGHTkBRT5JWJ9ytMtPo7a/lX0LxRJRbgt4DZ0NHZUuxwi8lBaygXAtecNAw8/h+iSvdje/g6cCr+0VcdSW6G1EC8UvwA99JgbMpevKxGRU7WFbPCvzMGIjAdRrffFd72WtugYWvFmyZs4ZTmFG/xvQE/vnmqXQ0Qeyt5scLgpXlxcjLvvvhuffvoprFbbCA+DwYBJkybh3//+N0JCQlpUcGNN8fqOHj2KTp06Yc2aNRg2bFiLm+KNXb2VkJCAvLy8Fgep7tBP8PpsCqxxF8Ny25oWHcNeZrMZq1evxogRI2A0Gi/8AQ76z8H/4Lltz6FPTB8sHebc0HV17a4iat0Aa1eDqHUD7q29uLgYkZGRrT6JcUYuNZULWVlZ2Lx5c6u+HoZPJ0F/dC2qr3oVUq/JLTqGo1zxfcw15WL016Nh0Bnw299+g6+Xr1OOWxt/d9yPdbsX674wEbKhNecMAKDL+BpeX/0d1sRBsNz8bYuPYy9XfP/yy/Mx4qsR0EGHX//2KwKMzt/nkL8v7idq7azbvdxdt5ZyAWg6G5YvX45rrrmmVV8Tr3eGQpezG9WTVkDqPLzFx3GEq76fa0+uxYO/PYiuYV3x6ZhPnXbc2kT9HQJYuxpErRtg7Y0RJRtadd6QvRvGd4dCCoxB9f17W3YMB7jqe2W2mHHZ55fBbDXj26u/RbvAdk47dp3n4e+Jali/ulj/efZmg8Pj02fMmIE///wT3333HQYOHAjANsb8/vvvxx133IEVK1a0vOoL6NixIyIjI3H48GEMGzYMsbGxyM3NrfOY6upqFBQUNLkPOWDbp9zHx6fB/UajseVfeGsFAEDvEwS9m374WlVvM7bmbAUADIof5LJfJFfV7mqi1g2wdjWIWjfgntqddXxn5FJzuSDftrjeykIAgFdgJODmnwdnfh8PFh0EAHQM7YggvyCnHLMp/N1xP9btXqy7+edwBldnQ6vqNJcCAPR+oW47bwCc+/07kG27KLlDSAeE+oc65ZhN4e+L+4laO+t2L3fVraVcAJrOBrnWVtVbXgAA8AqOFvq8AQD2n9sPwLafuDv+30HE3yGAtatB1LoB1l7/eM7g6mxwRq9B5xPk1u+7s79XB4sOwmw1I9QnFEmhSdDpdE47dmP4e6Ie1q8u1m9/NjjcFF+5ciV+/PFHXHbZZcp9o0aNwttvv43Ro0c7ejiHnDp1Cvn5+YiLiwMADBw4EIWFhdi+fTsuueQSAMDPP/8Mq9WK/v37u7SWBqrKbLfezl8l4U7V1mpszbY1xQfEDVC5GiKiC1Mzl+xSXmi79QtTtYzWyijIAACkhqeqXAkR0YVpOhsqa/af9Wn9qEe17MnfA8DW7CAiEoGmcwEAJAkw5dv+7R+hbi1OsCfPlhNpkWkqV0JE1DRNZ0Ol7UJaeAeqW0cr7c7bDQDoHtnd5Q1xIiJ7ONwUj4iIaHR0SEhICMLCHHvBv7S0FIcPH1bePnbsGHbu3Inw8HCEh4djwYIFmDhxImJjY3HkyBE89NBD6Ny5M0aNGgUA6NatG0aPHo0ZM2bgzTffhNlsxj333IMbb7wR8fHxjn5qraMEldhN8Yz8DJSYSxBkDEJqBBsfRKR9zswllyg/Z7v1C1W1jNbKyLc1xbtFdFO5EiKiC9N0NlTUNMV9xW2Kyy9usSlORKLQdC4AgNkEVNtWBYreFLdKVuzNt436ZU4QkZZpOhuqSmy3HtIU7xHBPCAibdA7+gFz587F7NmzkZ2drdyXnZ2NBx98EPPmzXPoWNu2bUN6ejrS09MBALNnz0Z6ejoef/xxGAwG/PXXX7j66qvRpUsXTJ8+HZdccgl+++23OuNIli1bhpSUFAwbNgxjx47FZZddhqVLnbsPtl08ZKX4pqxNAIB+cf1g0BtUroaI6MKcmUtOZ7UCFYW2fwu+Unxf/j4AQPeI7ipXQkR0YZrOBsFXikuShL15Nc0OvrhFRILQdC4AgMk2Oh0Gb+EbIMeLj6PUXApfgy86hXZSuxwioiZpOhvkXoOP2JkgnzdwcggRaYXDK8XfeOMNHD58GImJiUhMTAQAZGZmwsfHB2fPnsVbb72lPHbHjh3NHmvIkCGQJKnJ9//4448XrCc8PBzLly+3s3oXqpJXirt2n1VX25y1GQBHpxOROJyZS05XWQxIVtu/fUPd+9xOdNZ0FmfLz0Kv06NreFe1yyEiuiBNZ4PgK8VPl55GYWUhvPRezAQiEoamcwGoOzpd8PGycgOkW0Q3eOkdftmRiMhtNJ0NHjA+vcxchqNFRwHYxqcTEWmBw/93OmHCBBeU4QE8YKV4eXU5/sz9EwCb4kQkDk3nkrxK3MsPMPqqWkpryKPTO4Z0hJ+Xn8rVEBFdmKazQV4p7ttwVKMI5P3Eu4R1gbfBW+VqiIjso+lcADxqP3Fl/1hOmCIijdN0NsgL8AReKZ6RnwEJEuIC4hDpF6l2OUREAFrQFH/iiSdcUYf4PKAp/mfOnzBbzYgNiEX74PZql0NEZBdN55Kyn7jYo9OV/cTDuZ84EYlB09lQUWS7FXR8OkcgEpGINJ0LwPnx6f7h6tbhBMwJIhKFprOhUvw9xZX9xCO55RIRaUeL5xhVVVUhNzcXVqu1zv3yqJE2p0oOKnGb4vJ+4gPiBkAn+LguImp7NJlLntIUL7A1xVMjUlWuhIjIMZrMBsHHp3MFIBGJTJO5AHjMSnGzxYz9BfsBsAlCROLQZDYoK8XF3ap1T55twhTzgIi0xOGm+MGDBzF9+nT88ccfde6XJAk6nQ4Wi8VpxQlFWSku7tVbtZviRESi0HQueUpTPJ9NcSISi6azoVJeKS7e+HSL1aJkAl/cIiKRaDoXAI9pih8sPIgqaxWCvYOREJSgdjlERM3SdDZ4wJ7iclOck0OISEscborfeuut8PLywsqVKxEXF8cVxTK5KS7oPh/nKs5hX8E+AED/uP4qV0NEZD9N51J5oe3WL1TNKlolrzwPuaZc6KBDSniK2uUQEdlF09kg8ErxY0XHUF5dDj8vP3QM6ah2OUREdtN0LgAe0xSXR6f3iOyhva8xEVE9ms4GeaW4oFNp88rzkFWWBR10XGBBRJricFN8586d2L59O1JS+MJ4HYLvKb45ezMAIDksGZF+kSpXQ0RkP03nkrJSPFTVMlpDXhGYFJIEf6O/ytUQEdlHs9kgSef3BxRwT/E9+bbVHqkRqTDoDSpXQ0RkP83mgsxDmuLcYoOIRKLpbBB8fLp8kVTHkI4IMIrZLyEiz6R39ANSU1ORl5fnilrEJr+4JehIk01nODqdiMSk6VzygPHp+/JtU0R4ZS8RiUSz2VBVBkg1Yxh9xRufruwLGMHR6UQkFs3mgsxDmuIclUtEItF0Ngg+Pl25SCqSF0kRkbbY1RQvLi5W/nvuuefw0EMPYf369cjPz6/zvuLiYlfXq12CrxTnfuJEJBJhckkZny5uU1zZTzycTXEi0jYhsqGiZj9xvRdg9FOvjhZSmuLcT5yIBCBELshMBbZb/3B162gFk9mEo0VHATAniEi7hMkGZaW4mE1xecIUL5IiIq2xa3x6aGhonT01JEnCsGHD6jxGkiTodDpYLBbnVigKgZviJ0tO4nTpaXjpvNAnpo/a5RARXZAwuVRRaLv1DVWvhlbKKKhpinOlOBFpnBDZUFnz4ppPMKClPQvtUGWpwoFzBwBwxQcRiUGIXJDJK8X9xG2KZ+RnwCpZEeMfgyj/KLXLISJqlDDZIPBKcUmSeDEtEWmWXU3xdevWuboOsVktQHW57d/e4u3zIa8S7xnVk/vFEpEQhMklwcenF1QUILssGwCQEq7BPbaIiGoRIhsqaprivuLtJ37w3EFUW6sR6hOKdoHt1C6HiOiChMgFAJAkjxifvjfftn8sGyBEpGXCZIPAe4qfKj2FosoiGPVGdAnronY5RER12NUUv+KKK1xdh9jkVeKAkCvFlf3E4zk6nYjEIEwuCd4Ul/cTTwpOQqCAVycTUdsiRDbUXikumNr7AuoEW+VORG2TELkAAJUlgNVs+7fATXE5J9gUJyItEyIbJOl8U1zA12LkVeIp4SnwNnirXA0RUV12NcVr++uvvxq9X6fTwdfXF4mJifDx8Wl1YUKRQ0pnALzE+tytkhWbszcDAAbGDVS5GiIix2k6lwRvisv7iXeL6KZyJUREjtFsNsh7ivuGuP+5W0kZgRjBZgcRiUezuQCcXyXu5Qd4izu9j6NyiUg0ms0GswmQrLZ/C7gAT86D7hHccomItMfhpvjFF1/c7MoAo9GISZMm4a233oKvr2+rihOGsp94oHB7A+4v2I+iyiIEGAO4NyARCUnTuVReaLv1C3Xv8zqJ3BTniQwRiUaz2SCvFBewKb43j2NxiUhcms0FACgvsN0KvEq8oKIAp0tPAwBSI1JVroaIyD6azQZlKq1O6KZ4WlSaypUQETWkd/QDvvrqKyQnJ2Pp0qXYuXMndu7ciaVLl6Jr165Yvnw53n33Xfz888+YO3euK+rVJmWciXghJe8n3jemL4x6o8rVEBE5TrO5ZC4Hqstt/xZ8pThf2CIi0Wg2G+SV4oKNTy8zl+Fo0VEAbIoTkZg0mwsAYJKb4uHuf24nkS+cSgpOQrC3WBlHRG2XZrOhssR2K+ACvGprtfJaEidMEZEWObxS/JlnnsErr7yCUaNGKfelpaWhXbt2mDdvHrZs2YKAgAD84x//wAsvvODUYjVLvnrLR7w9PuT9xPvH9Ve5EiKiltFsLsmrxHUG4ZofAFBYUYgzZWcA2PaBIiISiWazoUJeKS5WLmTkZ0CChNiAWET6RapdDhGRwzSbC8D58ekCrxTfk8/R6UQkHs1mg7wAT8Bew5HCI6iwVCDAGICkkCS1yyEiasDhleK7d+9G+/btG9zfvn177N69G4Bt9EhWVlbrqxOFMj5drJXilZZK7MjdAQAYEDdA5WqIiFpGs7mk7CceKtyVvQCQUWC7sjcxKBFB3kEqV0NE5BjNZoM8Pl2wi6W4nzgRiU6zuQB4RlOc+4kTkYA0mw2V8lRa8ZritfcT1+scbj0REbmcw3+ZUlJSsGjRIlRVVSn3mc1mLFq0CCkptpVkp0+fRkxMjPOq1LraI00EsjN3JyotlYj0i0Sn0E5ql0NE1CKazaWKQtutb6h7n9dJODqdiESm3WwQc6W48uJWZHeVKyEiahnN5gIgfFNckiQ2xYlISJrNBoFXinNyCBFpncPj0//973/j6quvRrt27dCzZ08AtquqLBYLVq5cCQA4evQo7rrrLudWqmWCrhSX9xMfEDcAOgFXMRIRARrOJWWlOPcTJyJyN81mg6Arxffm2/aK5YtbRCQqzeYCIHxTPKssCwUVBfDSeXHbJSISimazQdAFeMD5i2nTItNUroSIqHEON8UHDRqEY8eOYdmyZTh48CAA4G9/+xumTJmCoCDbeNWbb77ZuVVqndIUFyuo5P3EOTqdiESm2VzykKZ4t4huKldCROQ4zWaDslI8xP3P3UIFFQU4XXoaAC+UIiJxaTYXgFpN8XB1nr+V5AZIclgyfAw+KldDRGQ/zWZDlZjj0yuqK3Do3CEAvJiWiLTL4aY4AAQFBeHOO+90di3iEnCleFFlkbLio39cf5WrISJqHU3mksBN8aLKIqUB0i2cTXEiEpMms6GiyHYr0Ph0udmRFJyEYG9x6iYiqk+TuQAApgLbraArxTk6nYhEpslskHsNgo1P31+wHxbJgki/SMT4t6GtdYlIKA43xT/66KNm33/LLbe0uBhhCXj11tbsrZAgoUNIB8QGxKpdDhFRi2k2l8oLbbd+oeo8fyvsK9gHAGgX2A4hPuKsZiQikmk2G5Tx6eL8bd2bx9HpRCQ+zeYCIPz4dHn/WI7KJSLRaDYbKsXrNQDA7rzdAIAeET24VSsRaZbDTfH777+/zttmsxkmkwne3t7w9/dv401xcVaK195PnIhIZJrNJYFXinM/cSISnWazQRmfLs6Ka7nZwaY4EYlMs7kACN0Ut1gtyrlD98juKldDROQYzWZDVc2e4oKtFOfkECISgd7RDzh37lyd/0pLS3HgwAFcdtll+PTTT11Ro/YJOD6dTXEi8hSazSUPaIpzP3EiEpUms8FqrbVSXIymuCRJyotb3SPY7CAicWkyFwBbNijj08XbU/x48XGUmcvg5+WHjiEd1S6HiMghms0GZaV4kHo1tACb4kQkAoeb4o1JTk7GokWLGlxd1WbIK8UFuXrrTOkZnCg+Ab1Oj76xfdUuh4jI6TSRSxWFtlsBm+L78m3j07lSnIg8ierZUFUKQLL9W5CV4tll2SioKICXzgsp4Slql0NE5FSq5wIAVBYBksX2bz/xmuJyA6RbeDd46R0eRklEpDmayAbBeg0AUFRZhMySTABsihORtjmlKQ4AXl5eOHPmjLMOJxZlpbgYQbU5azMAW0AFCXbFGRGRvVTPJXmluG+oejW0QElViXIikxrOpjgReRZVs0FeJW7wBrx81anBQfK+gMlhyfAVpGYiIkeofs4grxL3DgSM4v2dVfaPZQOEiDyI6tkg4J7ie/P2AgASghIQ4hOicjVERE1z+DLOb7/9ts7bkiQhKysLr732Gi699FKnFSYUwcanc3Q6EXkSzeaSoOPT5VXiFwVehFDBGvpERDJNZkNFke3WJxjQ6dSpwUHyfuLcJ5aIRKfJXACEHp0OnG+CpEWmqVwJEZHjNJsN8kpxQXoNwPnzBl4kRURa53BTfMKECXXe1ul0iIqKwpVXXokXX3zRWXWJpVKcoJIkiU1xIvIoms0lQZviyn7i4dxPnIjEpclsqKhZKS7I6HTgfLOjRwRf3CIisWkyFwDAlG+79Y9Qr4YWqrJUYf+5/QB48RQRiUmz2aCMTxdnwqs8OYQXSRGR1jncFLdara6oQ2xV4ow0OVR4CAUVBfDz8kOvqF5ql0NE1GqazCWr5XzzQ7SmeIGtKc79xIlIZJrMBnl8uo8YTXGrZMXe/JqmOFd8EJHgNJkLgNBN8YPnDqLaWo1Qn1C0C2yndjlERA7TbDYINj5dkiTsyeNKcSISQ6v2FJckCZIkOasWcQm0p/imM7ZV4r1jesPb4K1yNUREzqWZXKooAlBTh1+ompU4TB6fzqY4EXkK7WSDWCvFjxcdR5m5DL4GX3QK7aR2OURETqOZXACEborLDZDukd2hE2RbECKipmgqG5SV4trvNQBAjikHeeV5MOgMSAlPUbscIqJmtagp/tFHHyEtLQ1+fn7w8/NDz5498fHHHzt8nF9//RXjx49HfHw8dDodvv76a+V9ZrMZDz/8MNLS0hAQEID4+HjccsstOHPmTJ1jJCUlQafT1flv0aJFLfm0Wk6gPcXl0ekD4waqXAkRkfM4K5ecRh6d7h0IGIzq1eGg0qpSHC8+DgDoFsHx6UQkNs1lQ2XNnuK+IerV4AB5X8BuEd3gpXd4wBgRkeZoLhcAoZvi8qhcbrFBRCLTZDYItlJcvkiqc2hn+Hn5qVwNEVHzHH51Y/HixZg3bx7uueceXHrppQCA33//HXfeeSfy8vIwa9Ysu49VVlaGXr164bbbbsN1111X530mkwk7duzAvHnz0KtXL5w7dw73338/rr76amzbtq3OY5988knMmDFDeTsoyI37bVgtQHW57d8aDyqzxYxtObavHfcTJyJP4cxccpryQtutYKPT9xXYVonHBsQi3Ddc5WqIiFpOk9kgrxT3EaQpLq8AjOA+sUQkPk3mAlCrKS7e/3vvzbNtscH9Y4lIVJrMBkkSbk9xjk4nIpE43BR/9dVX8cYbb+CWW25R7rv66qvRvXt3zJ8/36GwGDNmDMaMGdPo+0JCQrB69eo697322mvo168fMjMzkZiYqNwfFBSE2NhYBz8TJ5FDCtD8SvG/8v5CeXU5wn3DkRyWrHY5RERO4cxccpqKmpXioo5OD+fodCISmzazQV4pLsb4dLnZwRe3iMgTaDIXAMBUYLsVbKV4mbkMR4uOArCNTyciEpEms6GqDMp2fBpfgCdjU5yIROJwUzwrKwuDBg1qcP+gQYOQlZXllKKaUlRUBJ1Oh9DQ0Dr3L1q0CE899RQSExMxZcoUzJo1C15eTX9qlZWVqKysVN4uLrat2jCbzTCbzY4VVVYEIwBJZ0C1pAcc/fgWkGt0tNYNpzYAAPrG9IWl2gILLE6v7UJaWrvaRK0bYO1qELVuwL21O+s5nJFLzeVC7Vu7WC3QH9sAAwCrJMFSWQHoDfZ/vBO09Pson8ikhKWo8vPL3x33Y93uxbrtf67WcnU2tKROfXkhDAAsxgBY3fgz0JLvn9lixv6C/QCAlBBmgiNErRsQt3bW7V7urltLuQA0nQ1Ay2o1lOVBD6DaJxSSQH9r/8r5CxIkxPrHIsQrhDnhINbufqLWDbD25o7bWq7Ohpb1Gs7V9Br0qIaXW3oNQMu/V1bJir35totpu4V2U+3nlL8n6mH96mL9DY91ITpJkiRHDtyjRw9MmTIFjz32WJ37n376afznP//B7t27HTnc+UJ0Onz11VeYMGFCo++vqKjApZdeipSUFCxbtky5f/HixejduzfCw8Pxxx9/4NFHH8Wtt96KxYsXN/lc8+fPx4IFCxrcv3z5cvj7+ztUd2BFFobtexhVBn/80PNNhz7W3ZaWLEWmJRMT/Cagj08ftcshojbOZDJhypQpKCoqQnBwy1fNOSOXnJULcYVbkXZqGfzMBcp95cZw7G53E7JC+9p9HLW8UvwKzlrP4paAW9DF2EXtcoioDfLEbAAASFYMOvwcokr34WjkMOxudzOg0zt2DDc6XX0ab5S+AV+dL/4Z/E/odDq1SyKiNkpLuQA4PxtG7J0Nf3MBdsffhKPRIzSdDbX9VvEbfqz4Ed2N3TE5YLLa5RBRG+PJ2RBQkY3h+x6CWe+H73u95dDHquGs5SxeKXkFRhgxN2QuDDr3LgohIpLZmw0ON8W/+OILTJo0CcOHD1f22tiwYQPWrl2Lzz77DNdee22LCm6uKW42mzFx4kScOnUK69evb/YTeu+993DHHXegtLQUPj4+jT6msau3EhISkJeX53iQnt4O4wejIPmGwnL9B5ASBrp8RaDZbMbq1asxYsQIGI1Guz6m1FyKof8dCotkwcprViI+IN6lNTalJbVrgah1A6xdDaLWDbi39uLiYkRGRrb6JMYZudRULmRlZWHz5s12fT10+1fC8MWtACTUbh3Ib1kmvg8p5SqHP7+WaMn3scxchsGfD4YECauvXY0IP/ePcOTvjvuxbvdi3RcmQjY4es6g278Shp8eg67kjHKfFBQPy8iFbsmFlnz//nvov1i4dSEGxA7A61e+7uIKG8ffF/cTtXbW7V7urltLuQA0nQ3Lly/HNddcY/fXRO1skLX0+/nQbw9hzck1uO/i+zAtdZrrCmyGqL9DAGtXg6h1A6y9MaJkQ8t6DTtg/GCkW3sNQMu/VyuPrcTjGx/HxVEX470R77mwwubx90Q9rF9drP88e7PB4fHpEydOxJYtW7B48WJ8/fXXAIBu3bphy5YtSE9Pb3HBTTGbzbjhhhtw4sQJ/PzzzxcMkv79+6O6uhrHjx9H165dG32Mj49Pow1zo9Ho2Bc+41tgpW1vEV1FIbw+mQAExwOjnwNSr7b/OC3kSL27snfBIlmQGJSI9qHtXVzZhTn8tdYIUesGWLsaRK0bcE/tzjq+M3KpuVyQb5ut12oBVj8GZd+nWnSQAOjgtfqfQPer3TpK3ZHv49GCo5AgIdo/GrHBsS6urHn83XE/1u1erLv553AGV2eD3XVmfAvUXDBVm64kC15f3Arc8JFbzhsAx+red24fACAtKk31n1X+vrifqLWzbvdyV91aygWg6WwAHPiaaCgbZI5+PzMKMgAAvaJ7qf7zK+rvEMDa1SBq3QBrr388Z3B1NojWawAcr1lL5w0Af0/UxPrVxfrtzwaHmuJmsxl33HEH5s2bh08++aRFhTn6fDfccAMOHTqEdevWISLiwqvWdu7cCb1ej+joaNcWl/Et8NktaNAAKc6y3a/CSUxzNmVtAgAMiBugciVERM7j7lxq0ok/gOIzzTxAAopP2x7X4XK3leWIjHzbC1upEakqV0JE1DqayQarBVj1MBq7YAo1F0xh1SNAyji3XjBljz35ewAA3SO7q1wJEVHraSYXAKGzQZZfno8zZWegg47nDkQkLE1lAyBcr0G2J8923tAjoofKlRAR2cehzYqMRiO++OILpz15aWkpdu7ciZ07dwIAjh07hp07dyIzMxNmsxnXX389tm3bhmXLlsFisSA7OxvZ2dmoqqoCAGzcuBEvv/wydu3ahaNHj2LZsmWYNWsW/u///g9hYWFOq7OBC57EwHYSY7W4rgYHbTpT0xSPZ1OciDyHs3OpxUpznPs4FbApTkSeQjPZ4MgFUxpiMptwpPAIAL64RUSeQTO5AAibDbXtzd8LAOgQ0gGB3oEqV0NE1DKaygYBew0AYLaYsb9gPwAgLTJN5WqIiOzjUFMcACZMmKCME2mtbdu2IT09XRlHMnv2bKSnp+Pxxx/H6dOn8e233+LUqVO4+OKLERcXp/z3xx+2kwMfHx+sWLECV1xxBbp3745nnnkGs2bNwtKlS51SX5MEO4nJNeXiSNER6KBDv9h+apdDRORUzsylFguMce7jVKA0a7r5sAAAXVRJREFUxcPZFCci8WkiGwS9YGpfwT5YJSui/aIRE6Dd3CIicoQmcgEQNhtq2523GwDQI5IXThGR2DSTDYL1GmQHzx2E2WpGiE8I2gW1U7scIiK7OLyneHJyMp588kls2LABl1xyCQICAuq8/7777rP7WEOGDIEkNXYFlE1z7wOA3r17Y9OmTXY/n9MIdhKzOWszANvqvxCfEJWrISJyLmfmUou1H2Tb56k4C41f2auzvb/9INfX0gImswnHio8B4EpxIvIMmsgGQS+YkkcgcnQ6EXkSTeQCIGw21KaMymVTnIgEp5lsEKzXIKs9Ol2n06lcDRGRfRxuir/77rsIDQ3F9u3bsX379jrv0+l07gsLNQl2EiPvJ94/rr/KlRAROZ8mcklvAEY/V7P/kw51G+M1JwajF2l2X8CD5w7CKlkR5ReFKP8otcshImo1TWSDoBdM7c2zjcVls4OIPIkmcgEQNhtkkiSdzwlusUFEgtNMNgjWa5DJk0N4MS0RicThpvixY8dcUYdYBDqJkSTp/H7icdxPnIg8j2ZyKfVq4IaPbPtA1R57FRxva4inXq1ebRcg7wvIVeJE5Ck0kQ2CXjC1J//8ig8iIk+hiVwAhM0G2enS0zhXeQ5eei90De+qdjlERK2imWwQqNdQm/xaEvcTJyKROLynOOH8SQwA5aRFoa2TmGNFx5BbngtvvTfSo9PVLoeIyLOlXg08sAeYuhKY+K7t9oHdmm6IA7X2E2dTnIjIueQLpoLj6t4fHG+7X2P5UFhRiJMlJwFwxQcRkcsIlg21yRdOdQ3rCm+Dt8rVEBF5CIF6DbIycxmOFB4BwAlTRCQWh1eKWywWfPDBB1i7di1yc3NhtVrrvP/nn392WnGaJsiKwI1ZGwEA6THp8PXyVbkaIiLn01wu6Q1Ah8vd+5yttK9gHwCgW3g3lSshInIOTWVD6tVAyjjgxB+2fQADY2yrPDT0opZMXu2RGJSIEJ8QlashInIeTeUCIFQ21LbnLPcTJyLPoalsEKTXIMvIz4AECbEBsYj0i1S7HCIiuzncFL///vvxwQcfYNy4cejRowd0uvpXL7UhApzEyPuJc3Q6EXkq5lLrVFRX4GjhUQBcKU5EnkNz2SDIBVN78mzNDq4SJyJPo7lcAITJhtqULTbYFCciD6C5bBCg1yCTzxs4Op2IRONwU3zFihX47LPPMHbsWFfUIx4Nn8RUW6uxNXsrAGBg3ECVqyEicg3mUuscOHcAFsmCCN8IRPtHq10OEZFTMBtahvuJE5GnYi60nsVqUbZdYk4QkSfQZDZouNdQ2+683QCA7hG8mJaIxOLwnuLe3t7o3LmzK2ohJ9uTtwdl5jIEewcjJTxF7XKIiFyCudQ6tfcTV/2qaCIiJ2E2OE6SJGXFB1cAEpGnYS603tGioyivLoe/lz86hHRQuxwiolZjNrTc3jzbtktcKU5EonG4Kf6Pf/wDr7zyCiRJckU95ETy6PT+cf1h0OCYFSIiZ2AutY7cFO8Wwf3EiURgsUrYeCQf3+w8jY1H8mGx8m9fY5gNjssx5SCvPA8GnYEX1BKRx2EutJ584VRqRCpfYyIij8BsaJn88nycKTsDHXTcho+IhGPX+PTrrruuzts///wzfvjhB3Tv3h1Go7HO+7788kvnVUetwv3EicRisUrYcqwAuSUViA7yRb8O4TDouXK3Mcwl59mXvw8A9xMnEsGqPVlY8L8MZBVVKPfFhfjiifGpGN0jTsXKtIHZ0Dryao9OoZ3gb/RXuRoiag7PG+zDXHAuThMhEhMzoy5mQ+vtzbedN3QI6YBA70CVqyEicoxdTfGQkJA6b1977bUuKYacx2Q2YdfZXQDYFCcSARsdjmEuOUelpRJHCo8A4D5QRFq3ak8WZn6yA/XXMGQXVWDmJzvwxv/1bvN5wWxoHWU/cTY7iDSN5w32Yy44F3OCSDzMjIaYDa0n7yfOPCDyDG3t4im7muLvv/8+MjMz0a5dO+j1Dk9cJxXsyN2Bams14gPikRCUoHY5RNQMNjocx1xyjoMFB1EtVSPcNxwx/jFql0NETbBYJSz4X0aDnAAACYAOwIL/ZWBEaqyqJy6SxQLTtu2oPnsWXlFR8O9zCXQG941XZTa0jrwCkBdJEWkXzxscw1xwnkpLJQ4WHATAJgiRKJgZjWM2tB4nhxB5jrZ48ZTdf/k7dOiAvLw8V9ZCTrTpTM3o9PgB0Ok896oOItFdqNEB2Bod3DO2IeZS6yn7iYd3Y1YQadiWYwV1TlDqkwBkFVVgy7EC9xVVT/FPP+HwsOHInDoVZ+bMQebUqTg8bDiKf/rJrXUwG1rGKlmV8elpkWkqV0NEjeF5Q8swF5zjQMEB5WLa+IB4tcshogtgZjSP2dBykiQpTXGeNxCJTb54qv7rTfLFU6v2ZLmtFsliQdnmLSha+R3KNm+BZLG47LnsbopLUtsMSVFxP3EiMYjQ6NAq5lLr7SvgfuJEIsgtaTonWvI4Zyv+6Secvv8BVGdn17m/OicHp+9/wK2NcWZDy2QWZ6LEXAIfgw86h3VWuxwiaoSo5w3ufIGr0ednLjhF7WkivJiWSPu0nBlq5wLAbGiNU6WnUFhZCC+9F7qEdVG7HCJqIS1dPBW4Zw+OjxrttkUWdo1Pl/F/fMWQX56PA+cOAAD6xfZTuRoiao7WGx1NUXtEroy51DrySnE2xYm0LTrI16mPcybJYkHOwmeBxl5YkiRAp0POwmcRNGyY23KC2eA4eZ/YruFdYdQbVa6GiBoj4nlD8U8/IWfhs3UumvKKjUXMY48ieORIt9XBXGg9rgokEotWM0MruQAwG1pKni6VEpYCb4O3ytUQUUs5cvHUwE4RLqujdM0axH38CepfHiUvssArLzs9Hxxqis+bNw/+/v7NPmbx4sWtKohax2K14NP9nwIAEgITEOoTqm5BRNQsLTc6mqKlkxjmUstYrBZsztqsXEDVNbyryhURUXP6dQhHXIgvsosqGr2KVwcgNsQX/TqEu7s02wVS9VaI1yFJqM7OhmnbdgT0d8/FmswG+1msFuzI3YGVR1YCAFLDeZEUkVaJdt4gTxGpf9GUK1/gagpzofXki6e6R3ZXuRIisocWM0NLuQAwG1rCYrVg9YnVAIAo/yhYrBYY9O5fIENEraeFi6ckiwVnFz3XxDtdt8jCoab47t274e3d9BVAvMJKXWtOrMGiLYuQY8oBAJwsPYlRX4zCI/0ewfD2w1Wujogao+VGR2O0dhLDXHJc/awAgFtX3cqsINIwg16HJ8anYuYnOxq8T/4r98T4VBj07v+bV332rFMf5wzMBvs0lgffH/se/eP6Mw+INEik8watTRFhLrScxWrB76d/x7GiYwCAbuHdVK6IiOyhtczQWi4AzAZH1T93WHdyHfsORALTwsVTpm3bYcnJQZN/bV20yMKhpvhXX32F6Ohopz05Oc+aE2swe/1sSPX+VyfXlIvZ62dj8ZDFDCgiDard6NABdX6D1W501KfFkxjmkmOYFUTiGt0jDm/8X2/c++mfMFvO/w7HhvjiifGpGN0jTpW6vKKinPo4Z2A2XFhTeVBcVcw8INIokc4btDZFhLnQMo1dPDX5u8lsgBAJQGsX1Zbv2KGpXACYDY7ga0lEnkcLF0+ptchCb+8DeXWUdlmsFizasqhBMAFQ7ntuy3OwWOtP5iciLZAbHTEhda+8ig3xxRv/11u1Rkd9jry45Q7MJccwK4jEN7pHHGKCfQAADwxPxqczBuD3h69UNSf8+1wCr9hYoKm/yTodvGJj4d/nErfUw2y4sObyQMY8INIm5bwhWNvnDVqaIsJcaBm5AVK7IQ6cb4CsObFGpcqIyF5yZnh71X35X43MsGgoFwBmgyP4WhKRZ5IvngLQYKW2uy6eUmuRhd1NcamxlYGkCTtydzQ4UalNgoRsUzZ25Da8OpCItGF0jzj8cP/lytsf3tZX9UZHfVp6cQtgLjmKWUEkPkmSkFdaBQC4Lr0dBnaKUH1FoM5gQMxjjzbxTlttMY896rYJIsyGC2MeEIltdI84fHXXIAC2F6yW/b2/5s4btDRFhLngODZAiDzH6B5xaB/uBwCYeUUn1S6qNWgoFwBmgyN47kDkueSLpyIC624l4a6Lp/z7XAJDTEzTl+u7aJGF3U3x999/HyEhIU59cnKOsyb7GlD2Po6I1FFaUQ0A8DXqcUWXaNUbHfVp6cUtgLnkKGYFkfhKK6tRYbYCACKDmt7/zt2CR47ERa+8DNRrfHvFxOCiV15G8MiRbquF2XBhzAMi8eWX2S6QigzywaWdIzV33qClKSLMBcexAULkWbKLKwEAEy+5SLWLav1699ZMLgDMBkfw3IHIs43uEYcXru8FAIgL9nXrxVM6gwFRjzzcxDtdt8jC7qb41KlT4ePj49QnJ+eI8revAWXv44hIHedMthe3Qv200+ioTUsvbgHMJUcxK4jEd7bE9oJWoI8X/L29VK6mrqAhQwCrrWEf8/g8JH74ITqvXePWhjjAbLAH84BIfDnFFQCgbKmhNVqaIsJccBwbIESeo7SyGiU1CzDiQvxUq0NLuQAwGxzBcwciz5dbanutqUtskNsvngocPhxZN/8f9IGBde535SILu5vipF29o3sjxj8GugbT/2100CHWPxa9o3u7uTIickShyQwACPU3qlxJ47R2EkOOYVYQiU9uikcFae8FnKrTpwFJgs7fH2GTJyOgfz/mgUYxD4jEl1uTB9FBvhd4pHqCR47ERS++0OB+NaaIkGPYACHyHFmF5QCAYF8vBPioe1GtPF1KHxRU537mgrbx3IHI82UX2S64jQ1W59yitEcPBF13LQAg4PLLXb7Igk1xD2DQG/BIv0cAoEFAyW8/3O9hGPR8YZJIy+SV4mH+2lwpDtQakauvGx88idE+ZgWR+OT9xCMDtZcT5sxMAIB3YiJ0TU0UIU1gHhCJT+srxWW+PXva/uHlhfgX/qXaFBFyDBsgRJ7jTE2jQ81V4rUFjxyJkGtrGh9XXMFcEADPHYg8X3bNuUVsiHoX3FrPnQMA+Pfr6/JFFmyKe4jh7Ydj8ZDFiPaPrnN/jH8MFg9ZjOHth6tUGRHZq6hc2yvFZf59+yojcuOeeZonMQJhVhCJ7WyJ7URFkyvFT5xvipP2MQ+IxJarTA7R7kpxADCfOQMA8L7oIoRcdRWniAiCDRAizyGvFI8L1U5emM+cBgAEXjGYuSAInjsQeTZlpbiKTfHqvHwAgFek6ycROTw3pWPHjti6dSsiIiLq3F9YWIjevXvj6NGjTiuOHDO8/XAMTRiKHbk7cNZ0FlH+Uegd3ZsnKkSCOFcmN8W1twKwtsqDhwAAxsREhE6cqHI1zCVHMSuIxHW2Zp+nqEANNsXlleLttdEUZzZcGPOASFy5xfL4dO3lQW3VWVkAAK/4OJUrYS44Sm6ALNqyCDmmHOX+GP8YPNzvYTZAiAShtZXiAGA+eQoA4N2uncqVMBscwXMHIs+lhaa4JV9uikdc4JGt53BT/Pjx47BYLA3ur6ysxOnTp51SFLWcQW9A39i+apdBRC0gj0/X+krxygMHAAA+XZJVrsSGueQ4ZgWRmDS9p3jmCQC2C6a0gNlgH+YBkZhyS+Tx6dpZ+dcYc01T3BirflOcueA4NkCIxCevFI9XsdFRmyRJMJ88CQAwtktQuRpmg6N47kDkmeTx6XGaaIpHuvy57G6Kf/vtt8q/f/zxR4SEhChvWywWrF27FklJSU4tjoioLZHHp4dpvClecdDWFPft0lXVOphLRNTWaLopfsLWFPdObK9qHcwGImoLRFkpbj5T0xSPU68pzlxoHTZAiMSmNDpCtbFS3HLuHKwmE6DTwXhRvGp1MBuIiGwqzBYUlNkW6sWqdcGt1QpLzZ7ihggNrRSfMGECAECn02Hq1Kl13mc0GpGUlIQXX3zRqcUREbUlykpxP42PTz9wEADg01XdpjhziYjamrxSW05Eamx8umQ2w3y6Zt9YlcenMxuIyNNZrJKynYYwK8VVHJ/OXCCituyMxlaKy6vEvWJioPdR75yG2UBEZCNfbOtr1CPET52FeoayMsBqBXQ6eIWHu/z57G6KW61WAECHDh2wdetWRLphGTsRUVtSaJL3FNfuSnHJYkHlIdue4r5du6haC3OJiNoara4UN2dlAdXV0Pn4wCs6WtVamA1E5OkKyqpgsUrQ6YDIQG1fTFudXbOnuIorxZkLRNRWSZKErCJtrRSv0sh+4swGIiKbrCLbxVOxwb7Q6XSq1GAoKbHdhoVB5+Xwjt8O0zv6AceOHXNaUPz6668YP3484uPjodPp8PXXX9d5vyRJePzxxxEXFwc/Pz8MHz4ch2qaMbKCggLcdNNNCA4ORmhoKKZPn47S0lKn1EdE5E6FNSvFwwK0++JWVWYmpIoK6Pz8YExQf/8nwLm5RESkVVarhLxSbTbFq05kAgC8ExOg0zt8euESzAYi8lTyfuIRAd7wMmjjb25jJElSpogY49QbkStjLhBRW1NcXg1TlW2/bNVG4tZjPmVrivP1JCIibZC32YhVcaKIV00/18sNo9MBB1aKy5588slm3//444/bfayysjL06tULt912G6677roG73/++eexZMkSfPjhh+jQoQPmzZuHUaNGISMjA76+tm/STTfdhKysLKxevRpmsxm33norbr/9dixfvtyxT4yISGWFNXuKh6o0qsQeyuj05GToDAaVq7FxZi4REWlVYbkZ1VYJABARoLGmeKZtP3GjyvuJ18ZsICJPdX4/cW00OJpiLSmx7RsLwBgXq3I1zAUianvO1Kz+C/M3ws9bG6/fVJ2yjU83Jqi7UlzGbCCiti67ZqKImhdPySvFvaLcc5GSw03xr776qs7bZrMZx44dg5eXFzp16uRQWIwZMwZjxoxp9H2SJOHll1/G3Llzcc011wAAPvroI8TExODrr7/GjTfeiH379mHVqlXYunUr+vTpAwB49dVXMXbsWLzwwguIj1f/amQiIntYrBKK5Ka4v3ZXilcePABA/dHptTkzl4iItEpeJR7mb4S3l7ZWBpoz5ZXi6u4nXhuzgYg8lbxSPDpYWxdI1SfvJ24IC4PeT/2xvcwFImpr5JG4cSHq/w2WmeXx6RpZKc5sIKK2Tt5mI1bFrJBXihsiNNoU//PPPxvcV1xcjGnTpuHaa691SlGAbXxJdnY2hg8frtwXEhKC/v37Y+PGjbjxxhuxceNGhIaGKg1xABg+fDj0ej02b97cZD2VlZWorKysUz9gCz6z2ey0z8FV5BpFqLU+UWsXtW6AtauhJXWfM1VBsi0AhL+Xep/zhWov37cfAODVqXOra3TW5+iMXGouF2rfioJ1u5+otbNu92pN3VnnygDYxuW6+/O+UN0Vx48DAAztLmoz2SDSz15b/H1Rk6h1A+LW3tbqPnPOtvo6KtD9eQDYX3fFSdtqQK/Y2FbVqaVcAJrOBkC8n0FA3N8fgLWrRdTaRa0baHntJ/Nt5w+xwT6aeY2pqiYbdLFxbSIbRDpvEPl3BBC7fpFrB1i/2lpbf1ah7dwiOtCo2rmFocTWFNeHh7klG3SSJLdhWmf37t0YP348jte8KOYonU6Hr776ChMmTAAA/PHHH7j00ktx5swZxMXFKY+74YYboNPp8J///AcLFy7Ehx9+iAMHDtQ5VnR0NBYsWICZM2c2+lzz58/HggULGty/fPly+Pv7t6h+IqLWyC0HntnpBR+DhOf7WdQup0lJzz0P74ICnLzjdpR37NiqY5lMJkyZMgVFRUUIDg52UoXnOZJLzAUi0rptZ3X4+LABycFW3NPdqnY5dbR/4UX4nD2LU3+fDlNycquOxWwgImreZ0f12JCjx8iLrBiXqK08qC1k40bEfP0NSrun4swtt7T4OFrKBYDZQETi+C5Tj59O63FZjBV/66iBvKiuRvLcedBJEo7M/ScsQUEtPhSzgYjIORbvNuBEqQ63dbGgV4RTWsUOi12xAsF/7sTZsWNw7oorWnwce7PB4ZXiTSkqKkJRUZGzDudSjz76KGbPnq28XVxcjISEBIwcOdIlQepsZrMZq1evxogRI2A0anfv4caIWruodQOsXQ0tqfvPk4XAzi2IDPLD2LGDXVtgM5qr3VpaiqMPPwIAGHzzzTCEhLTquWqvqnAFR3KpqVwYOnQoNm/e3CZ+BrVA1LoBcWtn3e7VmrqzNhwHDh9ESlI8xo7t6ZoCm9Bc3ZLFgiNz5wEALr3+ehgvuqhVzyVCNohyziBri78vahK1bkDc2tta3SuX7wRycjEoPRVj+7t/2wp76847cBCFAC66+GJcPHZsi59PS7kANJ0NAIT7GQTE/f0BWLtaRK1d1LqBlte+/ovdwOksDOjZFWMHd3BhhU2rXbuUlYVMSYLOzxcjaxa9tZQo2SDSeYPIvyOA2PWLXDvA+tXW2vqf3fsLgEqMGzoIPdu17rX+ljCbzch4+x0AQPdLL0WwG84bHG6KL1mypM7bkiQhKysLH3/8cZP7g7dEbGwsACAnJ6fOSvGcnBxcfPHFymNyc3PrfFx1dTUKCgqUj2+Mj48PfHwa7r9lNBqF+sEXrd7aRK1d1LoB1q4GR+ouq7JdiRUW4K2Jz7Wx2k3HjgOwjUD0jWz9Hh/O+jydkUvN5YJ8q4Xvi6NYt/uJWjvrdq+W1F1gqgYAxAT7qfY5N1a3OTcXMJuhMxrhl5AAncHQ6udwBldnQ1v5udMC1u1+otbeVurOLa0CAMSFBaj6+V6obmtODgDAJ/6iVtWppVwAms4GQNyfQYC1q4W1u5+odQOO155dbMuLduHq5gVgq70yKxsA4N2uHby9vVt9PGdwdTaI+PMmYs21iVy/yLUDrF9tLam/2mLF2Zpzi3YRgap9/l6lJQAAn5hYt5w3ONwUf+mll+q8rdfrERUVhalTp+LRRx919HBN6tChA2JjY7F27VqlCV5cXIzNmzcrY9EHDhyIwsJCbN++HZdccgkA4Oeff4bVakX//v2dVgsRkaudM9kCKNSvdScGrlR50LZVhU/XLipXUpe7comISE15JbY96qKCGn8hXi1VmZkAAKMTGuLOxGwgIk91trgCABCtsTyoz5yVBQAwxsdd4JHuwVwgorYmq6gcABAb4qtyJTbmU7b9xI3tElSu5DxmAxG1ZXmlVbBYJRj0OkQGqnduIe8p7hUZ4Zbnc7gpfuzYMac9eWlpKQ4fPlzn2Dt37kR4eDgSExPxwAMP4Omnn0ZycjI6dOiAefPmIT4+Xtl3vFu3bhg9ejRmzJiBN998E2azGffccw9uvPFGxMfHO61OIiJXKzSZAQCh/tq9Iq3igK0p7tulq8qV1OXMXCIi0qqzpbamuJonKo2pOmFrinsnun+Eb3OYDUTkiaxWCbk1F0nFBGujydEUpSkep42mOHOBiNoSSZKQVWS7iCo+xE/lamyqTtY0xRPaqVzJecwGImrLsmsuto0J8oFB3/ItLVpDqq6GwWQCAHg5YTKtPVq1p/jJmjCT909y1LZt/9/encfHWdb7/39PMjNZmiZpkzRLd+gCpQeUVmpZRLpRRE6VPoAfiwcPHL4im4AogtaWehDEoyKKeNwoyiZ48AiHRWhZFCxL2SoUu1la2mxNm7VpMtv1+2PmniRN0iaQzH3dM6/n49EHzWSS+eRukjf3/bk/17VOp5xySvJtZ++NCy+8UKtWrdLXv/517du3T//v//0/NTU16cQTT9RTTz2l3Nyuk7/77rtPV1xxhebPn6+srCwtXbq019InAGC7psSk+Kh8iyfFN26SJOVMt6sp3t1HzSUAsNVuyyfFgxPtaop3RzYASBeN7SFFYvFtl2y7Sao7E4koktjqzl9p38ACuQAg3e3dF1JnJCZJKi+yIy/CH+yUJAUtmhTvjmwAkGlqEyuKlLu4oki0sVE+Y6SsLGWPGpWS18wa7AdEIhEtW7ZMRUVFmjRpkiZNmqSioiJ961vfUjgcHtTn+vSnPy1jTK8/q1atkiT5fD6tXLlStbW16ujo0OrVqzVtWs9le0ePHq37779fra2tam5u1m9+8xsVFBQM9ssCAFc1Wj4pboxRpzMpbtny6UOZSwBgK2ub4tu3S5IClk2Kkw0A0pEzJV4yIqigf9CXc1Imsnu3FI1KgYD8ZamZ+DgUcgFAJnGmxEsLcpTjt2OLo9BO+ybFyQYAmaw2kRWVbjbF9+yRJGUXF6dsS75BT4pfeeWVeuSRR3Tbbbdp7ty5kqS1a9dqxYoV2rNnj+66664hLxIA0l3TfqcpbuekeHhXtWL79skXCCg4aZLb5fRALgFId5FoTHsTK4rY1hQP74g3xYMTJrpcSU9kA4B0VJdY4tC2LDhQcun08nL5suxo3pMLADJJcun0Ynu22gjv3CVJClo0iU02AMhkNc7y6S5uyxRpaJAkZado6XTpQzTF77//fj344IM67bTTko8dffTRGj9+vM4991zCAgA+BGf59OI8OyfFOzfFp8SDU6bIF7CrRnIJQLrbuy8kY6TsLJ9V22yYWEyhHfGJD9uWTycbAKQjZ1J8jO37iVfbtZ+4RC4AyCw1iSVx3Zz+6y7a3KJYS4skKTB2rMvVdCEbAGSyOpsmxUtKUvaag75lNycnR5P6mBKcPHmygkF7LtIBgJc0JZZPHzXCroazI7l0+jS7lk6XyCUA6c9pgoweEVR2ls/larpE6utlOjslv1+BKrv2jCUbAKSjemeaw/pJ8WpJUqDKnqY4uQAgk1Q3OY2OPJcriQvvjO8n7i8rU1aeHTVJZAOAzOasKuLmpLjTFPeXWtwUv+KKK/Sd73xHnZ2dycc6Ozt1880364orrhjS4gAgUzQmJsWL8uz8n+6OjZskSTnTp7tcSW/kEoB0t7stsZ94gV1NkND2HZKkwNgq+fyDXoBqWJENANJR16S4XXlwoEhi+XS/RZPi5AKATGLbpHgk0RQPWLR0ukQ2AMhstS3u30AVaUj9pPigr169+eabWrNmjcaNG6djjjlGkvT2228rFApp/vz5OvPMM5PPfeSRR4auUgBIY83OpHi+3ZPiOdPtmxQnlwCku92JJohte8iGLN1PXCIbAKSn+pZ4Hrg5zTEQXcun27OKCLkAIJPUOJPixXZMZTuT4oFx9iydLpENADKXMUa1Ni2fbvOe4sXFxVq6dGmPx8ZbdpcXAHhJOBpTa2dEkqzaK9YR279foe3xxkeuhZPi5BKAdGdrUzy8Iz4pHpxg137iEtkAID3VtcYvXI2xLA8OFK5xmuIVLlfShVwAkEmqE5PiVZZMijtN8eA4u37vkg0AMlVTe1idkZgkd1ehiu5pkCT5R1s8KX733XcPRx0AkLGc/cR9Pqkwz75J8c4tW6VYTNklJfKn8K6tgSKXAKS7hjY7m+LO8unBifY1xckGAOnImRQvG2lHk6M/XU1xe5ZPJxcAZIpYzKiuxdJJccsazmQDgEzlLJ1eMiKoHH+2a3UkJ8VTuHz6oPcUnzdvnpqamno93tLSonnz5g1FTQCQUZr3x/cTL8wNKDvL53I1vXVuii+dnmvh0ukSuQQg/TmT4qW27SmemBQPWDgpTjYASDfGmGQelFu8p3i0bZ9iLS2S7NpTnFwAkCka2joVjhpl+aRyS26qTU6Kjx/nciU9kQ0AMpWzdLrb2zJF9uyVJGWXWtwUf/755xUKhXo93tHRob/+9a9DUhQAZJLGxKR4saX7iXc4+4lPs2/pdIlcApD+bFw+3RiTbIoHJ9q3pzjZACDdNLWHFYrGlzi0KQ8OFKmNT4lnFRYqu6DA5Wq6kAsAMkVNs7PVRq782YO+9D/0olFFnBVELJsUJxsAZKoaC/YTN+GwYo2NklI7KT7g5dPXr1+f/PuGDRtUW1ubfDsajeqpp57S2LFjh7Y6AMgATcmmuH37iUtS58ZNkqQcy/YTJ5cAZIrdzvLpFk2KRxsaZNrbpawsBS36XUs2AEhX9YkbpEblB1xd4vBQbFs6nVwAkGlqEvuJVxbbsdWGv7lZikblCwblLytzuxxJZAMAOMunV7jYFI/sjTfETVaWsouLU/a6A26Kf+xjH5PP55PP5+tz+ZC8vDz95Cc/GdLiACATNLbH70ottnA/cWOMOjfauXw6uQQgU9g4KZ5cOr2qSr6gPTd1kQ0A0pWzP+wY2/cTr7arKU4uAMg01U3xvKgqsmM/8cDe+NK4gXHj5MuyYHJdZAMA1CZuoKpwcfn0SMNuSVJ0xAj5slN30++Am+Lbtm2TMUaHHXaYXn31VZV1u7MrGAxqzJgxyk5h4QCQLpoTk+KjLFw+PVK/W9GmJik7W8HDD3e7nB7IJQCZoCMcVWtHRJJdk+Kh7Yml0y3bT5xsAJCunEnxMRbvJy5J4ZpqSVKgyo6mOLkAINM4k+JuTv91F0zsFxuwaD9xsgFApqttiZ9buJkV0T17JEmRkandcmnATfGJib0CY7HYsBUDAJkoOSlu4fLpnZviU+LByZOUlWPXBThyCUAmaEgsnR7MzlJh3oD/133YhXZslyQFJtrVFCcbAKQrr0yKO/vG+i2ZFCcXAGSaagv2ie3O3xhvigfH2bOfONkAINPVWnADVWR3gyQpWjAypa876Ctrv/3tbw/6/n/7t3/70MUAQCZq2u/sKW7fpHhy6fRpdu0n3h25BCCddV863efzuVxNl9D2eFM8OGGiy5X0jWwAkG52e2VSPLl8epXLlfRELgDIFDVN8UZHVbEly6dbOCnuIBsAZKoaC26gijiT4gWWToo7vvKVr/R4OxwOq729XcFgUPn5+YQFAAxSU2JSfJSFk+IdGzdJknKm29sUJ5cApDOnCVJq0X7ikhR2lk+3bFLcQTYASDf1rfELV+WW5cGBwjVOU7zC5Up6IhcAZAobGh3dBfc6k+L2NcXJBgCZaF9nJLlNX0WRezdQRfckJsVHpnZSPGuwH9DY2NjjT1tbmzZu3KgTTzxRDzzwwHDUCABprXGf/ZPiOdOnuVxJ/8glAOlsd2L5dJv2EzfGKLTDzj3FHWQDgHRT1+JMitvR5OiLiUYVrquTJAUsWT7dQS4AyASRaEz1iZtqrZkU3+tMituzfLqDbACQiWoT2zIV5PhVkOPeNn3O8umpnhQfdFO8L1OnTtWtt97a6+4qAMChdS2fbtekuAmF1PnPf0qSci2eFO8LuQQgXTS0xlcTKRtpT0ZEGxsVa2uTfD4rL271h2wA4GXOpPgYiyfFIw17pHBYysqSf8wYt8s5JHIBQLrZ3dapaMzIn+VTqQU31UZbW5Xd3i5JCoy1b1K8L2QDgHRXl1hRxM39xKWu5dOtnxTvj9/vV3V19VB9OgDIGM7y6cV5dk2Kd27bJkUiyioslL/CruUPB4JcApAOdrfFT1ZsmhR39hP3V1QoK8eeugaCbADgRcaY5KR4ucWT4pHa+NLp/vJy+fzuTZ0MBrkAIJ1UNyW22ijMVXaWz+VqpMiuXZKk7NGjlF0wwuVqBo5sAJDOnG02Klw+r4g0JCbFR1q+p/ijjz7a421jjGpqavTTn/5UJ5xwwpAVBgCZoqk9Pilu257iztLpudOmyedz/2SqP+QSgHTm7CleZtFkYNjypdMlsgFAemnZH1EoEpNkVx4cqGs/cbuWTpfIBQCZoaZ5vyR79hMP79wpSfJbuJ+4RDYAyEzO8uluT4pHE03xaIqXTx90U/xzn/tcj7d9Pp/Kyso0b948/eAHPxiqugAgI3SEo9ofjkqSikfYNSnekdxP3O6l08klAOnMxqZ4aLv9TXGyAUA6cZZOL8oLKDeQ7XI1/QtX29sUJxcAZIKaxKR4pSX7iTtNcVuXTicbAGSiWgsmxU0opGhzsyQpkuLl0wfdFI/FYsNRBwBkpObEfuLZWT6NzLFricHOjZskSTnTp7lcycGRSwDSWUObs6e4RU1xZ1J8or1NcbIBQDpxlk63eT9xqdukeJV9TXFyAUAmqE5MildZNikesHRSnGwAkIlqLNhTPLJ3b/wv2dmK5aX2Rq4Pvad4Q0ODGhLj7QCAD6ex237iti1Rnlw+3fJJcQe5BCDdGGOSk+KlNu0pnmiKByyeFHeQDQDSgTMpPqbQnizoS7gmvv+q38JJcQe5ACCdJSfFLWmKR5ym+Hg7m+IOsgFAJqlrcT8rIg17JEnZJSVS1oduU38og3q1pqYmXX755SotLVV5ebnKy8tVWlqqK664Qk1NTcNUIgCkr8Z98Unxony7lk6P7t2ryO7dks+nnClT3C6nX+QSgHS2L9S1xYZNTfHw9u2SpODESe4W0g+yAUC6cSbFy0fa0eToT8TS5dPJBQCZoqbFtuXTd0myc09xsgFApnImxctdXD490rBbkuQvGZ3y1x7wWr179+7V3LlztWvXLp1//vk68sgjJUkbNmzQqlWrtGbNGv3tb3/TqFGjhq1YAEg3zfvjk+Kj8oMuV9JT5+bNkqTAhPHKGjHC5Wr6Ri4BSHfOlPiIYLZGWLLFRrSpKbnvU9DCiQ+yAUA6cibFy6yfFHeWT69yuZIu5AKATFLT5Cyf7n5T3ESjClfHVxCxbfl0sgFApgpFYtqzL36tyc1J8egeZ1K8NOWvPeCraytXrlQwGNTWrVtVXl7e632LFi3SypUr9aMf/WjIiwSAdNXYHp8UL86za1I8tCm+n3juNHuXTieXAKQ7pylu1X7iH3wgSfKPGaOs/HyXq+mNbACQjupb7Z8Uj+3fr2hjoyQpUFHhcjVdyAUAmSIUiWl3W6LRUex+XkTq66VwWCY7W/4xY9wupweyAUCmqm/tkDFSMDtLo0e4N6TXY/n0FBvw8un/+7//q//6r//qFRSSVFFRodtuu01//OMfh7Q4AEh3TU5T3LZJ8U3xSfEci/cTJ5cApLuGNgub4tvj+4kHLd1PnGwAkI7qW+zfUzxcUytJysrPV1ZhocvVdCEXAGSKupZujQ4LrjE5N9OGi4vly852uZqeyAYAmarWWTq9KEc+n8+1OiINDZIsb4rX1NToqKOO6vf9M2fOVG1t7ZAUBQCZoqndWT7dzknxnOnTXK6kf+QSgHTnTIrbtJ94aPv7kqTARDub4mQDgHTkTIqPsXhSPFwTXyLXX1Xp6gW2A5ELADKFs0dsRVGusrLc/z0c/mBn/L+jU79f7KGQDQAyVW3iZtvKQne32YjuiTfF/aUWN8VLS0v1/vvv9/v+bdu2abSFIQcANuuaFLeoKR6NKrR1qyQp1+JJcXIJQLqzcfn08A5nUnyiy5X0jWwAkG6MMapLXLwqt3hSPJJoHgQq7dlPXCIXAGSOmub4fuJu7hHbXWhnYlK8xL7fsWQDgEzVNSnublZEdntgUvzUU0/VN7/5TYVCoV7v6+zs1LJly7R48eIhLQ4A0l1jYlLcpuXTA3v2yHR2ypefr8C4cW6X0y9yCUC6SzbFrZoUTzTFLZ0UJxsApJvWzog6wjFJlk+KV9dIkgKVlS5X0hO5ACBTVDfFGx1Vxe5O/zlsnhQnGwBkKqcp7vYNVJE9iT3FS0tT/tr+gT5x5cqVmj17tqZOnarLL79cRxxxhIwxeu+99/Szn/1MnZ2d+t3vfjfkBU6aNEnbt2/v9fhll12mO++8U5/+9Kf1wgsv9Hjfl770Jf385z8f8loAYKjZOCmek9gPMHfqVPmyBnzvVMq5lUsAkCq7bdxTfIfde4qTDQDSTX1LPAtG5vqVF7RrT9buwjWJpniVXU1xcgFApqi1bFI87OwpbmFTnGwAkKlqkitQudwUT+wp7i8pkRJ/T5UBN8XHjRuntWvX6rLLLtMNN9wgY4wkyefzaeHChfrpT3+q8ePHD3mBr732mqLRaPLtd955RwsXLtRZZ52VfOySSy7RypUrk2/n5+cPeR0AMBya9jt7itszKZ5TG7+glWPx0umSe7kEAKnSYFlTPNraqujevZKkgKVNcbIBQLqpT1y4GmNJFvTH2VPctklxcgFApqh2pv8smRQP7bR3UpxsAJCp6iyYFI+FQoq1tEhKTIpv3JjS1x9wU1ySJk+erCeffFKNjY3avHmzJGnKlCnDusdGWVlZj7dvvfVWHX744Tr55JOTj+Xn56uiomLYagCA4dKYmBQvyrNvUjxn+jSXKzk0N3IJAFLFWT691JLl051pj+ySEmUXFLhcTf/IBgDppD6RBTYvnS5JkcTy6X7LmuISuQAgMzh7ildZMCke27dP0cTSuOHRqd8vdiDIBgCZqCbRFK9wMSucfFAgoKzCwpS//qCa4o5Ro0bpuOOOG+paDikUCunee+/VtddeK5/Pl3z8vvvu07333quKigqdccYZWrZs2UGnxTs7O9XZ2Zl8uyVxV0I4HFY4HB6+L2CIODV6odYDebV2r9YtUbsbBlq3MUZNiT3FRwZ9Vnyd4XA4OSnuP/zwYa1pKD/3R82lg+VC9/96BXWnnldrp+7UGkzdsZhJToqPyst29Wt1Xrtj2zZJUmD8+GGrxyvZ4KXvvUz4ebGJV+uWvFt7utdd3bRPklRWELTia+yrbmNMcvl0X1nZkNZpUy5I/WeD5L3vQcm7Pz8StbvFq7V7tW5pMHkRb4qXjvC7/nV2vv++JCmrqEixvNwhr8cr2eCl8wYv/4xI3q7fy7VL1O+2wV5nqkusQlWa715WdNTGB/KyR41SJBKRNDTHf6Cfw2ec9UE84KGHHtJ5552nHTt2qKqqSpL0i1/8QhMnTlRVVZXWr1+v66+/Xscdd5weeeSRfj/PihUrdNNNN/V6/P7772fpdQAp0xmVvv5q/N6k246LKMeCLQKz9ndoyooVkqQtK5Yrljd8y261t7frvPPOU3NzswpduCusO3IBgG32haUb18Uz4gdzIvJnuVyQpNHPPqfSP/9Zzcceq7pzzh6W1yAbAKCnP76fpedrsjSvMqYlk2Jul9On7LY2Hf6d/5Tx+bT5P78j+T/U/EWfbMoFiWwAYKdwTLrulfjv3u/OjmiEy4sRjnj3XY397e/UMXasdlx15ZB/frIBAAavJSQte90vn4x+MCeqbJeuM4147z2NXXXPkGfEQLPBU03xU089VcFgUI899li/z3n22Wc1f/58bdmyRYcffnifz+nr7q3x48eroaHBiiA9lHA4rGeeeUYLFy5UIGDPkssD4dXavVq3RO1uGGjd1U37dfIP/qpAtk/vLl/QYwUMt7S++qrqLv4PZVdUaPIzTw/ra7W0tKi0tNSKk5j+cqGmpkavvPJK2n4P2sardUverZ26U2swdW+ua9Nnfvo3FecF9NqNp6Sowr45dX/8pb9p36OPavTll2v0pV8altfyQjZ45ZzBkQk/Lzbxat2Sd2tP97qvfmi9Hv97rW48bbr+/fiJKaywb33V3fHuu9r5/52r7LIyTX52zZC+nk25IPWfDffff7+WLFniqe9Bybs/PxK1u8WrtXu1bmlgtW/f064Ft7+o3ECW1i+b7/r1pabf/U4Nt31f+QsX6K0FC4b8uHslG7x03uDlnxHJ2/V7uXaJ+t02mPrf2dWiz//8ZY0ZmaOXvn7yQZ87nJr/53+0e8VNyj/pJJX9+PYhO/4DzYahu313mG3fvl2rV68+6AS4JM2ZM0eSDtoUz8nJUU5O770ZA4GAp77xvVZvd16t3at1S9TuhkPV3RpqlySNyg8qGAymqqyDiv3zn5Li+4kP9zG36d/0YLng/NemegeKulPPq7VTd2oNpO6mjqgkqWxkjjVfY3TnTklS7mGTh60mW75WKX3OGRzUnVperVvybu3pWvfutvh2SxXF+VZ9fd3r3l9fH3+sqnLIa7Tpa5b6zwbJu9+DErW7hdpTz6t1SwevvX5ffMnYqqI8K64vRavjW2oEx0+QNPTH3bZ/w3Q6b/Bizd15uX4v1y5Rv9sGUn9De3yp8sqiXHe/1qYmSVJgTNmQXnsf6MdbsBDjwNx9990aM2aMTj/99IM+76233pIkVVZWpqAqAPjwmtrjJy3F+fYEbuemzZKknGnTXK4EADLb7sR+4qUFfV94d0P4gw8kScEJ7k8qAkCm2N0az4MxI+3JgwNFEvsCBiqrXK4EADJTbXN8j9jK4lyXK4lzzhsC48a5XAkAwFHbvF+SVF7oblZEGvZIkvwlpa68vicmxWOxmO6++25deOGF8nfbm2rr1q26//779ZnPfEYlJSVav369rrnmGn3qU5/S0Ucf7WLFAHBoTfvjUx/F+e7fxesIbdokSQrSFAcAVzlNkDJLmiC+UEjR3bslScEJ412uBgAyR11LvNHh9sWrgwknJgIDDCcAgCtqnKZ4UZ7LlcSFEitMBcaNkxr3ulwNAEDqnhVuN8UbJEn+0hJXXt8TTfHVq1drx44duuiii3o8HgwGtXr1at1+++3at2+fxo8fr6VLl+pb3/qWS5UCwMA1OpPieXZMiptYTJ2bmRQHABvY1hQP7InfyZtdXKzsoiKXqwGAzNDWGVF7KL6dhs2T4uEamuIA4Kbqpvj0X5XLjQ4pfm0pTFMcAKxTm7jZtsLlG6iiyaY4k+L9WrRokYwxvR4fP368XnjhBRcqAoCPrrk9Pik+ypJJ8fCuXTLt7Yr5/QpMmOB2OQCQ0WxrigcTTfHARPIBAFKlPnHhqiDHrxE59l6+STbFq2iKA4AbnOk/txsdkhTZ3SDT2SllZ8tfUS793e2KAABS11YbFUXuXmdyJsWzXVo+3TN7igNAuklOio+wY1K8c+NGSVKovFw+v70X3QAgE9i2p3ggcdLCfuIAkDp1LfbvJy5J4ZpqSZKfSXEAcIUzKW7DnuLhnYn9xCsr5QvYcb0LANBtUrzQ3RuoIomhC38ZTXEAyChNyeXT7ZgU70g0xTsrKlyuBABg3aR4Q/ykJchKIgCQMvWt8QtXtmRBX2KhkKK74zdOBaqqXK4GADKTMyleZcGkeOiDRFN8/DiXKwEAOIwx3SbF3buBKtbZqVhrqyTJX+LOnuI0xQHAJU3J5dPtuHO2c+Om+H8raYoDgNsaEpPiZbZMiifu5A2yfDoApEx9YlK8vND9yb/+RGprJUm+3FxlFxe7WwwAZKD2UETN++NDF1ZMin8Q3088OG68y5UAABwtHRG1h6KSpAoXzy2c/cR9gYCyCgtdqYGmOAC4pDHRFC+2pimemBRn2UMAcFUkGtOeffGMsGU6MNkUZ1IcAFLGmRS3efn0cHViP/GKCvl8PperAYDM40yJF+T4VZjr/vWl5PLp45gUBwBb1CWWTi/ODygvmO1aHc7S6dmlpa6dO9AUBwCXNCXu5C3Od3/59Fh7u0I7dkiSQiyfDgCu2rsvJGOkLJ80eoQFGdHRoUBzsyQpMJE9xQEgVepb7Z8UD9ckmuJV3FgLAG6oaYo3OipdXA63u5AzKc7y6QBgDecGKjenxCUpkpgUd2vpdImmOAC4JrmnuAWT4p1btkjGKLu0VNGCArfLAYCMtjuxdProETnKznJ/6i68M35hK2vkSJbGBYAUciY6xhRaPCleUy1J8rPaFAC4orp5vySpstj9/cSlrnOHwHiWTwcAW9RZsJ+41K0pXlrqWg00xQHABbGY6banuPtTgB2JpdOD06a5XAkAYHdiMtCWpdPDOxJLII4fz9K4AJBC9ZblQV8iNfE9xQOVVS5XAgCZKTkpbsGqIrGODkXq6yWxfDoA2MSWSfFocvl0JsUBIKO0dkYUM/G/F+VZMCm+cZMkKWfaVJcrAQBY1xT/IL69RoD9xAEgpepbPLR8OpPiAOCKmuSkuPtZEd61S5KUVVDAClMAYJHaFksmxXczKQ4AGak5sXR6XiBbuYFsl6uROpkUBwBrOMunlxVY0hTfkWiKswQiAKRMeyiits6IJGmMJTdJ9YU9xQHAXdWJ6b+qIveXTw99wApTAGCj2sQNVG5PikcSk+L+EpriAJBRGpNLp7s/JW6MUccmZ1KcpjgAuK2hNZ4R1kyKO8unT2RSHABSIRoz+vM78WXJc/xZyrPgJtq+GGOYFAcAl9U0WTQp/kF8P/EgS6cDgFVqbNtTvIymOABkFKcpXmTBfuKRujrFmpslv1/ByZPdLgcAMp4zKV5a4H5GSCyfDgCp9NQ7NTrxe8/qmofeliR1RmI66bbn9NQ7NS5X1lusuVmmvV2S5K+ocLkaAMhMtYlGR6UFk+LhnV2T4gAAe9S12JEVUacpXsKe4gCQUZr3x5dPt2FS3Fk6PWfyJPmCdjRgACCT7W6Nn6zYMCkeC4UUqYlPKwbG0xQHgOH01Ds1+vK9byQnORy1zR368r1vWNcYd6bEs0tKlJXr/oQiAGSSaMzo2ffq1JrYaqO80P1zh5AzKT6eSXEAsEVHOKrGxFauri+fnmiKZ7N8OgBkjmjM6I3tjYm/xxSNGVfr6djoLJ0+3dU6AADxjNixJz51t7u10/WMCO/cJcViigWDyi4Z7WotAJDOojGjmx7boL5+6zuP3fTYBtdzoTuWTgcAdzirilx0z7rkY4t+9BfXb54KO3uKj2NSHABsEI2ZZDYEs7M0Ise9bZliHR2K7dsnieXTASBjOCcu96zdLkl6ZVujTvzes66euCQnxafTFAcANzkZUZ2YEPzPx99zPSNC29+P/7e0RD6fz7U6ACDdvbptb68J8e6M4nsBvrptb+qKOoRwtdMUZ+l0AEgVW1cVMcYotDM+KR4YN9aVGgAAXZxrTFf/Pr4tUyjq7rZMkYY9kiRfMKisggJXapBoigNAyth64tK5Kd4Uz50+zZXXBwDYmxHhHfH9xMMuLm0FAJmgvrX/hviHeV4qhGuqJUl+JsUBICVsXVXERKNqfWa1zP79kiR/BTdLAYCbbLzGFG3YLUnyl5a6OnRBUxwAUsDGExcTjartpZfUuWWrJCk4ZUrKXhsA0MXGjJDiObHv1dfif8/KkolGU/r6AJBJxowc2P5+A31eKkSSy6dXuVwJAGQGG1cVaXn6aW2Zv0C7rroq+dg/T/uMWp5+OmU1AAC62HiNyUSjalu7Nv5Gbq6r15doigNACth24uKctHxw8X9IJh6A288/X22rV6fk9QEAXWzLCKkrJ9rWrJEkFb79tt4/dTEXtwBgmBw3ebQqi3LV38yET1JlUa6Omzw6lWX1y0Sj6ti0WZIUa2vjxikASAHbVhVpefpp7frK1YrU1vZ4PFJXp11fuZprTADgAtuuMTnXlxp+fIckKfzPf2rL/AWuXV+iKQ4AKWDTiUv/Jy31qr32qyp4551hrwEA0MWmjJD6z4lofb12feVqGuMAMAyys3xafsYMSerVGHfeXn7GDGVnubfUoKPgnXf0/qmLFdqyRZLUcOedrl7YAoBMYdOqIiYaVd13b0kOWvR8Z/yxhu/dJsViw14LAKCLTdeYbLx5iqY4AKSALScuAzlpKXv0MSY9ACCFbMkIaWA5UffdW8gJABgGi2dW6q4LjlVFUc/f9xVFubrrgmO1eKb7e3e3rV6tyt/dq2hdXY/HnQtbNMYBYPjYtKpI+7rXezU5ejBGkdpa5W3bNuy1AAC62HKNydabp/wpfTUAyFDOiUttc0ef+3n4FL/YNdwnLgM5aQk0N2v/G28oePzxw1oLACDOloyQBn5xq33d6xox57hhrwcAMs3imZVaOKNCr27bq/rWDo0ZGf/9b8OEuIlGtfvW7/XzTiP5fKr77i0aOX++fNnZqS0OADKAs6rIl+99Qz6px7lDqlcViezePaDn+Vtbh7kSAEB3tlxjsvXmKSbFASAFbFkOcaAnLdEBPg8A8NHZkhHSwHNioM8DAAxedpZPcw8v0ZKPjdXcw0usaIhL8Qtb0bq6ficUu984BQAYHrasKuIvKxvQ8yIjRw5zJQCA7my5xmTrzVNMigNAijgnLjc9tkE1zV17dlQU5Wr5GTNScuIy0JOW7AE+DwAwNGzICGngOTHQ5wEA0gc3TgGAHWxYVSR/9iz5KyoUqavre2lcn0/+8nLtnzw5ZTUBAOJsuMZk681TNMUBIIXcPnEZyElLuLBQeccem5J6AABd3M4IaeAXt/Jnz0pZTQAAO3DjFADYw1lVxC2+7GyV33iDdn3lasnn63nu4Iufv5Re/3UpFHKnQADIcG5fY7L15imWTweAFHNzOUTnpCX+xgGvm3h797+ewR6AAOASt5fMHUhOlN94AzkBABkof/YsZZeX97k3oaT4ha2KCm6cAoAMUbhokcb++Hb5y8t7PO4vL9fYH9+uggULXKoMACDZ34covf7rUlZq29Q0xQEgwxzspKXihz9Q28yZLlUGALDBoS5uFS5a5FJlAAA3+bKzVfaN6xNvcOMUACB+7jBlzWpNuOceVf3Xf2nCPfdoyprVnDMAAKy8eYrl0wEgAxUuWqSR8+erfd3riuzeLX9ZmfJnz1IkFpOeeMLt8gAALuueE521NVq3das+fdllCubmul0aAMBFBQsWqOYLF2j8088oWleXfNxfXq7yG2+gCQIAGciXna0Rc45zuwwAgIX660P4srMVDodTXg9NcQDIUH2etMRi7hQDALCOkxPBcFj7n3iCyT8AgCSpbeZMTfrqVxV+e32vC1sAAAAA0J1NN0/RFAcAAAAAAMCA2XRhCwAAAAAGgj3FAQAAAAAAAAAAAABpy+qm+IoVK+Tz+Xr8OeKII5Lv7+jo0OWXX66SkhIVFBRo6dKlquu2pxUAAAAAAAAAAAAAILNZ3RSXpKOOOko1NTXJPy+++GLyfddcc40ee+wxPfzww3rhhRdUXV2tM88808VqAQAAAAAAAAAAAAA2sX5Pcb/fr4qKil6PNzc369e//rXuv/9+zZs3T5J0991368gjj9TLL7+sT37yk6kuFQAAAAAAAAAAAABgGeub4ps3b1ZVVZVyc3M1d+5c3XLLLZowYYJef/11hcNhLViwIPncI444QhMmTNDatWsP2hTv7OxUZ2dn8u2WlhZJUjgcVjgcHr4vZog4NXqh1gN5tXav1i1Ruxu8WreU2tptOj4Hy4Xu//UK6k49r9ZO3alF3QN/LRt4/ZzBwfddanm1bsm7tVN3aqW6btuOT3/ZINlX60B49ftQona3eLV2r9YtUfvBPq8t0uG8wcvfZ5K36/dy7RL1u436e3+uQ/EZY8xHfrVh8uSTT6qtrU3Tp09XTU2NbrrpJu3atUvvvPOOHnvsMf37v/97j8CRpOOOO06nnHKKvve97/X7eVesWKGbbrqp1+P333+/8vPzh/zrAAD01t7ervPOO0/Nzc0qLCx0tRZyAQDsQDYAALqzKRcksgEAbEA2AAAONNBssLopfqCmpiZNnDhRP/zhD5WXl/ehm+IH3r3V3NysCRMmaNu2bRo5cuSw1T9UwuGwnnvuOZ1yyikKBAJulzMoXq3dq3VL1O4Gr9Ytpbb21tZWTZ48WU1NTSoqKhrW1zqU/nJh06ZNWrdunef+Lb36PejVuiXv1k7dqUXdh+aFbPDKOYOD77vU8mrdkndrp+7USnXdNuWC1H82/OpXv9Lpp5/uqX9LybvfhxK1u8WrtXu1bona++KVbPDSeYOXv88kb9fv5dol6ncb9XcZaDZYv3x6d8XFxZo2bZq2bNmihQsXKhQKqampScXFxcnn1NXV9bkHeXc5OTnKyclJvu0saTJ58uRhqRsA0L/W1lbXT2L6y4Vp06a5VRIAZDSbs4FzBgBIPRtyQeo/G/7jP/7DrZIAIGPZng2cNwBA6h0qGzzVFG9ra9PWrVv1hS98QbNmzVIgENCaNWu0dOlSSdLGjRu1Y8cOzZ07d1Cft6qqSh988IFGjhwpn883HKUPqZaWFo0fP14ffPCBFUvEDIZXa/dq3RK1u8GrdUuprd0Yo9bWVlVVVQ3r63wYTi4YYzRhwgTP/Vt69XvQq3VL3q2dulOLug/NC9nglXMGB993qeXVuiXv1k7dqZXqum3OBSmeDRs2bNCMGTM8928peff7UKJ2t3i1dq/WLVF7X7yQDV47b/Dy95nk7fq9XLtE/W6j/i4DzQarm+LXXXedzjjjDE2cOFHV1dVavny5srOzde6556qoqEgXX3yxrr32Wo0ePVqFhYW68sorNXfuXH3yk58c1OtkZWVp3Lhxw/RVDJ/CwkJPfqNL3q3dq3VL1O4Gr9Ytpa52G+7o7YuTC87dvV79t6Tu1PNq7dSdWtR9cLZng1fxfZdaXq1b8m7t1J1aqazb1lyQ4tkwduxYSd79t5So3S3UnnperVui9gPZng1ePW/w8veZ5O36vVy7RP1uo/64gWSD1U3xnTt36txzz9WePXtUVlamE088US+//LLKysokST/60Y+UlZWlpUuXqrOzU6eeeqp+9rOfuVw1AAAAAAAAAAAAAMAWVjfFH3zwwYO+Pzc3V3feeafuvPPOFFUEAAAAAAAAAAAAAPCSLLcLwODl5ORo+fLlysnJcbuUQfNq7V6tW6J2N3i1bsnbtQ8Hrx4P6k49r9ZO3alF3XCDV//9qDv1vFo7daeWV+seTl4+JtTuDmpPPa/WLVE7UsPr/1Zert/LtUvU7zbqHzyfMcak7NUAAAAAAAAAAAAAAEghJsUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4pa66667dPTRR6uwsFCFhYWaO3eunnzyyeT7P/3pT8vn8/X4c+mll7pYcf9uvfVW+Xw+XX311cnHOjo6dPnll6ukpEQFBQVaunSp6urq3CuyD33VbetxX7FiRa+6jjjiiOT7bT7eh6rd1mMuSbt27dIFF1ygkpIS5eXl6V/+5V+0bt265PuNMfr2t7+tyspK5eXlacGCBdq8ebOLFXc5VO1f/OIXex33xYsXu1jx8DnU71ubf37SISu8mhGSd3LCqxlBPriDfPCmdMgDybuZ4JU8kMgEN5AJ6YFzBjuQE6nh1ayQvJ0XEpmB4ZVOeSB5NxMk7+WCRDa4zcv5INmVEf5h+az4yMaNG6dbb71VU6dOlTFG99xzj5YsWaI333xTRx11lCTpkksu0cqVK5Mfk5+f71a5/Xrttdf03//93zr66KN7PH7NNdfo8ccf18MPP6yioiJdccUVOvPMM/XSSy+5VGlP/dUt2XvcjzrqKK1evTr5tt/f9eNt+/E+WO2Snce8sbFRJ5xwgk455RQ9+eSTKisr0+bNmzVq1Kjkc2677TbdcccduueeezR58mQtW7ZMp556qjZs2KDc3Fyra5ekxYsX6+67706+nZOTk+pSU+JQv29t/vnxelZ4NSMk7+WEVzOCfLCvdilz8sFLvJ4HknczwWt5IJEJqUQmpA/OGdxHTqSWV7NC8mZeSGQGhl+65IHk3UyQvJsLEtngFi/ng2RhRhh4xqhRo8yvfvUrY4wxJ598svnKV77ibkGH0NraaqZOnWqeeeaZHvU2NTWZQCBgHn744eRz33vvPSPJrF271qVqu/RXtzH2Hvfly5ebY445ps/32X68D1a7MfYe8+uvv96ceOKJ/b4/FouZiooK8/3vfz/5WFNTk8nJyTEPPPBAKkrs16FqN8aYCy+80CxZsiQ1BVnI+X1r+89PX7ySFV7NCGO8lxNezQjyIfXIh/TilTwwxruZ4LU8MIZMSDUyIb1xzpA65ERqeTUrjPFuXhhDZsAdXssDY7ybCcZ4NxeMIRvc5OV8MMa+jGD5dA+IRqN68MEHtW/fPs2dOzf5+H333afS0lLNnDlTN9xwg9rb212ssrfLL79cp59+uhYsWNDj8ddff13hcLjH40cccYQmTJigtWvXprrMXvqr22Hrcd+8ebOqqqp02GGH6fzzz9eOHTsk2X+8pf5rd9h4zB999FHNnj1bZ511lsaMGaOPf/zj+uUvf5l8/7Zt21RbW9vjuBcVFWnOnDmuH/dD1e54/vnnNWbMGE2fPl1f/vKXtWfPHheqTa0Df9964efH4bWs8GpGSN7MCa9mBPmQWuRDevBaHkjezQQv5oFEJqQSmZCeOGdIPXIi9byaFZI380IiM5BaXs0DybuZIHk7FySywS1ezgfJvoxg+XSL/f3vf9fcuXPV0dGhgoIC/fGPf9SMGTMkSeedd54mTpyoqqoqrV+/Xtdff702btyoRx55xOWq4x588EG98cYbeu2113q9r7a2VsFgUMXFxT0eLy8vV21tbYoq7NvB6pbsPe5z5szRqlWrNH36dNXU1Oimm27SSSedpHfeecfq4y0dvPaRI0dae8z/+c9/6q677tK1116rG2+8Ua+99pquuuoqBYNBXXjhhcljW15e3uPjbDjuh6pdii9XcuaZZ2ry5MnaunWrbrzxRp122mlau3atsrOzXa1/OPT3+/att96y+udH8mZWeDUjJG/mhFczgnxIPfLB27yYB5J3M8GLeSCRCalGJqQXzhncQU6knlezQvJuXkhkBlLDy3kgeTcTJG/ngkQ2uMnL+SBZmBEpmUfHh9LZ2Wk2b95s1q1bZ77xjW+Y0tJS8+677/b53DVr1hhJZsuWLSmusrcdO3aYMWPGmLfffjv5WPclKO677z4TDAZ7fdwnPvEJ8/Wvfz1VZfZyqLr7YtNx766xsdEUFhaaX/3qV9Ye7/50r70vthzzQCBg5s6d2+OxK6+80nzyk580xhjz0ksvGUmmurq6x3POOussc/bZZ6eszr4cqva+bN261Ugyq1evHu7yXNHf71sv/Px4LSu8mhHGpE9OeDUjyIfhRz54m9fywBjvZkK65IExZMJwIxPSC+cMqUdO2MGrWWGMd/LCGDIDqeHVPDDGu5lgTPrlgjFkQyp5OR+MsS8jWD7dYsFgUFOmTNGsWbN0yy236JhjjtGPf/zjPp87Z84cSdKWLVtSWWKfXn/9ddXX1+vYY4+V3++X3+/XCy+8oDvuuEN+v1/l5eUKhUJqamrq8XF1dXWqqKhwp2gduu5oNNrrY2w67t0VFxdr2rRp2rJliyoqKqw83v3pXntfbDnmlZWVyTspHUceeWRy6RXn2NbV1fV4jg3H/VC19+Wwww5TaWmp68d9uPT3+9YLPz9eywqvZoSUPjnh1YwgH4Yf+eBtXssDybuZkC55IJEJw41MSC+cM6QeOWEHr2aF5J28kMgMpIZX80DybiZI6ZcLEtmQSl7OB8m+jKAp7iGxWEydnZ19vu+tt96SFP8Gc9v8+fP197//XW+99Vbyz+zZs3X++ecn/x4IBLRmzZrkx2zcuFE7duzosYeJbXX3tUyDTce9u7a2Nm3dulWVlZWaNWuWlce7P91r74stx/yEE07Qxo0bezy2adMmTZw4UZI0efJkVVRU9DjuLS0teuWVV1w/7oeqvS87d+7Unj17XD/uqeL8vvXaz49kf1Z4NSOk9MkJr2YE+TD8yIf0YnseSN7NhHTJA4lMGG5kQnrjnGH4kRN28GpWSN7JC4nMgDu8kgeSdzNBSr9ckMiGVPJyPkgWZsSQz55jSHzjG98wL7zwgtm2bZtZv369+cY3vmF8Pp95+umnzZYtW8zKlSvNunXrzLZt28yf/vQnc9hhh5lPfepTbpfdrwOXA7n00kvNhAkTzLPPPmvWrVtn5s6d22sJBRt0r9vm4/7Vr37VPP/882bbtm3mpZdeMgsWLDClpaWmvr7eGGP38T5Y7TYf81dffdX4/X5z8803m82bN5v77rvP5Ofnm3vvvTf5nFtvvdUUFxebP/3pT2b9+vVmyZIlZvLkyWb//v0uVn7o2ltbW811111n1q5da7Zt22ZWr15tjj32WDN16lTT0dHhau3D4WC/b42x++cnXbLCqxlhjDdywqsZQT6kHvngXemSB8Z4NxO8kAfGkAmpRiakD84Z7EFODD+vZoUx3s0LY8gMDL90ywNjvJsJxngrF4whG9zk5Xwwxr6MoCluqYsuushMnDjRBINBU1ZWZubPn5882dqxY4f51Kc+ZUaPHm1ycnLMlClTzNe+9jXT3NzsctX9OzCg9u/fby677DIzatQok5+fbz7/+c+bmpoa9wrsR/e6bT7u55xzjqmsrDTBYNCMHTvWnHPOOT32vLD5eB+sdpuPuTHGPPbYY2bmzJkmJyfHHHHEEeYXv/hFj/fHYjGzbNkyU15ebnJycsz8+fPNxo0bXaq2p4PV3t7ebhYtWmTKyspMIBAwEydONJdccompra11seLhc7Dft8bY/fOTLlnh1Ywwxhs54dWMIB/cQT54U7rkgTHezQQv5IExZIIbyIT0wDmDPciJ4efVrDDG23lhDJmB4ZVueWCMdzPBGG/lgjFkg9u8nA/G2JURPmOMGfr5cwAAAAAAAAAAAAAA3Mee4gAAAAAAAAAAAACAtEVTHAAAAAAAAAAAAACQtmiKAwAAAAAAAAAAAADSFk1xAAAAAAAAAAAAAEDaoikOAAAAAAAAAAAAAEhbNMUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4gAAAAAAAAAAAACAtEVTHLDEr3/9ay1atMjtMgZk1apVKi4uHtbX+PnPf64zzjhjWF8DAGxGLvRELgAA2XAgsgEAyIYDkQ0AQDYciGyAg6Y40prP5zvonxUrVuj999/v8VhJSYkWLVqkN998s8fnevfdd3X22WerrKxMOTk5mjZtmr797W+rvb1dkvT8888f8vWef/75Puvs6OjQsmXLtHz58uE+JJKkyZMna/Xq1SkJnA/roosu0htvvKG//vWvbpcCII2QC30jFwBkMrKhb2QDgExGNvSNbACQyciGvpEN8BKa4khrNTU1yT+33367CgsLezx23XXXJZ+7evVq1dTU6M9//rPa2tp02mmnqampSZL08ssva86cOQqFQnr88ce1adMm3XzzzVq1apUWLlyoUCik448/vsfnPvvss7V48eIejx1//PF91vmHP/xBhYWFOuGEE4b9mKxfv16NjY06+eSTh/21PopgMKjzzjtPd9xxh9ulAEgj5EJv5AKATEc29EY2AMh0ZENvZAOATEc29EY2wHMMkCHuvvtuU1RU1Ovxbdu2GUnmzTffTD720ksvGUnmqaeeMrFYzMyYMcPMnj3bRKPRHh/71ltvGZ/PZ2699dZen/fCCy80S5YsGVBtp59+urnuuuv6/Pibb77ZjBkzxhQVFZmbbrrJhMNhc91115lRo0aZsWPHmt/85jfJj+ns7DSXX365qaioMDk5OWbChAnmu9/9bo/Pu3LlSnPOOeeY5557zkjq8Wf58uXGGGP27t1rvvCFL5ji4mKTl5dnFi9ebDZt2tTvsayvrzezZs0yn/vc50xHR4eJRqPmu9/9rpk0aZLJzc01Rx99tHn44YeTz3dee/Xq1WbWrFkmLy/PzJ071/zjH//oUesLL7xggsGgaW9vH9BxBIDBIBfiyAUA6EI2xJENANCFbIgjGwCgC9kQRzbAa5gUB/qQl5cnSQqFQnrrrbe0YcMGXXvttcrK6vkjc8wxx2jBggV64IEHPtLrvfjii5o9e3avx5999llVV1frL3/5i374wx9q+fLl+uxnP6tRo0bplVde0aWXXqovfelL2rlzpyTpjjvu0KOPPqqHHnpIGzdu1H333adJkyb1+JyPPvqolixZouOPP77XHW3O3Wxf/OIXtW7dOj366KNau3atjDH6zGc+o3A43KvGDz74QCeddJJmzpypP/zhD8rJydEtt9yi3/72t/r5z3+ud999V9dcc40uuOACvfDCCz0+9pvf/KZ+8IMfaN26dfL7/brooot6vH/27NmKRCJ65ZVXPsrhBYCPjFwgFwDgQGQD2QAAByIbyAYAOBDZQDbAIq625IEUGujdW42Njebzn/+8KSgoMLW1tebBBx/sdXdXd1dddZXJy8vr9fhA795qbGw0ksxf/vKXXh8/ceLEHneMTZ8+3Zx00knJtyORiBkxYoR54IEHjDHGXHnllWbevHkmFov1+Vo7d+40wWDQNDY2GmP6PiabNm0yksxLL72UfKyhocHk5eWZhx56qMfH/eMf/zDjx483V111VfI1Ozo6TH5+vvnb3/7W4/NefPHF5txzzzXG9Lx7y/H4448bSWb//v09Pm7UqFFm1apVfR88APgIyAVyAQAORDaQDQBwILKBbACAA5ENZAO8yZ+KxjvgBccff7yysrK0b98+HXbYYfr973+v8vLy5PuNMcPyuvv375ck5ebm9nrfUUcd1eOOsfLycs2cOTP5dnZ2tkpKSlRfXy8pftfVwoULNX36dC1evFif/exntWjRouTzH330UZ144okqLi7ut5733ntPfr9fc+bMST5WUlKi6dOn67333utR90knnaTzzjtPt99+e/LxLVu2qL29XQsXLuzxeUOhkD7+8Y/3eOzoo49O/r2yslKSVF9frwkTJiQfz8vLU3t7e7/1AsBwIRfiyAUA6EI2xJENANCFbIgjGwCgC9kQRzbANjTFgYTf//73mjFjhkpKSnr8Ip82bZqk+C/wA3/ROo87z/kwSkpK5PP51NjY2Ot9gUCgx9s+n6/Px2KxmCTp2GOP1bZt2/Tkk09q9erVOvvss7VgwQL94Q9/kBQPqn/913/90LV2l5OTowULFuj//u//9LWvfU1jx46VJLW1tUmSHn/88eRj3T+mv6/P5/NJUvJrcezdu1dlZWVDUjMADAa5MDjkAoBMQDYMDtkAIBOQDYNDNgDIBGTD4JANSBX2FAcSxo8fr8MPP7zXnU0f+9jHdMQRR+hHP/pRr1+ib7/9tlavXq1zzz33Q79uMBjUjBkztGHDhg/9OborLCzUOeeco1/+8pf6/e9/r//5n//R3r171dbWpueee05Llizp8drRaLTHxx955JG99tbYs2ePNm7cqBkzZiQfy8rK0u9+9zvNmjVLp5xyiqqrqyVJM2bMUE5Ojnbs2KEpU6b0+DN+/PhBfS1bt25VR0dHn/+DAADDjVyIIxcAoAvZEEc2AEAXsiGObACALmRDHNkA29AUBw7B5/Pp17/+tTZs2KClS5fq1Vdf1Y4dO/Twww/rjDPO0Ny5c3X11Vd/pNc49dRT9eKLL37kWn/4wx/qgQce0D/+8Q9t2rRJDz/8sCoqKlRcXKynnnpK06ZN06RJk5LPnzRpktra2rRmzRo1NDSovb1dU6dO1ZIlS3TJJZfoxRdf1Ntvv60LLrhAY8eO7RFyUnxJlfvuu0/HHHOM5s2bp9raWo0cOVLXXXedrrnmGt1zzz3aunWr3njjDf3kJz/RPffcM6iv569//asOO+wwHX744R/52ADAUCEXyAUAOBDZQDYAwIHIBrIBAA5ENpANcBdNcWAAjj/+eL388svKzs7WaaedpilTpuiGG27QhRdeqGeeeabXUh2DdfHFF+uJJ55Qc3PzR/o8I0eO1G233abZs2frE5/4hN5//3098cQTysrK0p/+9Kdey5kcf/zxuvTSS3XOOeeorKxMt912myTp7rvv1qxZs/TZz35Wc+fOlTFGTzzxRK/lVCTJ7/frgQce0FFHHaV58+apvr5e3/nOd7Rs2TLdcsstOvLII7V48WI9/vjjmjx58qC+ngceeECXXHLJhz8gADBMyAVyAQAORDaQDQBwILKBbACAA5ENZAPc4zPGGLeLACCdddZZOvbYY3XDDTcM+eeORCIqLy/Xk08+qeOOO27IP/9wePfddzVv3jxt2rRJRUVFbpcDAClHLvRELgAA2XAgsgEAyIYDkQ0AQDYciGyAg0lxwBLf//73VVBQMCyfe+/evbrmmmv0iU98Ylg+/3CoqanRb3/7W0IKQMYiF3oiFwCAbDgQ2QAAZMOByAYAIBsORDbAwaQ4AAAAAAAAAAAAACBtMSkOAAAAAAAAAAAAAEhbNMUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4gAAAAAAAAAAAACAtEVTHAAAAAAAAAAAAACQtmiKAwAAAAAAAAAAAADSFk1xAAAAAAAAAAAAAEDaoikOAAAAAAAAAAAAAEhb/z8c3Pk+fvR86QAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create a list to store the throughput and tpot data\n", + "throughput_tpot_data = []\n", + "\n", + "# Iterate over the models, batch sizes, and arrival rates to calculate throughput and tpot\n", + "for ssm in small_model_names:\n", + " for batch_size in batch_sizes:\n", + " for arrival_rate in arrival_rates:\n", + " model_name = ssm.replace(\"/\", \"-\")\n", + " filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n", + " if os.path.exists(filepath):\n", + " throughput = get_throughput(filepath)\n", + " tpot = get_tpot(filepath)\n", + " throughput_tpot_data.append({\n", + " 'Model': model_name,\n", + " 'Batch Size': batch_size,\n", + " 'Arrival Rate': arrival_rate,\n", + " 'Throughput': throughput,\n", + " 'TPOT': tpot\n", + " })\n", + "\n", + "# add incremental decoding entry\n", + "for batch_size in batch_sizes:\n", + " for arrival_rate in arrival_rates:\n", + " model_name = ssm.replace(\"/\", \"-\")\n", + " filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n", + " if os.path.exists(filepath):\n", + " throughput = get_throughput(filepath)\n", + " tpot = get_tpot(filepath)\n", + " throughput_tpot_data.append({\n", + " 'Model': \"Incr Dec (baseline)\",\n", + " 'Batch Size': batch_size,\n", + " 'Arrival Rate': arrival_rate,\n", + " 'Throughput': throughput,\n", + " 'TPOT': tpot\n", + " })\n", + "\n", + "# Convert the list to a DataFrame\n", + "throughput_tpot_df = pd.DataFrame(throughput_tpot_data)\n", + "\n", + "# Plot the data\n", + "fig, axes = plt.subplots(nrows=1, ncols=len(arrival_rates), figsize=(20, 5), sharey=True)\n", + "\n", + "for i, arrival_rate in enumerate(arrival_rates):\n", + " ax = axes[i]\n", + " for model_name in throughput_tpot_df['Model'].unique():\n", + " model_data = throughput_tpot_df[(throughput_tpot_df['Model'] == model_name) & (throughput_tpot_df['Arrival Rate'] == arrival_rate)]\n", + " ax.plot(model_data['TPOT'], model_data['Throughput'], marker='o', label=model_name)\n", + " ax.set_title(f'Arrival Rate: {arrival_rate} {\"requests/sec\" if arrival_rate != \"offline\" else \"\"}')\n", + " ax.set_xlabel('TPOT (ms/token)')\n", + " ax.set_ylabel('Output Throughput (tokens/sec)')\n", + " ax.grid(True)\n", + " if i == 0:\n", + " ax.legend(title='Model')\n", + "\n", + "plt.suptitle('Throughput vs TPOT for Different Arrival Rates\\nLLM: LLAMA-3.1-70B-Instruct\\nBatch Sizes: 4, 8')\n", + "plt.tight_layout(rect=[0, 0, 1, 0.96])\n", + "\n", + "# Save the plot as a PDF\n", + "plt.savefig('/usr/FlexFlow/benchmarking/throughput_vs_tpot.pdf')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n", + "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return ttft.mean()[1] / 1000\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Model Batch Size Arrival Rate TTFT\n", + "0 Zhuominc-Llama-3-330M 4 offline 236.037453\n", + "1 Zhuominc-Llama-3-330M 4 1 239.494513\n", + "2 Zhuominc-Llama-3-330M 4 2 236.035863\n", + "3 Zhuominc-Llama-3-330M 4 4 237.153932\n", + "4 Zhuominc-Llama-3-330M 4 8 237.309231\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABVwAAALvCAYAAACZeQ7oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADxCklEQVR4nOzdeVyN6f8/8NeptC9atJFKoiSh7EP2KCR7tsq+CxmMqGQdu5mxLxlK9u0jS4OIjGXsNJgohpQ1khZ1//7o1/11nJZTIqbX8/s4j++c676W93Wfuz68Xee6JIIgCCAiIiIiIiIiIiKiz6ZQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUsKEKxEREREREREREVEpYcKViIiIiIiIiIiIqJQw4UpERCJvb29YWFh80TEkEgkCAwO/6BhlYeHChahWrRoUFRVRt27dsg5HRsuWLdGyZUupsqSkJPTo0QP6+vqQSCRYtmwZAODevXto3749dHR0IJFIsG/fvq8e739NfvdfXhYWFvD29i7VeL60z5lvWbOwsECnTp2++DiBgYGQSCRffBwiIiIi+vqYcCWib45EIinVV1RUFOLj4wu83rhxY/EvvkW9vpUEwo8//giJRILevXuXdShfxKefl4KCAvT09NCxY0ecO3euxP2uXLkSISEhpRfo/3fs2DH8+OOPaNasGTZt2oS5c+eW+hgf8/b2lro/mpqaqFatGnr06IHdu3cjJydHrn4mTJiAo0ePYtq0adiyZQs6dOgAAPDy8sKNGzcwZ84cbNmyBU5OTl9yOp9l7ty5cieEP36uZs+enW+dfv36ifeUvjwLCwupZ1lVVRXW1taYPHkyXr58WaI+Y2JiEBgYiNevX5dusHJKTU1FQEAAateuDQ0NDejr66Nu3boYP348njx5UiYxEREREdHXpVTWARARfWrLli1S73///XdERkbKlGdnZ0NRUbHIera2tnj//j0AwNPTE66urlLXK1WqBBMTE1SvXl0sS01NxciRI+Hh4YFu3bqJ5UZGRp83uVIgCAK2bdsGCwsLHDx4EG/fvoWWllap9L1u3Tq5k3VfQ97nlZ2djbt372LlypVo1aoVLl68CHt7+2L3t3LlShgYGJT6asETJ05AQUEBGzZsgLKycqn2XRAVFRWsX78eAPD+/XskJCTg4MGD6NGjB1q2bIn9+/dDW1tbrH/s2LF843Z3d4efn59Y9v79e5w7dw7Tp0/HmDFjvvxEPtPcuXPRo0cPdO3aVe42qqqq2LZtG/z9/aXK3717h/3790NVVbWUo6TC1K1bF5MmTQIApKen46+//sKyZctw6tQpXLhwodj9xcTEICgoCN7e3qhYsWIpR1u4rKwstGjRAn///Te8vLwwduxYpKam4tatWwgLC4OHhwdMTU0BAP7+/pg6depXjY+IiIiIvg4mXInom9O/f3+p93/++SciIyNlyj9VWL34+HgAQP369Qvsp06dOuJ/P3/+HCNHjkSdOnWKHPdri4qKwr///osTJ07AxcUFe/bsgZeXV5Ht0tPToaysDAUF2S83vHv3DhoaGqhQocKXCLnEPv28mjdvjo4dO2LVqlVYuXJlGUYmLTk5GWpqaqWWbBUEAenp6VBTUyuwjpKSksyzOXv2bMyfPx/Tpk3D0KFDsX37dvFafrElJyfLJKSePXsGAKWaqCrs2SsLrq6u2LNnD65duwYHBwexfP/+/cjMzESHDh1w4sSJMoywfKlcubLUszxkyBBoampi0aJFuHfvHqytrcswuuLZt28frly5gtDQUPTt21fqWnp6OjIzM8X3SkpKUFLiH8WJiIiI/ou+jb/5EBF953bt2gWJRIJTp07JXFuzZg0kEglu3rwJAHj69Cl8fHxQpUoVqKiowMTEBO7u7mJSuCihoaGoVasWWrVqhbZt2yI0NFSmTlRUFCQSCcLDw+Hv74/KlStDXV0db968gbe3NzQ1NREXFwdXV1doaWmhX79+AKT3cM3KyoKenh58fHxk+n/z5g1UVVXFlZGZmZmYOXMmHB0doaOjAw0NDTRv3hwnT56Ua07yat68OQAgLi5OqnzTpk1o3bo1DA0NoaKiglq1amHVqlVSdSwsLHDr1i2cOnUq3y0iXr9+DV9fX5iZmUFFRQXVq1fHggULilzxK5FIsGnTJrx7907sN2/bgg8fPiA4OBhWVlZQUVGBhYUFfvrpJ2RkZMjE1qlTJxw9ehROTk5QU1PDmjVrSnSPpk6divbt22Pnzp24e/euWP7xnpohISGQSCQQBAG//fabGHdgYCDMzc0BAJMnT4ZEIpHa0/fx48cYNGgQjIyMoKKiAjs7O2zcuFFq/MKePQA4f/48OnToAB0dHairq8PZ2Rlnz56V6iNvi49//vlHXKWoo6MDHx8fpKWlSd37d+/eYfPmzeIc5Fm93KRJE1haWiIsLEyqPDQ0FB06dICenl6+7VauXAk7OzuoqKjA1NQUo0ePzvdr62vXroWVlRXU1NTQsGFDREdH59tfRkYGAgICUL16daioqMDMzAw//vijzPPxqaysLAQFBcHa2hqqqqrQ19fHDz/8gMjIyELbvXz5En5+frC3t4empia0tbXRsWNHXLt2Tape3me4Y8cOzJkzB1WqVIGqqiratGmDf/75p8TzLQ5jY2MAkEpIXr9+Hd7e3qhWrRpUVVVhbGyMQYMG4cWLF2KdwMBATJ48GQBgaWkpPhcf/37dunUrGjZsCHV1dejq6qJFixb5rgA/c+YMGjZsCFVVVVSrVg2///57kXHn/W5q1qyZzDVVVVWpVeef7uH66VYhH78+3vda3ucmMjISP/zwAypWrAhNTU3UrFkTP/30U5FzICIiIqLPx39WJ6JyJS0tDc+fP5cq09HR+eyVnW5ubtDU1MSOHTvg7OwsdW379u2ws7ND7dq1AQDdu3fHrVu3MHbsWFhYWCA5ORmRkZF4+PBhkQdWZWRkYPfu3eLXbz09PeHj44OnT5+KCYqPBQcHQ1lZGX5+fsjIyBBXOX748AEuLi744YcfsGjRIqirq8u0rVChAjw8PLBnzx6sWbNGaoXkvn37kJGRgT59+gDITcCuX78enp6eGDp0KN6+fYsNGzbAxcUFFy5cKLVDpPKSJrq6ulLlq1atgp2dHbp06QIlJSUcPHgQo0aNQk5ODkaPHg0AWLZsGcaOHQtNTU1Mnz4dwP9tEZGWlgZnZ2c8fvwYw4cPR9WqVRETE4Np06YhMTFRPEwqP1u2bMHatWtx4cIF8Sv+TZs2BZC7Um/z5s3o0aMHJk2ahPPnz2PevHmIjY3F3r17pfq5c+cOPD09MXz4cAwdOhQ1a9Ys8X0aMGAAjh07hsjISNSoUUPmeosWLbBlyxYMGDAA7dq1w8CBAwHkrvKuWLEiJkyYIG7nkLeXaVJSEho3bgyJRIIxY8agUqVKOHz4MAYPHow3b97A19dXaoz8nr0TJ06gY8eOcHR0REBAABQUFMRkeXR0NBo2bCjVR69evWBpaYl58+bh8uXLWL9+PQwNDbFgwQLx3g8ZMgQNGzbEsGHDAABWVlZy3SNPT09s3boV8+fPh0QiwfPnz3Hs2DFs2bIFR44ckakfGBiIoKAgtG3bFiNHjsSdO3ewatUqXLx4EWfPnhV/h2zYsAHDhw9H06ZN4evri/v376NLly7Q09ODmZmZ2F9OTg66dOmCM2fOYNiwYbC1tcWNGzewdOlS3L17t9B9aQMDAzFv3jxx7m/evMGlS5dw+fJltGvXrsB29+/fx759+9CzZ09YWloiKSkJa9asgbOzM27fvi1+1T3P/PnzoaCgAD8/P6SkpODnn39Gv379cP78ebGOvPMtTFZWlvh7OT09HVeuXMGSJUvQokULWFpaivUiIyNx//59+Pj4wNjYGLdu3cLatWtx69Yt/Pnnn5BIJOjWrRvu3r2Lbdu2YenSpTAwMACQu3UMAAQFBSEwMBBNmzbFrFmzoKysjPPnz+PEiRNo3769ONY///yDHj16YPDgwfDy8sLGjRvh7e0NR0dH2NnZFTiXvH+w+P333+Hv71+sQ7GGDx+Otm3bSpUdOXIEoaGhMDQ0BCD/c3Pr1i106tQJderUwaxZs6CiooJ//vlH5h83iIiIiOgLEYiIvnGjR48W5Pl1VVi9Bw8eCADyfZ08eVKm/rNnzwQAQkBAgNxxenp6CoaGhsKHDx/EssTEREFBQUGYNWuWIAiC8OrVKwGAsHDhQrn7/diuXbsEAMK9e/cEQRCEN2/eCKqqqsLSpUul6p08eVIAIFSrVk1IS0uTuubl5SUAEKZOnSrTv5eXl2Bubi6+P3r0qABAOHjwoFQ9V1dXoVq1auL7Dx8+CBkZGVJ1Xr16JRgZGQmDBg2SKpfnvuZ9XkFBQcKzZ8+Ep0+fCtHR0UKDBg0EAMLOnTul6n86R0EQBBcXF6kYBUEQ7OzsBGdnZ5m6wcHBgoaGhnD37l2p8qlTpwqKiorCw4cPC43Xy8tL0NDQkCq7evWqAEAYMmSIVLmfn58AQDhx4oRYZm5uLgAQjhw5Uug4hY33sStXrggAhAkTJohlzs7OMnMHIIwePVqqLO/ef/qMDh48WDAxMRGeP38uVd6nTx9BR0dH/AwKevZycnIEa2trwcXFRcjJyRHL09LSBEtLS6Fdu3ZiWUBAgABA5tnx8PAQ9PX1pco0NDQELy+vAu9FQXO7efOmAECIjo4WBEEQfvvtN0FTU1N49+6dzP1NTk4WlJWVhfbt2wvZ2dli+a+//ioAEDZu3CgIgiBkZmYKhoaGQt26daV+HtauXSsAkLr/W7ZsERQUFMTx86xevVoAIJw9e1YsMzc3l5qjg4OD4ObmJtecP5aeni4Vf949UVFREX9HCcL/fYa2trZS81i+fLkAQLhx40ax51uQvGf/01ezZs1knrX8fs63bdsmABBOnz4tli1cuFAAIDx48ECq7r179wQFBQXBw8ND5j58/EzmxfRxn8nJyYKKioowadKkQueTlpYm1KxZUwAgmJubC97e3sKGDRuEpKQkmbp5z3lB7t27J+jo6Ajt2rUT/3dF3udm6dKlAgDh2bNnhcZLRERERF8GtxQgonJl2LBhiIyMlHp9vIfj5+jduzeSk5MRFRUllu3atQs5OTno3bs3AIj7fEZFReHVq1fFHiM0NBROTk7iAV9aWlpwc3PLd1sBIPe0+YL2AR05cmSR47Vu3RoGBgZSe4G+evUKkZGR4pwAQFFRUVwBm5OTg5cvX+LDhw9wcnLC5cuX5Z7fpwICAlCpUiUYGxujefPmiI2NxeLFi9GjRw+peh/PMSUlBc+fP4ezszPu37+PlJSUIsfZuXMnmjdvDl1dXTx//lx8tW3bFtnZ2Th9+nSxY4+IiAAATJw4Uao8b3XyoUOHpMotLS3h4uJS7HHyk7cq9e3bt6XSnyAI2L17Nzp37gxBEKTukYuLC1JSUmQ+50+fvatXr+LevXvo27cvXrx4IbZ/9+4d2rRpg9OnT8ts3zBixAip982bN8eLFy/E7Qk+h52dHerUqYNt27YBAMLCwuDu7p7vau8//vgDmZmZ8PX1ldqHdujQodDW1hY/y0uXLiE5ORkjRoyQWhHu7e0NHR0dqT537twJW1tb2NjYSN3P1q1bA0Ch23FUrFgRt27dwr1794o1ZxUVFTH+7OxsvHjxQvyqeX4/pz4+PlLzyNvS4/79+8Web2EaNWok/j7+3//+hzlz5uDWrVvo0qWLeOAhIP1znp6ejufPn6Nx48YAINfvmX379iEnJwczZ86U2U/405WotWrVEucL5K6QrVmzpjj3gqipqeH8+fPitgYhISEYPHgwTExMMHbs2CK3i8jz7t07eHh4QFdXF9u2bRMPiJT3ucnbg3n//v3f1EGIREREROUFtxQgonLF2tpa5iubpSVvX8rt27ejTZs2AHK3E6hbt674tW4VFRUsWLAAkyZNgpGRERo3boxOnTph4MCB+W4J8LHXr18jIiICY8aMkdpHsVmzZti9ezfu3r0r8/Xxj7+O+zElJSVUqVKlyDkpKSmhe/fuCAsLQ0ZGBlRUVLBnzx5kZWVJJVwBYPPmzVi8eDH+/vtvZGVlFRmDPIYNG4aePXsiPT0dJ06cwIoVK5CdnS1T7+zZswgICMC5c+ek9vgEchOwRSV/7t27h+vXr4tfO/5UcnJysWNPSEiAgoKCmBzPY2xsjIoVKyIhIUGq/HPu06dSU1MB5CbkS8OzZ8/w+vVrrF27FmvXrs23zqf36NP55CUHCzvgLSUlRWq7iKpVq0pdz7v26tUrqb0wS6pv375YvHgxJkyYgJiYmAL3t8z7rD7d5kFZWRnVqlUTr+f9/08PeapQoQKqVasmVXbv3j3ExsaW6JmbNWsW3N3dUaNGDdSuXRsdOnTAgAEDpA7+y09OTg6WL1+OlStX4sGDB1I/S/r6+jL1C7v/QPHmWxgDAwOp38tubm6oWbMmevTogfXr12Ps2LEAcvegDQoKQnh4uMz9kecfVuLi4qCgoIBatWoVWffTuQO585fnH8p0dHTw888/4+eff0ZCQgKOHz+ORYsW4ddff4WOjg5mz55dZB9Dhw5FXFwcYmJipD4beZ+b3r17Y/369RgyZAimTp2KNm3aoFu3bujRo8c3c3gdERER0X8ZE65ERKVERUUFXbt2xd69e7Fy5UokJSXh7NmzmDt3rlQ9X19fdO7cGfv27cPRo0cxY8YMzJs3DydOnEC9evUK7H/nzp3IyMjA4sWLsXjxYpnroaGhCAoKkioraHXrxyvditKnTx+sWbMGhw8fRteuXbFjxw7Y2NhIrQzeunUrvL290bVrV0yePBmGhoZQVFTEvHnzZA64Ko6PE+SdOnWCoqIipk6dilatWsHJyQlAbhKlTZs2sLGxwZIlS2BmZgZlZWVERERg6dKlcq3uysnJQbt27fDjjz/mez2/fVDlJe8ejgV9ViWRd0Dbp8neksq7h/379y8wYfppsu/T+eT1sXDhwgL39M1bmZsnb1XfpwRBKDJmeXh6emLatGkYOnQo9PX1pfbw/NJycnJgb2+PJUuW5Hu9sP1PW7Rogbi4OOzfvx/Hjh3D+vXrsXTpUqxevRpDhgwpsN3cuXMxY8YMDBo0CMHBwdDT04OCggJ8fX3z/Tn50ve/MHn/aHX69Gkx4dqrVy/ExMRg8uTJqFu3LjQ1NZGTk4MOHTqU+irO0pq7ubk5Bg0aBA8PD1SrVg2hoaFFJlyXL1+Obdu2YevWrTI/K/I+N2pqajh9+jROnjyJQ4cO4ciRI9i+fTtat26NY8eOFTg/IiIiIiodTLgSEZWi3r17Y/PmzTh+/DhiY2MhCILMSlAg92CfSZMmYdKkSbh37x7q1q2LxYsXY+vWrQX2HRoaitq1ayMgIEDm2po1axAWFiaTcC0NLVq0gImJCbZv344ffvgBJ06cEA+dyrNr1y5Uq1YNe/bskUow5hfr55g+fTrWrVsHf39/8WCjgwcPIiMjAwcOHJBalZbfV7ILSn5aWVkhNTW1VFc/m5ubIycnB/fu3YOtra1YnpSUhNevX4uH63wJW7ZsgUQiKfQApeKoVKkStLS0kJ2dXeJ7lHeYlba2dqne5+IcSvSpqlWrolmzZoiKisLIkSOhpJT/H4vyPqs7d+5IrdzMzMzEgwcPxPnk1bt37574FW8g91CoBw8eSP0jhZWVFa5du4Y2bdqUaA56enrw8fGBj48PUlNT0aJFCwQGBhaacN21axdatWqFDRs2SJW/fv1aPFyqOIoz3+L68OEDgP9brf3q1SscP34cQUFBmDlzplgvv20VCvs5z8nJwe3bt0vtID956erqwsrKSvzHkIJER0fDz88Pvr6+6Nevn8z14jw3CgoKaNOmDdq0aYMlS5Zg7ty5mD59Ok6ePPnFvulBRERERLn4nSIiolLUtm1b6OnpYfv27di+fTsaNmwo9dXqtLQ0pKenS7WxsrKClpZWoXv7PXr0CKdPn0avXr3Qo0cPmZePjw/++ecfqdPDS4uCggJ69OiBgwcPYsuWLfjw4YNMEjlvtdTHq7/Onz+Pc+fOlWosFStWxPDhw3H06FFcvXq1wLFTUlKwadMmmfYaGhp4/fq1THmvXr1w7tw5HD16VOba69evxeRPcbi6ugIAli1bJlWetzLNzc2t2H3KY/78+Th27Bh69+4t81XvklJUVET37t2xe/fufBNGz549K7IPR0dHWFlZYdGiRWISrbh95Kegz1Res2fPRkBAgLiKMj9t27aFsrIyVqxYIfWcbdiwASkpKeJn6eTkhEqVKmH16tXIzMwU64WEhMjE2KtXLzx+/Bjr1q2TGe/9+/d49+5dgfG8ePFC6r2mpiaqV69e5P6gioqKMis0d+7cicePHxfariDFmW9xHTx4EADEpG1+P+eA7M8XkPtMAJCJoWvXrlBQUMCsWbNkVsSW1qrda9eu4fnz5zLlCQkJuH37tsy2FB9LTExEr1698MMPP2DhwoX51pH3uXn58qXM9bwks7z7yBIRERFRyXGFKxFRKapQoQK6deuG8PBwvHv3DosWLZK6fvfuXbRp0wa9evVCrVq1oKSkhL179yIpKQl9+vQpsN+wsDAIgoAuXbrke93V1RVKSkoIDQ1Fo0aNSnVOQO7K3V9++QUBAQGwt7eXWrEJ5H7df8+ePfDw8ICbmxsePHiA1atXo1atWvkm1z7H+PHjsWzZMsyfPx/h4eFo3749lJWV0blzZwwfPhypqalYt24dDA0NkZiYKNXW0dERq1atwuzZs1G9enUYGhqidevWmDx5Mg4cOIBOnTrB29sbjo6OePfuHW7cuIFdu3YhPj6+2CsAHRwc4OXlhbVr1+L169dwdnbGhQsXsHnzZnTt2hWtWrX6rPvw4cMHcUV0eno6EhIScODAAVy/fh2tWrUqcK/Vkpo/fz5OnjyJRo0aYejQoahVqxZevnyJy5cv448//sg3wfMxBQUFrF+/Hh07doSdnR18fHxQuXJlPH78GCdPnoS2traYZCsOR0dH/PHHH1iyZAlMTU1haWlZrJ8BZ2dnODs7F1qnUqVKmDZtGoKCgtChQwd06dIFd+7cwcqVK9GgQQP0798fQO7P/+zZszF8+HC0bt0avXv3xoMHD7Bp0yaZPU0HDBiAHTt2YMSIETh58iSaNWuG7Oxs/P3339ixYweOHj0qbpvxqVq1aqFly5ZwdHSEnp4eLl26hF27dmHMmDGFzqNTp06YNWsWfHx80LRpU9y4cQOhoaHF2m/1Y8WZb2EeP34sPsuZmZm4du0a1qxZAwMDAzERrq2tjRYtWuDnn39GVlYWKleujGPHjuHBgwcy/Tk6OgLIXRHfp08fVKhQAZ07d0b16tUxffp0BAcHo3nz5ujWrRtUVFRw8eJFmJqaYt68eSW6Dx+LjIxEQEAAunTpgsaNG0NTUxP379/Hxo0bkZGRgcDAwALbjhs3Ds+ePcOPP/6I8PBwqWt16tRBnTp15H5uZs2ahdOnT8PNzQ3m5uZITk7GypUrUaVKFfzwww+fPU8iIiIiKoJARPSNGz16tCDPr6vC6j148EAAICxcuFCuMZ89eyYAEAICAooTqiAIghAZGSkAECQSifDo0SOpa8+fPxdGjx4t2NjYCBoaGoKOjo7QqFEjYceOHYX2aW9vL1StWrXQOi1bthQMDQ2FrKws4eTJkwIAYefOnTL1vLy8BA0NjXz78PLyEszNzWXKc3JyBDMzMwGAMHv27Hyvz507VzA3NxdUVFSEevXqCf/73//y7U+e+1rU5+Xt7S0oKioK//zzjyAIgnDgwAGhTp06gqqqqmBhYSEsWLBA2LhxowBAePDggdju6dOngpubm6ClpSUAEJydncVrb9++FaZNmyZUr15dUFZWFgwMDISmTZsKixYtEjIzMwuNt6B7mpWVJQQFBQmWlpZChQoVBDMzM2HatGlCenq6VD1zc3PBzc2t0DE+HQ+A+FJXVxcsLCyE7t27C7t27RKys7Nl2jg7O0vNVxByP4vRo0dLlRV275OSkoTRo0cLZmZmQoUKFQRjY2OhTZs2wtq1a8U6hT17giAIV65cEbp16ybo6+sLKioqgrm5udCrVy/h+PHjYp2AgAABgPDs2TOptps2bZL5TP/++2+hRYsWgpqamgBA8PLyKui2yf17oKDP89dffxVsbGyEChUqCEZGRsLIkSOFV69eydRbuXKlYGlpKaioqAhOTk7C6dOn873/mZmZwoIFCwQ7OztBRUVF0NXVFRwdHYWgoCAhJSVFrGdubi41r9mzZwsNGzYUKlasKKipqQk2NjbCnDlzinxO09PThUmTJgkmJiaCmpqa0KxZM+HcuXMysRX0Gebdv02bNpVovvkxNzeXepYVFBQEQ0NDwdPTU/z5zvPvv/8KHh4eQsWKFQUdHR2hZ8+ewpMnT/L9nRIcHCxUrlxZUFBQkHlmNm7cKNSrV0+8587OzkJkZKRUTPn9PMozp/v37wszZ84UGjduLBgaGgpKSkpCpUqVBDc3N+HEiRNSdfOe84/7//hefPz6eH7yPDfHjx8X3N3dBVNTU0FZWVkwNTUVPD09hbt37xYaPxERERGVDokgfIWTD4iIiIiIiIiIiIjKAe7hSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiL6KkJAQSCQSXLp0qcA68fHxkEgkWLRoUaF9WVhYQCKRoG3btvleX7duHSQSSZHjFSYwMBASiQTPnz8vsE5UVBQkEgl27dold7+9evWCRCLBlClTCu1TIpFg69at+dZp1qwZJBIJateune/17OxsmJqaQiKR4PDhw3LHBgATJkxA/fr1oaenB3V1ddja2iIwMBCpqalytV+1ahV69uyJqlWrQiKRwNvbu1jj5322+b2sra1l6m/YsAG2trZQVVWFtbU1fvnlF5k63t7eUv0oKSnBzMwMffr0we3bt+WKS57n93Pcvn0bgYGBiI+P/yL9fy8xEBERERH9FyiVdQBEREQloaqqipMnT+Lp06cwNjaWuhYaGgpVVVWkp6eXUXT5e/PmDQ4ePAgLCwts27YN8+fPh0QiybeuqqoqwsLC0L9/f6ny+Ph4xMTEQFVVtcBxTpw4gcTERFhYWCA0NBQdO3aUO8aLFy+iefPm8PHxgaqqKq5cuYL58+fjjz/+wOnTp6GgUPi/1S5YsABv375Fw4YNkZiYKPe4eZYtWyaT3E1ISIC/vz/at28vVb5mzRqMGDEC3bt3x8SJExEdHY1x48YhLS1NJqGtoqKC9evXAwA+fPiAuLg4rF69GkeOHMHt27dhampa7FhL0+3btxEUFISWLVvCwsKi3MZARERERPRfwIQrERF9l5o1a4aLFy9i+/btGD9+vFj+77//Ijo6Gh4eHti9e3cZRihr9+7dyM7OxsaNG9G6dWucPn0azs7O+dZ1dXXFgQMH8Pz5cxgYGIjlYWFhMDIygrW1NV69epVv261bt6J+/frw8vLCTz/9hHfv3kFDQ0OuGM+cOSNTZmVlBT8/P1y4cAGNGzcutP2pU6fE1a2amppyjfmxrl27ypTNnj0bANCvXz+x7P3795g+fTrc3NzEFcZDhw5FTk4OgoODMWzYMOjq6or1lZSUZJLXjRs3RqdOnXDo0CEMHTq02LGWFUEQkJ6eDjU1tbIOhYiIiIiI8sEtBYiI6LukqqqKbt26ISwsTKp827Zt0NXVhYuLi0ybrKws/P333yVaeVkaQkND0a5dO7Rq1Qq2trYIDQ0tsK67uztUVFSwc+dOqfKwsDD06tULioqK+bZ7//499u7diz59+qBXr154//499u/f/1lx5612fP36dZF1zc3NC1y1W1JhYWGwtLRE06ZNxbKTJ0/ixYsXGDVqlFTd0aNH4927dzh06FCR/eatjFZSKtm/P3t7e0NTUxOPHz9G165doampiUqVKsHPzw/Z2dlSdcPDw+Ho6AgtLS1oa2vD3t4ey5cvB5C7XUHPnj0BAK1atRK3PoiKigKQe/87deqEo0ePwsnJCWpqalizZo24BUdISIhMbBKJBIGBgVJljx8/xuDBg2FqagoVFRVYWlpi5MiRyMzMLDIGIiIiIiKSHxOuRET03erbty8uXLiAuLg4sSwsLAw9evRAhQoVZOo/fvwYtra2mDZt2tcMEwDw5MkTnDx5Ep6engAAT09P7Nq1C5mZmfnWV1dXh7u7O7Zt2yaWXbt2Dbdu3ULfvn0LHOfAgQNITU1Fnz59YGxsjJYtWxaa2M3Phw8f8Pz5czx58gTHjh2Dv78/tLS00LBhw2L1UxquXLmC2NhYmTlfuXIFAODk5CRV7ujoCAUFBfH6x54/f47nz58jKSkJ586dw4QJE6Cvr49OnTqVOL7s7Gy4uLhAX18fixYtgrOzMxYvXoy1a9eKdSIjI+Hp6QldXV0sWLAA8+fPR8uWLXH27FkAQIsWLTBu3DgAwE8//YQtW7Zgy5YtsLW1Ffu4c+cOPD090a5dOyxfvhx169YtVpxPnjxBw4YNER4ejt69e2PFihUYMGAATp06hbS0NLliICIiIiIi+XBLASIi+m61bt0axsbG2LZtG/z9/REbG4urV69i+fLluH//flmHJ2Xbtm1QUVGBu7s7AKBPnz6YOXMmIiIi8v0aPZCbUO7cuTMePXoEMzMzhIaGolq1aoV+rX/r1q1o2rQpzMzMxHFGjRqFZ8+eoVKlSnLFeunSJTRp0kR8X7NmTRw4cAB6enpyzrb05CWLP95OAAASExOhqKgIQ0NDqXJlZWXo6+vjyZMnUuXv3r2TmX/lypVx7Ngxue9LftLT09G7d2/MmDEDADBixAjUr18fGzZswMiRIwEAhw4dgra2No4ePZrvyuRq1aqhefPmWLFiBdq1a4eWLVvK1Pnnn39w5MgRqZXbxTncatq0aXj69CnOnz8vlaSeNWsWBEFAxYoVi4yBiIiIiIjkwxWuRET03VJUVESvXr3EVaChoaEwMzND8+bN861vYWEBQRDy/Qr2lxYaGgo3NzdoaWkBAKytreHo6Fjo6tP27dtDT08P4eHhEAQB4eHh4grZ/Lx48QJHjx6VqtO9e3dIJBLs2LFD7lhr1aqFyMhI7Nu3Dz/++CM0NDRkDrL6GnJychAeHo569erJrLR8//49lJWV822nqqqK9+/fy5RFRkYiMjISR48exZo1a6CpqQlXV1fcvXv3s+IcMWKE1PvmzZtLJfwrVqyId+/eITIyssRjWFpa5rtNhjxycnKwb98+dO7cWWZFMIBS3wKCiIiIiKi84wpXIiL6rvXt2xcrVqzAtWvXEBYWhj59+nxzCaTY2FhcuXIFAwcOxD///COWt2zZEr/99hvevHkDbW1tmXYVKlRAz549ERYWhoYNG+LRo0eFbiewfft2ZGVloV69elLjNGrUCKGhoRg9ejQA4OXLl1JbGaipqUFHR0d8r62tjbZt2wLI3Us2LCwM7u7uuHz5MhwcHEp+I/6/9+/fIyUlRaosbz/Vj506dQqPHz/GhAkTZK6pqakVuB1DfgdKKSoqinPK4+rqCmtra0ybNk080OzZs2dSdfT09ApM7AK5idxPV8jq6upKHWg2atQo7NixAx07dkTlypXRvn179OrVCx06dCiw309ZWlrKXfdTz549w5s3b1C7du0S90FERERERPLjClciIvquNWrUCFZWVvD19cWDBw8KTUiWla1btwIAJkyYAGtra/G1ePFipKenY/fu3QW27du3L65evYrAwEA4ODigVq1aBdbNWy3brFkzqXHOnDmDc+fOiasuu3XrBhMTE/E1fvz4QuPv1q0bgNyDn0rD9u3bpcY3MTEpcD4KCgr5ruo1MTFBdnY2kpOTpcozMzPx4sULmJqaFhlHlSpVULNmTZw+fRoA8OjRI5m4YmJiCu2joMPLPmZoaIirV6/iwIED6NKlC06ePImOHTvCy8uryLZ5Pk0gAwWvTP30wC4iIiIiIvq6uMKViIi+e56enpg9ezZsbW2LfZjQlyYIAsLCwtCqVSuMGjVK5npwcDBCQ0Ph4+OTb/sffvgBVatWRVRUFBYsWFDgOA8ePEBMTAzGjBkDZ2dnqWs5OTkYMGAAwsLC4O/vj8WLF0utwCwqOZmRkYGcnByZVakl5eLiUuTX6zMyMrB79260bNky3/jyPudLly7B1dVVLL906RJycnLkfg4+fPggbpdgbGwsE1dprOgFcveW7dy5Mzp37oycnByMGjUKa9aswYwZM1C9evUSrcrW1dUFALx+/VqqPCEhQep9pUqVoK2tjZs3bxba37e2MpyIiIiI6HvFhCsREX33hgwZAkVFRTRq1KjQellZWYiLi4OOjk6BqypL29mzZxEfH49Zs2ahR48eMtfv3r2LGTNm4MmTJ/kmFiUSCVasWIErV65gwIABBY6Tt7r1xx9/FA/M+tj69esRGhoKf39/ODo65tvH69evoaGhgQoVKsi0BSC1/2daWhoePnwIAwMDGBgYFBhXfgpb1ZonIiICr1+/ljksK0/r1q2hp6eHVatWSSVcV61aBXV1dbi5uRUZx927d3Hnzh3xfqiqqspsO1AaXrx4AX19ffG9goIC6tSpAyA3sQwAGhoaAGSTp4XR1taGgYEBTp8+DV9fX7F85cqVUvUUFBTQtWtXbN26FZcuXZLZx1UQBEgkkhLFQEREREREsphwJSKir2rjxo04cuSITPnHX2s/fvw40tPTZep07do1330ozc3NERgYWOTYjx8/hq2tLby8vOQ+OGvJkiVQV1eXKlNQUMBPP/0kvt+9ezf+/vtvmbZeXl4IDQ2FoqJigQnALl26YPr06QgPD8fEiRPzrePu7g53d/dC4wwNDUXdunXzTbbmjTN27FhcvnwZ9evXz7dOVFQUxo0bhx49esDa2hqZmZmIjo7Gnj174OTkhP79+4t1L1y4gFatWiEgIEDq3h88eBDXrl0DkJvgvn79OmbPni3GkJdoLEpoaChUVFTQvXv3fK+rqakhODgYo0ePRs+ePeHi4oLo6Ghs3boVc+bMgZ6enlT9Dx8+iFs75OTkID4+HqtXr0ZOTg4CAgLkiqmkhgwZgpcvX6J169aoUqUKEhIS8Msvv6Bu3briYWB169aFoqIiFixYgJSUFKioqKB169YwNDQssu/58+djyJAhcHJywunTp/M9BGzu3Lk4duwYnJ2dMWzYMNja2iIxMRE7d+7EmTNnULFixRLHQERERERE0phwJSKir2rVqlX5lnt7e4v/feTIkXyTshYWFl/94J958+bJlCkqKkolXAva29TZ2Rk7d+5E06ZNZRKAeWrXrg1LS0ts3bq1wIRrUS5fvoy///4bM2bMKLBO586dMXbsWGzdurXAhKu9vT1atWqF/fv3IzExEYIgwMrKCjNnzsTkyZMLPTwqz+7du7F582bx/ZUrV3DlyhUAuXumypNwffPmDQ4dOgQ3Nzepw7w+NWrUKFSoUAGLFy/GgQMHYGZmhqVLl+a7J21GRobUCmFtbW00aNAAW7ZsQZs2bYqM6XP0798fa9euxcqVK/H69WsYGxujd+/eCAwMhIJC7nb6xsbGWL16NebNm4fBgwcjOzsbJ0+eLDLZOXPmTDx79gy7du0SD+Y6fPiwTLvKlSvj/PnzmDFjBkJDQ/HmzRtUrlwZHTt2FP9BoaQxEBERERGRNIkgCEJZB0FERERERERERET0X6BQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKiVJZB/AtyMnJwZMnT6ClpQWJRFLW4RAREREREdF/nCAIePv2LUxNTaGgwLVQRET/JUy4Anjy5AnMzMzKOgwiIiIiIiIqZx49eoQqVaqUdRhERFSKmHAFoKWlBSD3f+i0tbXLOJryISsrC8eOHUP79u1RoUKFsg6H6Ivgc07lAZ9zKg/4nFN5wOf863vz5g3MzMzEv48SEdF/BxOugLiNgLa2NhOuX0lWVhbU1dWhra3NP9DRfxafcyoP+JxTecDnnMoDPudlh9vaERH993CjGCIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolHAPVyIiIiIiIiIqVdnZ2cjKyirrMIiISo2ysjIUFORbu8qEKxERERERERGVCkEQ8PTpU7x+/bqsQyEiKlUKCgqwtLSEsrJykXWZcCUiIiIiIiKiUpGXbDU0NIS6ujokEklZh0RE9NlycnLw5MkTJCYmomrVqkX+bmPClYiIiIiIiIg+W3Z2tphs1dfXL+twiIhKVaVKlfDkyRN8+PABFSpUKLQuD80iIiIiIiIios+Wt2erurp6GUdCRFT68rYSyM7OLrIuE65EREREREREVGq4jQAR/RcV53cbE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrEREREREREVE5FBUVBYlEgtevX8vdxsLCAsuWLftiMRWkJLF+Td7e3ujatWup99uyZUv4+vqWer/0ZTHhSkRERERERET0jfH29oZEIsGIESNkro0ePRoSiQTe3t5fP7BvmIWFBSQSCSQSCRQVFWFqaorBgwfj1atXxernayU5s7OzMX/+fNjY2EBNTQ16enpo1KgR1q9fL9bZs2cPgoODv3gsVLqYcCUiIiIiIiIi+gaZmZkhPDwc79+/F8vS09MRFhaGqlWrlmFk365Zs2YhMTERDx8+RGhoKE6fPo1x48aVdVj5CgoKwtKlSxEcHIzbt2/j5MmTGDZsmNQqXj09PWhpaZVdkFQiTLgSEREREREREX2D6tevDzMzM+zZs0cs27NnD6pWrYp69epJ1c3IyMC4ceNgaGgIVVVV/PDDD7h48aJUnYiICNSoUQNqampo1aoV4uPjZcY8c+YMmjdvDjU1NZiZmWHcuHF49+6d3DFfvHgR7dq1g4GBAXR0dODs7IzLly9L1ZFIJFi/fj08PDygrq4Oa2trHDhwoNix5kdLSwvGxsaoXLkyWrVqBS8vL6nxX7x4AU9PT1SuXBnq6uqwt7fHtm3bxOve3t44deoUli9fLq6WzRv71q1b6NSpE7S1taGlpYXmzZsjLi5OavxFixbBxMQE+vr6GD16NLKysgqM9cCBAxg1ahR69uwJS0tLODg4YPDgwfDz8xPrfLzaNm9bhU9fH6903r9/P+rXrw9VVVVUq1YNQUFB+PDhg1z3jkoPE65ERERERERERN+oQYMGYdOmTeL7jRs3wsfHR6bejz/+iN27d2Pz5s24fPkyqlevDhcXF7x8+RIA8OjRI3Tr1g2dO3fG1atXMWTIEEydOlWqj7i4OHTo0AHdu3fH9evXsX37dpw5cwZjxoyRO963b9/Cy8sLZ86cwZ9//glra2u4urri7du3UvWCgoLQq1cvXL9+Ha6urujXr1+xYpXH48ePcfDgQTRq1EgsS09Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuAAAWL58OZo0aYKhQ4ciMTERiYmJMDMzw+PHj9GiRQuoqKjgxIkT+OuvvzBo0CCpZObJkycRFxeHkydPYvPmzQgJCUFISEiB8RkbG+PEiRN49uyZXPNp2rSpGFNiYiJOnDgBVVVVtGjRAgAQHR2NgQMHYvz48bh9+zbWrFmDkJAQzJkzp9j3jj6TQEJKSooAQEhJSSnrUMqNzMxMYd++fUJmZmZZh0L0xfA5p/KAzzmVB3zOqTzgc/71/Rf/Hvr+/Xvh9u3bwvv378s6lP8ELy8vwd3dXUhOThZUVFSE+Ph4IT4+XlBVVRWePXsmuLu7C15eXoIgCEJqaqpQoUIFITQ0VGyfmZkpmJqaCj///LMgCIIwbdo0oVatWlJjTJkyRQAgvHr1ShAEQRg8eLAwbNgwqTrR0dGCgoKC+Lmam5sLS5culXse2dnZgpaWlnDw4EGxDIDg7+8vvk9NTRUACIcPH5Y71vyYm5sLysrKgoaGhqCqqioAEBo1alRoG0EQBDc3N2HSpEnie2dnZ2H8+PFSdaZNmyZYWloW+HvSy8tLMDc3Fz58+CCW9ezZU+jdu3eB4966dUuwtbUVFBQUBHt7e2H48OFCRESEVJ38YhEEQXj+/LlQrVo1YdSoUWJZmzZthLlz50rV27Jli2BiYlJgDCS/4vyO4wpXIiIiIiIiIqJvVKVKleDm5oaQkBBs2rQJbm5uMDAwkKoTFxeHrKwsNGvWTCyrUKECGjZsiNjYWABAbGys1EpPAGjSpInU+2vXriEkJASampriy8XFBTk5OXjw4IFc8SYlJWHo0KGwtraGjo4OtLW1kZqaiocPH0rVq1OnjvjfGhoa0NbWRnJystyxFmTy5Mm4evUqrl+/juPHjwMA3NzckJ2dDSD3oKrg4GDY29tDT08PmpqaOHr0qEx8n7p69SqaN2+OChUqFFjHzs4OioqK4nsTExNxTvmpVasWbt68iT///BODBg1CcnIyOnfujCFDhhQaS1ZWFrp37w5zc3MsX75cLL927RpmzZol9fnlrdRNS0srtE8qXUplHQARERERERERERVs0KBB4tf6f/vtty82TmpqKoYPH57vIVPyHtLl5eWFFy9eYPny5TA3N4eKigqaNGmCzMxMqXqfJi4lEglycnJKHvz/Z2BggOrVqwMArK2tsWzZMjRp0gQnT55E27ZtsXDhQixfvhzLli2Dvb09NDQ04OvrKxPfp9TU1IocuyRzUlBQQIMGDdCgQQP4+vpi69atGDBgAKZPnw5LS8t824wcORKPHj3ChQsXoKT0f6m91NRUBAUFoVu3bjJtVFVVi4yfSg8TrkRERERERERE37AOHTogMzMTEokELi4uMtetrKygrKyMs2fPwtzcHEDuKsiLFy+KBy7Z2trKHEz1559/Sr2vX78+bt++LSYsS+Ls2bNYuXIlXF1dAeTux/r8+fNi9SFPrPLKW3H6/v17MT53d3f0798fAJCTk4O7d++iVq1aYhtlZWVxRWyeOnXqYPPmzcjKyip0levnyoujoIPKlixZgh07diAmJgb6+vpS1+rXr487d+581udHpYNbChARERERERERfcMUFRURGxuL27dvS31lPY+GhgZGjhyJyZMn48iRI7h9+zaGDh2KtLQ0DB48GAAwYsQI3Lt3D5MnT8adO3cQFhYmc6DTlClTEBMTgzFjxuDq1au4d+8e9u/fX6xDs6ytrbFlyxbExsbi/Pnz6Nevn1yrQz8mT6wFefv2LZ4+fYrExERcuHABkydPRqVKldC0aVMxvsjISMTExCA2NhbDhw9HUlKSVB8WFhY4f/484uPj8fz5c+Tk5GDMmDF48+YN+vTpg0uXLuHevXvYsmUL7ty5U6y5faxHjx5YunQpzp8/j4SEBERFRWH06NGoUaMGbGxsZOr/8ccf+PHHH7Fw4UIYGBjg6dOnePr0KVJSUgAAM2fOxO+//46goCDcunULsbGxCA8Ph7+/f4ljpJJhwpWIiIiIiIiI6Bunra0NbW3tAq/Pnz8f3bt3x4ABA1C/fn38888/OHr0KHR1dQHkbgmwe/du7Nu3Dw4ODli9ejXmzp0r1UedOnVw6tQp3L17F82bN0e9evUwc+ZMmJqayh3nhg0b8OrVK9SvXx8DBgzAuHHjYGhoWKy5yhNrQWbOnAkTExOYmpqiU6dO0NDQwLFjx8TVoP7+/qhfvz5cXFzQsmVLGBsbo2vXrlJ9+Pn5QVFREbVq1UKlSpXw8OFD6Ovr48SJE0hNTYWzszMcHR2xbt26z1rt6uLigoMHD6Jz586oUaMGvLy8YGNjg2PHjkltFZDnzJkzyM7OxogRI2BiYiK+xo8fL/b3v//9D8eOHUODBg3QuHFjLF26VFz1TF+PRBAEoayDKGtv3ryBjo4OUlJSCv3lRaUnKysLERERcHV1/aJL8YnKEp9zKg/4nFN5wOecygM+51/ff/Hvoenp6Xjw4AEsLS25XyQR/ecU53ccV7gSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUiJ75BnRt+jeseLVt27/ZeIgIiIiIiIiIiIqBFe4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIiIiolLCQ7OIiIiIiIiI6Ita/mr5VxtrvO74ErV78eIFbG1tceHCBVhYWJRuUN+hFi1aYMSIEejbty8AQCKRYO/evejatWuZxWRhYQFfX1/4+vp+sZgaN26MyZMno3v37qXWJwBkZmaiRo0a2LVrF5ycnEq1b/r2cIUrEREREREREZV7c+bMgbu7u5hsjY+Ph0QiwdWrV796LN7e3pBIJJBIJKhQoQKMjIzQrl07bNy4ETk5OV98/AMHDiApKQl9+vT54mN9jsTERHTs2LFU+/T398fUqVM/6z7Pnz8fEolETAwDgLKyMvz8/DBlypRSiJK+dUy4EhEREREREVG5lpaWhg0bNmDw4MFfddzMzMwCr3Xo0AGJiYmIj4/H4cOH0apVK4wfPx6dOnXChw8fvmhcK1asgI+PDxQUvu20kbGxMVRUVEq1z44dO+Lt27c4fPhwidpfvHgRa9asQZ06dWSu9evXD2fOnMGtW7c+N0z6xn3bPzlERERERERERF9YREQEVFRU0Lhx4wLrREVFQSKR4Pjx43BycoK6ujqaNm2KO3fuSNU7ePAgGjRoAFVVVRgYGMDDw0O8ZmFhgeDgYAwcOBDa2toYNmxYgeOpqKjA2NgYlStXRv369fHTTz9h//79OHz4MEJCQsR6r1+/xpAhQ1CpUiVoa2ujdevWuHbtmtwxferZs2c4ceIEOnfuLHMtb0WpmpoaqlWrhl27dkldnzJlCmrUqAF1dXVUq1YNM2bMQFZWlnj92rVraNWqFbS0tKCtrQ1HR0dcunRJvH7mzBk0b94campqMDMzw7hx4/Du3bsCY5VIJNi3bx+A/1uRvGfPHrRq1Qrq6upwcHDAuXPnpNoUNYaioiJcXV0RHh5e4LgFSU1NRb9+/bBu3Tro6urKXNfV1UWzZs1K1Dd9X5hwJSIiIiIiIqJyLTo6Go6OjnLVnT59OhYvXoxLly5BSUkJgwYNEq8dOnQIHh4ecHV1xZUrV3D8+HE0bNhQqv2iRYvg4OCAK1euYMaMGcWKs3Xr1nBwcMCePXvEsp49eyI5ORmHDx/GX3/9hfr166NNmzZ4+fKl3DF97MyZM1BXV4etra3MtRkzZqB79+64du0a+vXrhz59+iA2Nla8rqWlhZCQENy+fRvLly/HunXrsHTpUvF6v379UKVKFVy8eBF//fUXpk6digoVKgAA4uLi0KFDB3Tv3h3Xr1/H9u3bcebMGYwZM6ZY92j69Onw8/PD1atXUaNGDXh6eoorguUdo2HDhoiOji7WuAAwevRouLm5oW3btgXWKWnf9H3hoVlEREREREREVK4lJCTA1NRUrrpz5syBs7MzAGDq1Klwc3NDeno6VFVVMWfOHPTp0wdBQUFifQcHB6n2rVu3xqRJk0ocq42NDa5fvw4gNzl64cIFJCcni1+tX7RoEfbt24ddu3Zh2LBhcsX0sYSEBBgZGeW7nUDPnj0xZMgQAEBwcDAiIyPxyy+/YOXKlQBy9z/NY2FhAT8/P4SHh+PHH38EADx8+BCTJ0+GjY0NAMDa2lqsP2/ePPTr10/c99Ta2horVqyAs7MzVq1aBVVVVbnuj5+fH9zc3AAAQUFBsLOzwz///AMbGxu5xzA1NcWjR4+Qk5Mj97YK4eHhuHz5Mi5evFhoPVNTUyQkJMjVJ32/uMKViIiIiIiIiMq19+/fy53Q+3hvThMTEwBAcnIyAODq1ato06ZNoe0/94R6QRAgkUgA5H5FPzU1Ffr6+tDU1BRfDx48QFxcnNwxfaywe9GkSROZ9x+vcN2+fTuaNWsGY2NjaGpqwt/fHw8fPhSvT5w4EUOGDEHbtm0xf/58Mca8uYSEhEjNw8XFBTk5OXjw4IHc8Rf2+cg7hpqaGnJycpCRkSHXmI8ePcL48eMRGhpa5HOkpqaGtLQ0uedD3yeucCUiIiIiIiKics3AwACvXr2Sq27eV+ABiInPvBPt1dTUimyvoaFRggj/T2xsLCwtLQHk7hlqYmKCqKgomXoVK1aUO6aPFedefOzcuXPo168fgoKC4OLiAh0dHYSHh2Px4sVincDAQPTt2xeHDh3C4cOHERAQgPDwcHh4eCA1NRXDhw/HuHHjZPquWrWq3HEU9vnIO8bLly+hoaEh973766+/kJycjPr164tl2dnZOH36NH799VdkZGRAUVFR7LtSpUpyz4e+T0y4EhEREREREVG5Vq9ePWzduvWz+6lTpw6OHz8OHx+fUohK1okTJ3Djxg1MmDABAFC/fn08ffoUSkpKsLCwKJWY6tWrh6dPn+LVq1cyBz/9+eefGDhwoNT7evXqAQBiYmJgbm6O6dOni9fz++p8jRo1UKNGDUyYMAGenp7YtGkTPDw8UL9+fdy+fRvVq1eXK86SkHeMmzdvivOSR5s2bXDjxg2pMh8fH9jY2GDKlClisrUkfdP3iVsKEBEREREREVG55uLiglu3bpVoZefHAgICsG3bNgQEBCA2NhY3btzAggULStRXRkYGnj59isePH+Py5cuYO3cu3N3d0alTJzHp2bZtWzRp0gRdu3bFsWPHEB8fj5iYGEyfPh2XLl0qUUz16tWDgYEBzp49K3Nt586d2LhxI+7evYuAgABcuHBBPHDK2toaDx8+RHh4OOLi4rBixQrs3btXbPv+/XuMGTMGUVFRSEhIwNmzZ3Hx4kXxcK4pU6YgJiYGY8aMwdWrV3Hv3j3s37+/2IdmFUbeMaKjo9G+fXu5+9XS0kLt2rWlXhoaGtDX10ft2rU/q2/6PnGFKxERERERERF9UeN1x5d1CIWyt7dH/fr1sWPHDgwfPrzE/bRs2RI7d+5EcHAw5s+fD21tbbRo0aJEfR05cgQmJiZQUlKCrq4uHBwcsGLFCnh5eYkHOUkkEkRERGD69Onw8fHBs2fPYGxsjBYtWsDIyKhEMSkqKsLHxwehoaHo1KmT1LWgoCCEh4dj1KhRMDExwbZt21CrVi0AQJcuXTBhwgSMGTMGGRkZcHNzw4wZMxAYGCj2++LFCwwcOBBJSUkwMDBAt27dxMO86tSpg1OnTmH69Olo3rw5BEGAlZUVevfuXaL7lx95xnj8+DFiYmKkVjzHx8fD0tISJ0+eRMuWLUs8/rlz55CSkoIePXp8zjToOyARBEEo6yDK2ps3b6Cjo4OUlBRoa2uXdTjlQlZWFiIiIuDq6iq1v0qB7h0r3gDW/NciKnvFfs6JvkN8zqk84HNO5QGf86/vv/j30PT0dDx48ACWlpZyH0D1LTl06BAmT56Mmzdvyn0y/X/V06dPYWdnh8uXL8Pc3Lysw/mqpkyZglevXmHt2rVi2cmTJ9GtWzfcv39fZpuF4ujduzccHBzw008/lUao9JUV53ccV7gSERERERERUbnn5uaGe/fu4fHjxzAzMyvrcMqUsbExNmzYgIcPH5a7hKuhoSEmTpwoVRYREYGffvrps5KtmZmZsLe3F/ffpf82JlyJiIiIiIiIiAD4+vqWdQjfjK5du5Z1CGVi0qRJMmULFy787H6VlZXh7+//2f3Q96F8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolCiVdQBERERERERE9N82/8rzrzbW1HoGJWr34sUL2Nra4sKFC7CwsCjdoL5DLVq0wIgRI9C3b18AgEQiwd69e9G1a9cyi8nCwgK+vr7w9fX9YjE1btwYkydPRvfu3Uutz6/RN31buMKViIiIiIiIiMq9OXPmwN3dXUy2xsfHQyKR4OrVq189Fm9vb0gkEkgkElSoUAFGRkZo164dNm7ciJycnC8+/oEDB5CUlIQ+ffp88bE+R2JiIjp27Fiqffr7+2Pq1KnFvs/Z2dmYMWMGLC0toaamBisrKwQHB0MQhM/um74/TLgSERERERERUbmWlpaGDRs2YPDgwV913MzMzAKvdejQAYmJiYiPj8fhw4fRqlUrjB8/Hp06dcKHDx++aFwrVqyAj48PFBS+7bSRsbExVFRUSrXPjh074u3btzh8+HCx2i1YsACrVq3Cr7/+itjYWCxYsAA///wzfvnll8/um74/3/ZPDhERERERERHRFxYREQEVFRU0bty4wDpRUVGQSCQ4fvw4nJycoK6ujqZNm+LOnTtS9Q4ePIgGDRpAVVUVBgYG8PDwEK9ZWFggODgYAwcOhLa2NoYNG1bgeCoqKjA2NkblypVRv359/PTTT9i/fz8OHz6MkJAQsd7r168xZMgQVKpUCdra2mjdujWuXbsmd0yfevbsGU6cOIHOnTvLXMtbUaqmpoZq1aph165dUtenTJmCGjVqQF1dHdWqVcOMGTOQlZUlXr927RpatWoFLS0taGtrw9HREZcuXRKvnzlzBs2bN4eamhrMzMwwbtw4vHv3rsBYJRIJ9u3bB+D/ViTv2bMHrVq1grq6OhwcHHDu3DmpNkWNoaioCFdXV4SHhxc4bn5iYmLg7u4ONzc3WFhYoEePHmjfvj0uXLjw2X3T94cJVyIiIvp67h0r3ouIiIjoK4iOjoajo6NcdadPn47Fixfj0qVLUFJSwqBBg8Rrhw4dgoeHB1xdXXHlyhUcP34cDRs2lGq/aNEiODg44MqVK5gxY0ax4mzdujUcHBywZ88esaxnz55ITk7G4cOH8ddff6F+/fpo06YNXr58KXdMHztz5gzU1dVha2src23GjBno3r07rl27hn79+qFPnz6IjY0Vr2tpaSEkJAS3b9/G8uXLsW7dOixdulS83q9fP1SpUgUXL17EX3/9halTp6JChQoAgLi4OHTo0AHdu3fH9evXsX37dpw5cwZjxowp1j2aPn06/Pz8cPXqVdSoUQOenp7iimB5x2jYsCGio6OLNW7Tpk1x/Phx3L17F0BucvnMmTMyWx6UpG/6/vDQLCIiIiIiIiIq1xISEmBqaipX3Tlz5sDZ2RkAMHXqVLi5uSE9PR2qqqqYM2cO+vTpg6CgILG+g4ODVPvWrVtj0qRJJY7VxsYG169fB5CbHL1w4QKSk5PFr9YvWrQI+/btw65duzBs2DC5YvpYQkICjIyM8t1OoGfPnhgyZAgAIDg4GJGRkfjll1+wcuVKALl7lOaxsLCAn58fwsPD8eOPPwIAHj58iMmTJ8PGxgYAYG1tLdafN28e+vXrJx6IZW1tjRUrVsDZ2RmrVq2CqqqqXPfHz88Pbm5uAICgoCDY2dnhn3/+gY2NjdxjmJqa4tGjR8jJyZF7W4WpU6fizZs3sLGxgaKiIrKzszFnzhz069dPql5J+qbvDz9ZIiIiIiIiIirX3r9/L3dCr06dOuJ/m5iYAACSk5MBAFevXkWbNm0Kbe/k5FTCKHMJggCJRAIgdxVlamoq9PX1oampKb4ePHiAuLg4uWP6WGH3okmTJjLvP17hun37djRr1gzGxsbQ1NSEv78/Hj58KF6fOHEihgwZgrZt22L+/PlijHlzCQkJkZqHi4sLcnJy8ODBA7njL+zzkXcMNTU15OTkICMjQ+5xd+zYgdDQUISFheHy5cvYvHkzFi1ahM2bN0vVK0nf9P3hClciIiIiIiIiKtcMDAzw6tUruermfQUegJj4zDt1Xk1Nrcj2GhoaJYjw/8TGxsLS0hIAkJqaChMTE0RFRcnUq1ixotwxfaw49+Jj586dQ79+/RAUFAQXFxfo6OggPDwcixcvFusEBgaib9++OHToEA4fPoyAgACEh4fDw8MDqampGD58OMaNGyfTd9WqVeWOo7DPR94xXr58CQ0NjWLdu8mTJ2Pq1Kno06cPAMDe3h4JCQmYN28evLy8Pqtv+v4w4UpEREREVJqKu/+wdfsvEwcREcmtXr162Lp162f3U6dOHRw/fhw+Pj6lEJWsEydO4MaNG5gwYQIAoH79+nj69CmUlJRgYWFRKjHVq1cPT58+xatXr6Crqyt17c8//8TAgQOl3terVw9A7qFR5ubmmD59ung9ISFBpv8aNWqgRo0amDBhAjw9PbFp0yZ4eHigfv36uH37NqpXry5XnCUh7xg3b94U5yWvtLQ0mS0CFBUVxWTv5/RN3x9uKUBERERERERE5ZqLiwtu3bpVopWdHwsICMC2bdsQEBCA2NhY3LhxAwsWLChRXxkZGXj69CkeP36My5cvY+7cuXB3d0enTp3EpGfbtm3RpEkTdO3aFceOHUN8fDxiYmIwffp0XLp0qUQx1atXDwYGBjh79qzMtZ07d2Ljxo24e/cuAgICcOHCBfHAKWtrazx8+BDh4eGIi4vDihUrsHfvXrHt+/fvMWbMGERFRSEhIQFnz57FxYsXxcO5pkyZgpiYGIwZMwZXr17FvXv3sH///mIfmlUYeceIjo5G+/bF+wfRzp07Y86cOTh06BDi4+Oxd+9eLFmyBB4eHp/dN31/uMKViIiIiIiIiL6oqfUMyjqEQtnb26N+/frYsWMHhg8fXuJ+WrZsiZ07dyI4OBjz58+HtrY2WrRoUaK+jhw5AhMTEygpKUFXVxcODg5YsWIFvLy8xJWUEokEERERmD59Onx8fPDs2TMYGxujRYsWMDIyKlFMioqK8PHxQWhoKDp16iR1LSgoCOHh4Rg1ahRMTEywbds21KpVCwDQpUsXTJgwAWPGjEFGRgbc3NwwY8YMBAYGiv2+ePECAwcORFJSEgwMDNCtWzfxMK86derg1KlTmD59Opo3bw5BEGBlZYXevXuX6P7lR54xHj9+jJiYGKkVz/Hx8bC0tMTJkyfRsmXLfPv+5ZdfMGPGDIwaNQrJyckwNTXF8OHDMXPmzEL7pv8miSAIQlkHUdbevHkDHR0dpKSkQFtbu6zDKReysrIQEREBV1dXqf1VCsSv5tF3qNjPOdF3iL/PqTzgc07lAf/c8vX9F/8emp6ejgcPHsDS0lLuA6i+JYcOHcLkyZNx8+bNcn96/NOnT2FnZ4fLly/D3Ny8rMP5qqZMmYJXr15h7dq1YtnJkyfRrVs33L9/X2abhc/tm74fxfkdxxWuRERERERERFTuubm54d69e3j8+DHMzMzKOpwyZWxsjA0bNuDhw4flLuFqaGiIiRMnSpVFRETgp59++qxka0F9038TE65ERERERERERAB8fX3LOoRvRteuXcs6hDIxadIkmbKFCxd+sb7pv6l8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSolTWARAR0f9371jx6lu3/zJxEBEREREREVGJcYUrEREREREREZV7L168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJQcWKFctsfCq+1atXo3PnzmUdxneNK1yJiIiIiIiI6ItKCQr6amPpBASUqN2cOXPg7u4OCwsLALmJ1FOnThVYPyoqCs7OziUa62tp2rQpEhMToaOj80XHkUgk2Lt3L7p27fpFx/kSAgMDER4ejkePHkFZWRmOjo6YM2cOGjVqVGCbFy9eoF+/frh+/bqYqHd3d8fcuXOhra0NADhz5gymTJmCv//+G2lpaTA3N8fw4cMxYcIEqb5+++03LFy4EE+fPoWDgwN++eUXNGzYULxuYWGBhIQEbNu2DX369JFqa2dnh9u3b2PTpk3w9vYu9txfvHgBBwcHPH78GK9evRIT44MGDUJwcDCio6PRvHnzYvdLXOFKREREREREROVcWloaNmzYgMGDB4tle/bsQWJiotQrISEBtWvXhpOTU6EJuW+FsrIyjI2NIZFIyjqUb1aNGjXw66+/4saNGzhz5gwsLCzQvn17PHv2rMA2CgoKcHd3x4EDB3D37l2EhITgjz/+wIgRI8Q6GhoaGDNmDE6fPo3Y2Fj4+/vD398fa9euFets374dEydOREBAAC5fvgwHBwe4uLggOTlZajwzMzNs2rRJquzPP//E06dPoaGhUeK5Dx48GHXq1JEpV1ZWRt++fbFixYoS913eMeFKREREREREROVaREQEVFRU0LhxY7FMT08PxsbGUq/g4GA8f/4ce/fuhaqqqlg3JycHP/74o9gmMDBQvBYfHw+JRIKrV6+KZa9fv4ZEIkFUVJRYdurUKTRs2BAqKiowMTHB1KlT8eHDB/F6y5YtMXbsWPj6+kJXVxdGRkZYt24d3r17Bx8fH2hpaaF69eo4fPiw2ObTLQXyvt5/9OhR2NraQlNTEx06dEBiYqLU/di4cSPs7OzEWMaMGfOZd/j/xMXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23bthUZT9++fdG2bVtUq1YNdnZ2WLJkCd68eYPr168X2EZXVxcjR46Ek5MTzM3N0aZNG4waNQrR0dFinXr16sHT0xN2dnawsLBA//794eLiIlVnyZIlGDp0KHx8fFCrVi2sXr0a6urq2Lhxo9R4/fr1w6lTp/Do0SOxbOPGjejXrx+UlEr25fVVq1bh9evX8PPzy/d6586dceDAAbx//75E/Zd3TLgSERERERERUbkWHR0NR0fHQuusXLkSv//+O3bv3o0qVapIXdu8eTM0NDRw/vx5/Pzzz5g1axYiIyPlHv/x48dwdXVFgwYNcO3aNaxatQobNmzA7NmzZcYxMDDAhQsXMHbsWIwcORI9e/ZE06ZNcfnyZbRv3x4DBgxAWlpagWOlpaVh0aJF2LJlC06fPo2HDx9KJd1WrVqF0aNHY9iwYbhx4wYOHDiA6tWryz2XoqSmpsLV1RXHjx/HlStX0KFDB3Tu3BkPHz6Uqrd06VI0a9YMV65cgZubGwYMGICBAweif//+uHz5MqysrDBw4EAIggAASE9Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuCB3bJmZmVi7di10dHTg4OAgd7snT55gz549hW4xceXKFcTExIh1MjMz8ddff6Ft27ZiHQUFBbRt2xbnzp2TamtkZAQXFxds3rwZQO5nuH37dgwaNEjuGD92+/ZtzJo1C7///jsUFPJPDTo5OeHDhw84f/58icYo75hwJSIiIiIiIqJyLSEhAaampgVeP336NHx9ffHbb7+hadOmMtfr1KmDgIAAWFtbY+DAgXBycsLx48flHn/lypUwMzPDr7/+ChsbG3Tt2hVBQUFYvHgxcnJyxHoODg7w9/eHtbU1pk2bBlVVVRgYGGDo0KGwtrbGzJkz8eLFi0JXZ2ZlZWH16tVwcnJC/fr1MWbMGKlYZ8+ejUmTJmH8+PGoUaMGGjRoUKqHgjk4OGD48OGoXbs2rK2tERwcDCsrKxw4cECqnqurK4YPHy7O682bN2jQoAF69uyJGjVqYMqUKYiNjUVSUhIAoHLlyvDz80PdunVRrVo1jB07Fh06dMCOHTuKjOl///sfNDU1oaqqiqVLlyIyMhIGBgZFtvP09IS6ujoqV64MbW1trF+/XqZOlSpVoKKiAicnJ4wePRpDhgwBADx//hzZ2dkwMjKSqm9kZISnT5/K9DNo0CCEhIRAEATs2rULVlZWqFu3bpExfiojIwOenp5YuHAhqlatWmA9dXV16OjoICEhodhjEBOuRERERERERFTOvX//XmqLgI89fPgQPXr0wLBhw8Rk2ac+3QfTxMREZh/OwsTGxqJJkyZSe602a9YMqamp+Pfff/MdR1FREfr6+rC3txfL8pJ3hY2trq4OKyurfGNNTk7GkydP0KZNm3zbjhgxApqamuKrJFJTU+Hn5wdbW1tUrFgRmpqaiI2NlVnh+vFc8+ZV2Fyzs7MRHBwMe3t76OnpQVNTE0ePHhX7DQ0NlYr946/2t2rVClevXkVMTAw6dOiAXr16if127NhRbGNnZycV49KlS3H58mXs378fcXFxmDhxosx8o6OjcenSJaxevRrLli2Ta5uD/Li5uSE1NRWnT5/Gxo0bS7y6ddq0abC1tUX//v2LrKumplboamkqWMk2eiAiIiL6SHxYPBShWGS9arILQoiIiIjKnIGBAV69eiVT/v79e3h4eMDOzg7Lli0rsH2FChWk3kskEnFlat5XtvO++g7krjItifzG+bgsL2H78apYefrIi01NTa3Q8WfNmlXgnp/y8vPzQ2RkJBYtWoTq1atDTU0NPXr0QGZmZoFx5s2rsLkuXLgQy5cvx7Jly2Bvbw8NDQ34+vqK/Xbp0kXqoLPKlSuL/62hoYHq1aujevXqaNy4MaytrbFhwwZMmzYN69evF/cx/fTe5e3ta2NjAz09PTRv3hwzZsyAiYmJWMfS0hJAbrI4KSkJgYGB8PT0hIGBARQVFcUVunmSkpJgbGwsc9+UlJQwYMAABAQE4Pz589i7d2+h97kgJ06cwI0bN7Br1y4A//dcGhgYYPr06QgKChLrvnz5EpUqVSrROOUdE65EREREREREVK7Vq1cPW7dulSkfMmQIXr58iaNHj5b4cKK8hFViYiLq1asHAFIHaAGAra0tdu/eDUEQxETi2bNnoaWlJbNf7JekpaUFCwsLHD9+HK1atZK5bmhoCENDw88a4+zZs/D29oaHhweA3BWv8fHxn9VnXr/u7u7iys2cnBzcvXsXtWrVApA7Ny0tLbn6ysnJQUZGBgDpxGxRbQCI7YrqV1lZGY6Ojjh+/Di6du0qXj9+/HiBh5QNGjQIixYtQu/evaGrqytXXJ/avXu31EFYFy9exKBBgxAdHS218jkuLg7p6eniM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRMLPNToSwkMDMSIESNgaGiIjh074u3btzh79izGjh1baLsHDx7IJJKtra1l6llbW2PPnj3o3LkzJBIJZsyYUeiKXHlZW1tj165diImJga6uLpYsWYKkpCQx4Zqfd+/eYc6cOejSpQtMTEzw/Plz/Pbbb3j8+DF69uxZYLuIiAgkJSWhQYMG0NTUxK1btzB58mQ0a9YMFhYWAIDffvsNVatWhY2NDYDcfYAXLVqEcePGif1MnDgRXl5ecHJyQsOGDbFs2TK8e/cOPj4++Y5ra2uL58+fQ11dvQR3KNfHSVUgdy/ZvL4rVqwolkdHR6NatWoy9Uk+TLgSERERERERUblmb2+P+vXrY8eOHRg+fDiA3IOssrKy0KFDh3zbbNq0Cd7e3nL1v3HjRgwePBiOjo6oWbMmfv75Z7Rv3168XrlyZURERGDy5MlwcHCAnp4eBg8eLJOY/Rq8vLyQnp6OpUuXws/PDwYGBujRo0eR7Qrav/RTS5YswaBBg9C0aVMYGBhgypQpePPmzWfH7e/vj/v378PFxQXq6uoYNmwYunbtipSUlALbKCoq4u+//8bmzZvx/Plz6Ovro0GDBoiOjpbZr/VjampqWLduHSZMmICMjAyYmZmhW7dumDp1qlgnJycH06ZNw4MHD6CkpAQrKyssWLBAfL4AoHfv3nj27BlmzpyJp0+fom7dujhy5IjMQVof09fXL/Q+eHt7Iz4+HlFRUYXWK8q2bdswdOjQz+qjPJMIH28iUk69efMGOjo6SElJgba2dlmHUy5kZWUhIiICrq6uMnug5OveseINYN2+6DpEXxifcyoP8p7zWq9rybmH6z/FG4DPOX0D+PucygM+51/ff/Hvoenp6Xjw4AEsLS0LPIDqW3bo0CFMnjwZN2/e/OqrSolKi7OzM1q1aoXAwMAS93Hr1i20bt0ad+/ehY6OTukF950rzu84rnAlIiIiIiIionLPzc0N9+7dw+PHj2FmZlbW4RAVW0pKCuLi4nDo0KHP6icxMRG///47k62fgQlXIiIiIiIiIiIAvr6+ZR0CUYnp6Ojg33///ex+2rZtWwrRlG/fzBr5+fPnQyKRSP1yS09Px+jRo6Gvrw9NTU10794dSUlJUu0ePnwINzc3qKurw9DQEJMnT8aHDx++cvRERERERERERERE30jC9eLFi1izZg3q1KkjVT5hwgQcPHgQO3fuxKlTp/DkyRN069ZNvJ6dnQ03NzdkZmYiJiYGmzdvRkhICGbOnPm1p0BERERERERERERU9lsKpKamol+/fli3bh1mz54tlqekpGDDhg0ICwtD69atAeSeAGhra4s///wTjRs3xrFjx3D79m388ccfMDIyQt26dREcHIwpU6YgMDAQysrKZTUtIiJRfFi8nIcJfYVgiIiIiArBP7cQERF9vjJPuI4ePRpubm5o27atVML1r7/+QlZWltS+ETY2NqhatSrOnTuHxo0b49y5c7C3t4eRkZFYx8XFBSNHjsStW7dQr169fMfMyMhARkaG+P7NmzcAck/mzMrKKu0pUj7y7rPc9ztbKO4AxYyIqPTlPd/ZyJavPp9z+g7xOafygH9uofKAv8+/Pv7dk4jov6tME67h4eG4fPkyLl68KHPt6dOnUFZWRsWKFaXKjYyM8PTpU7HOx8nWvOt51woyb948BAUFyZQfO3YM6urqxZ0GfYbIyMgv0/GdiC/TL1EJ3Kl4R656t+Wr9lHHfM7p28HnnMoD/rmFygP+Pv960tLSyjoEIiL6Qsos4fro0SOMHz8ekZGRUFVV/apjT5s2DRMnThTfv3nzBmZmZmjfvj20tbW/aizlVVZWFiIjI9GuXTtUqFCh6AZxJ4o3gFXrkgVGVIrynvOar2vK9dU8i0b3izcAn3P6BvA5p/KAf26h8oC/z7++vG9aEhHRf0+ZJVz/+usvJCcno379+mJZdnY2Tp8+jV9//RVHjx5FZmYmXr9+LbXKNSkpCcbGxgAAY2NjXLhwQarfpKQk8VpBVFRUoKKiIlNeoUIF+f4QTaXm8c7Hcu4RJSlex/wc6Rui+P//rygVFPmc0/eLzzmVB3L/WZHPOX3H+Pv86+HfPYmI/rsUymrgNm3a4MaNG7h69ar4cnJyQr9+/cT/rlChAo4fPy62uXPnDh4+fIgmTZoAAJo0aYIbN24gOTlZrBMZGQltbW3UqlXrq8+JiIiIiIiIiL5PL168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJkdkukr5tR44cQd26dZGTk1PWoXy3yizhqqWlhdq1a0u9NDQ0oK+vj9q1a0NHRweDBw/GxIkTcfLkSfz111/w8fFBkyZN0LhxYwBA+/btUatWLQwYMADXrl3D0aNH4e/vj9GjR+e7gpWIiIiIiIiIysDfkq/3KqE5c+bA3d0dFhYWAHITqRKJpMDXqVOnSunmfDlNmzZFYmIidHR0vug4EokE+/bt+6JjfCmBgYGwsbGBhoYGdHV10bZtW5w/f77QNi9evECHDh1gamoKFRUVmJmZYcyYMVJbhZw5cwbNmjWDvr4+1NTUYGNjg6VLl8r09dtvv8HCwgKqqqpo1KiRzDe5LSwsIJFIEB4eLtPWzs4OEokEISEhxZrzxYsX0aZNG1SsWBG6urpwcXHBtWvXxOsdOnRAhQoVEBoaWqx+6f+UWcJVHkuXLkWnTp3QvXt3tGjRAsbGxtizZ494XVFREf/73/+gqKiIJk2aoH///hg4cCBmzZpVhlETERERERER0fckLS0NGzZswODBg8WyPXv2IDExUeqVkJCA2rVrw8nJCY0aNSrDiOWjrKwMY2NjSCQlT0T/19WoUQO//vorbty4gTNnzsDCwgLt27fHs2fPCmyjoKAAd3d3HDhwAHfv3kVISAj++OMPjBgxQqyjoaGBMWPG4PTp04iNjYW/vz/8/f2xdu1asc727dsxceJEBAQE4PLly3BwcICLi4vUN7kBwMzMDJs2bZIq+/PPP/H06VNoaGgUa76pqano0KEDqlativPnz+PMmTPQ0tKCi4sLsrKyxHre3t5YsWJFsfqm//NNJVyjoqKwbNky8b2qqip+++03vHz5Eu/evcOePXtk9mY1NzdHREQE0tLS8OzZMyxatAhKSmW2NS0RERERERERfWciIiKgoqIifqMWAPT09GBsbCz1Cg4OxvPnz7F3716pA8BzcnLw448/im0CAwPFa/Hx8ZBIJLh69apY9vr1a0gkEkRFRYllp06dQsOGDaGiogITExNMnToVHz58EK+3bNkSY8eOha+vL3R1dWFkZIR169bh3bt38PHxgZaWFqpXr47Dhw+LbT7dUiDv6/1Hjx6Fra0tNDU10aFDByQmJkrdj40bN8LOzk6MZcyYMZ95h/9PXFwc3N3dYWRkBE1NTTRo0AB//PGHVB0LCwvMnj0bAwcOhKamJszNzXHgwAE8e/YM7u7u0NTURJ06dXDp0iWxzYsXL+Dp6YnKlStDXV0d9vb22LZtW5Hx9O3bF23btkW1atVgZ2eHJUuW4M2bN7h+/XqBbXR1dTFy5Eg4OTnB3Nwcbdq0wahRoxAdHS3WqVevHjw9PWFnZwcLCwv0798fLi4uUnWWLFmCoUOHwsfHB7Vq1cLq1auhrq6OjRs3So3Xr18/nDp1Co8ePRLLNm7ciH79+hU7B/b333/j5cuXmDVrFmrWrAk7OzsEBAQgKSkJCQkJYr3OnTvj0qVLiIuLK1b/lOubSrgSEREREX2r4sPicX/z/SJfRET0/YmOjoajo2OhdVauXInff/8du3fvRpUqVaSubd68GRoaGjh//jx+/vlnzJo1C5GRkXKP//jxY7i6uqJBgwa4du0aVq1ahQ0bNmD27Nky4xgYGODChQsYO3YsRo4ciZ49e6Jp06a4fPky2rdvjwEDBiAtLa3AsdLS0rBo0SJs2bIFp0+fxsOHD+Hn5ydeX7VqFUaPHo1hw4bhxo0bOHDgAKpXry73XIqSmpoKV1dXHD9+HFeuXEGHDh3QuXNnPHz4UKre0qVL0axZM1y5cgVubm4YMGAABg4ciP79++Py5cuwsrLCwIEDIQgCACA9PR2Ojo44dOgQbt68iWHDhmHAgAEyX9EvTGZmJtauXQsdHR04ODjI3e7JkyfYs2cPnJ2dC6xz5coVxMTEiHUyMzPx119/oW3btmIdBQUFtG3bFufOnZNqa2RkBBcXF2zevBlA7me4fft2DBo0SO4Y89SsWRP6+vrYsGEDMjMz8f79e2zYsAG2trbidhoAULVqVRgZGUkliEl+TLgSERERERERUbmWkJAAU1PTAq+fPn0avr6++O2339C0aVOZ63Xq1EFAQACsra0xcOBAODk5SR0CXpSVK1fCzMwMv/76K2xsbNC1a1cEBQVh8eLFUgcXOTg4wN/fH9bW1pg2bRpUVVVhYGCAoUOHwtraGjNnzsSLFy8KXZ2ZlZWF1atXw8nJCfXr18eYMWOkYp09ezYmTZqE8ePHo0aNGmjQoEGpHgrm4OCA4cOHo3bt2rC2tkZwcDCsrKxw4MABqXqurq4YPny4OK83b96gQYMG6NmzJ2rUqIEpU6YgNjYWSUlJAIDKlSvDz88PdevWRbVq1TB27Fh06NABO3bsKDKm//3vf9DU1ISqqiqWLl2KyMhIGBgYFNnO09MT6urqqFy5MrS1tbF+/XqZOlWqVIGKigqcnJwwevRoDBkyBADw/PlzZGdnw8jISKq+kZERnj59KtPPoEGDEBISAkEQsGvXLlhZWaFu3bpFxvgpLS0tREVFYevWrVBTU4OmpiaOHDmCw4cPy6yWNTU1lVr1SvJjwpWIiIiIiIiIyrX3799LbRHwsYcPH6JHjx4YNmyYmCz7VJ06daTem5iYyOzDWZjY2Fg0adJEaq/VZs2aITU1Ff/++2++4ygqKkJfXx/29vZiWV7yrrCx1dXVYWVllW+sycnJePLkCdq0aZNv2xEjRkBTU1N8lURqair8/Pxga2uLihUrQlNTE7GxsTIrXD+ea968CptrdnY2goODYW9vDz09PWhqauLo0aNiv6GhoVKxf7xys1WrVrh69SpiYmLQoUMH9OrVS+y3Y8eOYhs7OzupGJcuXYrLly9j//79iIuLw8SJE2XmGx0djUuXLmH16tVYtmyZXNsc5MfNzQ2pqak4ffo0Nm7cWKLVrUDusz548GA0a9YMf/75J86ePYvatWvDzc0N79+/l6qrpqZW6GppKhg3OyUiIiIiIiKics3AwACvXr2SKX///j08PDxgZ2cndebMpypUqCD1XiKRiCtTFRRy17rlffUdgNThRMWR3zgfl+UlbD9eFStPH3mxqampFTr+rFmzpLYfKAk/Pz9ERkZi0aJFqF69OtTU1NCjRw9kZmYWGGfevAqb68KFC7F8+XIsW7YM9vb20NDQgK+vr9hvly5dpA46q1y5svjfGhoaqF69OqpXr47GjRvD2toaGzZswLRp07B+/XoxEfnpvcvb29fGxgZ6enpo3rw5ZsyYARMTE7GOpaUlgNxkcVJSEgIDA+Hp6QkDAwMoKiqKK3TzJCUlyZxfBABKSkoYMGAAAgICcP78eezdu7fQ+1yQsLAwxMfH49y5c+KzGRYWBl1dXezfvx99+vQR6758+RKVKlUq0TjlHROuRERERERERFSu1atXD1u3bpUpHzJkCF6+fImjR4+W+IDuvIRVYmIi6tWrBwBSB2gBgK2tLXbv3g1BEMRE4tmzZ6GlpSWzX+yXpKWlBQsLCxw/fhytWrWSuW5oaAhDQ8PPGuPs2bPw9vaGh4cHgNwVr/Hx8Z/VZ16/7u7u6N+/P4DcROzdu3dRq1YtALlz09LSkquvnJwcZGRkAJBOzBbVBoDYrqh+lZWV4ejoiOPHj6Nr167i9ePHjxd4SNmgQYOwaNEi9O7dG7q6unLF9am0tDQoKChIrabOe/9xoj49PR1xcXHiM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRPFVYhfS2BgIEaMGAFDQ0N07NgRb9++xdmzZzF27NhC2z148EAmkWxtbS1Tz9raGnv27EHnzp0hkUgwY8aMQlfkysva2hq7du1CTEwMdHV1sWTJEiQlJYkJ1/y8e/cOc+bMQZcuXWBiYoLnz5/jt99+w+PHj9GzZ88C20VERCApKQkNGjSApqYmbt26hcmTJ6NZs2biwVO//fYbqlatChsbGwC5+wAvWrQI48aNE/uZOHEivLy84OTkhIYNG2LZsmV49+4dfHx88h3X1tYWz58/h7q6egnuUK527dph8uTJGD16NMaOHYucnBzMnz8fSkpKUkn2P//8EyoqKmjSpEmJxyrPmHAlIiIiIiIionLN3t4e9evXx44dOzB8+HAAuQdZZWVloUOHDvm22bRpE7y9veXqf+PGjRg8eDAcHR1Rs2ZN/Pzzz2jfvr14vXLlyoiIiMDkyZPh4OAAPT09DB48WCYx+zV4eXkhPT0dS5cuhZ+fHwwMDNCjR48i2xW0f+mnlixZgkGDBqFp06YwMDDAlClT8ObNm8+O29/fH/fv34eLiwvU1dUxbNgwdO3aFSkpKQW2UVRUxN9//43Nmzfj+fPn0NfXR4MGDRAdHS2zX+vH1NTUsG7dOkyYMAEZGRkwMzNDt27dMHXqVLFOTk4Opk2bhgcPHkBJSQlWVlZYsGCB+HwBQO/evfHs2TPMnDkTT58+Rd26dXHkyBGZg7Q+pq+vX+h98Pb2Rnx8PKKiovK9bmNjg4MHDyIoKAhNmjSBgoIC6tWrhyNHjkhthbBt2zb069fvs5K75ZlE+HgTkXLqzZs30NHRQUpKCrS1tcs6nHIhKysLERERqPW6FhShWGT9ak3/Kd4A1u2LrkP0hfE5p/KAzzmVB3zOqTzgc/71/Rf/Hpqeno4HDx7A0tKywAOovmWHDh3C5MmTcfPmza++qpSotDg7O6NVq1YIDAwscR/Pnz9HzZo1cenSJXEPWire7ziucCUiIiIiIiKics/NzQ337t3D48ePYWZmVtbhEBVbSkoK4uLicOjQoc/qJz4+HitXrmSy9TMw4UpEREREREREBMDX17esQyAqMR0dHfz777+f3Y+TkxOcnJxKIaLyi2vkiYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERFRuffixQsYGhoiPj6+rEORW1RUFCQSCV6/fg0ACAkJQcWKFcs0ppKKj4+HRCLB1atXyzoUKqcaN26M3bt3l0pfSqXSCxERERERERFRAe5vvv/VxqrmVa1E7ebMmQN3d3dYWFiUbkCfiIqKQqtWrfDq1avvNjlaUi1btkTdunWxbNmysg6l2Pbs2YO5c+fin3/+QVZWFqytrTFp0iQMGDCgwDaJiYmYNGkSLl26hH/++Qfjxo2Ta+7Z2dkIDAzE1q1b8fTpU5iamsLb2xv+/v6QSCQAcu/lqVOnxDaGhoZo0aIFFi1aBHNz8wL7Lu3nLzAwEPv27ftqiXILCwv4+vrC19e3xH2Eh4fD09MT7u7u2Ldvn1ju7++PCRMmwMPDAwoKn7dGlStciYiIiIiIiKhcS0tLw4YNGzB48OCyDoW+UXp6epg+fTrOnTuH69evw8fHBz4+Pjh69GiBbTIyMlCpUiX4+/vDwcFB7rEWLFiAVatW4ddff0VsbCwWLFiAn3/+Gb/88otUvaFDhyIxMRFPnjzB/v378ejRI/Tv37/Ec/ySsrKyyjoEALkrqf38/NC8eXOZax07dsTbt29x+PDhzx6HCVciIiIiIiIiKtciIiKgoqKCxo0bi2V5X9c/evQo6tWrBzU1NbRu3RrJyck4fPgwbG1toa2tjb59+yItLU1sl5OTg3nz5sHS0hJqampwcHDArl27AOQme1q1agUA0NXVhUQigbe3NwDgyJEj+OGHH1CxYkXo6+ujU6dOiIuL+6x5xcXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23btn1WrJ/Kzs7G4MGDxftYs2ZNLF++XKqOt7c3unbtirlz58LIyAgVK1bErFmz8OHDB0yePBl6enqoUqUKNm3aJNVuypQpqFGjBtTV1VGtWjXMmDGjyIRgy5Yt4eHhAVtbW1hZWWH8+PGoU6cOzpw5U2AbCwsLLF++HAMHDoSOjo7cc4+JiYG7uzvc3NxgYWGBHj16oH379rhw4YJUPXV1dRgbG8PExASNGzfGmDFjcPnyZbnHAf5vG4qjR4/C1tYWmpqa6NChAxITE8U6UVFRaNiwITQ0NFCxYkU0a9YMCQkJCAkJQVBQEK5duwaJRAKJRIKQkBAAgEQiwapVq9ClSxdoaGhgzpw5+W55sW/fPnHVbp6DBw+iQYMGUFVVhYGBATw8PADkfgYJCQmYMGGCOF5xZGdno1+/fggKCkK1arIr4RUVFeHq6orw8PBi9ZsfJlyJiIiIiIiIqFyLjo6Go6NjvtcCAwPx66+/IiYmBo8ePUKvXr2wbNkyhIWF4dChQzh27JjUysN58+bh999/x+rVq3Hr1i1MmDAB/fv3x6lTp2BmZibuEXnnzh0kJiaKScR3795h4sSJuHTpEo4fPw4FBQV4eHggJyenxPNKTU2Fq6srjh8/jitXrqBDhw7o3LkzHj58KFVv6dKlaNasGa5cuQI3NzcMGDAAAwcORP/+/XH58mVYWVlh4MCBEAQBAJCeng5HR0ccOnQIN2/exLBhwzBgwACZhODnyMnJQZUqVbBz507cvn0bM2fOxE8//YQdO3ZI1Ttx4gSePHmC06dPY8mSJQgICECnTp2gq6uL8+fPY8SIERg+fDj+/fdfsY2WlhZCQkJw+/ZtLF++HOvWrcPSpUvljk0QBBw/fhx37txBixYtSm3OeZo2bYrjx4/j7t27AIBr167hzJkz6NixY4FtXr58iR07dqBRo0bFHi8tLQ2LFi3Cli1bcPr0aTx8+BB+fn4AgA8fPqBr165wdnbG9evXce7cOQwbNgwSiQS9e/fGpEmTYGdnh8TERCQmJqJ3795iv4GBgfDw8MCNGzcwaNAguWI5dOgQPDw84OrqiitXruD48eNo2LAhgNxtHapUqYJZs2aJ4xXHrFmzYGhoWOhK9oYNGyI6OrpY/eaHe7gSERERERERUbmWkJAAU1PTfK/Nnj0bzZo1AwAMHjwY06ZNQ1xcnLhCrkePHjh58iSmTJmCjIwMzJ07F3/88QeaNGkCAKhWrRrOnDmDNWvWwNnZGXp6egBy99z8eLVf9+7dpcbduHEjKlWqhNu3b6N27dolmpeDg4PUV9mDg4Oxd+9eHDhwAGPGjBHLXV1dMXz4cADAzJkzsWrVKjRo0AA9e/YEkLsitEmTJkhKSoKxsTEqV64sJuQAYOzYsTh69Ch27NghJsc+V4UKFRAUFCS+t7S0xLlz57Bjxw706tVLLNfT08OKFSugoKCAmjVr4ueff0ZaWhp++uknAMC0adMwf/58nDlzBn369AGQu1dnHgsLC/j5+SE8PBw//vhjoTGlpKSgcuXKyMjIgKKiIlauXIl27dqVynw/NnXqVLx58wY2NjZQVFREdnY25syZg379+knVW7lyJdavXw9BEJCWloYaNWoUusVBQbKysrB69WpYWVkBAMaMGYNZs2YBAN68eYOUlBR06tRJvG5rayu21dTUhJKSEoyNjWX67du3L3x8fIoVy5w5c9CnTx+pzz7vGdbT04OioiK0tLTyHa8wZ86cwYYNG4rca9bU1BSPHj1CTk7OZ+3jyhWuRERERERERFSuvX//Hqqqqvleq1OnjvjfRkZG4tfQPy5LTk4GAPzzzz9IS0tDu3btoKmpKb5+//33IrcHuHfvHjw9PVGtWjVoa2uLh3flrUbt2LGj2J+dnZ1c80pNTYWfnx9sbW1RsWJFaGpqIjY2VmaF66dzBAB7e3uZsrx5ZmdnIzg4GPb29tDT04OmpiaOHj0q9hsaGio1/5KuGPztt9/g6OiISpUqQVNTE2vXrpWJ3c7OTioxZmRkJBW7oqIi9PX1xdgBYPv27WjWrBmMjY2hqakJf39/sd+HDx9KxT537lyxnZaWFq5evYqLFy9izpw5mDhxIqKioko0NyB3ZfXHY4WGhgIAduzYgdDQUISFheHy5cvYvHkzFi1ahM2bN0u179evH65evSqugK1evTrat2+Pt2/fivcmr+/CVseqq6uLyVQAMDExEe+Xnp4evL294eLigs6dO2P58uVyryx1cnIq1v0AgKtXr6JNmzbFbleYt2/fYsCAAVi3bh0MDAwKraumpoacnBxkZGR81phc4UpERERERERE5ZqBgQFevXqV77UKFSqI/y2RSKTe55Xlfe0/NTUVQO7XoitXrixVT0VFpdAYOnfuDHNzc6xbtw6mpqbIyclB7dq1kZmZCQBYv3493r9/LxNTYfz8/BAZGYlFixahevXqUFNTQ48ePcQ+C5pjQWV581y4cCGWL1+OZcuWwd7eHhoaGvD19RX77dKli9RX2z+9F/IIDw+Hn58fFi9ejCZNmkBLSwsLFy7E+fPnC4w9L9bCPqNz586J+3i6uLhAR0cH4eHhWLx4MYDcFY4fr4LMW5EMAAoKCqhevToAoG7duoiNjcW8efPQsmXLYs8PyE1IfjxWXmJ78uTJmDp1qrgi197eHgkJCZg3bx68vLzE+jo6OmI81atXx4YNG2BiYoLt27djyJAhiIiIEPemVVNTKzCO/O5X3vYRALBp0yaMGzcOR44cwfbt2+Hv74/IyEipPY/zo6GhIfVeQUFBql9A9jCtwuIsqbi4OMTHx6Nz585iWd7zoKSkhDt37ogJ55cvX0JDQ+Oz42DClYiIiIiIiIjKtXr16mHr1q2f3U+tWrWgoqKChw8fwtnZOd86ysrKAHJXieZ58eIF7ty5g3Xr1omnp396GFNJkpZnz56Ft7e3eOhQamoq4uPji91Pfv26u7ujf//+AHKTV3fv3kWtWrUA5K4E1dLS+uwxmjZtilGjRolln3uIGJB7IJW5uTmmT58uliUkJIj/raSkJCYxi/K5KyHV1NTyHSstLU3m6+yKiopF7uerqKgIAGJi3tzcvMSxfapevXqoV68epk2bhiZNmiAsLAyNGzeGsrKy1LNcmEqVKuHt27d49+6dmIz99Cv+derUwfHjxwvciqA44+WxsbHBjRs3pMr8/f3x9u1bLF++HGZmZmL5zZs3Ua9evWL1nx8mXImIiIiIiIioXHNxccG0adPw6tUr6OrqlrgfLS0t+Pn5YcKECcjJycEPP/yAlJQUnD17Ftra2vDy8oK5uTkkEgn+97//wdXVFWpqatDV1YW+vj7Wrl0LExMTPHz4EFOnTv3seVlbW2PPnj3o3LkzJBIJZsyY8VmHcH3c765duxATEwNdXV0sWbIESUlJYsK1MM+ePZNJspmYmOQ7xu+//46jR4/C0tISW7ZswcWLF2FpafnZsT98+BDh4eFo0KABDh06hL179xbZbt68eXBycoKVlRUyMjIQERGBLVu2YNWqVWKdadOm4fHjx/j999/Fsry5pqaminNXVlYu9F517twZc+bMQdWqVWFnZ4crV65gyZIlMgdPpaWl4enTpwCApKQkBAcHQ1VVFe3bty/OLSnUgwcPsHbtWnTp0gWmpqa4c+cO7t27h4EDBwLI3QP3wYMHuHr1KqpUqQItLa0CV3M3atQI6urq+OmnnzBu3DicP38eISEhUnUCAgLQpk0bWFlZoU+fPvjw4QMiIiIwZcoUcbzTp0+jT58+UFFRKXKLAABQVVWV2Qc5b//kT8ujo6NL5f4x4UpEREREREREX1Q1r2pFVypD9vb2qF+/Pnbs2CEeHlVSwcHBqFSpEubNm4f79++jYsWKqF+/vniIU+XKlREUFISpU6fCx8cHAwcOREhICMLDwzFu3DjUrl0bNWvWxIoVK0r8VfU8eUm6pk2bwsDAAFOmTMGbN28+q08gd3Xg/fv34eLiAnV1dQwbNgxdu3ZFSkpKkW3DwsIQFhYmVRYcHCyuls0zfPhwXLlyBb1794ZEIoGnpydGjRqFw4cPf1bsXbp0wYQJEzBmzBhkZGTAzc0NM2bMQGBgYKHt3r17h1GjRuHff/+FmpoabGxssHXrVvTu3Vusk5iYKLPH7MerJf/66y+EhYXB3Ny80JXGv/zyC2bMmIFRo0YhOTkZpqamGD58OGbOnClVb926dVi3bh0AQFdXF3Xq1EFERARq1qwp590omrq6Ov7++29s3rwZL168gImJCUaPHi3+nHTv3h179uxBq1at8Pr1a2zatAne3t759qWnp4etW7di8uTJWLduHdq0aYPAwEAMGzZMrNOyZUvs3LkTwcHBmD9/PrS1tdGiRQvx+qxZszB8+HAx8Z23RYFEIil0bHk8fvwYMTExpbLaXSJ8unlCOfTmzRvo6OggJSUF2traZR1OuZCVlYWIiAjUel0LilAssn61pv8UbwDr0vvXHKKS4nNO5QGfcyoP+JxTecDn/Ov7L/49ND09HQ8ePIClpWWBB1B9yw4dOoTJkyfj5s2bn3U6ORF9XQ8ePECNGjVw+/ZtWFtbl7ifKVOm4NWrV1i7dm2+14vzO44rXImIiIiIiIio3HNzc8O9e/fw+PFjqT0diejbFhERgWHDhn1WshUADA0NMXHixFKJiQlXIiIiIiIiIiIAvr6+ZR0CERXT6NGjS6WfSZMmlUo/AMA18kRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIqNx78eIFDA0NER8fX9ahyC0qKgoSiQSvX78GAISEhKBixYplGlNJxcfHQyKR4OrVq2UdCpVTffr0weLFi0ulLyZciYiIiIiIiOjLunfs671KaM6cOXB3d4eFhUXpzTsfnyZJy5OWLVvC19e3rMMokT179sDJyQkVK1aEhoYG6tatiy1bthTaJjExEX379kWNGjWgoKAg99yzs7MxY8YMWFpaQk1NDVZWVggODoYgCGKdli1bQiKRiC8jIyP07NkTCQkJhfZd2s9fYGAg6tatWyp9ycPCwgLLli0rUdtly5ahZs2aUFNTg5mZGSZMmID09HTxur+/P+bMmYOUlJTPjpMJVyIiIiIiIiIq19LS0rBhwwYMHjy4rEOhb5Senh6mT5+Oc+fO4fr16/Dx8YGPjw+OHj1aYJuMjAxUqlQJ/v7+cHBwkHusBQsWYNWqVfj1118RGxuLBQsW4Oeff8b/a+/O46qq9v+Pvw+ICDKJCooDOI+opGZIpeaskWM5T5laag6EKeWAs5VjmXY1EydCG+yaY0bOes05Ta+ZE5Y43BwQUUQ4vz/8eb4eQQTZiMDr2cPHw7P32mt/9mYp8XbttT/77DOrdn369FF0dLTOnz+vf//73zp37py6du36xNeYmRISErL0/OHh4RoxYoTGjBmjY8eOacGCBVq+fLk++OADS5uqVauqTJkyWrp0aYbPR+AKAAAAAABytbVr18re3l4vvPCCZdv9mYAbNmyQn5+fHBwc9Morr+jSpUtat26dKlWqJBcXF3Xu3FlxcXGW45KSkjR58mTL7MTq1avr22+/lXTvsfkGDRpIkgoUKCCTyaSePXtKktavX68XX3xRbm5uKliwoF599VWdPHkyQ9d18uRJtWrVSp6ennJyclLt2rX1888/W7Xx8fHRhAkT1L17dzk5Ocnb21urVq3S5cuX1apVKzk5OalatWrau3ev5Zh//vlHnTp1UrFixeTo6ChfX199/fXXGar1YYmJierdu7flPlaoUEGzZs2yatOzZ0+1bt1akyZNkqenp9zc3DRu3DjdvXtXw4YNk7u7u4oXL66FCxdaHTd8+HCVL19ejo6OKl26tEaNGvXYQLB+/fpq06aNKlWqpDJlymjw4MGqVq2atm/f/shjfHx8NGvWLHXv3l2urq5pvvadO3eqVatWatmypXx8fNS+fXs1adJEv/76q1U7R0dHFSlSREWLFtULL7yggQMHav/+/Wk+j/R/y1Bs2LBBlSpVkpOTk5o1a6bo6GhLm82bN+v5559X/vz55ebmpoCAAJ09e1ZhYWEaO3asDh06ZJlpGxYWJkkymUyaO3euXnvtNeXPn18TJ05MccmLH374QSaTyWrbjz/+qNq1aytfvnwqVKiQ2rRpI+ne1+Ds2bMaOnSo5XxptXPnTgUEBKhz587y8fFRkyZN1KlTp2T3NDAwUBEREem4gykjcAUAAAAAALnatm3bVLNmzRT3hYaGavbs2dq5c6fOnTunN954QzNnzlR4eLjWrFmjn376yWrm4eTJk7V48WJ98cUX+v333zV06FB17dpVW7ZsUYkSJfTdd99Jko4fP67o6GhLiHjz5k0FBQVp7969ioyMlI2Njdq0aaOkpKQnvq7Y2Fi1aNFCkZGROnDggJo1a6bAwEBFRUVZtZsxY4YCAgJ04MABtWzZUt26dVP37t3VtWtX7d+/X2XKlFH37t0tj7Tfvn1bNWvW1Jo1a3TkyBH17dtX3bp1SxZeZURSUpKKFy+ub775RkePHtXo0aP1wQcfaMWKFVbtfvnlF50/f15bt27V9OnTNWbMGL366qsqUKCAdu/erbffflv9+vXTX3/9ZTnG2dlZYWFhOnr0qGbNmqX58+drxowZaa7NbDYrMjJSx48f18svv2zYNd9Xt25dRUZG6o8//pAkHTp0SNu3b1fz5s0fecyVK1e0YsUK1alTJ93ni4uL09SpU7VkyRJt3bpVUVFRCg4OliTdvXtXrVu3Vr169fTbb79p165d6tu3r0wmkzp06KD33ntPVapUUXR0tKKjo9WhQwdLv6GhoWrTpo0OHz6sN998M021rFmzRm3atFGLFi104MABRUZG6vnnn5d0b1mH4sWLa9y4cZbzpVXdunW1b98+yxg9deqU1q5dqxYtWli1e/755/Xrr78qPj4+zX2nJE+GjgYAAAAAAMjmzp49Ky8vrxT3TZgwQQEBAZKk3r17KyQkRCdPnlTp0qUlSe3bt9emTZs0fPhwxcfHa9KkSfr555/l7+8vSSpdurS2b9+uf/3rX6pXr57c3d0lSR4eHlaz/dq1a2d13q+++kqFCxfW0aNHVbVq1Se6rurVq1s9yj5+/HitXLlSq1at0sCBAy3bW7RooX79+kmSRo8erblz56p27dp6/fXXJd2bEerv76+LFy+qSJEiKlasmCWQk6R3331XGzZs0IoVKyzhWEbZ2dlp7Nixls+lSpXSrl27tGLFCr3xxhuW7e7u7vr0009lY2OjChUq6OOPP1ZcXJzlUfGQkBBNmTJF27dvV8eOHSXdW6vzPh8fHwUHBysiIkLvv/9+qjVdv35dxYoVU3x8vGxtbTVnzhw1btzYkOt90IgRIxQTE6OKFSvK1tZWiYmJmjhxorp06WLVbs6cOfryyy9lNpsVFxen8uXLp7rEwaMkJCToiy++UJkyZSRJAwcO1Lhx4yRJMTExun79ul599VXL/kqVKlmOdXJyUp48eVSkSJFk/Xbu3Fm9evVKVy0TJ05Ux44drb7298ewu7u7bG1t5ezsnOL5UtO5c2f973//04svviiz2ay7d+/q7bfftlpSQJK8vLx0584dXbhwQd7e3uk6x4OY4QoAAAAAAHK1W7duKV++fCnuq1atmuX3np6elsfQH9x26dIlSdKff/6puLg4NW7cWE5OTpZfixcvfuzyACdOnFCnTp1UunRpubi4WF7edX82avPmzS39ValSJU3XFRsbq+DgYFWqVElubm5ycnLSsWPHks1wffgaJcnX1zfZtvvXmZiYqPHjx8vX11fu7u5ycnLShg0bLP0uW7bM6vq3bduWpnof9vnnn6tmzZoqXLiwnJycNG/evGS1V6lSRTY2/xdveXp6WtVua2urggULWmqXpOXLlysgIEBFihSRk5OTRo4caek3KirKqvZJkyZZjnN2dtbBgwe1Z88eTZw4UUFBQdq8efMTXZt0b2b1g+datmyZJGnFihVatmyZwsPDtX//fi1atEhTp07VokWLrI7v0qWLDh48aJkBW7ZsWTVp0kQ3btyw3Jv7fac2O9bR0dESpkpS0aJFLffL3d1dPXv2VNOmTRUYGKhZs2aleWZprVq10nU/JOngwYNq2LBhuo97nM2bN2vSpEmaM2eO9u/fr++//15r1qzR+PHjrdo5ODhIktUyIU+CGa4AAAAAACBXK1SokK5evZriPjs7O8vvTSaT1ef72+4/9h8bGyvp3mPRxYoVs2pnb2+fag2BgYHy9vbW/Pnz5eXlpaSkJFWtWlV37tyRJH355Ze6detWsppSExwcrI0bN2rq1KkqW7asHBwc1L59e0ufj7rGR227f52ffPKJZs2apZkzZ8rX11f58+fXkCFDLP2+9tprVo+2P3wv0iIiIkLBwcGaNm2a/P395ezsrE8++US7d+9+ZO33a03ta7Rr1y516dJFY8eOVdOmTeXq6qqIiAhNmzZN0r0ZjgcPHrQce39GsiTZ2NiobNmykqQaNWro2LFjmjx5surXr5/u65PuBZIPnut+sD1s2DCNGDHCMiPX19dXZ8+e1eTJk9WjRw9Le1dXV0s9ZcuW1YIFC1S0aFEtX75cb731ltauXWtZm/Z+kJiSlO7X/eUjJGnhwoUaNGiQ1q9fr+XLl2vkyJHauHGj1ZrHKcmfP7/VZxsbG6t+peQv00qtzowYNWqUunXrprfeekvSvXt68+ZN9e3bVx9++KEltL9y5YokqXDhwhk6H4ErAAAAAADI1fz8/Ax5M3nlypVlb2+vqKgo1atXL8U2efPmlXRvluh9//zzj44fP6758+frpZdekqRkL2N6ktByx44d6tmzp+WlQ7GxsTpz5ky6+0mp31atWqlr166S7gWxf/zxhypXrizp3kxQZ2fnDJ+jbt266t+/v2VbRl8iJt17eZK3t7c+/PBDy7azZ89afp8nTx5LiPk4SUlJGVrr08HBIcVzxcXFWc3ale7N1H3cer62traSZAnmM/JI/MP8/Pzk5+enkJAQ+fv7Kzw8XC+88ILy5s1rNZZTU7hwYd24cUM3b960hLEPBs7SvdnWkZGRj1yKID3ne9Cj7qkkqxD4yJEjKl68uAoVKpTuczyIwBUAAAAAAORqTZs2VUhIiK5evaoCBQo8cT/Ozs4KDg7W0KFDlZSUpBdffFHXr1/Xjh075OLioh49esjb21smk0mrV69WixYt5ODgoAIFCqhgwYKaN2+eihYtqqioKI0YMSLD11WuXDl9//33CgwMlMlk0qhRozL0Eq4H+/3222+1c+dOFShQQNOnT9fFixctgWtqLl++nCxkK1q0aIrnWLx4sTZs2KBSpUppyZIl2rNnj0qVKpXh2qOiohQREaHatWtrzZo1Wrly5WOPmzx5smrVqqUyZcooPj5ea9eu1ZIlSzR37lxLm5CQEP39999avHixZdv9a42NjbVce968eVO9V4GBgZo4caJKliypKlWq6MCBA5o+fXqyF0/FxcXpwoULkqSLFy9q/Pjxypcvn5o0aZKeW5Kq06dPa968eXrttdfk5eWl48eP68SJE+revbuke2vgnj59WgcPHlTx4sXl7Oz8yNncderUkaOjoz744AMNGjRIu3fvVlhYmFWbMWPGqGHDhipTpow6duyou3fvau3atRo+fLjlfFu3blXHjh1lb2+f5mA0MDBQ06dPl5+fn+rUqaM///xTo0aNUmBgoCV4le4t82DE/SNwBQAAAAAAmauccQFQZvD19dVzzz2nFStWWF4e9aTGjx+vwoULa/LkyTp16pTc3Nz03HPPWV7OU6xYMY0dO1YjRoxQr1691L17d4WFhSkiIkKDBg1S1apVVaFCBX366adP/Kj6ffdDurp166pQoUIaPny4YmJiMtSndO+lU6dOnVLTpk3l6Oiovn37qnXr1rp+/fpjjw0PD1d4eLjVtvHjx1tmy97Xr18/HThwQB06dJDJZFKnTp3Uv39/rVu3LkO1v/baaxo6dKgGDhyo+Ph4tWzZUqNGjVJoaGiqx928eVP9+/fXX3/9JQcHB1WsWFFLly5Vhw4dLG2io6OTrTHr5+dn+f2+ffsUHh4ub2/vVGcaf/bZZxo1apT69++vS5cuycvLS/369dPo0aOt2s2fP1/z58+XJBUoUEDVqlXT2rVrVaFChTTejcdzdHTUf//7Xy1atEj//POPihYtqgEDBlj+nLRr107ff/+9GjRooGvXrmnhwoXq2bNnin25u7tr6dKlGjZsmObPn6+GDRsqNDRUffv2tbSpX7++vvnmG40fP15TpkyRi4uLXn75Zcv+cePGqV+/fpbg+/7sVJPJlOq5R44cKZPJpJEjR+rvv/9W4cKFLcH2fbdv39YPP/yg9evXZ/CuSSbzw4sn5EIxMTFydXXV9evX5eLiktXl5AoJCQlau3atKl+rLFvZPrZ96bp/pu8Ez/g3c+QOjHPkBoxz5AaMc+QGjPOnLyf+HHr79m2dPn1apUqVeuQLqJ5la9as0bBhw3TkyJFkjx4DeHadPn1a5cuX19GjR1WuXLkn7mfu3LlauXKlfvrppxT3p+fvOGa4AgAAAACAXK9ly5Y6ceKE/v77b5UoUSKrywGQRmvXrlXfvn0zFLZK914e9tlnnxlSE4ErAAAAAACApCFDhmR1CQDSacCAAYb089ZbbxnSjyQxRx4AAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAADI9f755x95eHjozJkzWV1Kmm3evFkmk0nXrl2TJIWFhcnNzS1La3pSZ86ckclk0sGDB7O6FORCd+7ckY+Pj/bu3WtIf3kM6QUAAAAAAOARmv+276mda121mk903MSJE9WqVSv5+PgYW9BDNm/erAYNGujq1avZNhx9UvXr11eNGjU0c+bMrC4l3b7//ntNmjRJf/75pxISElSuXDm999576tatW6rHzJ07VwcPHlR8fLyqVKmi0NBQNW3a9LHn+uKLL7Rv3z5duXJFBw4cUI0aNaza+Pj46OzZs5IkGxsbeXp6qnnz5po6daoKFCjwyL7DwsI0ZMgQS0ifUT179tS1a9f0ww8/GNLf45hMJq1cuVKtW7d+4j6mTJmikJAQDR482DIW8+bNq+DgYA0fPlyRkZEZrpMZrgAAAAAAIFeLi4vTggUL1Lt376wuBc8od3d3ffjhh9q1a5d+++039erVS7169dKGDRseeczWrVvVuHFjrV27Vvv27VODBg0UGBioAwcOpHqumzdv6sUXX9RHH32Uartx48YpOjpaUVFRWrZsmbZu3apBgwY90fVltoSEhKwuQZK0Z88e/etf/1K1atWS7evSpYu2b9+u33//PcPnIXAFAAAAAAC52tq1a2Vvb68XXnjBsu3+4/obNmyQn5+fHBwc9Morr+jSpUtat26dKlWqJBcXF3Xu3FlxcXGW45KSkjR58mSVKlVKDg4Oql69ur799ltJ9x6bb9CggSSpQIECMplM6tmzpyRp/fr1evHFF+Xm5qaCBQvq1Vdf1cmTJzN0XSdPnlSrVq3k6ekpJycn1a5dWz///LNVGx8fH02YMEHdu3eXk5OTvL29tWrVKl2+fFmtWrWSk5OTqlWrZvWo9T///KNOnTqpWLFicnR0lK+vr77++usM1fqwxMRE9e7d23IfK1SooFmzZlm16dmzp1q3bq1JkybJ09NTbm5uGjdunO7evathw4bJ3d1dxYsX18KFC62OGz58uMqXLy9HR0eVLl1ao0aNemwgWL9+fbVp00aVKlVSmTJlNHjwYFWrVk3bt29/5DEzZ87U+++/r9q1a6tcuXKaNGmSypUrpx9//DHVc3Xr1k2jR49Wo0aNUm3n7OysIkWKqFixYmrQoIF69Oih/fv3p3rMw0JDQ1WjRg0tWbJEPj4+cnV1VceOHXXjxg1Lm2+//Va+vr5ycHBQwYIF1ahRI928eVOhoaFatGiR/v3vf8tkMslkMmnz5s2W5SGWL1+uevXqKV++fFq2bJnlXA/fo4dnlX/11VeqUqWK7O3tVbRoUQ0cOFCSLO3atGkjk8mU7tnosbGx6tKli+bPn5/iLOACBQooICBAERER6eo3JQSuAAAAAAAgV9u2bZtq1kx5KYLQ0FDNnj1bO3fu1Llz5/TGG29o5syZCg8P15o1a/TTTz/ps88+s7SfPHmyFi9erC+++EK///67hg4dqq5du2rLli0qUaKEvvvuO0nS8ePHFR0dbQkRb968qaCgIO3du1eRkZGysbFRmzZtlJSU9MTXFRsbqxYtWigyMlIHDhxQs2bNFBgYqKioKKt2M2bMUEBAgA4cOKCWLVuqW7du6t69u7p27ar9+/erTJky6t69u8xmsyTp9u3bqlmzptasWaMjR46ob9++6tatm3799dcnrvVhSUlJKl68uL755hsdPXpUo0eP1gcffKAVK1ZYtfvll190/vx5bd26VdOnT9eYMWP06quvqkCBAtq9e7fefvtt9evXT3/99ZflGGdnZ4WFheno0aOaNWuW5s+frxkzZqS5NrPZrMjISB0/flwvv/xyuq7pxo0bcnd3T/MxafX333/rxx9/VJ06ddJ97MmTJ/XDDz9o9erVWr16tbZs2aIpU6ZIkqKjo9WpUye9+eabOnbsmDZv3qy2bdvKbDYrODhYb7zxhpo1a6bo6GhFR0erbt26ln5HjBihwYMH69ixY49dRuG+uXPnasCAAerbt68OHz6sVatWqWzZspLuzU6VpIULFyo6OtryOa0GDBigli1bphpkP//889q2bVu6+k0Ja7gCAAAAAIBc7ezZs/Ly8kpx34QJExQQECBJ6t27t0JCQnTy5EmVLl1aktS+fXtt2rRJw4cPV3x8vCZNmqSff/5Z/v7+kqTSpUtr+/bt+te//qV69epZwjYPDw+rNVzbtWtndd6vvvpKhQsX1tGjR1W1atUnuq7q1aurevXqls/jx4/XypUrtWrVKsusQUlq0aKF+vXrJ0kaPXq05s6dq9q1a+v111+XdG9GqL+/vy5evGiZURkcHGw5/t1339WGDRu0YsUKPf/8809U68Ps7Ow0duxYy+dSpUpp165dWrFihd544w3Ldnd3d3366aeysbFRhQoV9PHHHysuLk4ffPCBJCkkJERTpkzR9u3b1bFjR0nSyJEjLcf7+PgoODhYERERev/991Ot6fr16ypWrJji4+Nla2urOXPmqHHjxmm+pqlTpyo2Ntaq/owYPny4Ro4cqcTERN2+fVt16tTR9OnT091PUlKSwsLC5OzsLOneDNvIyEhNnDhR0dHRunv3rtq2bStvb29Jkq+vr+VYBwcHxcfHq0iRIsn6HTJkiNq2bZuuWiZMmKD33ntPgwcPtmyrXbu2JKlw4cKSJDc3txTPl5qIiAjt37//sSGtl5eXZW3cjGCGKwAAAAAAyNVu3bqlfPnypbjvwbUePT09LY+hP7jt0qVLkqQ///xTcXFxaty4sZycnCy/Fi9e/NjlAU6cOKFOnTqpdOnScnFxsTwufX82avPmzS39ValSJU3XFRsbq+DgYFWqVElubm5ycnLSsWPHks1wffgaJetQ7f62+9eZmJio8ePHy9fXV+7u7nJyctKGDRss/S5btszq+p90xuDnn3+umjVrqnDhwnJyctK8efOS1V6lShXZ2PxfvOXp6WlVu62trQoWLGipXZKWL1+ugIAAFSlSRE5OTho5cqSl36ioKKvaJ02aZDnO2dlZBw8e1J49ezRx4kQFBQVp8+bNabqW8PBwjR07VitWrJCHh4ekjN+nYcOG6eDBg/rtt98sL3pq2bKlEhMTJcmq77fffvuR/fj4+FjCVkkqWrSo5X5Vr15dDRs2lK+vr15//XXNnz9fV69eTVN9tWrVStf1XLp0SefPn1fDhg3TddzjnDt3ToMHD9ayZcse+ef8PgcHB6slQp4UM1wBAAAAAECuVqhQoUeGSHZ2dpbfm0wmq8/3t91/7D82NlaStGbNGhUrVsyqnb29fao1BAYGytvbW/Pnz5eXl5eSkpJUtWpV3blzR5L05Zdf6tatW8lqSk1wcLA2btyoqVOnqmzZsnJwcFD79u0tfT7qGh+17f51fvLJJ5o1a5ZmzpwpX19f5c+fX0OGDLH0+9prr1k92v7wvUiLiIgIBQcHa9q0afL395ezs7M++eQT7d69+5G13681ta/Rrl271KVLF40dO1ZNmzaVq6urIiIiNG3aNEn3ZjgePHjQcuyDj//b2NhYHm+vUaOGjh07psmTJ6t+/fqPvZa33npL33zzjdXj7Bm9T4UKFbLUU65cOc2cOVP+/v7atGmTGjVqZHUdLi4uj+wntftla2urjRs3aufOnZblMz788EPt3r1bpUqVSrW+/PnzW322sbGxLEtx34Nr5zo4OKTa35Pat2+fLl26pOeee86yLTExUVu3btXs2bMtM5Yl6cqVK5aZtBlB4AoAAAAAAHI1Pz8/LV26NMP9VK5cWfb29oqKilK9evVSbJM3b15JssxClO69hOr48eOaP3++XnrpJUlK9jKmJwktd+zYoZ49e6pNmzaS7gXCZ86cSXc/KfXbqlUrde3aVdK9IPaPP/5Q5cqVJd2bCfrgjMknPUfdunXVv39/y7aMvkRMknbu3Clvb299+OGHlm0PPkKeJ08eS4j5OElJSYqPj0+1zddff60333xTERERatmypdU+I+7Tg+6HhveD+bRex+OYTCYFBAQoICBAo0ePlre3t1auXKmgoCDlzZvXaiynpnDhwrpw4YLMZrMlxH8wFHZ2dpaPj48iIyMtL5d7mJ2dXZrPd1/Dhg11+PBhq229evVSxYoVNXz4cMt9k6QjR47Iz88vXf2nhMAVAAAAAADkak2bNlVISIiuXr2a4tvL08rZ2VnBwcEaOnSokpKS9OKLL+r69evasWOHXFxc1KNHD3l7e8tkMmn16tVq0aKFHBwcVKBAARUsWFDz5s1T0aJFFRUVpREjRmT4usqVK6fvv/9egYGBMplMGjVqVIZewvVgv99++6127typAgUKaPr06bp48aIlcE3N5cuXrUI26d4j7CmdY/HixdqwYYNKlSqlJUuWaM+ePY+dVZmW2qOiohQREaHatWtrzZo1Wrly5WOPmzx5smrVqqUyZcooPj5ea9eu1ZIlSzR37lxLm5CQEP39999avHixpHvLCPTo0UOzZs1SnTp1dOHCBUn3ZnK6uro+8lxXrlxRVFSUzp8/L+neC9YkqUiRIlZrl964ccMSYJ47d07vv/++ChcubPXiqozavXu3IiMj1aRJE3l4eGj37t26fPmyKlWqJOnecgQbNmzQ8ePHVbBgwVSvq379+rp8+bI+/vhjtW/fXuvXr9e6deusZt+Ghobq7bffloeHh5o3b64bN25ox44devfddy3ni4yMVEBAgOzt7dP059XZ2TnZOsj58+dXwYIFk23ftm2bxo8fn+b78ygErgAAAAAAIFOtq1Yzq0tIla+vr5577jmtWLHC8vKoJzV+/HgVLlxYkydP1qlTp+Tm5qbnnnvO8hKnYsWKaezYsRoxYoR69eql7t27KywsTBERERo0aJCqVq2qChUq6NNPP33so+qPM336dL355puqW7euChUqpOHDhysmJiZDfUr3Xjp16tQpNW3aVI6Ojurbt69at26t69evP/bY8PBwhYeHW20bP368Zbbsff369dOBAwfUoUMHmUwmderUSf3799e6desyVPtrr72moUOHauDAgYqPj1fLli01atQohYaGpnrczZs31b9/f/31119ycHBQxYoVtXTpUnXo0MHSJjo62mqN2Xnz5unu3bsaMGCABgwYYNneo0cPhYWFPfJcq1atUq9evSyf77/sa8yYMVZ1jh49WqNHj5Z0b/Zo7dq19dNPP6lgwYJpuRVp4uLioq1bt2rmzJmKiYmRt7e3pk2bpubNm0uS+vTpo82bN6tWrVqKjY3Vpk2bLOsPP6xSpUqaM2eOJk2apPHjx6tdu3YKDg7WvHnzLG169Oih27dva8aMGQoODlahQoXUvn17y/5p06YpKChI8+fPV7FixXTmzBmdOXNGpUqV0qZNmzL0Z2bXrl26fv261fmelMn88OIJuVBMTIxcXV11/fr1VNe0gHESEhK0du1aVb5WWbayfWz70nX/TN8JyjV5wsoA4zDOkRswzpEbMM6RGzDOn76c+HPo7du3dfr0aZUqVeqxL6Z5Fq1Zs0bDhg3TkSNHrF7CBODZtmnTJrVt21anTp3K0Az1Dh06qHr16pZ/HHlYev6OY4YrAAAAAADI9Vq2bKkTJ07o77//VokSJbK6HABptHbtWn3wwQcZClvv3LkjX19fDR061JCaCFwBAAAAAAAkDRkyJKtLAJBOn3zySYb7yJs3r0aOHGlANfcwRx4AAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAADI9f755x95eHjozJkzWV1Kmm3evFkmk0nXrl2TJIWFhcnNzS1La3pSZ86ckclk0sGDB7O6FORSL7zwgr777jtD+spjSC8AAAAAAACPkDD2vad2Lrsx057ouIkTJ6pVq1by8fExtqCHbN68WQ0aNNDVq1ezbTj6pOrXr68aNWpo5syZWV1Kun3//feaNGmS/vzzTyUkJKhcuXJ677331K1bt1SPmTt3rg4ePKj4+HhVqVJFoaGhatq06WPP9cUXX2jfvn26cuWKDhw4oBo1ali18fHx0dmzZyVJNjY28vT0VPPmzTV16lQVKFDgkX2HhYVpyJAhlpA+o3r27Klr167phx9+MKS/xzGZTFq5cqVat26druMSExMVGhqqpUuX6sKFC/Ly8lLPnj01cuRImUwmSdLIkSM1dOhQtWnTRjY2GZujygxXAAAAAACQq8XFxWnBggXq3bt3VpeCZ5S7u7s+/PBD7dq1S7/99pt69eqlXr16acOGDY88ZuvWrWrcuLHWrl2rffv2qUGDBgoMDNSBAwdSPdfNmzf14osv6qOPPkq13bhx4xQdHa2oqCgtW7ZMW7du1aBBg57o+jJbQkJClp7/o48+0ty5czV79mwdO3ZMH330kT7++GN99tlnljbNmzfXjRs3tG7dugyfj8AVAAAAAADkamvXrpW9vb1eeOEFy7b7j+tv2LBBfn5+cnBw0CuvvKJLly5p3bp1qlSpklxcXNS5c2fFxcVZjktKStLkyZNVqlQpOTg4qHr16vr2228l3XtsvkGDBpKkAgUKyGQyqWfPnpKk9evX68UXX5Sbm5sKFiyoV199VSdPnszQdZ08eVKtWrWSp6ennJycVLt2bf38889WbXx8fDRhwgR1795dTk5O8vb21qpVq3T58mW1atVKTk5Oqlatmvbu3Ws55p9//lGnTp1UrFgxOTo6ytfXV19//XWGan1YYmKievfubbmPFSpU0KxZs6za9OzZU61bt9akSZPk6ekpNzc3jRs3Tnfv3tWwYcPk7u6u4sWLa+HChVbHDR8+XOXLl5ejo6NKly6tUaNGPTYQrF+/vtq0aaNKlSqpTJkyGjx4sKpVq6bt27c/8piZM2fq/fffV+3atVWuXDlNmjRJ5cqV048//pjqubp166bRo0erUaNGqbZzdnZWkSJFVKxYMTVo0EA9evTQ/v37Uz3mYaGhoapRo4aWLFkiHx8fubq6qmPHjrpx44alzbfffitfX185ODioYMGCatSokW7evKnQ0FAtWrRI//73v2UymWQymbR582bL8hDLly9XvXr1lC9fPi1btsxyrofv0cOzyr/66itVqVJF9vb2Klq0qAYOHChJlnZt2rSRyWRK12z0nTt3qlWrVmrZsqV8fHzUvn17NWnSRL/++qulja2trVq0aKGIiIh03cOUELgCAAAAAIBcbdu2bapZs2aK+0JDQzV79mzt3LlT586d0xtvvKGZM2cqPDxca9as0U8//WQ1S27y5MlavHixvvjiC/3+++8aOnSounbtqi1btqhEiRKWNSKPHz+u6OhoS4h48+ZNBQUFae/evYqMjJSNjY3atGmjpKSkJ76u2NhYtWjRQpGRkTpw4ICaNWumwMBARUVFWbWbMWOGAgICdODAAbVs2VLdunVT9+7d1bVrV+3fv19lypRR9+7dZTabJUm3b99WzZo1tWbNGh05ckR9+/ZVt27drMKrjEpKSlLx4sX1zTff6OjRoxo9erQ++OADrVixwqrdL7/8ovPnz2vr1q2aPn26xowZo1dffVUFChTQ7t279fbbb6tfv37666+/LMc4OzsrLCxMR48e1axZszR//nzNmDEjzbWZzWZFRkbq+PHjevnll9N1TTdu3JC7u3uaj0mrv//+Wz/++KPq1KmT7mNPnjypH374QatXr9bq1au1ZcsWTZkyRZIUHR2tTp066c0339SxY8e0efNmtW3bVmazWcHBwXrjjTfUrFkzRUdHKzo6WnXr1rX0O2LECA0ePFjHjh177DIK982dO1cDBgxQ3759dfjwYa1atUply5aVJO3Zs0eStHDhQkVHR1s+p0XdunUVGRmpP/74Q5J06NAhbd++Xc2bN7dq9/zzz2vbtm1p7vdRWMMVAADkGOlZH+5J13cDAAA5z9mzZ+Xl5ZXivgkTJiggIECS1Lt3b4WEhOjkyZMqXbq0JKl9+/batGmThg8frvj4eE2aNEk///yz/P39JUmlS5fW9u3b9a9//Uv16tWzhG0eHh5Wa7i2a9fO6rxfffWVChcurKNHj6pq1apPdF3Vq1dX9erVLZ/Hjx+vlStXatWqVZZZg5LUokUL9evXT5I0evRozZ07V7Vr19brr78u6d6MUH9/f128eNEyozI4ONhy/LvvvqsNGzZoxYoVev7555+o1ofZ2dlp7Nixls+lSpXSrl27tGLFCr3xxhuW7e7u7vr0009lY2OjChUq6OOPP1ZcXJw++OADSVJISIimTJmi7du3q2PHjpLurdV5n4+Pj4KDgxUREaH3338/1ZquX7+uYsWKKT4+Xra2tpozZ44aN26c5muaOnWqYmNjrerPiOHDh2vkyJFKTEzU7du3VadOHU2fPj3d/SQlJSksLEzOzs6S7s2wjYyM1MSJExUdHa27d++qbdu28vb2liT5+vpajnVwcFB8fLyKFCmSrN8hQ4aobdu26aplwoQJeu+99zR48GDLttq1a0uSChcuLElyc3NL8XypGTFihGJiYlSxYkXZ2toqMTFREydOVJcuXazaeXl56dy5c0pKSsrQOq7McAUAAAAAALnarVu3lC9fvhT3VatWzfJ7T09Py2PoD267dOmSJOnPP/9UXFycGjduLCcnJ8uvxYsXP3Z5gBMnTqhTp04qXbq0XFxcLI9L35+N2rx5c0t/VapUSdN1xcbGKjg4WJUqVZKbm5ucnJx07NixZDNcH75GyTpUu7/t/nUmJiZq/Pjx8vX1lbu7u5ycnLRhwwZLv8uWLbO6/iedMfj555+rZs2aKly4sJycnDRv3rxktVepUsUqGPP09LSq3dbWVgULFrTULknLly9XQECAihQpIicnJ40cOdLSb1RUlFXtkyZNshzn7OysgwcPas+ePZo4caKCgoK0efPmNF1LeHi4xo4dqxUrVsjDw0NSxu/TsGHDdPDgQf3222+KjIyUJLVs2VKJiYmSZNX322+//ch+fHx8LGGrJBUtWtRyv6pXr66GDRvK19dXr7/+uubPn6+rV6+mqb5atWql63ouXbqk8+fPq2HDhuk6Li1WrFihZcuWKTw8XPv379eiRYs0depULVq0yKqdg4ODkpKSFB8fn6HzMcMVAAAAAADkaoUKFXpkiGRnZ2f5vclksvp8f9v9x/5jY2MlSWvWrFGxYsWs2tnb26daQ2BgoLy9vTV//nx5eXkpKSlJVatW1Z07dyRJX375pW7dupWsptQEBwdr48aNmjp1qsqWLSsHBwe1b9/e0uejrvFR2+5f5yeffKJZs2Zp5syZ8vX1Vf78+TVkyBBLv6+99prVo+0P34u0iIiIUHBwsKZNmyZ/f385Ozvrk08+0e7dux9Z+/1aU/sa7dq1S126dNHYsWPVtGlTubq6KiIiQtOm3Xv6ycvLSwcPHrQc++Dj/zY2NpbH22vUqKFjx45p8uTJql+//mOv5a233tI333xjtS5rRu9ToUKFLPWUK1dOM2fOlL+/vzZt2qRGjRpZXYeLi8sj+0ntftna2mrjxo3auXOnZfmMDz/8ULt371apUqVSrS9//vxWn21sbCzLUtz34Nq5Dg4OqfaXEcOGDdOIESMss5x9fX119uxZTZ48WT169LC0u3LlivLnz5/hWghcAQAAgGyEpTMAwHh+fn5aunRphvupXLmy7O3tFRUVpXr16qXYJm/evJJkmYUo3XsJ1fHjxzV//ny99NJLkpTsZUxPElru2LFDPXv2VJs2bSTdC4TPnDmT7n5S6rdVq1bq2rWrpHtB7B9//KHKlStLujcT9MEZk096jrp166p///6WbRl9iZh07+VJ3t7e+vDDDy3bzp49a/l9njx5LCHm46RlJuTXX3+tN998UxEREWrZsqXVPiPu04NsbW0lyRLMp/U6HsdkMikgIEABAQEaPXq0vL29tXLlSgUFBSlv3rxWYzk1hQsX1oULF2Q2my0h/oOhsLOzs3x8fBQZGWl5udzD7Ozs0ny+B8XFxSVbIsDW1jbZGslHjhyRn59fuvt/GIErAAAAAADI1Zo2baqQkBBdvXpVBQoUeOJ+nJ2dFRwcrKFDhyopKUkvvviirl+/rh07dsjFxUU9evSQt7e3TCaTVq9erRYtWsjBwUEFChRQwYIFNW/ePBUtWlRRUVEaMWJEhq+rXLly+v777xUYGCiTyaRRo0Zl6CVcD/b77bffaufOnSpQoICmT5+uixcvWgLX1Fy+fNkqZJPuPcKe0jkWL16sDRs2qFSpUlqyZIn27Nnz2FmVaak9KipKERERql27ttasWaOVK1c+9rjJkyerVq1aKlOmjOLj47V27VotWbJEc+fOtbQJCQnR33//rcWLF0u6t4xAjx49NGvWLNWpU0cXLlyQdG8mp6ur6yPPdeXKFUVFRen8+fOS7r1gTZKKFClitXbpjRs3LAHmuXPn9P7776tw4cJWL67KqN27dysyMlJNmjSRh4eHdu/ercuXL6tSpUqS7i1HsGHDBh0/flwFCxZM9brq16+vy5cv6+OPP1b79u21fv16rVu3zmr2bWhoqN5++215eHioefPmunHjhnbs2KF3333Xcr7IyEgFBATI3t4+zX9eAwMDNXHiRJUsWVJVqlTRgQMHNH36dL355ptW7bZt26YmTZqk9zYlQ+AKAAAAAAAy1bM+497X11fPPfecVqxYYXl51JMaP368ChcurMmTJ+vUqVNyc3PTc889Z3mJU7FixTR27FiNGDFCvXr1Uvfu3RUWFqaIiAgNGjRIVatWVYUKFfTpp58+9lH1x7kfKNWtW1eFChXS8OHDFRMTk6E+pXsvnTp16pSaNm0qR0dH9e3bV61bt9b169cfe2x4eLjCw8Otto0fP94yW/a+fv366cCBA+rQoYNMJpM6deqk/v37a926dRmq/bXXXtPQoUM1cOBAxcfHq2XLlho1apRCQ0NTPe7mzZvq37+//vrrLzk4OKhixYpaunSpOnToYGkTHR1ttcbsvHnzdPfuXQ0YMEADBgywbO/Ro4fCwsIeea5Vq1apV69els/3H4MfM2aMVZ2jR4/W6NGjJd2bPVq7dm399NNPKliwYFpuRZq4uLho69atmjlzpmJiYuTt7a1p06apefPmkqQ+ffpo8+bNqlWrlmJjY7Vp0ybL+sMPq1SpkubMmaNJkyZp/PjxateunYKDgzVv3jxLmx49euj27duaMWOGgoODVahQIbVv396yf9q0aQoKCtL8+fNVrFgxnTlzRmfOnFGpUqW0adOmR/6Z+eyzzzRq1Cj1799fly5dkpeXl/r162e5f5L0999/a+fOnYbMdjeZH148IReKiYmRq6urrl+/nuqaFjBOQkKC1q5dq8rXKstWto9tX7run+k7QbmM/2sEkFGMc+QGz9o451FrZAbGOXKDZ22c5wY58efQ27dv6/Tp0ypVqtQjX0D1LFuzZo2GDRumI0eOZOjt5ACerk2bNqlt27Y6depUhmaoDx8+XFevXrUKgB+Unr/jmOEKAAAAAAByvZYtW+rEiRP6+++/VaJEiawuB0AarV27Vh988EGGwlZJ8vDwUFBQkCE1EbgCAAAAAABIGjJkSFaXACCdPvnkE0P6ee+9tD9F9DjMkQcAAAAAAAAAgzDDFQAAAADwTGGtYgBAdsYMVwAAAAAAYBjezQ0gJ0rP320ErgAAAAAAIMPs7OwkSXFxcVlcCQAY786dO5IkW1vbx7ZlSQEAyKaa/7YvXe3XVauZSZUAmSe943xVJtUBAAAez9bWVm5ubrp06ZIkydHRUSaTKYurAoCMS0pK0uXLl+Xo6Kg8eR4fpxK4AkAuwVpoAAAAyGxFihSRJEvoCgA5hY2NjUqWLJmmf0gicAUAAAAAAIYwmUwqWrSoPDw8lJCQkNXlAIBh8ubNKxubtK3OSuAKiJl/AAAAAGAkW1vbNK1zCAA5ES/NAgAAAAAAAACDMMMVAAAAyEK8HA4AACBnYYYrAAAAAAAAABiEwBUAAAAAAAAADMKSAgAAAACATMXSGQCA3IQZrgAAAAAAAABgEAJXAAAAAAAAADAISwogR+KRJQAAAAAAAGQFZrgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIFkauM6dO1fVqlWTi4uLXFxc5O/vr3Xr1ln23759WwMGDFDBggXl5OSkdu3a6eLFi1Z9REVFqWXLlnJ0dJSHh4eGDRumu3fvPu1LAQAAAAAAAICsDVyLFy+uKVOmaN++fdq7d69eeeUVtWrVSr///rskaejQofrxxx/1zTffaMuWLTp//rzatm1rOT4xMVEtW7bUnTt3tHPnTi1atEhhYWEaPXp0Vl0SAAAAAAAAgFwsT1aePDAw0OrzxIkTNXfuXP3nP/9R8eLFtWDBAoWHh+uVV16RJC1cuFCVKlXSf/7zH73wwgv66aefdPToUf3888/y9PRUjRo1NH78eA0fPlyhoaHKmzdvVlwWAAAAAAAAgFwqSwPXByUmJuqbb77RzZs35e/vr3379ikhIUGNGjWytKlYsaJKliypXbt26YUXXtCuXbvk6+srT09PS5umTZvqnXfe0e+//y4/P78UzxUfH6/4+HjL55iYGElSQkKCEhISMukK8aD79zlRiWlrn2hOV/92SUnpq8fGNh2NGSNIG8Y5cgPGOXIDxjlyA8b508fPngCQc5nMZnP6vlMa7PDhw/L399ft27fl5OSk8PBwtWjRQuHh4erVq5dVMCpJzz//vBo0aKCPPvpIffv21dmzZ7VhwwbL/ri4OOXPn19r165V8+bNUzxnaGioxo4dm2x7eHi4HB0djb1AAAAAAAAeEhcXp86dO+v69etycXHJ6nIAAAbK8hmuFSpU0MGDB3X9+nV9++236tGjh7Zs2ZKp5wwJCVFQUJDlc0xMjEqUKKEmTZrwje4pSUhI0MaNG1XhWgXZ6vH/eu1T51S6+m932z1d7SN+/CbNbe1GTExX38i9GOfIDRjnyA0Y58gNGOdP3/0nLQEAOU+WB6558+ZV2bJlJUk1a9bUnj17NGvWLHXo0EF37tzRtWvX5ObmZml/8eJFFSlSRJJUpEgR/frrr1b9Xbx40bLvUezt7WVvb59su52dnezs7DJ6SUgH2///3+PY2ZrS1W+CTfreB2eXlLZHpyQxRpBujHPkBoxz5AaMc+QGjPOnJ7vWDQB4vPR913sKkpKSFB8fr5o1a8rOzk6RkZGWfcePH1dUVJT8/f0lSf7+/jp8+LAuXbpkabNx40a5uLiocuXKT712AAAAAAAAALlbls5wDQkJUfPmzVWyZEnduHFD4eHh2rx5szZs2CBXV1f17t1bQUFBcnd3l4uLi9599135+/vrhRdekCQ1adJElStXVrdu3fTxxx/rwoULGjlypAYMGJDiDFYAAAAAAAAAyExZGrheunRJ3bt3V3R0tFxdXVWtWjVt2LBBjRs3liTNmDFDNjY2ateuneLj49W0aVPNmTPHcrytra1Wr16td955R/7+/sqfP7969OihcePGZdUlAQAAAAAAAMjFsjRwXbBgQar78+XLp88//1yff/75I9t4e3tr7dq1RpcGAAAAAAAAAOn2zK3hCgAAAAAAAADZFYErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIPkSe8BSUlJ2rJli7Zt26azZ88qLi5OhQsXlp+fnxo1aqQSJUpkRp0AAAAAAAAA8MxL8wzXW7duacKECSpRooRatGihdevW6dq1a7K1tdWff/6pMWPGqFSpUmrRooX+85//ZGbNAAAAAAAAAPBMSvMM1/Lly8vf31/z589X48aNZWdnl6zN2bNnFR4ero4dO+rDDz9Unz59DC0WAAAAAAAAAJ5laQ5cf/rpJ1WqVCnVNt7e3goJCVFwcLCioqIyXBwAAAAAAAAAZCdpXlLgcWHrg+zs7FSmTJknKggAAAAAAAAAsqs0B64PWr9+vbZv3275/Pnnn6tGjRrq3Lmzrl69alhxAAAAAAAAAJCdPFHgOmzYMMXExEiSDh8+rPfee08tWrTQ6dOnFRQUZGiBAAAAAAAAAJBdpHkN1wedPn1alStXliR99913evXVVzVp0iTt379fLVq0MLRAAAAAAAAAAMgunmiGa968eRUXFydJ+vnnn9WkSRNJkru7u2XmKwAAAAAAAADkNk80w/XFF19UUFCQAgIC9Ouvv2r58uWSpD/++EPFixc3tEAAAAAAAAAAyC6eaIbr7NmzlSdPHn377beaO3euihUrJklat26dmjVrZmiBAAAAAAAAAJBdPNEM15IlS2r16tXJts+YMSPDBQEAAAAAAABAdvVEget9ly5d0qVLl5SUlGS1vVq1ahkqCgAAAAAAAACyoycKXPft26cePXro2LFjMpvNkiSTySSz2SyTyaTExERDiwQAAAAAAACA7OCJAtc333xT5cuX14IFC+Tp6SmTyWR0XQAAAAAAAACQ7TxR4Hrq1Cl99913Klu2rNH1AAAAAAAAAEC2ZfMkBzVs2FCHDh0yuhYAAAAAAAAAyNaeaIbrl19+qR49eujIkSOqWrWq7OzsrPa/9tprhhQHAAAAAAAAANnJEwWuu3bt0o4dO7Ru3bpk+3hpFgAAAAAAAIDc6okC13fffVddu3bVqFGj5OnpaXRNAAAAAICM+m86Xm6c6CDp60wrBQCA3OSJAtd//vlHQ4cOJWwFAABA9kQQBQAAgEzyRC/Natu2rTZt2mR0LQAAAAAAAACQrT3RDNfy5csrJCRE27dvl6+vb7KXZg0aNMiQ4gDgmcOMKOQGjHMAAAAAeGJPFLh++eWXcnJy0pYtW7RlyxarfSaTicA1t+IHdAAAAAAAAORyTxS4nj592ug6AAAAAAAAACDbe6I1XAEAAAAAAAAAyaU5cJ0yZYpu3bqVpra7d+/WmjVrnrgoAAAAAAAAAMiO0rykwNGjR1WyZEm9/vrrCgwMVK1atVS4cGFJ0t27d3X06FFt375dS5cu1fnz57V48eJMKxpPx/WxY9PV3rVDJhUCAAAAAAAAZBNpDlwXL16sQ4cOafbs2ercubNiYmJka2sre3t7xcXFSZL8/Pz01ltvqWfPnsqXL1+mFQ0AAAAAAAAAz6J0vTSrevXqmj9/vv71r3/pt99+09mzZ3Xr1i0VKlRINWrUUKFChTKrTgAAAOCReDIHAAAAz4p0Ba732djYqEaNGqpRo4bB5QAAAKMRRAEAAADA0/NEgSsA5BQEUQAAILvg/1sAAMgebLK6AAAAAAAAAADIKQhcAQAAAAAAAMAgBK4AAAAAAAAAYJB0Ba62tra6dOlSZtUCAAAAAAAAANlaugJXs9mcWXUAAAAAAAAAQLbHkgIAAAAAAAAAYJA86T3gyy+/lJOTU6ptBg0a9MQFAQAAAAAAAEB2le7A9YsvvpCtre0j95tMJgJXAAAAAAAAALlSugPXvXv3ysPDIzNqAQAAAAAAAIBsLV1ruJpMpsyqAwAAAAAAAACyvXQFrmazObPqAAAAAAAAAIBsL12B65gxYx77wiwAAAAAAAAAyK3SFbguWLBAt27dsnyePXu2YmJiDC8KAAAAAAAAALKjdAWuf/31lxITEy2fP/jgA/3vf/8zvCgAAAAAAAAAyI7SFbg+jDVdAQAAAAAAAOD/ZChwBQAAAAAAAAD8nzzpPeDLL7+0vDjr7t27CgsLU6FChazaDBo0yJjqAAAAAAAAACAbSVfgWrJkSc2fP9/yuUiRIlqyZIlVG5PJROAKAAAAAAAAIFdKV+B65syZTCoDAAAAAAAAALK/dK3h+sorr+jatWuZVAoAAAAAAAAAZG/pClw3b96sO3fuZFYtAAAAAAAAAJCtpStwBQAAAAAAAAA8WrrWcJWko0eP6sKFC6m2qVat2hMXBAAAAAAAAADZVboD14YNG8psNifbbjKZZDabZTKZlJiYaEhxAAAAAAAAAJCdpDtw3b17twoXLpwZtQAAAAAAAABAtpbuwLVkyZLy8PDIjFoAAAAAAAAAIFvjpVkAAAAAAAAAYJB0Ba716tXTnTt3MqsWAAAAAAAAAMjW0hW4bt26VXnz5s2sWgAAAAAAAAAgW0tX4Go2mzOrDgAAAAAAAADI9tK9hqvJZMqMOgAAAAAAAAAg28uT3gPKly//2ND1ypUrT1wQAAAAAAAAAGRX6Q5cx44dK1dX18yoBQAAAAAAAACytXQHrh07dpSHh0dm1AIAAAAAAAAA2Vq61nBl/VYAAAAAAAAAeLR0Ba5mszmz6gAAAAAAAACAbC9dSwokJSVlVh0AAAAAAAAAkO2la4YrAAAAAAAAAODRCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMkqWB6+TJk1W7dm05OzvLw8NDrVu31vHjx63a3L59WwMGDFDBggXl5OSkdu3a6eLFi1ZtoqKi1LJlSzk6OsrDw0PDhg3T3bt3n+alAAAAAAAAAEDWBq5btmzRgAED9J///EcbN25UQkKCmjRpops3b1raDB06VD/++KO++eYbbdmyRefPn1fbtm0t+xMTE9WyZUvduXNHO3fu1KJFixQWFqbRo0dnxSUBAAAAAAAAyMXyZOXJ169fb/U5LCxMHh4e2rdvn15++WVdv35dCxYsUHh4uF555RVJ0sKFC1WpUiX95z//0QsvvKCffvpJR48e1c8//yxPT0/VqFFD48eP1/DhwxUaGqq8efMmO298fLzi4+Mtn2NiYiRJCQkJSkhIyMQrzl7u2qQvj09IdEh726R7bROVmMa+zemqxS4pKV3tE2xs09GYMZKTMM4f1ZhxnpMwzh/VmHGekzDOH9WYcZ6TMM4f1Th7jnN+9gSAnMtkNpvT950yE/35558qV66cDh8+rKpVq+qXX35Rw4YNdfXqVbm5uVnaeXt7a8iQIRo6dKhGjx6tVatW6eDBg5b9p0+fVunSpbV//375+fklO09oaKjGjh2bbHt4eLgcHR0z49IAAAAAALCIi4tT586ddf36dbm4uGR1OQAAA2XpDNcHJSUlaciQIQoICFDVqlUlSRcuXFDevHmtwlZJ8vT01IULFyxtPD09k+2/vy8lISEhCgoKsnyOiYlRiRIl1KRJE77RPSBmypR0tXdpOznNbROSHLTx1FeqcK2CbPX4f732qXMqXbW0u+2ervYRP36T5rZ2Iyamq2882xjnKWOc5yyM85QxznMWxnnKGOc5C+M8Zdl1nN9/0hIAkPM8M4HrgAEDdOTIEW3fvj3Tz2Vvby97e/tk2+3s7GRnZ5fp588u8qTzsR8721vpPoft///v8X2b0tVvQjoft7JLStujU5IYIzkM4/wRbRnnOQrj/BFtGec5CuP8EW0Z5zkK4/wRbbPpOM+udQMAHi9LX5p138CBA7V69Wpt2rRJxYsXt2wvUqSI7ty5o2vXrlm1v3jxoooUKWJpc/HixWT77+8DAAAAAAAAgKclSwNXs9msgQMHauXKlfrll19UqlQpq/01a9aUnZ2dIiMjLduOHz+uqKgo+fv7S5L8/f11+PBhXbp0ydJm48aNcnFxUeXKlZ/OhQAAAAAAAACAsnhJgQEDBig8PFz//ve/5ezsbFlz1dXVVQ4ODnJ1dVXv3r0VFBQkd3d3ubi46N1335W/v79eeOEFSVKTJk1UuXJldevWTR9//LEuXLigkSNHasCAASkuGwAAAAAAAAAAmSVLA9e5c+dKkurXr2+1feHCherZs6ckacaMGbKxsVG7du0UHx+vpk2bas6cOZa2tra2Wr16td555x35+/srf/786tGjh8aNG/e0LgMAAAAAAAAAJGVx4Go2mx/bJl++fPr888/1+eefP7KNt7e31q5da2RpAAAAAAAAAJBuz8RLswAAAAAAAAAgJyBwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACD5MnqAgAAAAAgt5py4H9pbvtOJtYBAACMQ+AKIMfhBxfkBoxzAAAAAHg2saQAAAAAAAAAABiEGa4AAAB4JjGTGwAAANkRgWsuww8uAAAAAAAAQOZhSQEAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABuGlWQCeebOuzkrnEV0ypQ4AAAAAAIDHIXDN5giiAAAAAAAAgGcHSwoAAAAAAAAAgEGY4QoAwDOAJxYAIGfg73MAAEDgCgAAgKeCIAoAAAC5AUsKAAAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAbJ0sB169atCgwMlJeXl0wmk3744Qer/WazWaNHj1bRokXl4OCgRo0a6cSJE1Ztrly5oi5dusjFxUVubm7q3bu3YmNjn+JVAAAAAAAAAMA9WRq43rx5U9WrV9fnn3+e4v6PP/5Yn376qb744gvt3r1b+fPnV9OmTXX79m1Lmy5duuj333/Xxo0btXr1am3dulV9+/Z9WpcAAAAAAAAAABZ5svLkzZs3V/PmzVPcZzabNXPmTI0cOVKtWrWSJC1evFienp764Ycf1LFjRx07dkzr16/Xnj17VKtWLUnSZ599phYtWmjq1Kny8vJ6atcCAAAAAAAAAFkauKbm9OnTunDhgho1amTZ5urqqjp16mjXrl3q2LGjdu3aJTc3N0vYKkmNGjWSjY2Ndu/erTZt2qTYd3x8vOLj4y2fY2JiJEkJCQlKSEjIpCvKHKa7pnS1t0m6m+a2d23SNwE6IdEh7W2T7rVNVGIa+zanqxa7pKR0tU+wsU1H4+w1RnICxnnKGOc5C+M8ZYzznIVxnjLGec7COE8Z4zy57PazJwAg7Uxmszl93ykziclk0sqVK9W6dWtJ0s6dOxUQEKDz58+raNGilnZvvPGGTCaTli9frkmTJmnRokU6fvy4VV8eHh4aO3as3nnnnRTPFRoaqrFjxybbHh4eLkdHR+MuCgAAAACAFMTFxalz5866fv26XFxcsrocAICBntkZrpkpJCREQUFBls8xMTEqUaKEmjRpku2+0c29Njdd7W9HvZHmtr3Xzk9X3y5tJ6e5bUKSgzae+koVrlWQrR7/r9c+dU6lq5Z2t93T1T7ix2/S3NZuxMR09Y2MY5ynjHGeszDOU8Y4z1kY5yljnOcsjPOUMc6Tu/+kJQAg53lmA9ciRYpIki5evGg1w/XixYuqUaOGpc2lS5esjrt7966uXLliOT4l9vb2sre3T7bdzs5OdnZ2BlT/9JjzpG+CcpJN2r/kedL52I+d7a10tZck2///3+P7Tt+jWQnpfNzKLiltj05JynZjJCdgnKeMcZ6zMM5TxjjPWRjnKWOc5yyM85QxzpPLrnUDAB4vfd/1nqJSpUqpSJEiioyMtGyLiYnR7t275e/vL0ny9/fXtWvXtG/fPkubX375RUlJSapTp85TrxkAAAAAAABA7palM1xjY2P1559/Wj6fPn1aBw8elLu7u0qWLKkhQ4ZowoQJKleunEqVKqVRo0bJy8vLss5rpUqV1KxZM/Xp00dffPGFEhISNHDgQHXs2FFeXl5ZdFUAAAAAAAAAcqssDVz37t2rBg0aWD7fX1e1R48eCgsL0/vvv6+bN2+qb9++unbtml588UWtX79e+fLlsxyzbNkyDRw4UA0bNpSNjY3atWunTz/99KlfCwAAAAAAAABkaeBav359mc2PXuPIZDJp3LhxGjdu3CPbuLu7Kzw8PDPKAwAAAAAAAIB0eWbXcAUAAAAAAACA7IbAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADBIjglcP//8c/n4+ChfvnyqU6eOfv3116wuCQAAAAAAAEAukyMC1+XLlysoKEhjxozR/v37Vb16dTVt2lSXLl3K6tIAAAAAAAAA5CJ5sroAI0yfPl19+vRRr169JElffPGF1qxZo6+++kojRoxI1j4+Pl7x8fGWz9evX5ckXblyRQkJCU+naIPEX49/fKMH3Im5mua2V+/cSVffd6/nS3PbhKR8iouL0/Vb12Ur28e2/+d6bLpq0W27dDX/587dNLe1++ef9NWCDGOcPwLjPEdhnD8C4zxHYZw/AuM8R2GcPwLjPJkbN25IksxmcxZXAgAwmsmczf92v3PnjhwdHfXtt9+qdevWlu09evTQtWvX9O9//zvZMaGhoRo7duxTrBIAAAAAgOTOnTun4sWLZ3UZAAADZfsZrv/73/+UmJgoT09Pq+2enp7673//m+IxISEhCgoKsnxOSkrSlStXVLBgQZlMpkytF/fExMSoRIkSOnfunFxcXLK6HCBTMM6RGzDOkRswzpEbMM6fPrPZrBs3bsjLyyurSwEAGCzbB65Pwt7eXvb29lbb3NzcsqaYXM7FxYX/oUOOxzhHbsA4R27AOEduwDh/ulxdXbO6BABAJsj2L80qVKiQbG1tdfHiRavtFy9eVJEiRbKoKgAAAAAAAAC5UbYPXPPmzauaNWsqMjLSsi0pKUmRkZHy9/fPwsoAAAAAAAAA5DY5YkmBoKAg9ejRQ7Vq1dLzzz+vmTNn6ubNm+rVq1dWl4ZHsLe315gxY5It7QDkJIxz5AaMc+QGjHPkBoxzAACMYzKbzeasLsIIs2fP1ieffKILFy6oRo0a+vTTT1WnTp2sLgsAAAAAAABALpJjAlcAAAAAAAAAyGrZfg1XAAAAAAAAAHhWELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAPDEePcmAAAAAFgjcAUAAE/M3t5ex44dy+oyAABPYNu2beratav8/f31999/S5KWLFmi7du3Z3FlAABkb3myugDg3LlzGjNmjL766qusLgXIkFu3bmnfvn1yd3dX5cqVrfbdvn1bK1asUPfu3bOoOiBjgoKCUtyemJioKVOmqGDBgpKk6dOnP82ygEwxe/Zs/frrr2rRooU6duyoJUuWaPLkyUpKSlLbtm01btw45cnD/0Yje/vuu+/UrVs3denSRQcOHFB8fLwk6fr165o0aZLWrl2bxRUCAJB9mcw8C4gsdujQIT333HNKTEzM6lKAJ/bHH3+oSZMmioqKkslk0osvvqiIiAgVLVpUknTx4kV5eXkxzpFt2djYqHr16nJzc7PavmXLFtWqVUv58+eXyWTSL7/8kjUFAgaZMGGCPv74YzVp0kQ7duzQkCFD9Mknn2jo0KGysbHRjBkz9M4772js2LFZXSqQIX5+fho6dKi6d+8uZ2dnHTp0SKVLl9aBAwfUvHlzXbhwIatLBAAg2+Kf5pHpVq1aler+U6dOPaVKgMwzfPhwVa1aVXv37tW1a9c0ZMgQBQQEaPPmzSpZsmRWlwdk2KRJkzRv3jxNmzZNr7zyimW7nZ2dwsLCks3qBrKrsLAwhYWFqW3btjp06JBq1qypRYsWqUuXLpKkihUr6v333ydwRbZ3/Phxvfzyy8m2u7q66tq1a0+/IAAAchACV2S61q1by2QypfpiFZPJ9BQrAoy3c+dO/fzzzypUqJAKFSqkH3/8Uf3799dLL72kTZs2KX/+/FldIpAhI0aMUMOGDdW1a1cFBgZq8uTJsrOzy+qyAMOdP39etWrVkiRVr15dNjY2qlGjhmX/c889p/Pnz2dRdYBxihQpoj///FM+Pj5W27dv367SpUtnTVEAAOQQvDQLma5o0aL6/vvvlZSUlOKv/fv3Z3WJQIbdunXLaj0/k8mkuXPnKjAwUPXq1dMff/yRhdUBxqhdu7b27duny5cvq1atWjpy5Aj/YIYcp0iRIjp69Kgk6cSJE0pMTLR8lqTff/9dHh4eWVUeYJg+ffpo8ODB2r17t0wmk86fP69ly5YpODhY77zzTlaXBwBAtsYMV2S6mjVrat++fWrVqlWK+x83+xXIDipWrKi9e/eqUqVKVttnz54tSXrttdeyoizAcE5OTlq0aJEiIiLUqFEj1iVGjtOlSxd1795drVq1UmRkpN5//30FBwfrn3/+kclk0sSJE9W+ffusLhPIsBEjRigpKUkNGzZUXFycXn75Zdnb2ys4OFjvvvtuVpcHAEC2xkuzkOm2bdummzdvqlmzZinuv3nzpvbu3at69eo95coA40yePFnbtm175Bt9+/fvry+++EJJSUlPuTIg8/z111/at2+fGjVqxLIZyDGSkpI0ZcoU7dq1S3Xr1tWIESO0fPlyvf/++4qLi1NgYKBmz57NmEeOcefOHf3555+KjY1V5cqV5eTklNUlAQCQ7RG4AgAAAAAAAIBBWFIAAAAAAHKZmzdvasqUKYqMjNSlS5eSPYVz6tSpLKoMAIDsj8AVAAAAAHKZt956S1u2bFG3bt1UtGhRXoIIAICBWFIAAAAAAHIZNzc3rVmzRgEBAVldCgAAOY5NVhcAAAAAAHi6ChQoIHd396wuAwCAHInAFQAAAABymfHjx2v06NGKi4vL6lIAAMhxWFIAAAAAAHIZPz8/nTx5UmazWT4+PrKzs7Pav3///iyqDACA7I+XZgEAAABALtO6deusLgEAgByLGa4AAAAAAAAAYBDWcAUAAAAAAAAAg7CkAAAAAADkAu7u7vrjjz9UqFAhFShQQCaT6ZFtr1y58hQrAwAgZyFwBQAAAIBcYMaMGXJ2dpYkzZw5M2uLAQAgB2MNVwAAAAAAAAAwCDNcAQAAACAXiImJSXNbFxeXTKwEAICcjRmuAAAAAJAL2NjYpLpuqySZzWaZTCYlJiY+paoAAMh5mOEKAAAAALnApk2bsroEAAByBWa4AgAAAEAu0LZtW4WFhcnFxUWLFy9Whw4dZG9vn9VlAQCQ4xC4AgAAAEAukDdvXp09e1ZFixaVra2toqOj5eHhkdVlAQCQ47CkAAAAAADkAhUrVlRISIgaNGggs9msFStWPPLlWN27d3/K1QEAkHMwwxUAAAAAcoEdO3bovffe08mTJ3XlyhU5Ozun+BItk8mkK1euZEGFAADkDASuAAAAAJDL2NjY6MKFCywpAABAJrDJ6gIAAAAAAJmvbdu2iomJkSQtXLhQzs7OWVwRAAA5EzNcAQAAACAX4KVZAAA8Hbw0CwAAAAByAV6aBQDA08EMVwAAAADIBXbu3KmgoCBemgUAQCYjcAUAAACAXMbGxkbR0dHy9PTM6lIAAMhxCFwBAAAAIJc5e/asXFxc9NVXX+nYsWOSpCpVqqh3796PXGYAAACkDYErAAAAAOQye/fuVdOmTeXg4KDnn39ekrRnzx7dunVLGzZsUM2aNbO4QgAAsi8CVwAAAADIZV566SWVLVtW8+fPV548996lfPfuXb311ls6deqUtm7dmsUVAgCQfRG4AgAAAEAu4+DgoAMHDqhixYpW248ePapatWopLi4uiyoDACD7s8nqAgAAAAAAT5eLi4uioqKSbT937pycnZ2zoCIAAHIOAlcAAAAAyGU6dOig3r17a/ny5Tp37pzOnTuniIgIvfXWW+rUqVNWlwcAQLaWJ6sLAAAAAAA8XVOnTpXJZFL37t119+5dSZKdnZ3eeecdTZkyJYurAwAge2MNVwAAAADIpeLi4nTy5ElJUpkyZeTo6JjFFQEAkP0RuAIAAAAAAACAQVjDFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAEgDk8mkH374wbD+Nm/eLJPJpGvXrhnW59Nw584dlS1bVjt37szqUnKF9evXq0aNGkpKSsrqUgAAAACkEYErACDH2rVrl2xtbdWyZcsM9xUdHa3mzZsbUFXa+fj4yGQyyWQyydHRUb6+vvryyy/T3Y+RYfEXX3yhUqVKqW7duob0l9V8fHw0c+bMdB939uxZOTg4KDY21viiHtCsWTPZ2dlp2bJlmXoeAAAAAMYhcAUA5FgLFizQu+++q61bt+r8+fOptjWbzbp7926y7Xfu3JEkFSlSRPb29plSZ2rGjRun6OhoHTlyRF27dlWfPn20bt26p16HdO8ezZ49W7179061XUJCwlOqKOv8+9//VoMGDeTk5JTp5+rZs6c+/fTTTD8PAAAAAGMQuAIAcqTY2FgtX75c77zzjlq2bKmwsDCr/fcf6V+3bp1q1qwpe3t7bd++XfXr19fAgQM1ZMgQFSpUSE2bNpVkPUu0bt26Gj58uFV/ly9flp2dnbZu3SpJWrJkiWrVqiVnZ2cVKVJEnTt31qVLl9J9HfePL126tIYPHy53d3dt3LjRsn/Pnj1q3LixChUqJFdXV9WrV0/79++37Pfx8ZEktWnTRiaTyfJZuhcaPvfcc8qXL59Kly6tsWPHphg637dv3z6dPHnSasbwmTNnZDKZtHz5ctWrV0/58uWzzMb88ssvValSJeXLl08VK1bUnDlzrPr79ddf5efnp3z58qlWrVpauXKlTCaTDh48KEkKCwuTm5ub1TE//PCDTCaT1bbUrsNsNis0NFQlS5aUvb29vLy8NGjQIElS/fr1dfbsWQ0dOtQyk1i6N3s1MDBQBQoUUP78+VWlShWtXbs22Tlfe+01SffG0vPPP6/8+fPLzc1NAQEBOnv2bJrv87Vr19SvXz95enoqX758qlq1qlavXm3ZHxgYqL179+rkyZOP/NoAAAAAeHbkyeoCAADIDCtWrFDFihVVoUIFde3aVUOGDFFISEiysG7EiBGaOnWqSpcurQIFCkiSFi1apHfeeUc7duxIse8uXbro448/1pQpUyz9LV++XF5eXnrppZck3ZvlOX78eFWoUEGXLl1SUFCQevbsmSy4S6ukpCStXLlSV69eVd68eS3bb9y4oR49euizzz6T2WzWtGnT1KJFC504cULOzs7as2ePPDw8tHDhQjVr1ky2traSpG3btql79+769NNP9dJLL+nkyZPq27evJGnMmDEp1rBt2zaVL19ezs7OyfaNGDFC06ZNswSoy5Yt0+jRozV79mz5+fnpwIED6tOnj/Lnz68ePXooNjZWr776qho3bqylS5fq9OnTGjx4cLrvy+Ou47vvvtOMGTMUERGhKlWq6MKFCzp06JAk6fvvv1f16tXVt29f9enTx9LngAEDdOfOHW3dulX58+fX0aNHrWayXrt2Tdu3b9eSJUt09+5dtW7dWn369NHXX3+tO3fu6Ndff7WMi8fVl5SUpObNm+vGjRtaunSpypQpo6NHj1q+TpJUsmRJeXp6atu2bSpTpky67xEAAACAp8wMAEAOVLduXfPMmTPNZrPZnJCQYC5UqJB506ZNlv2bNm0ySzL/8MMPVsfVq1fP7Ofnl6w/SeaVK1eazWaz+dKlS+Y8efKYt27datnv7+9vHj58+CPr2bNnj1mS+caNG1bnv3r16iOP8fb2NufNm9ecP39+c548ecySzO7u7uYTJ0488pjExESzs7Oz+ccff0yx9vsaNmxonjRpktW2JUuWmIsWLfrIvgcPHmx+5ZVXrLadPn3aLMlyr+8rU6aMOTw83Grb+PHjzf7+/maz2Wz+17/+ZS5YsKD51q1blv1z5841SzIfOHDAbDabzQsXLjS7urpa9bFy5Urzg//78rjrmDZtmrl8+fLmO3fupHhN3t7e5hkzZlht8/X1NYeGhqbY3mw2m5ctW2auVauW2Ww2m//55x+zJPPmzZtTbPu4+jZs2GC2sbExHz9+/JHnM5vNZj8/v1RrAgAAAPDsYEkBAECOc/z4cf3666/q1KmTJClPnjzq0KGDFixYkKxtrVq1km2rWbNmqv0XLlxYTZo0sTw6f/r0ae3atUtdunSxtNm3b58CAwNVsmRJOTs7q169epKkqKiodF3LsGHDdPDgQf3yyy+qU6eOZsyYobJly1r2X7x4UX369FG5cuXk6uoqFxcXxcbGPvY8hw4d0rhx4+Tk5GT51adPH0VHRysuLi7FY27duqV8+fKluO/B+3jz5k2dPHlSvXv3tup/woQJlsfijx07pmrVqln15+/vn+b7ktbreP3113Xr1i2VLl1affr00cqVK1NdNkGSBg0apAkTJiggIEBjxozRb7/9ZrX/weUE3N3d1bNnTzVt2lSBgYGaNWuWoqOj01zfwYMHVbx4cZUvXz7VmhwcHB75dQEAAADwbCFwBQDkOAsWLNDdu3fl5eWlPHnyKE+ePJo7d66+++47Xb9+3apt/vz5kx2f0raHdenSRd9++60SEhIUHh4uX19f+fr6SroXODZt2lQuLi5atmyZ9uzZo5UrV0r6v5dwpVWhQoVUtmxZvfTSS/rmm280aNAgHT161LK/R48eOnjwoGbNmqWdO3fq4MGDKliw4GPPExsbq7Fjx+rgwYOWX4cPH9aJEyceGaoWKlRIV69eTXHfg/csNjZWkjR//nyr/o8cOaL//Oc/ab52Gxsbmc1mq20Pv5DrcddRokQJHT9+XHPmzJGDg4P69++vl19+OdUXe7311ls6deqUunXrpsOHD6tWrVr67LPPJN37+q1fv94SuErSwoULtWvXLtWtW1fLly9X+fLlLdf5uPocHBzSdC+uXLmiwoULp6ktAAAAgKxF4AoAyFHu3r2rxYsXa9q0aVYh16FDh+Tl5aWvv/7akPO0atVKt2/f1vr16xUeHm41u/W///2v/vnnH02ZMkUvvfSSKlas+EQvzHpYiRIl1KFDB4WEhFi27dixQ4MGDVKLFi1UpUoV2dvb63//+5/VcXZ2dkpMTLTa9txzz+n48eMqW7Zssl82Nin/74Gfn5/++9//JgtBH+bp6SkvLy+dOnUqWd+lSpWSJFWqVEm//fabbt++bTnu4TC2cOHCunHjhm7evGnZdv+FWum5DgcHBwUGBurTTz/V5s2btWvXLh0+fFiSlDdv3mT3Rrp3r99++219//33eu+99zR//nxJ916QVaBAAVWvXj3ZvQkJCdHOnTtVtWpVhYeHp6m+atWq6a+//tIff/zxyPt5+/ZtnTx5Un5+fqnedwAAAADPBl6aBQDIUVavXq2rV6+qd+/ecnV1tdrXrl07LViwQG+//XaGz5M/f361bt1ao0aN0rFjxyzLF0j3XnKUN29effbZZ3r77bd15MgRjR8/PsPnlKTBgweratWq2rt3r2rVqqVy5cppyZIlqlWrlmJiYjRs2LBksyZ9fHwUGRmpgIAA2dvbq0CBAho9erReffVVlSxZUu3bt5eNjY0OHTqkI0eOaMKECSmeu0GDBoqNjdXvv/+uqlWrplrn2LFjNWjQILm6uqpZs2aKj4/X3r17dfXqVQUFBalz58768MMP1adPH4WEhOjMmTOaOnWqVR916tSRo6OjPvjgAw0aNEi7d+9WWFiYVZvHXUdYWJgSExMtfS1dulQODg7y9va23JutW7eqY8eOsre3V6FChTRkyBA1b95c5cuX19WrV7Vp0yZVqlRJkrRq1Sqr2a2nT5/WvHnz9Nprr8nLy0vHjx/XiRMn1L179zTVV69ePb388stq166dpk+frrJly+q///2vTCaTmjVrJuleEG1vb/9ESy4AAAAAePqY4QoAyFEWLFigRo0aJQtbpXuB6969e5OtyfmkunTpokOHDumll15SyZIlLdsLFy6ssLAwffPNN6pcubKmTJmSLEx8UpUrV1aTJk00evRoSfeu9+rVq3ruuefUrVs3DRo0SB4eHlbHTJs2TRs3blSJEiUssySbNm2q1atX66efflLt2rX1wgsvaMaMGZYgMiUFCxZUmzZtLGvXpuatt97Sl19+qYULF8rX11f16tVTWFiYZYark5OTfvzxRx0+fFh+fn768MMP9dFHH1n14e7urqVLl2rt2rXy9fXV119/rdDQUKs2j7sONzc3zZ8/XwEBAapWrZp+/vln/fjjjypYsKAkady4cTpz5ozKlCljeWQ/MTFRAwYMUKVKldSsWTOVL19ec+bMkZQ8cHV0dNR///tftWvXTuXLl1ffvn01YMAA9evXL833+bvvvlPt2rXVqVMnVa5cWe+//77VrNuvv/5aXbp0kaOj42PvOwAAAICsZzI/7rlAAACA/++3335T48aNdfLkSTk5ORna95kzZ1SqVCkdOHBANWrUMLRvI+zfv1+vvPKKLl++LDs7u6dyzv/973+qUKGC9u7dawmrAQAAADzbmOEKAADSrFq1avroo490+vTprC7lqbt7964+++yzpxa2SvdC6Dlz5hC2AgAAANkIM1wBAMAz4Vmf4QoAAAAAaUHgCgAAAAAAAAAGYUkBAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgkP8H5wq5TazU+vsAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create a list to store the TTFT data\n", + "ttft_data = []\n", + "\n", + "# Iterate over the models, batch sizes, and arrival rates to calculate TTFT\n", + "for ssm in small_model_names:\n", + " for batch_size in batch_sizes:\n", + " for arrival_rate in arrival_rates:\n", + " model_name = ssm.replace(\"/\", \"-\")\n", + " filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n", + " if os.path.exists(filepath):\n", + " ttft = get_ttft(filepath)\n", + " ttft_data.append({\n", + " 'Model': model_name,\n", + " 'Batch Size': batch_size,\n", + " 'Arrival Rate': arrival_rate,\n", + " 'TTFT': ttft\n", + " })\n", + "# add incremental decoding entry\n", + "for batch_size in batch_sizes:\n", + " for arrival_rate in arrival_rates:\n", + " model_name = ssm.replace(\"/\", \"-\")\n", + " filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n", + " if os.path.exists(filepath):\n", + " ttft = get_ttft(filepath)\n", + " ttft_data.append({\n", + " 'Model': \"Incr Dec (baseline)\",\n", + " 'Batch Size': batch_size,\n", + " 'Arrival Rate': arrival_rate,\n", + " 'TTFT': ttft\n", + " })\n", + "\n", + "# Convert the list to a DataFrame\n", + "ttft_df = pd.DataFrame(ttft_data)\n", + "print(ttft_df.head())\n", + "\n", + "# Pivot the dataframe to have models and batch sizes as columns\n", + "pivot_df = ttft_df.pivot_table(index='Arrival Rate', columns=['Model', 'Batch Size'], values='TTFT')\n", + "\n", + "# Plot the data\n", + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "\n", + "colors = ['lightgreen', 'skyblue', 'lightcoral', 'gold', 'plum', 'peachpuff', 'mediumturquoise', 'salmon']\n", + "pivot_df.plot(kind='bar', ax=ax, color=colors)\n", + "\n", + "ax.set_title('TTFT vs Arrival Rate for Different Models and Batch Sizes\\nLLM: LLAMA-3.1-70B-Instruct')\n", + "ax.set_xlabel('Arrival Rate (requests/sec)')\n", + "ax.set_ylabel('TTFT (ms)')\n", + "ax.grid(True)\n", + "ax.legend(title='Model and Batch Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "\n", + "# Save the plot as a PDF\n", + "plt.savefig('/usr/FlexFlow/benchmarking/ttft_vs_arrival_rate.pdf', bbox_inches='tight')\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n", + "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n", + "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return queueing_time.mean()[1] / 1000000\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Model Batch Size Arrival Rate Queueing Time\n", + "0 Zhuominc-Llama-3-330M 4 offline 376.053818\n", + "1 Zhuominc-Llama-3-330M 4 1 319.585296\n", + "2 Zhuominc-Llama-3-330M 4 2 346.747481\n", + "3 Zhuominc-Llama-3-330M 4 4 360.138720\n", + "4 Zhuominc-Llama-3-330M 4 8 368.694877\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABVwAAALvCAYAAACZeQ7oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeVzN2f8H8NetaC+tWqSSKCTKbixhREnZSyhbDCEmY4vKNvZtxjJjkCGyb2OXrez7mmUalUnJWlIq9fn94dfn67qVWyKm1/PxuI/v3PM5n3Pen0/nXt/enc85EkEQBBARERERERERERHRJ1Mo6wCIiIiIiIiIiIiI/iuYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCWib56FhQV8fX3LOoxCHT9+HBKJBMePHy/rUP4z4uLiIJFIEBYW9tn6CAkJgUQi+Wztl5X79++jffv20NbWhkQiwc6dO8s6JCmFfV7WrVsHGxsbVKhQAZUqVRLL586di2rVqkFRURH16tX7orH+F33K91VYWBgkEgni4uJKPa7P5Vv+fs6/3xcvXvzsfUkkEoSEhHz2foiIiIj+K5hwJSpnbt26hT59+sDU1BTKysowMTFBnz59cPv27bIO7Zvi6+sLiUTy0dfXnAj+El6+fAkVFRVIJBLExMSUdTifxYdjQVlZGTVq1MCUKVPw5s2bErV5+/ZthISEfJbElY+PD27cuIEZM2Zg3bp1aNCgQan3kS8/MZ7/qlChAvT19dGsWTNMnDgRCQkJcrVz584d+Pr6wsrKCitXrsTvv/8OADh06BB++uknNG/eHGvWrMHMmTM/27V8qtOnTyMkJAQvX76Uq37+uNLS0kJmZqbM8fv374v3dd68eaUcLX0oP7n5/svQ0BBOTk7Yv39/idudOXNmmf7RIzo6Gh07doSpqSlUVFRQtWpVuLm5YcOGDWUWExEREdF/gVJZB0BEX8727dvh5eUFXV1dDBw4EJaWloiLi8OqVauwdetWbNq0Ce7u7mUdZrHdvXsXCgpf9u9HQ4YMQbt27cT3Dx48wJQpU+Dn54cWLVqI5VZWVmjcuDEyMzNRsWLFLxrj12DLli2QSCQwMjJCeHg4pk+fXirtmpubIzMzExUqVCiV9j6VsrIy/vjjDwBAamoqdu3ahWnTpiE2Nhbh4eHFbu/27dsIDQ1F69atYWFhUWpxZmZm4syZM5g0aRL8/f1Lrd2P8fLygouLC/Ly8vDixQtcuHABixYtwuLFi7Fq1Sp4enqKdVu2bCnzeTl+/Djy8vKwePFiVK9eXSw/evQoFBQUsGrVqq/+83X69GmEhobC19dXaoZuUZSUlJCRkYE9e/agZ8+eUsfCw8OhoqJS4qQ+lczUqVNhaWkJQRDw+PFjhIWFwcXFBXv27EGnTp2K3d7MmTPRvXt3eHh4lH6wH7Flyxb06tUL9erVw6hRo6Cjo4MHDx7g5MmTWLlyJXr37i3WzczMhJISf20gIiIikhf/nxNROREbG4u+ffuiWrVqOHnyJAwMDMRjo0aNQosWLdCnTx9cv34dlpaWZRhp8SkrK3/xPps2bYqmTZuK7y9evIgpU6agadOm6NOnj0x9FRWVLxneV2P9+vVwcXGBubk5NmzYIHfCNSMjA2pqajLlb9++RV5eHipWrPhV3VMlJSWpn/uwYcPQrFkzbNy4EQsWLEDlypXLMLr/efLkCQDInfCTx+vXr6Gurl5kHQcHB5nPRXx8PNq3bw8fHx/Y2trC3t4eAKCgoCDzs01JSSkw7pSUFKiqqpZqsrWwsVcWlJWV0bx5c2zcuFEm4bphwwa4urpi27ZtZRRd+dSxY0epWeEDBw5E5cqVsXHjxhIlXMtSSEgIatWqhbNnz8p8hvI/c/m+pu9bIiIiom8BlxQgKifmzp2LjIwM/P7771LJVgDQ19fHb7/9hvT0dMydO1cs9/X1LXB2XWFrW65fvx6Ojo5QVVWFrq4uPD098fDhQ6k6ha232rp1a7Ru3VqqLCsrC8HBwahevTqUlZVhZmaGn376CVlZWUW2mf/o56lTpzBmzBgYGBhAXV0dXbp0ERNO+fLy8hASEgITExOoqanByckJt2/fLtV1YQtaI7B169aoU6cOrl+/jlatWkFNTQ3Vq1fH1q1bAQAnTpxA48aNoaqqipo1a+LIkSMy7SYmJmLAgAGoXLkylJWVUbt2baxevfqj8dSpUwdOTk4y5Xl5eTA1NUX37t3FsoiICDg6OkJTUxNaWlqws7PD4sWL5bruhIQEREVFwdPTE56ennjw4AFOnz4tUy//Xly6dAktW7aEmpoaJk6cKD6OPm/ePCxatAhWVlZQVlbG7du3ZdZwnTdvHiQSCeLj42XanzBhAipWrIgXL14AAKKiotCjRw9UrVpVHFejR48u8LHtkpJIJPjuu+8gCAL++ecfsTw+Ph7Dhg1DzZo1oaqqCj09PfTo0UNq6YCwsDD06NEDAODk5CQ+vvz++Nm/fz9atGgBdXV1aGpqwtXVFbdu3SoyppCQEJibmwMAxo4dC4lEIvX5vnLlCjp27AgtLS1oaGigbdu2OHv2rFQb+Z+tEydOYNiwYTA0NESVKlVKdI/Mzc0RFhaG7OxszJkzRyz/8PNiYWGB4OBgAICBgYG4lqREIsGaNWvw+vVr8R69v6avPN9HhY09QP7vH4lEAn9/f+zcuRN16tQRP4sHDhyQuvdjx44FAFhaWorxyrNkRO/evbF//36ppQguXLiA+/fvS81AfN8///yDHj16QFdXF2pqamjSpAn27t0rU+/ff/+Fh4cH1NXVYWhoiNGjR8tcX75z586hQ4cO0NbWhpqaGlq1aoVTp059NP6LFy/C2dkZ+vr6UFVVhaWlJQYMGPDR83bt2gVXV1eYmJhAWVkZVlZWmDZtGnJzc6Xq5f8Mb9++DScnJ6ipqcHU1FRqTJXkeuVVqVIlqKqqysz+nDdvHpo1awY9PT2oqqrC0dFR/H7PJ5FI8Pr1a6xdu7bAZWgSExMxcOBA8R5YWlrihx9+QHZ2tlQ7WVlZH/23riCxsbFo2LBhgX+wMDQ0lIk1fw3XD5cK+fD1PnnGzatXrxAQEAALCwsoKyvD0NAQ33//PS5fvvzRayAiIiL6WnGGK1E5sWfPHlhYWEg97v6+li1bwsLCAnv27MGyZcuK3f6MGTMwefJk9OzZE4MGDcKTJ0/wyy+/oGXLlrhy5UqxZ9Tl5eWhc+fOiI6Ohp+fH2xtbXHjxg0sXLgQ9+7dk2vNuxEjRkBHRwfBwcGIi4vDokWL4O/vj02bNol1JkyYgDlz5sDNzQ3Ozs64du0anJ2dv8hjui9evECnTp3g6emJHj16YPny5fD09ER4eDgCAgIwdOhQ9O7dG3PnzkX37t3x8OFDaGpqAgAeP36MJk2aiMkeAwMD7N+/HwMHDkRaWhoCAgIK7bdXr14ICQlBcnIyjIyMxPLo6Gg8evRIfLz78OHD8PLyQtu2bTF79mwAQExMDE6dOoVRo0Z99Po2btwIdXV1dOrUCaqqqrCyskJ4eDiaNWsmU/fZs2fo2LEjPD090adPH6kZoWvWrMGbN2/g5+cHZWVl6OrqIi8vT+r8nj174qeffsLmzZvFxFa+zZs3o3379tDR0QHw7jHajIwM/PDDD9DT08P58+fxyy+/4N9//8WWLVs+el3yyk+m5fcLvEuUnT59Gp6enqhSpQri4uKwfPlytG7dGrdv34aamhpatmyJkSNHYsmSJZg4cSJsbW0BQPzfdevWwcfHB87Ozpg9ezYyMjKwfPlyfPfdd7hy5UqhSxB07doVlSpVwujRo8VH/DU0NAC8W9u5RYsW0NLSwk8//YQKFSrgt99+Q+vWrcXk//uGDRsGAwMDTJkyBa9fvy7xPWratCmsrKxw+PDhQussWrQIf/75J3bs2IHly5dDQ0MDdevWRfXq1fH777/j/Pnz4nIO+WOrON9HBY294n7/REdHY/v27Rg2bBg0NTWxZMkSdOvWDQkJCdDT00PXrl1x7949bNy4EQsXLoS+vj4AyPzxqyBdu3bF0KFDsX37djFRuWHDBtjY2MDBwUGm/uPHj9GsWTNkZGRg5MiR0NPTw9q1a9G5c2ds3boVXbp0AfDuEfG2bdsiISEBI0eOhImJCdatW4ejR4/KtHn06FF07NgRjo6OCA4OhoKCAtasWYM2bdogKioKjRo1KjD2lJQUtG/fHgYGBhg/fjwqVaqEuLg4bN++/aPXHRYWBg0NDYwZMwYaGho4evQopkyZgrS0NKk/DALvvks7dOiArl27omfPnti6dSvGjRsHOzs7dOzYsdjXW5TU1FQ8ffoUgiAgJSUFv/zyC9LT02VmcC9evBidO3eGt7c3srOzERERgR49euCvv/6Cq6srgHef5UGDBqFRo0bw8/MD8G4ZGgB49OgRGjVqhJcvX8LPzw82NjZITEzE1q1bkZGRIZUkleffuoKYm5sjMjIS//77b7H+cGJgYIB169ZJleXk5GD06NFScck7boYOHYqtW7fC398ftWrVwrNnzxAdHY2YmJgCxzgRERHRN0Egov+8ly9fCgAEd3f3Iut17txZACCkpaUJgiAIPj4+grm5uUy94OBg4f2vj7i4OEFRUVGYMWOGVL0bN24ISkpKUuXm5uaCj4+PTJutWrUSWrVqJb5ft26doKCgIERFRUnVW7FihQBAOHXqVKFtrlmzRgAgtGvXTsjLyxPLR48eLSgqKgovX74UBEEQkpOTBSUlJcHDw0Oqj5CQEAFAgXEW5sKFCwIAYc2aNTLHjh07JgAQjh07JnW9AIQNGzaIZXfu3BEACAoKCsLZs2fF8oMHD8q0PXDgQMHY2Fh4+vSpVF+enp6Ctra2kJGRUWisd+/eFQAIv/zyi1T5sGHDBA0NDfHcUaNGCVpaWsLbt2/luQUy7OzsBG9vb/H9xIkTBX19fSEnJ0eqXv69WLFihVT5gwcPBACClpaWkJKSUuCx9+9J06ZNBUdHR6l658+fFwAIf/75p1hW0L35+eefBYlEIsTHx4tlH47zwvj4+Ajq6urCkydPhCdPngh///23MG/ePEEikQh16tSRGoMF9X3mzBmZGLds2SIzZgRBEF69eiVUqlRJGDx4sFR5cnKyoK2tLVP+ofz7NnfuXKlyDw8PoWLFikJsbKxY9ujRI0FTU1No2bKlWJb/2fruu+/kGheF9fc+d3d3AYCQmpoqCELBn5f8n8WTJ0+kzs2/9+8rzvdRYWOvON8/AISKFSsKf//9t1h27do1mc/Y3LlzBQDCgwcPCr0XhV1b9+7dhbZt2wqCIAi5ubmCkZGREBoaWuD9DQgIEABIxf7q1SvB0tJSsLCwEHJzcwVBEIRFixYJAITNmzeL9V6/fi1Ur15d6v7n5eUJ1tbWgrOzs8xYtrS0FL7//nuxLH985F/jjh07BADChQsX5Lrm9xX0WRkyZIigpqYmvHnzRizL/xm+//nJysoSjIyMhG7duoll8l5vYfKv7cOXsrKyEBYW9tH4s7OzhTp16ght2rSRKldXVy/w35p+/foJCgoKBd67/J+DvP/WFWbVqlXi+HVychImT54sREVFiWPkfQCE4ODgQtsaNmyYoKioKBw9elSMUd5xo62tLQwfPrzIWImIiIi+NVxSgKgcePXqFQCIsyMLk388v768tm/fjry8PPTs2RNPnz4VX0ZGRrC2tsaxY8eKHfOWLVtga2sLGxsbqTbbtGkDAHK16efnJ/V4Y4sWLZCbmys+dh4ZGYm3b99i2LBhUueNGDGi2PGWhIaGhtRmQTVr1kSlSpVga2srNaMw/7/zH00XBAHbtm2Dm5sbBEGQuj/Ozs5ITU0t8lHMGjVqoF69elKzn3Jzc7F161a4ublBVVUVwLtHZV+/fl3k7MPCXL9+HTdu3ICXl5dY5uXlhadPn+LgwYMy9ZWVldG/f/8C2+rWrZtcMwF79eqFS5cuITY2VizbtGkTlJWVpTaDy78+4N36o0+fPkWzZs0gCAKuXLki1/V96PXr1zAwMICBgQGqV6+OwMBANG/eHLt27ZIag+/3nZOTg2fPnqF69eqoVKmSXI/PHj58GC9fvhTvZf5LUVERjRs3LtFnLTc3F4cOHYKHhweqVasmlhsbG6N3796Ijo5GWlqa1DmDBw+GoqJisfsqSP4s2+J+7xSmuN9HBY294n7/tGvXTpyZCAB169aFlpaW1HISn6J37944fvw4kpOTcfToUSQnJxe6nMC+ffvQqFEjfPfdd2KZhoYG/Pz8EBcXh9u3b4v1jI2NpZYQUVNTE2da5rt69aq4fMGzZ8/Ee/H69Wu0bdsWJ0+elJlxni9/JvFff/2FnJycYl3z+5+VV69e4enTp2jRogUyMjJw584dqboaGhpSM0wrVqyIRo0aSd1/ea/3Y5YuXYrDhw/j8OHDWL9+PZycnDBo0CCZWbvvx//ixQukpqaiRYsWcn3O8/LysHPnTri5uUmtF5vvw8f2P/ZvXWEGDBiAAwcOoHXr1oiOjsa0adPQokULWFtbF7j8S2H+/PNPLFu2DHPmzBGXqynOuKlUqRLOnTuHR48eyd0nERER0deOSwoQlQPyJlJfvXoFiUQiPu4qr/v370MQBFhbWxd4vCQ7yd+/fx8xMTGFJto+3NCjIFWrVpV6n/9od/5anvm/jL6/6zkA6OrqSj0G/rlUqVJF5hdnbW1tmJmZyZQB/4v7yZMnePnyJX7//Xf8/vvvBbb9sfvTq1cvTJw4EYmJiTA1NcXx48eRkpKCXr16iXWGDRuGzZs3o2PHjjA1NUX79u3Rs2dPdOjQ4aPXtn79eqirq6NatWr4+++/AbzbdMXCwgLh4eHiI7X5TE1NC934SN5N3Hr06IExY8Zg06ZNmDhxIgRBwJYtW8R1SfMlJCRgypQp2L17t3hP86WmpsrV14dUVFSwZ88eAO/WiZwzZ464odP7MjMz8fPPP2PNmjVITEyEIAjF6vv+/fsAICb+PvT+dcrryZMnyMjIQM2aNWWO2draIi8vDw8fPkTt2rXF8tLcWC89PR3Ax/8gJK/ifh8VNPaK+/3z4XcN8O775sPxVVIuLi7Q1NTEpk2bcPXqVTRs2BDVq1cvcA3Y+Ph4mSUggP8tSREfH486deogPj4e1atXl/kO+nAc5I85Hx+fQuNLTU0t8DuzVatW6NatG0JDQ7Fw4UK0bt0aHh4e6N2790c3O7x16xaCgoJw9OhRmYT/h5+Vgr5LdXR0cP36dfG9vNf7MY0aNZJKgnp5eaF+/frw9/dHp06dxLH0119/Yfr06bh69arUOrEFrX/+oSdPniAtLQ116tSRK6aP/VtXFGdnZzg7OyMjIwOXLl3Cpk2bsGLFCnTq1Al37tyRWcv1Q1evXsXQoUPh5eWFMWPGiOXFGTdz5syBj48PzMzM4OjoCBcXF/Tr10/qD0BERERE3xomXInKAW1tbZiYmEj98lmQ69evo0qVKuIvjIX9YvjhpiV5eXmQSCTYv39/gbPe8mewfazN98/Ny8uDnZ0dFixYUGD9D5OSBSlsBt77Sa6yVFh8H4s7f1ZQnz59Cv1ltm7dukX23atXL0yYMAFbtmxBQEAANm/eDG1tbalkqqGhIa5evYqDBw9i//792L9/P9asWYN+/fph7dq1hbYtCAI2btyI169fo1atWjLHU1JSkJ6eLjUuPkxMvq+oY+8zMTFBixYtsHnzZkycOBFnz55FQkKCuP4s8G6cff/993j+/DnGjRsHGxsbqKurIzExEb6+voXO1PsYRUVFtGvXTnzv7OwMGxsbDBkyBLt37xbLR4wYgTVr1iAgIABNmzaFtrY2JBIJPD095eo7v866deuk1t/N9+HGPZ+LvD8Tedy8eROGhoYlShYXpDjfR0DB11Lc75/P/V2jrKyMrl27Yu3atfjnn3/EzYu+hPwxN3fuXNSrV6/AOh/e03wSiQRbt27F2bNnsWfPHhw8eBADBgzA/Pnzcfbs2ULPe/nyJVq1agUtLS1MnToVVlZWUFFRweXLlzFu3DiZz0pZftcrKCjAyckJixcvxv3791G7dm1ERUWhc+fOaNmyJZYtWwZjY2NUqFABa9aswYYNG0o9htK4fjU1NbRo0QItWrSAvr4+QkNDsX///iITpi9evEC3bt1Qo0YNcR3lfMUZNz179kSLFi2wY8cOHDp0CHPnzsXs2bOxfft2cQ1eIiIiom8NE65E5YSbmxt+++03REdHSz1qmi8qKgpxcXFSM1R0dHSkdsbO9+FjilZWVhAEAZaWlqhRo0aRcRTV5vuzWaysrHDt2jW0bdtWrhlBJZG/Y/vff/8tNWPv2bNnpTYz7XMwMDCApqYmcnNzpZJ8xWFpaYlGjRph06ZN8Pf3x/bt2+Hh4SEz66xixYpwc3ODm5sb8vLyMGzYMPz222+YPHmyzMzgfCdOnMC///6LqVOnirPq8r148QJ+fn7YuXOnzCYzpaFXr14YNmwY7t69i02bNkFNTQ1ubm7i8Rs3buDevXtYu3Yt+vXrJ5aXZNmEohgbG2P06NEIDQ3F2bNn0aRJEwDA1q1b4ePjg/nz54t137x5I/OZKGzM5z+2bmhoWOKf/YcMDAygpqaGu3fvyhy7c+cOFBQU5PoDR0mcOXMGsbGxpToWivN9VFQbpf3986nt9O7dG6tXr4aCgoLUUiQfMjc3L/RnmX88/39v3rwJQRCkYvvw3Pwxp6WlVeIx16RJEzRp0gQzZszAhg0b4O3tjYiICAwaNKjA+sePH8ezZ8+wfft2tGzZUix/8OBBifoH5L/eknj79i2A/83W3rZtG1RUVHDw4EGp79Q1a9bInFvQuDAwMICWlhZu3rz5ybGVRP4M3qSkpELr5OXlwdvbGy9fvsSRI0egpqYmdby448bY2BjDhg3DsGHDkJKSAgcHB8yYMYMJVyIiIvpmcQ1XonIiMDAQampqGDJkCJ49eyZ17Pnz5xg6dCi0tLTg7+8vlltZWSE1NVVqZmxSUhJ27NghdX7Xrl2hqKiI0NBQmRk1giBI9WdlZYWzZ88iOztbLPvrr7/w8OFDqfN69uyJxMRErFy5UuZaMjMzP2ln9Hxt27aFkpISli9fLlX+66+/fnLbn5OioiK6deuGbdu2FfgL+ZMnT+Rqp1evXjh79ixWr16Np0+fSi0nAEBmnCgoKIgzZ99/RPZD+csJjB07Ft27d5d6DR48GNbW1ggPD5crxuLq1q0bFBUVsXHjRmzZsgWdOnWCurq6eDx/Jtj741QQBCxevLjUYxkxYgTU1NQwa9Ysqf4//Iz88ssvMrPG82P+MBHr7OwMLS0tzJw5s8A1MeX92b9PUVER7du3x65du6QeUX/8+DE2bNiA7777rtRmn74vPj4evr6+qFixIsaOHVtq7Rbn+6gwn+P7p7CfqbycnJwwbdo0/PrrrwXObs7n4uKC8+fP48yZM2LZ69ev8fvvv8PCwkKcde7i4oJHjx5h69atYr2MjAyZZUocHR1hZWWFefPmiQnF9xU15l68eCHzM8if7VjUd0hBn9Ps7GwsW7as0HM+Rt7rLa6cnBwcOnQIFStWFP/ApKioCIlEIvW5jouLw86dO2XOV1dXlxkTCgoK8PDwwJ49e3Dx4kWZc0pr5m5kZGSB5fv27QNQ9HILoaGhOHjwIDZu3FjgEiPyjpvc3FyZJSIMDQ1hYmJS5BghIiIi+tpxhitROVG9enX8+eef8PLygp2dHQYOHAhLS0vExcVh1apVePHiBSIiIqR+cfL09MS4cePQpUsXjBw5EhkZGVi+fDlq1KghtfGHlZUVpk+fjgkTJiAuLg4eHh7Q1NTEgwcPsGPHDvj5+SEwMBAAMGjQIGzduhUdOnRAz549ERsbi/Xr10ttOAMAffv2xebNmzF06FAcO3YMzZs3R25uLu7cuYPNmzfj4MGDBW4mUhyVK1fGqFGjMH/+fHTu3BkdOnTAtWvXsH//fujr63+2mbWlYdasWTh27BgaN26MwYMHo1atWnj+/DkuX76MI0eO4Pnz5x9to2fPnggMDERgYCB0dXVlZiENGjQIz58/R5s2bVClShXEx8fjl19+Qb169WRmrubLysrCtm3b8P3330NFRaXAOp07d8bixYuRkpLy0fUBi8vQ0BBOTk5YsGABXr16JZNEtrGxgZWVFQIDA5GYmAgtLS1s27bts8xo1tPTQ//+/bFs2TLExMTA1tYWnTp1wrp166CtrY1atWrhzJkzOHLkCPT09KTOrVevHhQVFTF79mykpqZCWVkZbdq0gaGhIZYvX46+ffvCwcEBnp6eMDAwQEJCAvbu3YvmzZuX6A8G06dPx+HDh/Hdd99h2LBhUFJSwm+//YasrCzMmTPnk+/F5cuXsX79euTl5eHly5e4cOECtm3bBolEgnXr1n10CYziKM73UWE+x/ePo6MjAGDSpEnw9PREhQoV4ObmJvUHgaIoKCggKCjoo/XGjx+PjRs3omPHjhg5ciR0dXWxdu1aPHjwANu2bYOCwru/tQ8ePBi//vor+vXrh0uXLsHY2Bjr1q2TmamooKCAP/74Ax07dkTt2rXRv39/mJqaIjExEceOHYOWlpa4fvGH1q5di2XLlqFLly6wsrLCq1evsHLlSmhpacHFxaXQa2jWrBl0dHTg4+ODkSNHiuPkUxKN8l7vx+zfv1+cLZySkoINGzbg/v37GD9+vPiHCVdXVyxYsAAdOnRA7969kZKSgqVLl6J69eoyS/s4OjriyJEjWLBgAUxMTGBpaYnGjRtj5syZOHToEFq1agU/Pz/Y2toiKSkJW7ZsQXR0tLgh2adwd3eHpaUl3NzcYGVlhdevX+PIkSPYs2cPGjZsKPV0wPtu3LiBadOmoWXLlkhJScH69euljvfp00fucfPq1StUqVIF3bt3h729PTQ0NHDkyBFcuHBB6kkAIiIiom+OQETlyo0bN4TevXsLRkZGgoKCggBAUFFREW7dulVg/UOHDgl16tQRKlasKNSsWVNYv369EBwcLBT09bFt2zbhu+++E9TV1QV1dXXBxsZGGD58uHD37l2pevPnzxdMTU0FZWVloXnz5sLFixeFVq1aCa1atZKql52dLcyePVuoXbu2oKysLOjo6AiOjo5CaGiokJqaKtYzNzcXfHx8xPdr1qwRAAgXLlyQau/YsWMCAOHYsWNi2du3b4XJkycLRkZGgqqqqtCmTRshJiZG0NPTE4YOHSrnXRWECxcuCACENWvWyBwrqN9WrVoJtWvXlqlrbm4uuLq6ypQDEIYPHy5V9vjxY2H48OGCmZmZUKFCBcHIyEho27at8Pvvv8sdd/PmzQUAwqBBg2SObd26VWjfvr1gaGgoVKxYUahataowZMgQISkpqdD2tm3bJgAQVq1aVWid48ePCwCExYsXC4JQ+L148OCBAECYO3duoccKut8rV64UAAiamppCZmamzPHbt28L7dq1EzQ0NAR9fX1h8ODBwrVr12TaK2ycf8jHx0dQV1cv8FhsbKygqKgojs8XL14I/fv3F/T19QUNDQ3B2dlZuHPnjswYzr+OatWqCYqKijLj59ixY4Kzs7Ogra0tqKioCFZWVoKvr69w8eLFImMt6p5evnxZcHZ2FjQ0NAQ1NTXByclJOH36tFSdwj5bH+sv/6WkpCTo6uoKjRs3FiZMmCDEx8fLnFPQ5yX/Z/HkyROpukXde3m+jwobe4Ig//dPQZ9NQZD9XhIEQZg2bZpgamoqfvc+ePCgwL4/dm35Cvt5xsbGCt27dxcqVaokqKioCI0aNRL++usvmfPj4+OFzp07C2pqaoK+vr4watQo4cCBAzL3XxAE4cqVK0LXrl0FPT09QVlZWTA3Nxd69uwpREZGinXyx0f+dV2+fFnw8vISqlatKigrKwuGhoZCp06dPjpOBUEQTp06JTRp0kRQVVUVTExMhJ9++kk4ePCg3N+lPj4+grm5eYmv90P51/b+S0VFRahXr56wfPlyIS8vT6r+qlWrBGtra0FZWVmwsbER1qxZU+B3yp07d4SWLVsKqqqqAgCpMRMfHy/069dPMDAwEJSVlYVq1aoJw4cPF7KysqRikuffuoJs3LhR8PT0FKysrARVVVVBRUVFqFWrljBp0iQhLS1Nqi4AITg4WKr9wl7v+9i4ycrKEsaOHSvY29sLmpqagrq6umBvby8sW7asyNiJiIiIvnYSQfhKdo8hojLx559/wtfXF3369MGff/5Z1uF8FV6+fAkdHR1Mnz4dkyZNKutwiIiIiIiIiOgbwiUFiMq5fv36ISkpCePHj0eVKlUwc+bMsg7pi8rMzJTZpXzRokUAgNatW3/5gIiIiIiIiIjom8YZrkRUroWFhSEsLAwuLi7Q0NBAdHQ0Nm7ciPbt2+PgwYNlHR4RERERERERfWM4w5WIyrW6detCSUkJc+bMQVpamriR1vTp08s6NCIiIiIiIiL6BnGGKxEREREREREREVEpUSjrAIiIiIiIiIiIiIj+K5hwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImI6IsICwuDRCLBxYsXC60TFxcHiUSCefPmFdmWhYUFJBIJ2rVrV+DxlStXQiKRfLS/ooSEhEAikeDp06eF1jl+/DgkEgm2bt0qd7s9e/aERCLBuHHjimxTIpFg/fr1BdZp3rw5JBIJ6tSpU+Dx3NxcmJiYQCKRYP/+/XLHBgCjR4+Gg4MDdHV1oaamBltbW4SEhCA9PV2u85cvX44ePXqgatWqkEgk8PX1LVb/+T/bgl7W1tYy9VetWgVbW1uoqKjA2toav/zyi0wdX19fqXaUlJRgZmYGT09P3L59W6645Bm/n+L27dsICQlBXFzcZ2n/W4mBiIiIiOi/QKmsAyAiIioJFRUVHDt2DMnJyTAyMpI6Fh4eDhUVFbx586aMoitYWloa9uzZAwsLC2zcuBGzZs2CRCIpsK6Kigo2bNiAPn36SJXHxcXh9OnTUFFRKbSfo0ePIikpCRYWFggPD0fHjh3ljvHChQto0aIF+vfvDxUVFVy5cgWzZs3CkSNHcPLkSSgoFP232tmzZ+PVq1do1KgRkpKS5O4336JFi2SSu/Hx8QgKCkL79u2lyn/77TcMHToU3bp1w5gxYxAVFYWRI0ciIyNDJqGtrKyMP/74AwDw9u1bxMbGYsWKFThw4ABu374NExOTYsdamm7fvo3Q0FC0bt0aFhYW5TYGIiIiIqL/AiZciYjom9S8eXNcuHABmzZtwqhRo8Tyf//9F1FRUejSpQu2bdtWhhHK2rZtG3Jzc7F69Wq0adMGJ0+eRKtWrQqs6+Ligt27d+Pp06fQ19cXyzds2IDKlSvD2toaL168KPDc9evXw8HBAT4+Ppg4cSJev34NdXV1uWKMjo6WKbOyskJgYCDOnz+PJk2aFHn+iRMnxNmtGhoacvX5Pg8PD5my6dOnAwC8vb3FsszMTEyaNAmurq7iDOPBgwcjLy8P06ZNg5+fH3R0dMT6SkpKMsnrJk2aoFOnTti7dy8GDx5c7FjLiiAIePPmDVRVVcs6FCIiIiIiKgCXFCAiom+SiooKunbtig0bNkiVb9y4ETo6OnB2dpY5JycnB3fu3CnRzMvSEB4eju+//x5OTk6wtbVFeHh4oXXd3d2hrKyMLVu2SJVv2LABPXv2hKKiYoHnZWZmYseOHfD09ETPnj2RmZmJXbt2fVLc+bMdX758+dG65ubmhc7aLakNGzbA0tISzZo1E8uOHTuGZ8+eYdiwYVJ1hw8fjtevX2Pv3r0fbTd/ZrSSUsn+/uzr6wsNDQ0kJibCw8MDGhoaMDAwQGBgIHJzc6XqRkREwNHREZqamtDS0oKdnR0WL14M4N1yBT169AAAODk5iUsfHD9+HMC7+9+pUyccPHgQDRo0gKqqKn777TdxCY6wsDCZ2CQSCUJCQqTKEhMTMXDgQJiYmEBZWRmWlpb44YcfkJ2d/dEYiIiIiIhIfky4EhHRN6t37944f/48YmNjxbINGzage/fuqFChgkz9xMRE2NraYsKECV8yTADAo0ePcOzYMXh5eQEAvLy8sHXrVmRnZxdYX01NDe7u7ti4caNYdu3aNdy6dQu9e/cutJ/du3cjPT0dnp6eMDIyQuvWrYtM7Bbk7du3ePr0KR49eoRDhw4hKCgImpqaaNSoUbHaKQ1XrlxBTEyMzDVfuXIFANCgQQOpckdHRygoKIjH3/f06VM8ffoUjx8/xpkzZzB69Gjo6emhU6dOJY4vNzcXzs7O0NPTw7x589CqVSvMnz8fv//+u1jn8OHD8PLygo6ODmbPno1Zs2ahdevWOHXqFACgZcuWGDlyJABg4sSJWLduHdatWwdbW1uxjbt378LLywvff/89Fi9ejHr16hUrzkePHqFRo0aIiIhAr169sGTJEvTt2xcnTpxARkaGXDEQEREREZF8uKQAERF9s9q0aQMjIyNs3LgRQUFBiImJwdWrV7F48WL8888/ZR2elI0bN0JZWRnu7u4AAE9PT0yZMgX79u0r8DF64F1C2c3NDQ8fPoSZmRnCw8NRrVq1Ih/rX79+PZo1awYzMzOxn2HDhuHJkycwMDCQK9aLFy+iadOm4vuaNWti9+7d0NXVlfNqS09+svj95QQAICkpCYqKijA0NJQqr1ixIvT09PDo0SOp8tevX8tcv6mpKQ4dOiT3fSnImzdv0KtXL0yePBkAMHToUDg4OGDVqlX44YcfAAB79+6FlpYWDh48WODM5GrVqqFFixZYsmQJvv/+e7Ru3Vqmzt9//40DBw5IzdwuzuZWEyZMQHJyMs6dOyeVpJ46dSoEQUClSpU+GgMREREREcmHM1yJiOibpaioiJ49e4qzQMPDw2FmZoYWLVoUWN/CwgKCIBT4CPbnFh4eDldXV2hqagIArK2t4ejoWOTs0/bt20NXVxcREREQBAERERHiDNmCPHv2DAcPHpSq061bN0gkEmzevFnuWGvVqoXDhw9j586d+Omnn6Curi6zkdWXkJeXh4iICNSvX19mpmVmZiYqVqxY4HkqKirIzMyUKTt8+DAOHz6MgwcP4rfffoOGhgZcXFxw7969T4pz6NChUu9btGghlfCvVKkSXr9+jcOHD5e4D0tLywKXyZBHXl4edu7cCTc3N5kZwQBKfQkIIiIiIqLyjjNciYjom9a7d28sWbIE165dw4YNG+Dp6fnVJZBiYmJw5coV9OvXD3///bdY3rp1ayxduhRpaWnQ0tKSOa9ChQro0aMHNmzYgEaNGuHhw4dFLiewadMm5OTkoH79+lL9NG7cGOHh4Rg+fDgA4Pnz51JLGaiqqkJbW1t8r6WlhXbt2gF4t5bshg0b4O7ujsuXL8Pe3r7kN+L/ZWZmIjU1Vaosfz3V9504cQKJiYkYPXq0zDFVVdVCl2MoaEMpRUVF8Zryubi4wNraGhMmTBA3NHvy5IlUHV1d3UITu8C7RO6HM2R1dHSkNjQbNmwYNm/ejI4dO8LU1BTt27dHz5490aFDh0Lb/ZClpaXcdT/05MkTpKWloU6dOiVug4iIiIiI5McZrkRE9E1r3LgxrKysEBAQgAcPHhSZkCwr69evBwCMHj0a1tbW4mv+/Pl48+YNtm3bVui5vXv3xtWrVxESEgJ7e3vUqlWr0Lr5s2WbN28u1U90dDTOnDkjzrrs2rUrjI2NxdeoUaOKjL9r164A3m38VBo2bdok1b+xsXGh16OgoFDgrF5jY2Pk5uYiJSVFqjw7OxvPnj2DiYnJR+OoUqUKatasiZMnTwIAHj58KBPX6dOni2yjsM3L3mdoaIirV69i9+7d6Ny5M44dO4aOHTvCx8fno+fm+zCBDBQ+M/XDDbuIiIiIiOjL4gxXIiL65nl5eWH69OmwtbUt9mZCn5sgCNiwYQOcnJwwbNgwmePTpk1DeHg4+vfvX+D53333HapWrYrjx49j9uzZhfbz4MEDnD59Gv7+/mjVqpXUsby8PPTt2xcbNmxAUFAQ5s+fLzUD82PJyaysLOTl5cnMSi0pZ2fnjz5en5WVhW3btqF169YFxpf/c7548SJcXFzE8osXLyIvL0/ucfD27VtxuQQjIyOZuEpjRi/wbm1ZNzc3uLm5IS8vD8OGDcNvv/2GyZMno3r16iWala2jowMAePnypVR5fHy81HsDAwNoaWnh5s2bRbb3tc0MJyIiIiL6VjHhSkRE37xBgwZBUVERjRs3LrJeTk4OYmNjoa2tXeisytJ26tQpxMXFYerUqejevbvM8Xv37mHy5Ml49OhRgYlFiUSCJUuW4MqVK+jbt2+h/eTPbv3pp5/EDbPe98cffyA8PBxBQUFwdHQssI2XL19CXV0dFSpUkDkXgNT6nxkZGUhISIC+vj709fULjasgRc1qzbdv3z68fPlSZrOsfG3atIGuri6WL18ulXBdvnw51NTU4Orq+tE47t27h7t374r3Q0VFRWbZgdLw7Nkz6Onpie8VFBRQt25dAO8SywCgrq4OQDZ5WhQtLS3o6+vj5MmTCAgIEMuXLVsmVU9BQQEeHh5Yv349Ll68KLOOqyAIkEgkJYqBiIiIiIhkMeFKRERf1OrVq3HgwAGZ8vcfa4+MjMSbN29k6nh4eBS4DqW5uTlCQkI+2ndiYiJsbW3h4+Mj98ZZCxYsgJqamlSZgoICJk6cKL7ftm0b7ty5I3Ouj48PwsPDoaioWGgCsHPnzpg0aRIiIiIwZsyYAuu4u7vD3d29yDjDw8NRr169ApOt+f2MGDECly9fhoODQ4F1jh8/jpEjR6J79+6wtrZGdnY2oqKisH37djRo0AB9+vQR654/fx5OTk4IDg6Wuvd79uzBtWvXALxLcF+/fh3Tp08XY8hPNH5MeHg4lJWV0a1btwKPq6qqYtq0aRg+fDh69OgBZ2dnREVFYf369ZgxYwZ0dXWl6r99+1Zc2iEvLw9xcXFYsWIF8vLyEBwcLFdMJTVo0CA8f/4cbdq0QZUqVRAfH49ffvkF9erVEzcDq1evHhQVFTF79mykpqZCWVkZbdq0gaGh4UfbnjVrFgYNGoQGDRrg5MmTBW4CNnPmTBw6dAitWrWCn58fbG1tkZSUhC1btiA6OhqVKlUqcQxERERERCSNCVciIvqili9fXmC5r6+v+N8HDhwoMClrYWHxxTf++fnnn2XKFBUVpRKuha1t2qpVK2zZsgXNmjWTSQDmq1OnDiwtLbF+/fpCE64fc/nyZdy5cweTJ08utI6bmxtGjBiB9evXF5pwtbOzg5OTE3bt2oWkpCQIggArKytMmTIFY8eOLXLzqHzbtm3D2rVrxfdXrlzBlStXALxbM1WehGtaWhr27t0LV1dXqc28PjRs2DBUqFAB8+fPx+7du2FmZoaFCxcWuCZtVlaW1AxhLS0tNGzYEOvWrUPbtm0/GtOn6NOnD37//XcsW7YML1++hJGREXr16oWQkBAoKLxbTt/IyAgrVqzAzz//jIEDByI3NxfHjh37aLJzypQpePLkCbZu3SpuzLV//36Z80xNTXHu3DlMnjwZ4eHhSEtLg6mpKTp27Cj+QaGkMRARERERkTSJIAhCWQdBRERERERERERE9F+gUNYBEBEREREREREREf1XMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISolSWQfwNcjLy8OjR4+gqakJiURS1uEQERERERHRf5wgCHj16hVMTEygoMC5UERE/yVMuAJ49OgRzMzMyjoMIiIiIiIiKmcePnyIKlWqlHUYRERUiphwBaCpqQng3T90WlpaZRxN+ZCTk4NDhw6hffv2qFChQlmHQ/RZcJxTecBxTuUBxzmVBxznX15aWhrMzMzE30eJiOi/gwlXQFxGQEtLiwnXLyQnJwdqamrQ0tLi/6Gj/yyOcyoPOM6pPOA4p/KA47zscFk7IqL/Hi4UQ0RERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEq7hSkRERERERESlKjc3Fzk5OWUdBhFRqalYsSIUFOSbu8qEKxERERERERGVCkEQkJycjJcvX5Z1KEREpUpBQQGWlpaoWLHiR+sy4UpEREREREREpSI/2WpoaAg1NTVIJJKyDomI6JPl5eXh0aNHSEpKQtWqVT/63caEKxERERERERF9stzcXDHZqqenV9bhEBGVKgMDAzx69Ahv375FhQoViqzLTbOIiIiIiIiI6JPlr9mqpqZWxpEQEZW+/KUEcnNzP1qXCVciIiIiIiIiKjVcRoCI/ouK893GhCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKRERERERERFQOHT9+HBKJBC9fvpT7HAsLCyxatOizxVSYksT6Jfn6+sLDw6PU223dujUCAgJKvV36vJhwJSIiIiIiIiL6yvj6+kIikWDo0KEyx4YPHw6JRAJfX98vH9hXzMLCAhKJBBKJBIqKijAxMcHAgQPx4sWLYrXzpZKcubm5mDVrFmxsbKCqqgpdXV00btwYf/zxh1hn+/btmDZt2mePhUoXE65ERERERERERF8hMzMzREREIDMzUyx78+YNNmzYgKpVq5ZhZF+vqVOnIikpCQkJCQgPD8fJkycxcuTIsg6rQKGhoVi4cCGmTZuG27dv49ixY/Dz85OaxaurqwtNTc2yC5JKhAlXIiIiIiIiIqKvkIODA8zMzLB9+3axbPv27ahatSrq168vVTcrKwsjR46EoaEhVFRU8N133+HChQtSdfbt24caNWpAVVUVTk5OiIuLk+kzOjoaLVq0gKqqKszMzDBy5Ei8fv1a7pgvXLiA77//Hvr6+tDW1karVq1w+fJlqToSiQR//PEHunTpAjU1NVhbW2P37t3FjrUgmpqaMDIygqmpKZycnODj4yPV/7Nnz+Dl5QVTU1OoqanBzs4OGzduFI/7+vrixIkTWLx4sThbNr/vW7duoVOnTtDS0oKmpiZatGiB2NhYqf7nzZsHY2Nj6OnpYfjw4cjJySk01t27d2PYsGHo0aMHLC0tYW9vj4EDByIwMFCs8/5s2/xlFT58vT/TedeuXXBwcICKigqqVauG0NBQvH37Vq57R6WHCVciIiIiIiIioq/UgAEDsGbNGvH96tWr0b9/f5l6P/30E7Zt24a1a9fi8uXLqF69OpydnfH8+XMAwMOHD9G1a1e4ubnh6tWrGDRoEMaPHy/VRmxsLDp06IBu3brh+vXr2LRpE6Kjo+Hv7y93vK9evYKPjw+io6Nx9uxZWFtbw8XFBa9evZKqFxoaip49e+L69etwcXGBt7d3sWKVR2JiIvbs2YPGjRuLZW/evIGjoyP27t2Lmzdvws/PD3379sX58+cBAIsXL0bTpk0xePBgJCUlISkpCWZmZkhMTETLli2hrKyMo0eP4tKlSxgwYIBUMvPYsWOIjY3FsWPHsHbtWoSFhSEsLKzQ+IyMjHD06FE8efJErutp1qyZGFNSUhKOHj0KFRUVtGzZEgAQFRWFfv36YdSoUbh9+zZ+++03hIWFYcaMGcW+d/SJBBJSU1MFAEJqampZh1JuZGdnCzt37hSys7PLOhSiz4bjnMoDjnMqDzjOqTzgOP/y/ou/h2ZmZgq3b98WMjMzyzqU/wQfHx/B3d1dSElJEZSVlYW4uDghLi5OUFFREZ48eSK4u7sLPj4+giAIQnp6ulChQgUhPDxcPD87O1swMTER5syZIwiCIEyYMEGoVauWVB/jxo0TAAgvXrwQBEEQBg4cKPj5+UnViYqKEhQUFMSfq7m5ubBw4UK5ryM3N1fQ1NQU9uzZI5YBEIKCgsT36enpAgBh//79csdaEHNzc6FixYqCurq6oKKiIgAQGjduXOQ5giAIrq6uwo8//ii+b9WqlTBq1CipOhMmTBAsLS0L/Z708fERzM3Nhbdv34plPXr0EHr16lVov7du3RJsbW0FBQUFwc7OThgyZIiwb98+qToFxSIIgvD06VOhWrVqwrBhw8Sytm3bCjNnzpSqt27dOsHY2LjQGEh+xfmO4wxXIiIiIiIiIqKvlIGBAVxdXREWFoY1a9bA1dUV+vr6UnViY2ORk5OD5s2bi2UVKlRAo0aNEBMTAwCIiYmRmukJAE2bNpV6f+3aNYSFhUFDQ0N8OTs7Iy8vDw8ePJAr3sePH2Pw4MGwtraGtrY2tLS0kJ6ejoSEBKl6devWFf9bXV0dWlpaSElJkTvWwowdOxZXr17F9evXERkZCQBwdXVFbm4ugHcbVU2bNg12dnbQ1dWFhoYGDh48KBPfh65evYoWLVqgQoUKhdapXbs2FBUVxffGxsbiNRWkVq1auHnzJs6ePYsBAwYgJSUFbm5uGDRoUJGx5OTkoFu3bjA3N8fixYvF8mvXrmHq1KlSP7/8mboZGRlFtkmlS6msAyAiIiIiIiIiosINGDBAfKx/6dKln62f9PR0DBkypMBNpuTdpMvHxwfPnj3D4sWLYW5uDmVlZTRt2hTZ2dlS9T5MXEokEuTl5ZU8+P+nr6+P6tWrAwCsra2xaNEiNG3aFMeOHUO7du0wd+5cLF68GIsWLYKdnR3U1dUREBAgE9+HVFVVP9p3Sa5JQUEBDRs2RMOGDREQEID169ejb9++mDRpEiwtLQs854cffsDDhw9x/vx5KCn9L7WXnp6O0NBQdO3aVeYcFRWVj8ZPpYcJVyIiIiIiIiKir1iHDh2QnZ0NiUQCZ2dnmeNWVlaoWLEiTp06BXNzcwDvZkFeuHBB3HDJ1tZWZmOqs2fPSr13cHDA7du3xYRlSZw6dQrLli2Di4sLgHfrsT59+rRYbcgTq7zyZ5xmZmaK8bm7u6NPnz4AgLy8PNy7dw+1atUSz6lYsaI4IzZf3bp1sXbtWuTk5BQ5y/VT5cdR2EZlCxYswObNm3H69Gno6elJHXNwcMDdu3c/6edHpYNLChARERERERERfcUUFRURExOD27dvSz2ynk9dXR0//PADxo4diwMHDuD27dsYPHgwMjIyMHDgQADA0KFDcf/+fYwdOxZ3797Fhg0bZDZ0GjduHE6fPg1/f39cvXoV9+/fx65du4q1aZa1tTXWrVuHmJgYnDt3Dt7e3nLNDn2fPLEW5tWrV0hOTkZSUhLOnz+PsWPHwsDAAM2aNRPjO3z4ME6fPo2YmBgMGTIEjx8/lmrDwsIC586dQ1xcHJ4+fYq8vDz4+/sjLS0Nnp6euHjxIu7fv49169bh7t27xbq293Xv3h0LFy7EuXPnEB8fj+PHj2P48OGoUaMGbGxsZOofOXIEP/30E+bOnQt9fX0kJycjOTkZqampAIApU6bgzz//RGhoKG7duoWYmBhEREQgKCioxDFSyTDhSkRERERERET0ldPS0oKWllahx2fNmoVu3bqhb9++cHBwwN9//42DBw9CR0cHwLslAbZt24adO3fC3t4eK1aswMyZM6XaqFu3Lk6cOIF79+6hRYsWqF+/PqZMmQITExO541y1ahVevHgBBwcH9O3bFyNHjoShoWGxrlWeWAszZcoUGBsbw8TEBJ06dYK6ujoOHTokzgYNCgqCg4MDnJ2d0bp1axgZGcHDw0OqjcDAQCgqKqJWrVowMDBAQkIC9PT0cPToUaSnp6NVq1ZwdHTEypUrP2m2q7OzM/bs2QM3NzfUqFEDPj4+sLGxwaFDh6SWCsgXHR2N3NxcDB06FMbGxuJr1KhRYnt//fUXDh06hIYNG6JJkyZYuHChOOuZvhyJIAhCWQdR1tLS0qCtrY3U1NQiv7yo9OTk5GDfvn1wcXH5rFPxicoSxzmVBxznVB5wnFN5wHH+5f0Xfw998+YNHjx4AEtLS64XSUT/OcX5juMMVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKZLc8IyIiIiIiohJZ/GJxseqP0hn1mSIhIiKissIZrkRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUcNMsIiIiIiIiIvqsiruh3Kco6WZ0z549g62tLc6fPw8LC4vSDeob1LJlSwwdOhS9e/cGAEgkEuzYsQMeHh5lFpOFhQUCAgIQEBDw2WJq0qQJxo4di27dupVamwCQnZ2NGjVqYOvWrWjQoEGptk1fH85wJSIiIiIiIqJyb8aMGXB3dxeTrXFxcZBIJLh69eoXj8XX1xcSiQQSiQQVKlRA5cqV8f3332P16tXIy8v77P3v3r0bjx8/hqen52fv61MkJSWhY8eOpdpmUFAQxo8f/0n3edasWZBIJGJiGAAqVqyIwMBAjBs3rhSipK8dE65EREREREREVK5lZGRg1apVGDhw4BftNzs7u9BjHTp0QFJSEuLi4rB//344OTlh1KhR6NSpE96+fftZ41qyZAn69+8PBYWvO21kZGQEZWXlUm2zY8eOePXqFfbv31+i8y9cuIDffvsNdevWlTnm7e2N6Oho3Lp161PDpK/c1/3JISIiIiIiIiL6zPbt2wdlZWU0adKk0DrHjx+HRCJBZGQkGjRoADU1NTRr1gx3796Vqrdnzx40bNgQKioq0NfXR5cuXcRjFhYWmDZtGvr16wctLS34+fkV2p+ysjKMjIxgamoKBwcHTJw4Ebt27cL+/fsRFhYm1nv58iUGDRoEAwMDaGlpoU2bNrh27ZrcMX3oyZMnOHr0KNzc3GSO5c8oVVVVRbVq1bB161ap4+PGjUONGjWgpqaGatWqYfLkycjJyRGPX7t2DU5OTtDU1ISWlhYcHR1x8eJF8Xh0dDRatGgBVVVVmJmZYeTIkXj9+nWhsUokEuzcuRPA/2Ykb9++HU5OTlBTU4O9vT3OnDkjdc7H+lBUVISLiwsiIiIK7bcw6enp8Pb2xsqVK6GjoyNzXEdHB82bNy9R2/RtYcKViIiIiIiIiMq1qKgoODo6ylV30qRJmD9/Pi5evAglJSUMGDBAPLZ371506dIFLi4uuHLlCiIjI9GoUSOp8+fNmwd7e3tcuXIFkydPLlacbdq0gb29PbZv3y6W9ejRAykpKdi/fz8uXboEBwcHtG3bFs+fP5c7pvdFR0dDTU0Ntra2MscmT56Mbt264dq1a/D29oanpydiYmLE45qamggLC8Pt27exePFirFy5EgsXLhSPe3t7o0qVKrhw4QIuXbqE8ePHo0KFCgCA2NhYdOjQAd26dcP169exadMmREdHw9/fv1j3aNKkSQgMDMTVq1dRo0YNeHl5iTOC5e2jUaNGiIqKKla/ADB8+HC4urqiXbt2hdYpadv0beGmWURERET0RRR3w5SSbnpCRERUXPHx8TAxMZGr7owZM9CqVSsAwPjx4+Hq6oo3b95ARUUFM2bMgKenJ0JDQ8X69vb2Uue3adMGP/74Y4ljtbGxwfXr1wG8S46eP38eKSkp4qP18+bNw86dO7F161b4+fnJFdP74uPjUbly5QKXE+jRowcGDRoEAJg2bRoOHz6MX375BcuWLQPwbv3TfBYWFggMDERERAR++uknAEBCQgLGjh0LGxsbAIC1tbVY/+eff4a3t7e47qm1tTWWLFmCVq1aYfny5VBRUZHr/gQGBsLV1RUAEBoaitq1a+Pvv/+GjY2N3H2YmJjg4cOHyMvLk3tZhYiICFy+fBkXLlwosp6JiQni4+PlapO+XZzhSkRERERERETlWmZmptwJvffX5jQ2NgYApKSkAACuXr2Ktm3bFnn+p+5QLwgCJBIJgHeP6Kenp0NPTw8aGhri68GDB4iNjZU7pvcVdS+aNm0q8/79Ga6bNm1C8+bNYWRkBA0NDQQFBSEhIUE8PmbMGAwaNAjt2rXDrFmzxBjzryUsLEzqOpydnZGXl4cHDx7IHX9RPx95+1BVVUVeXh6ysrLk6vPhw4cYNWoUwsPDPzqOVFVVkZGRIff10LeJM1yJiIiIiIiIqFzT19fHixcv5Kqb/wg8ADHxmb+jvaqq6kfPV1dXL0GE/xMTEwNLS0sA79YMNTY2xvHjx2XqVapUSe6Y3lece/G+M2fOwNvbG6GhoXB2doa2tjYiIiIwf/58sU5ISAh69+6NvXv3Yv/+/QgODkZERAS6dOmC9PR0DBkyBCNHjpRpu2rVqnLHUdTPR94+nj9/DnV1dbnv3aVLl5CSkgIHBwexLDc3FydPnsSvv/6KrKwsKCoqim0bGBjIfT30bWLClYiIiIiIiIjKtfr162P9+vWf3E7dunURGRmJ/v37l0JUso4ePYobN25g9OjRAAAHBwckJydDSUkJFhYWpRJT/fr1kZycjBcvXshs/HT27Fn069dP6n39+vUBAKdPn4a5uTkmTZokHi/o0fkaNWqgRo0aGD16NLy8vLBmzRp06dIFDg4OuH37NqpXry5XnCUhbx83b94Ur0sebdu2xY0bN6TK+vfvDxsbG4wbN05Mtpakbfo2cUkBIiIiIiIiIirXnJ2dcevWrRLN7HxfcHAwNm7ciODgYMTExODGjRuYPXt2idrKyspCcnIyEhMTcfnyZcycORPu7u7o1KmTmPRs164dmjZtCg8PDxw6dAhxcXE4ffo0Jk2ahIsXL5Yopvr160NfXx+nTp2SObZlyxasXr0a9+7dQ3BwMM6fPy9uOGVtbY2EhAREREQgNjYWS5YswY4dO8RzMzMz4e/vj+PHjyM+Ph6nTp3ChQsXxM25xo0bh9OnT8Pf3x9Xr17F/fv3sWvXrmJvmlUUefuIiopC+/bt5W5XU1MTderUkXqpq6tDT08PderU+aS26dvEGa5ERERERERE9Fl97Rsh2tnZwcHBAZs3b8aQIUNK3E7r1q2xZcsWTJs2DbNmzYKWlhZatmxZorYOHDgAY2NjKCkpQUdHB/b29liyZAl8fHzEjZwkEgn27duHSZMmoX///njy5AmMjIzQsmVLVK5cuUQxKSoqon///ggPD0enTp2kjoWGhiIiIgLDhg2DsbExNm7ciFq1agEAOnfujNGjR8Pf3x9ZWVlwdXXF5MmTERISIrb77Nkz9OvXD48fP4a+vj66du0qbuZVt25dnDhxApMmTUKLFi0gCAKsrKzQq1evEt2/gsjTR2JiIk6fPi014zkuLg6WlpY4duwYWrduXeL+z5w5g9TUVHTv3v1TLoO+ARJBEISyDqKspaWlQVtbG6mpqdDS0irrcMqFnJwc7Nu3Dy4uLlLrqxD9l3CcU3nAcU7FsfjF4mLV/1p+Oec4p+LgOCd5/Rd/D33z5g0ePHgAS0tLuTeg+prs3bsXY8eOxc2bN+Xemf6/Kjk5GbVr18bly5dhbm5e1uF8UePGjcOLFy/w+++/i2XHjh1D165d8c8//8gss1AcvXr1gr29PSZOnFgaodIXVpzvOM5wJSIiIiIiIqJyz9XVFffv30diYiLMzMzKOpwyZWRkhFWrViEhIaHcJVwNDQ0xZswYqbJ9+/Zh4sSJn5Rszc7Ohp2dnbj+Lv23MeFKRERERERERAQgICCgrEP4anh4eJR1CGXixx9/lCmbO3fuJ7dbsWJFBAUFfXI79G0o33PkiYiIiIiIiIiIiEoRE65EREREREREREREpeSrWVJg1qxZmDBhAkaNGoVFixYBeLcY7Y8//oiIiAhkZWXB2dkZy5YtE3faA4CEhAT88MMPOHbsGDQ0NODj44Off/4ZSkpfzaURERF91Le6yQoRERERERFJ+ypmuF64cAG//fYb6tatK1U+evRo7NmzB1u2bMGJEyfw6NEjdO3aVTyem5sLV1dXZGdn4/Tp01i7di3CwsIwZcqUL30JRERERERERERERGWfcE1PT4e3tzdWrlwptdtbamoqVq1ahQULFqBNmzZwdHTEmjVrcPr0aZw9exYAcOjQIdy+fRvr169HvXr10LFjR0ybNg1Lly5FdnZ2WV0SERERERERERERlVNl/tz98OHD4erqinbt2mH69Oli+aVLl5CTk4N27dqJZTY2NqhatSrOnDmDJk2a4MyZM7Czs5NaYsDZ2Rk//PADbt26hfr16xfYZ1ZWFrKyssT3aWlpAICcnBzk5OSU9iVSAfLvM+83/ZdxnFNxSN5KilX/axlXHOdUHBznVB5wnJO8eK+JiP67yjThGhERgcuXL+PChQsyx5KTk1GxYkVUqlRJqrxy5cpITk4W67yfbM0/nn+sMD///DNCQ0Nlyg8dOgQ1NbXiXgZ9gsOHD5d1CESfHcc5ycMc5sWqvw/7PlMkJcNxTvLgOKfygOOc5JWRkVHWIRAR0WdSZgnXhw8fYtSoUTh8+DBUVFS+aN8TJkzAmDFjxPdpaWkwMzND+/btoaWl9UVjKa9ycnJw+PBhfP/996hQoUJZh0P0WXCcU3Esf7m8WPV/qPTDZ4qkeDjOqTg4zqk84DgneeU/aVlezLry9Iv1Nb6+fonOe/bsGWxtbXH+/HlYWFiUblDfoJYtW2Lo0KHo3bs3AEAikWDHjh3w8PAos5gsLCwQEBCAgICAzxZTkyZNMHbsWHTr1q3U2vwSbdPXpcwSrpcuXUJKSgocHBzEstzcXJw8eRK//vorDh48iOzsbLx8+VJqluvjx49hZGQEADAyMsL58+el2n38+LF4rDDKyspQVlaWKa9QoQL/z8UXxntO5QHHOclDUBKKVf9rG1Mc5yQPjnMqDzjOSV68z1+fGTNmwN3dXUy2xsXFwdLSEleuXEG9evW+aCy+vr5Yu3YtAEBJSQm6urqoW7cuvLy84OvrCwWFz7slz+7du/H48WN4enp+1n4+VVJSktR+QKUhKCgIo0ePRpcuXYp1n3NzcxESEoL169cjOTkZJiYm8PX1RVBQECQSySe1Td+eMvvptm3bFjdu3MDVq1fFV4MGDeDt7S3+d4UKFRAZGSmec/fuXSQkJKBp06YAgKZNm+LGjRtISUkR6xw+fBhaWlqoVavWF78mIiIiIiIiIvr2ZGRkYNWqVRg4cOAX7beoDb87dOiApKQkxMXFYf/+/XBycsKoUaPQqVMnvH379rPGtWTJEvTv3/+rTwoaGRkVOKHuU3Ts2BGvXr3C/v37i3Xe7NmzsXz5cvz666+IiYnB7NmzMWfOHPzyyy+f3DZ9e8rsk6OpqYk6depIvdTV1aGnp4c6depAW1sbAwcOxJgxY3Ds2DFcunQJ/fv3R9OmTdGkSRMAQPv27VGrVi307dsX165dw8GDBxEUFIThw4eX+geOiIiIiIiIiP6b9u3bB2VlZTHfUJDjx49DIpEgMjISDRo0gJqaGpo1a4a7d+9K1duzZw8aNmwIFRUV6Ovro0uXLuIxCwsLTJs2Df369YOWlhb8/PwK7U9ZWRlGRkYwNTWFg4MDJk6ciF27dmH//v0ICwsT6718+RKDBg2CgYEBtLS00KZNG1y7dk3umD705MkTHD16FG5ubjLHkpKS0LFjR6iqqqJatWrYunWr1PFx48ahRo0aUFNTQ7Vq1TB58mSpDeKuXbsGJycnaGpqQktLC46Ojrh48aJ4PDo6Gi1atICqqirMzMwwcuRIvH79utBYJRIJdu7cCeDdjGSJRILt27fDyckJampqsLe3x5kzZ6TO+VgfioqKcHFxQURERKH9FuT06dNwd3eHq6srLCws0L17d7Rv317qyeyStk3fnq/6TxULFy5Ep06d0K1bN7Rs2RJGRkbYvn27eFxRURF//fUXFBUV0bRpU/Tp0wf9+vXD1KlTyzBqIiIiIiIiIvqWREVFwdHRUa66kyZNwvz583Hx4kUoKSlhwIAB4rG9e/eiS5cucHFxwZUrVxAZGYlGjRpJnT9v3jzY29vjypUrmDx5crHibNOmDezt7aVyIz169EBKSgr279+PS5cuwcHBAW3btsXz58/ljul90dHRUFNTg62trcyxyZMno1u3brh27Rq8vb3h6emJmJgY8bimpibCwsJw+/ZtLF68GCtXrsTChQvF497e3qhSpQouXLiAS5cuYfz48eLyGrGxsejQoQO6deuG69evY9OmTYiOjoa/v3+x7tGkSZMQGBiIq1evokaNGvDy8hJnBMvbR6NGjRAVFVWsfps1a4bIyEjcu3cPwLvkcnR0NDp27PjJbdO3p8zWcC3I8ePHpd6rqKhg6dKlWLp0aaHnmJubY9++r2tnTyIiIiIiIiL6dsTHx8PExESuujNmzECrVq0AAOPHj4erqyvevHkDFRUVzJgxA56enggNDRXr29vbS53fpk0b/PjjjyWO1cbGBtevXwfwLjl6/vx5pKSkiE/6zps3Dzt37sTWrVvh5+cnV0zvi4+PR+XKlQtcTqBHjx4YNGgQAGDatGk4fPgwfvnlFyxbtgzAuzVK81lYWCAwMBARERH46aefAAAJCQkYO3YsbGxsAADW1tZi/Z9//hne3t7ihljW1tZYsmQJWrVqheXLl8u94XpgYCBcXV0BAKGhoahduzb+/vtv2NjYyN2HiYkJHj58iLy8PLmXVRg/fjzS0tJgY2MDRUVF5ObmYsaMGfD29paqV5K26dvDnywRERERERERlWuZmZlyJ/Tq1q0r/rexsTEAiHvLXL16FW3bti3y/AYNGpQwyncEQRA3Ybp27RrS09Ohp6cHDQ0N8fXgwQPExsbKHdP7iroX+XvqvP/+/RmumzZtQvPmzWFkZAQNDQ0EBQUhISFBPD5mzBgMGjQI7dq1w6xZs8QY868lLCxM6jqcnZ2Rl5eHBw8eyB1/UT8feftQVVVFXl4esrKy5O538+bNCA8Px4YNG3D58mWsXbsW8+bNEzc/+5S26dvzVc1wJSIiIiIiIiL60vT19fHixQu56uY/Ag9ATHzm5eUBeJdM+xh1dfUSRPg/MTExsLS0BACkp6fD2NhY5olhAKhUqZLcMb2vOPfifWfOnIG3tzdCQ0Ph7OwMbW1tREREYP78+WKdkJAQ9O7dG3v37sX+/fsRHByMiIgIdOnSBenp6RgyZAhGjhwp03bVqlXljqOon4+8fTx//hzq6urFundjx47F+PHj4enpCQCws7NDfHw8fv75Z/j4+HxS2/TtYcKViL56i18sLlb9UTqjPlMkRERERET0X1S/fn2sX7/+k9upW7cuIiMj0b9//1KIStbRo0dx48YNjB49GgDg4OCA5ORkKCkpwcLColRiql+/PpKTk/HixQvo6OhIHTt79iz69esn9b5+/foA3m0aZW5ujkmTJonH4+PjZdqvUaMGatSogdGjR8PLywtr1qxBly5d4ODggNu3b6N69epyxVkS8vZx8+ZN8brklZGRIbNEgKKiopjs/ZS26dvDJQWIiIiIiIiIqFxzdnbGrVu3SjSz833BwcHYuHEjgoODERMTgxs3bmD27NklaisrKwvJyclITEzE5cuXMXPmTLi7u6NTp05i0rNdu3Zo2rQpPDw8cOjQIcTFxeH06dOYNGkSLl68WKKY6tevD319fZw6dUrm2JYtW7B69Wrcu3cPwcHBOH/+vLjhlLW1NRISEhAREYHY2FgsWbIEO3bsEM/NzMyEv78/jh8/jvj4eJw6dQoXLlwQN+caN24cTp8+DX9/f1y9ehX379/Hrl27ir1pVlHk7SMqKgrt27cvVttubm6YMWMG9u7di7i4OOzYsQMLFixAly5dPrlt+vZwhisRERERERERfVbj6+uXdQhFsrOzg4ODAzZv3owhQ4aUuJ3WrVtjy5YtmDZtGmbNmgUtLS20bNmyRG0dOHAAxsbGUFJSgo6ODuzt7bFkyRL4+PiIMyklEgn27duHSZMmoX///njy5AmMjIzQsmVLVK5cuUQxKSoqon///ggPD0enTp2kjoWGhiIiIgLDhg2DsbExNm7ciFq1agEAOnfujNGjR8Pf3x9ZWVlwdXXF5MmTERISIrb77Nkz9OvXD48fP4a+vj66du0qbuZVt25dnDhxApMmTUKLFi0gCAKsrKzQq1evEt2/gsjTR2JiIk6fPi014zkuLg6WlpY4duwYWrduXWDbv/zyCyZPnoxhw4YhJSUFJiYmGDJkCKZMmVJk2/TfJBEEQSjrIMpaWloatLW1kZqaCi0trbIOp1zIycnBvn374OLiIrW+ClFBvtUlBTjOqTg4zqk84Din8oDjnOT1X/w99M2bN3jw4AEsLS3l3oDqa7J3716MHTsWN2/eLPe7xycnJ6N27dq4fPkyzM3NyzqcL2rcuHF48eIFfv/9d7Hs2LFj6Nq1K/755x+ZZRY+tW36dhTnO44zXImIiIiIiIio3HN1dcX9+/eRmJgIMzOzsg6nTBkZGWHVqlVISEgodwlXQ0NDjBkzRqps3759mDhx4iclWwtrm/6bmHAlIiIiIiIiIgIQEBBQ1iF8NTw8PMo6hDLx448/ypTNnTv3s7VN/03le448ERERERERERERUSliwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREVO49e/YMhoaGiIuLK9Z5rVu3RkBAwGeJ6VMdP34cEokEL1++LLMYwsLCUKlSpTLrn4pvxYoVcHNzK+swvmlKZR0AEREREREREf23pYaGfrG+tIODS3TejBkz4O7uDgsLCwDvEqknTpwotP7x48fRqlWrEvX1pTRr1gxJSUnQ1tb+rP1IJBLs2LEDHh4en7WfzyEkJAQRERF4+PAhKlasCEdHR8yYMQONGzcu9Jxnz57B29sb169fFxP17u7umDlzJrS0tAAA0dHRGDduHO7cuYOMjAyYm5tjyJAhGD16tFRbS5cuxdy5c5GcnAx7e3v88ssvaNSokXjcwsIC8fHx2LhxIzw9PaXOrV27Nm7fvo01a9bA19e32Nf+7Nkz2NvbIzExES9evBAT4wMGDMC0adMQFRWFFi1aFLtd4gxXIiIiIiIiIirnMjIysGrVKgwcOFAs2759O5KSkqRe8fHxqFOnDho0aFBkQu5rUbFiRRgZGUEikZR1KF+tGjVq4Ndff8WNGzcQHR0NCwsLtG/fHk+ePCn0HAUFBbi7u2P37t24d+8ewsLCcOTIEQwdOlSso66uDn9/f5w8eRIxMTEICgpCUFAQfv/9d7HOpk2bMGbMGAQHB+Py5cuwt7eHs7MzUlJSpPozMzPDmjVrpMrOnj2L5ORkqKurl/jaBw4ciLp168qUV6xYEb1798aSJUtK3HZ5x4QrEREREREREZVr+/btg7KyMpo0aSKW6erqwsjISOo1bdo0PH36FDt27ICKiopYNy8vDz/99JN4TkhIiHgsLi4OEokEV69eFctevnwJiUSC48ePi2UnTpxAo0aNoKysDGNjY4wfPx5v374Vj7du3RojRoxAQEAAdHR0ULlyZaxcuRKvX79G//79oampierVq2P//v3iOR8uKZD/eP/Bgwdha2sLDQ0NdOjQAUlJSVL3Y/Xq1ahdu7YYi7+//yfe4f+JjY2Fu7s7KleuDA0NDTRs2BBHjhyRqmNhYYHp06ejX79+0NDQgLm5OXbv3o0nT57A3d0dGhoaqFu3Li5evCie8+zZM3h5ecHU1BRqamqws7PDxo0bPxpP79690a5dO1SrVg21a9fGggULkJaWhuvXrxd6jo6ODn744Qc0aNAA5ubmaNu2LYYNG4aoqCixTv369eHl5YXatWvDwsICffr0gbOzs1SdBQsWYPDgwejfvz9q1aqFFStWQE1NDatXr5bqz9vbGydOnMDDhw/FstWrV8Pb2xtKSiV7eH358uV4+fIlAgMDCzzu5uaG3bt3IzMzs0Ttl3dMuBIRERERERFRuRYVFQVHR8ci6yxbtgx//vkntm3bhipVqkgdW7t2LdTV1XHu3DnMmTMHU6dOxeHDh+XuPzExES4uLmjYsCGuXbuG5cuXY9WqVZg+fbpMP/r6+jh//jxGjBiBH374AT169ECzZs1w+fJltG/fHn379kVGRkahfWVkZGDevHlYt24dTp48iYSEBKmk2/LlyzF8+HD4+fnhxo0b2L17N6pXry73tXxMeno6XFxcEBkZiStXrqBDhw5wc3NDQkKCVL2FCxeiefPmuHLlClxdXdG3b1/069cPffr0weXLl2FlZYV+/fpBEAQAwJs3b+Do6Ii9e/fi5s2b8PPzQ9++fXH+/Hm5Y8vOzsbvv/8ObW1t2Nvby33eo0ePsH379iKXmLhy5QpOnz4t1snOzsalS5fQrl07sY6CggLatWuHM2fOSJ1buXJlODs7Y+3atQDe/Qw3bdqEAQMGyB3j+27fvo2pU6fizz//hIJCwanBBg0a4O3btzh37lyJ+ijvmHAlIiIiIiIionItPj4eJiYmhR4/efIkAgICsHTpUjRr1kzmeN26dREcHAxra2v069cPDRo0QGRkpNz9L1u2DGZmZvj1119hY2MDDw8PhIaGYv78+cjLyxPr2dvbIygoCNbW1pgwYQJUVFSgr6+PwYMHw9raGlOmTMGzZ8+KnJ2Zk5ODFStWoEGDBnBwcIC/v79UrNOnT8ePP/6IUaNGoUaNGmjYsGGpbgpmb2+PIUOGoE6dOrC2tsa0adNgZWWF3bt3S9VzcXHBkCFDxOtKS0tDw4YN0aNHD9SoUQPjxo1DTEwMHj9+DAAwNTVFYGAg6tWrh2rVqmHEiBHo0KEDNm/e/NGY/vrrL2hoaEBFRQULFy7E4cOHoa+v/9HzvLy8oKamBlNTU2hpaeGPP/6QqVOlShUoKyujQYMGGD58OAYNGgQAePr0KXJzc1G5cmWp+pUrV0ZycrJMOwMGDEBYWBgEQcDWrVthZWWFevXqfTTGD2VlZcHLywtz585F1apVC62npqYGbW1txMfHF7sPYsKViIiIiIiIiMq5zMxMqSUC3peQkIDu3bvDz89PTJZ96MN1MI2NjWXW4SxKTEwMmjZtKrXWavPmzZGeno5///23wH4UFRWhp6cHOzs7sSw/eVdU32pqarCysiow1pSUFDx69Aht27Yt8NyhQ4dCQ0NDfJVEeno6AgMDYWtri0qVKkFDQwMxMTEyM1zfv9b86yrqWnNzczFt2jTY2dlBV1cXGhoaOHjwoNhueHi4VOzvP9rv5OSEq1ev4vTp0+jQoQN69uwpttuxY0fxnNq1a0vFuHDhQly+fBm7du1CbGwsxowZI3O9UVFRuHjxIlasWIFFixbJtcxBQVxdXZGeno6TJ09i9erVJZ7dOmHCBNja2qJPnz4frauqqlrkbGkqXMkWeiAiIiIiIiIi+o/Q19fHixcvZMozMzPRpUsX1K5dG4sWLSr0/AoVKki9l0gk4szU/Ee28x99B97NMi2Jgvp5vyw/Yfv+rFh52siPTVVVtcj+p06dWuian/IKDAzE4cOHMW/ePFSvXh2qqqro3r07srOzC40z/7qKuta5c+di8eLFWLRoEezs7KCuro6AgACx3c6dO0ttdGZqair+t7q6OqpXr47q1aujSZMmsLa2xqpVqzBhwgT88ccf4jqmH967/LV9bWxsoKurixYtWmDy5MkwNjYW61haWgJ4lyx+/PgxQkJC4OXlBX19fSgqKoozdPM9fvwYRkZGMvdNSUkJffv2RXBwMM6dO4cdO3YUeZ8Lc/ToUdy4cQNbt24F8L9xqa+vj0mTJiE0NFSs+/z5cxgYGJSon/KOCVciIiIiIiIiKtfq16+P9evXy5QPGjQIz58/x8GDB0u8OVF+wiopKQn169cHAKkNtADA1tYW27ZtgyAIYiLx1KlT0NTUlFkv9nPS1NSEhYUFIiMj4eTkJHPc0NAQhoaGn9THqVOn4Ovriy5dugB4N+M1Li7uk9rMb9fd3V2cuZmXl4d79+6hVq1aAN5dm6amplxt5eXlISsrC4B0YvZj5wAQz/tYuxUrVoSjoyMiIyPh4eEhHo+MjCx0k7IBAwZg3rx56NWrF3R0dOSK60Pbtm2T2gjrwoULGDBgAKKioqRmPsfGxuLNmzfimKXiYcKViIiIiIiIiMo1Z2dnTJgwAS9evBATWXPnzsWWLVuwZ88evH37VmZdTW1t7Y/OCAXezRpt0qQJZs2aBUtLS6SkpCAoKEiqzrBhw7Bo0SKMGDEC/v7+uHv3LoKDgzFmzJhCNzX6XEJCQjB06FAYGhqiY8eOePXqFU6dOoURI0YUed6DBw9kEsnW1tYy9aytrbF9+3a4ublBIpFg8uTJRc7IlZe1tTW2bt2K06dPQ0dHBwsWLMDjx4/FhGtBXr9+jRkzZqBz584wNjbG06dPsXTpUiQmJqJHjx6Fnrdv3z48fvwYDRs2hIaGBm7duoWxY8eiefPmsLCwAAAsXboUVatWhY2NDYB36wDPmzcPI0eOFNsZM2YMfHx80KBBAzRq1AiLFi3C69ev0b9//wL7tbW1xdOnT6GmplaCO/TO+0lV4N1asvltV6pUSSyPiopCtWrVZOqTfJhwJSIiIiIiIqJyzc7ODg4ODti8eTOGDBkC4N1GVjk5OejQoUOB56xZswa+vr5ytb969WoMHDgQjo6OqFmzJubMmYP27duLx01NTbFv3z6MHTsW9vb20NXVxcCBA2USs1+Cj48P3rx5g4ULFyIwMBD6+vro3r37R88rbP3SDy1YsAADBgxAs2bNoK+vj3HjxiEtLe2T4w4KCsI///wDZ2dnqKmpwc/PDx4eHkhNTS30HEVFRdy5cwdr167F06dPoaenh4YNGyIqKkpmvdb3qaqqYuXKlRg9ejSysrJgZmaGrl27Yvz48WKdvLw8TJgwAQ8ePICSkhKsrKwwe/ZscXwBQK9evfDkyRNMmTIFycnJqFevHg4cOCCzkdb79PT0irwPvr6+iIuLw/Hjx4us9zEbN27E4MGDP6mN8kwivL+ISDmVlpYGbW1tpKamQktLq6zDKRdycnKwb98+uLi4yKyBQvShxS8WF6v+KJ1RnymS4uE4p+LgOKfygOOcygOOc5LXf/H30Ddv3uDBgwewtLQsdAOqr9nevXsxduxY3Lx584vPKiUqLa1atYKTkxNCQkJK3MatW7fQpk0b3Lt3D9ra2qUX3DeuON9xnOFKRERERET0DUh9byMTeWgHB3+mSIj+m1xdXXH//n0kJibCzMysrMMhKrbU1FTExsZi7969n9ROUlIS/vzzTyZbPwETrkREREREREREAAICAso6BKIS09bWxr///vvJ7bRr164UoinfOEeeiIiIiIiIiIiIqJQw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUsKEKxERERERERGVe8+ePYOhoSHi4uKKdV7r1q0REBDwWWL6VMePH4dEIsHLly/LLIawsDBUqlSpzPqn4jtw4ADq1auHvLy8sg7lm8WEKxERERERERF9XnckX+5VQjNmzIC7uzssLCwAvEukSiSSQl8nTpwopZvz+TRr1gxJSUnQ1tb+rP1IJBLs3Lnzs/bxuYSEhMDGxgbq6urQ0dFBu3btcO7cuSLPefbsGTp06AATExMoKyvDzMwM/v7+SEtLE+tER0ejefPm0NPTg6qqKmxsbLBw4UKZtpYuXQoLCwuoqKigcePGOH/+vNRxCwsLSCQSREREyJxbu3ZtSCQShIWFFeuaL1y4gLZt26JSpUrQ0dGBs7Mzrl27Jh7v0KEDKlSogPDw8GK1S/+jVNYBEBERERF9qtTQ0GLV1w4O/kyREBHRtygjIwOrVq3CwYMHxbLt27cjOztbql52djZcXV3F5NjXrmLFijAyMirrML5qNWrUwK+//opq1aohMzMTCxcuRPv27fH333/DwMCgwHMUFBTg7u6O6dOnw8DAAH///TeGDx+O58+fY8OGDQAAdXV1+Pv7o27dulBXV0d0dDSGDBkCdXV1+Pn5AQA2bdqEMWPGYMWKFWjcuDEWLVoEZ2dn3L17F4aGhmJ/ZmZmWLNmDTw9PcWys2fPIjk5Gerq6sW63vT0dHTo0AGdO3fGsmXL8PbtWwQHB8PZ2RkPHz5EhQoVAAC+vr5YsmQJ+vbtW6z26R3OcCUiIiIiIiKicm3fvn1QVlZGkyZNxDJdXV0YGRlJvaZNm4anT59ix44dUFFREevm5eXhp59+Es8JCQkRj8XFxUEikeDq1ati2cuXLyGRSHD8+HGx7MSJE2jUqBGUlZVhbGyM8ePH4+3bt+Lx1q1bY8SIEQgICICOjg4qV66MlStX4vXr1+jfvz80NTVRvXp17N+/XzznwyUF8h/vP3jwIGxtbaGhoYEOHTogKSlJ6n6sXr0atWvXFmPx9/f/xDv8P7GxsXB3d0flypWhoaGBhg0b4siRI1J1LCwsMH36dPTr1w8aGhowNzfH7t278eTJE7i7u0NDQwN169bFxYsXxXOePXsGLy8vmJqaQk1NDXZ2dti4ceNH4+nduzfatWuHatWqoXbt2liwYAHS0tJw/fr1Qs/R0dHBDz/8gAYNGsDc3Bxt27bFsGHDEBUVJdapX78+vLy8ULt2bVhYWKBPnz5wdnaWqrNgwQIMHjwY/fv3R61atbBixQqoqalh9erVUv15e3vjxIkTePjwoVi2evVqeHt7Q0mpeHMp79y5g+fPn2Pq1KmoWbMmateujeDgYDx+/Bjx8fFiPTc3N1y8eBGxsbHFap/eYcKViIiIiIiIiMq1qKgoODo6Flln2bJl+PPPP7Ft2zZUqVJF6tjatWuhrq6Oc+fOYc6cOZg6dSoOHz4sd/+JiYlwcXFBw4YNce3aNSxfvhyrVq3C9OnTZfrR19fH+fPnMWLECPzwww/o0aMHmjVrhsuXL6N9+/bo27cvMjIyCu0rIyMD8+bNw7p163Dy5EkkJCQgMDBQPL58+XIMHz4cfn5+uHHjBnbv3o3q1avLfS0fk56eDhcXF0RGRuLKlSvo0KED3NzckJCQIFVv4cKFaN68Oa5cuQJXV1f07dsX/fr1Q58+fXD58mVYWVmhX79+EAQBAPDmzRs4Ojpi7969uHnzJvz8/NC3b1+ZR/SLkp2djd9//x3a2tqwt7eX+7xHjx5h+/btaNWqVaF1rly5gtOnT4t1srOzcenSJbRr106so6CggHbt2uHMmTNS51auXBnOzs5Yu3YtgHc/w02bNmHAgAFyx5ivZs2a0NPTw6pVq5CdnY3MzEysWrUKtra24nIaAFC1alVUrlxZKkFM8mPClYiIiIiIiIjKtfj4eJiYmBR6/OTJkwgICMDSpUvRrFkzmeN169ZFcHAwrK2t0a9fPzRo0ACRkZFy979s2TKYmZnh119/hY2NDTw8PBAaGor58+dLbVxkb2+PoKAgWFtbY8KECVBRUYG+vj4GDx4Ma2trTJkyBc+ePStydmZOTg5WrFiBBg0awMHBAf7+/lKxTp8+HT/++CNGjRqFGjVqoGHDhqW6KZi9vT2GDBmCOnXqwNraGtOmTYOVlRV2794tVc/FxQVDhgwRrystLQ0NGzZEjx49UKNGDYwbNw4xMTF4/PgxAMDU1BSBgYGoV68eqlWrhhEjRqBDhw7YvHnzR2P666+/oKGhARUVFSxcuBCHDx+Gvr7+R8/z8vKCmpoaTE1NoaWlhT/++EOmTpUqVaCsrIwGDRpg+PDhGDRoEADg6dOnyM3NReXKlaXqV65cGcnJyTLtDBgwAGFhYRAEAVu3boWVlRXq1av30Rg/pKmpiePHj2P9+vVQVVWFhoYGDhw4gP3798vMljUxMZGa9UryY8KViIiIiIiIiMq1zMxMqSUC3peQkIDu3bvDz89PTJZ9qG7dulLvjY2NkZKSInf/MTExaNq0KSSS/2361bx5c6Snp+Pff/8tsB9FRUXo6enBzs5OLMtP3hXVt5qaGqysrAqMNSUlBY8ePULbtm0LPHfo0KHQ0NAQXyWRnp6OwMBA2NraolKlStDQ0EBMTIzMDNf3rzX/uoq61tzcXEybNg12dnbQ1dWFhoYGDh48KLYbHh4uFfv7MzednJxw9epVnD59Gh06dEDPnj3Fdjt27CieU7t2bakYFy5ciMuXL2PXrl2IjY3FmDFjZK43KioKFy9exIoVK7Bo0SK5ljkoiKurK9LT03Hy5EmsXr26RLNbgXdjfeDAgWjevDnOnj2LU6dOoU6dOnB1dUVmZqZUXVVV1SJnS1PhuGkWEREREREREZVr+vr6ePHihUx5ZmYmunTpgtq1a2PRokWFnp+/0VA+iUQizkxVUHg31y3/0Xfg3SzTkiion/fL8hO278+KlaeN/NhUVVWL7H/q1KlSyw+URGBgIA4fPox58+ahevXqUFVVRffu3WU2KCvouoq61rlz52Lx4sVYtGgR7OzsoK6ujoCAALHdzp07S210ZmpqKv63uro6qlevjurVq6NJkyawtrbGqlWrMGHCBPzxxx9iIvLDe5e/tq+NjQ10dXXRokULTJ48GcbGxmIdS0tLAO+SxY8fP0ZISAi8vLygr68PRUVFcYZuvsePHxe40ZmSkhL69u2L4OBgnDt3Djt27CjyPhdmw4YNiIuLw5kzZ8SxuWHDBujo6GDXrl1SG3M9f/680I3DqGhMuBIRERERERFRuVa/fn2sX79epnzQoEF4/vw5Dh48WOzNifLlJ6ySkpJQv359AJDaQAsAbG1tsW3bNgiCICYST506BU1NTZn1Yj8nTU1NWFhYIDIyEk5OTjLHDQ0NYWho+El9nDp1Cr6+vujSpQuAdzNe4+LiPqnN/Hbd3d3Rp08fAO8Ssffu3UOtWrUAvLs2TU1NudrKy8tDVlYWAOnE7MfOASCe97F2K1asCEdHR0RGRsLDw0M8HhkZWegmZQMGDMC8efPQq1cv6OjoyBXXhzIyMqCgoCA1mzr//fuJ+jdv3iA2NlYcs1Q8TLgSERERERERUbnm7OyMCRMm4MWLF2Iia+7cudiyZQv27NmDt2/fyqyrqa2t/dEZocC7WaNNmjTBrFmzYGlpiZSUFAQFBUnVGTZsGBYtWoQRI0bA398fd+/eRXBwMMaMGSPOQvxSQkJCMHToUBgaGqJjx4549eoVTp06hREjRhR53oMHD2QSydbW1jL1rK2tsX37dri5uUEikWDy5MlFzsiVl7W1NbZu3YrTp09DR0cHCxYswOPHj8WEa0Fev36NGTNmoHPnzjA2NsbTp0+xdOlSJCYmokePHoWet2/fPjx+/BgNGzaEhoYGbt26hbFjx6J58+bixlNLly5F1apVYWNjA+DdOsDz5s3DyJEjxXbGjBkDHx8fNGjQAI0aNcKiRYvw+vVr9O/fv8B+bW1t8fTpU6ipqZXgDr3z/fffY+zYsRg+fDhGjBiBvLw8zJo1C0pKSlJJ9rNnz0JZWRlNmzYtcV/lGROuRERERERERFSu2dnZwcHBAZs3b8aQIUMAvNvIKicnBx06dCjwnDVr1sDX11eu9levXo2BAwfC0dERNWvWxJw5c9C+fXvxuKmpKfbt24exY8fC3t4eurq6GDhwoExi9kvw8fHBmzdvsHDhQgQGBkJfXx/du3f/6HmFrV/6oQULFmDAgAFo1qwZ9PX1MW7cOKSlpX1y3EFBQfjnn3/g7OwMNTU1+Pn5wcPDA6mpqYWeo6ioiDt37mDt2rV4+vQp9PT00LBhQ0RFRcms1/o+VVVVrFy5EqNHj0ZWVhbMzMzQtWtXjB8/XqyTl5eHCRMm4MGDB1BSUoKVlRVmz54tji8A6NWrF548eYIpU6YgOTkZ9erVw4EDB2Q20nqfnp5ekffB19cXcXFxOH78eIHHbWxssGfPHoSGhqJp06ZQUFBA/fr1ceDAAamlEDZu3Ahvb+9PSu6WZxLh/UVEyqm0tDRoa2sjNTUVWlpaZR1OsSx+sbhY9UfpjPpMkRRPTk4O9u3bBxcXF5k1UIg+xHFO5QHHOZUHn3Ocp4aGFqtt7eBguetynFNxcJyTvL7l30ML8+bNGzx48ACWlpaFbkD1Ndu7dy/Gjh2LmzdvfvFZpUSlpVWrVnByckJISEiJ23j69Clq1qyJixcvimvQUvG+4zjDlYiIiIiIiIjKPVdXV9y/fx+JiYkwMzMr63CIii01NRWxsbHYu3fvJ7UTFxeHZcuWMdn6CZhwJSIiIiIiIiICEBAQUNYhEJWYtrY2/v33309up0GDBmjQoEEpRFR+MeFKRET0H/c5H0ElIiIiIiIiaVyUhIiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIir3nj17BkNDQ8TFxZV1KHI7fvw4JBIJXr58CQAICwtDpUqVyjSmkoqLi4NEIsHVq1fLOhQqp5o0aYJt27aVSltKpdIKEREREREREVEh/ln7zxfrq5pPtRKdN2PGDLi7u8PCwqJ0A/rA8ePH4eTkhBcvXnyzydGSat26NerVq4dFixaVdSjFtn37dsycORN///03cnJyYG1tjR9//BF9+/Yt9JykpCT8+OOPuHjxIv7++2+MHDlSrmvPzc1FSEgI1q9fj+TkZJiYmMDX1xdBQUGQSCQA3t3LEydOiOcYGhqiZcuWmDdvHszNzQttu7THX0hICHbu3PnFEuUWFhYICAhAQEBAiduIiIiAl5cX3N3dsXPnTrE8KCgIo0ePRpcuXaCg8GlzVDnDlYiIiIiIiIjKtYyMDKxatQoDBw4s61DoK6Wrq4tJkybhzJkzuH79Ovr374/+/fvj4MGDhZ6TlZUFAwMDBAUFwd7eXu6+Zs+ejeXLl+PXX39FTEwMZs+ejTlz5uCXX36Rqjd48GAkJSXh0aNH2LVrFx4+fIg+ffqU+Bo/p5ycnLIOAcC7mdSBgYFo0aKFzLGOHTvi1atX2L9//yf3U6YJ1+XLl6Nu3brQ0tKClpYWmjZtKnVRrVu3hkQikXoNHTpUqo2EhAS4urpCTU0NhoaGGDt2LN6+ffulL4WIiIiIiIiIvlH79u2DsrIymjRpIpblP65/8OBB1K9fH6qqqmjTpg1SUlKwf/9+2NraQktLC71790ZGRoZ4Xl5eHn7++WdYWlpCVVUV9vb22Lp1K4B3yR4nJycAgI6ODiQSCXx9fQEABw4cwHfffYdKlSpBT08PnTp1Qmxs7CddV2xsLNzd3VG5cmVoaGigYcOGOHLkiFQdCwsLTJ8+Hf369YOGhgbMzc2xe/duPHnyBO7u7tDQ0EDdunVx8eJF8Zxnz57By8sLpqamUFNTg52dHTZu3PhJsX4oNzcXAwcOFO9jzZo1sXjxYqk6vr6+8PDwwMyZM1G5cmVUqlQJU6dOxdu3bzF27Fjo6uqiSpUqWLNmjdR548aNQ40aNaCmpoZq1aph8uTJH00Itm7dGl26dIGtrS2srKwwatQo1K1bF9HR0YWeY2FhgcWLF6Nfv37Q1taW+9pPnz4Nd3d3uLq6wsLCAt27d0f79u1x/vx5qXpqamowMjKCsbExmjRpAn9/f1y+fFnufoD/LUNx8OBB2NraQkNDAx06dEBSUpJY5/jx42jUqBHU1dVRqVIlNG/eHPHx8QgLC0NoaCiuXbsm5u3CwsIAABKJBMuXL0fnzp2hrq6OGTNmFLjkxc6dO8VZu/n27NmDhg0bQkVFBfr6+ujSpQuAdz+D+Ph4jB49WuyvOHJzc+Ht7Y3Q0FBUqyY7E15RUREuLi6IiIgoVrsFKdOEa5UqVTBr1ixcunQJFy9eRJs2beDu7o5bt26JdfKz9fmvOXPmiMdyc3Ph6uqK7OxsnD59GmvXrkVYWBimTJlSFpdDRERERERERN+gqKgoODo6FngsJCQEv/76K06fPo2HDx+iZ8+eWLRoETZs2IC9e/fi0KFDUjMPf/75Z/z5559YsWIFbt26hdGjR6NPnz44ceIEzMzMxDUi7969i6SkJDGJ+Pr1a4wZMwYXL15EZGQkFBQU0KVLF+Tl5ZX4utLT0+Hi4oLIyEhcuXIFHTp0gJubGxISEqTqLVy4EM2bN8eVK1fg6uqKvn37ol+/fujTpw8uX74MKysr9OvXD4IgAADevHkDR0dH7N27Fzdv3oSfnx/69u0rkxD8FHl5eahSpQq2bNmC27dvY8qUKZg4cSI2b94sVe/o0aN49OgRTp48iQULFiA4OBidOnWCjo4Ozp07h6FDh2LIkCH4999/xXM0NTURFhaG27dvY/HixVi5ciUWLlwod2yCICAyMhJ3795Fy5YtS+2a8zVr1gyRkZG4d+8eAODatWuIjo5Gx44dCz3n+fPn2Lx5Mxo3blzs/jIyMjBv3jysW7cOJ0+eREJCAgIDAwEAb9++hYeHB1q1aoXr16/jzJkz8PPzg0QiQa9evfDjjz+idu3aYt6uV69eYrshISHo0qULbty4gQEDBsgVy969e9GlSxe4uLjgypUriIyMRKNGjQC8W9ahSpUqmDp1qthfcUydOhWGhoZFzmRv1KgRoqKiitVuQcp0DVc3Nzep9zNmzMDy5ctx9uxZ1K5dG8D/svUFOXToEG7fvo0jR46gcuXKqFevHqZNm4Zx48YhJCQEFStW/OzXQERERERERETftvj4eJiYmBR4bPr06WjevDkAYODAgZgwYQJiY2PFGXLdu3fHsWPHMG7cOGRlZWHmzJk4cuQImjZtCgCoVq0aoqOj8dtvv6FVq1bQ1dUF8G7Nzfdn+3Xr1k2q39WrV8PAwAC3b99GnTp1SnRd9vb2Uo+yT5s2DTt27MDu3bvh7+8vlru4uGDIkCEAgClTpmD58uVo2LAhevToAeDdjNCmTZvi8ePHMDIygqmpqZiQA4ARI0bg4MGD2Lx5s5gc+1QVKlRAaGio+N7S0hJnzpzB5s2b0bNnT7FcV1cXS5YsgYKCAmrWrIk5c+YgIyMDEydOBABMmDABs2bNQnR0NDw9PQG8W6szn4WFBQIDAxEREYGffvqpyJhSU1NhamqKrKwsKCoqYtmyZfj+++9L5XrfN378eKSlpcHGxgaKiorIzc3FjBkz4O3tLVVv2bJl+OOPPyAIAjIyMlCjRo0ilzgoTE5ODlasWAErKysAgL+/P6ZOnQoASEtLQ2pqKjp16iQet7W1Fc/V0NCAkpJSgbm73r17o3///sWKZcaMGfD09JT62eePYV1dXSgqKkJTU7PQXGFhoqOjsWrVqo+uNWtiYoKHDx8iLy/vk9Zx/Wo2zcrNzcWWLVvw+vVr8UsJAMLDw7F+/XoYGRnBzc0NkydPhpqaGgDgzJkzsLOzQ+XKlcX6zs7O+OGHH3Dr1i3Ur1+/wL6ysrKQlZUlvk9LSwPwboB9LWtKyEvytnjTp7+W68uP42uJh75uHOdUHnzOcf62mP9HoThtc5xTcXCcU3nAcU7y4r3+umRmZkJFRaXAY3Xr1hX/u3LlyuJj6O+X5c/s/Pvvv5GRkSGThMvOzi40R5Hv/v37mDJlCs6dO4enT5+KM1sTEhJQp04ddOzYUZx5Z25uLvV0cGHS09MREhKCvXv3IikpCW/fvkVmZqbMDNcPrxEA7OzsZMpSUlJgZGSE3NxczJw5E5s3b0ZiYiKys7ORlZUl5mvCw8PFBC4A7N+/v8A1Mz9m6dKlWL16NRISEpCZmYns7GzUq1dPqk7t2rWlEmOVK1eWSlArKipCT08PKSkpYtmmTZuwZMkSxMbGIj09HW/fvoWWlhaAd/e7Vq1aYt2JEyeKyVtNTU1cvXoV6enpiIyMxJgxY1CtWjW0bt262NcGvJtZ/f6s1d9++w3e3t7YvHkzwsPDsWHDBtSuXRtXr15FQEAATExM4OPjI9b39vbGpEmTAACPHz/GzJkz0b59e1y6dAmampqoXbs24uPjAQAtWrQodG1SNTU1MZkKAMbGxuL90tXVha+vL5ydnfH999+jXbt26NmzJ4yNjT96fQ0aNCj2Pbl69SoGDx5c7POK8urVK/Tt2xcrV66Evr5+kXVVVVWRl5eHrKwsqKqqlrjPMk+43rhxA02bNsWbN2+goaGBHTt2iAO7d+/eMDc3h4mJCa5fv45x48bh7t272L59OwAgOTlZKtkK/O9LIDk5udA+f/75Z6lMeb5Dhw6JXw7fCnMUvvNcQfZh32eKpGQOHz5c1iHQN4DjnMqDzzrO3/s/0PI1XvzPEMc5yYPjnMoDjnOS1/trflLZ09fXx4sXLwo8VqFCBfG/JRKJ1Pv8svzkaHp6OoB3j0WbmppK1VNWVi4yBjc3N5ibm2PlypUwMTFBXl4e6tSpg+zsbADAH3/8gczMTJmYihIYGIjDhw9j3rx5qF69OlRVVdG9e3exzcKusbCy/OucO3cuFi9ejEWLFsHOzg7q6uoICAgQ2+3cubPUo+0f3gt5REREIDAwEPPnz0fTpk2hqamJuXPn4ty5c4XGnh9rUT+jM2fOiOt4Ojs7Q1tbGxEREZg/fz6AdzMc358FmT8jGQAUFBRQvXp1AEC9evUQExODn3/+ucQJ1wYNGkj1lZ/TGjt2LMaPHy/OyLWzs0N8fDx+/vlnqYSrtra2GE/16tWxatUqGBsbY9OmTRg0aBD27dsn/nGnqORhQfcrf/kIAFizZg1GjhyJAwcOYNOmTQgKCsLhw4el1jwuiLq6utR7BQUFqXYB2T8+fUqSszCxsbGIi4uTetI+fzwoKSnh7t27YsL5+fPnUFdX/+Q4yjzhWrNmTVy9ehWpqanYunUrfHx8cOLECdSqVQt+fn5iPTs7OxgbG6Nt27aIjY2VyrwX14QJEzBmzBjxfVpaGszMzNC+fXvxLxrfiuUvlxer/g+VfvhMkRRPTk4ODh8+jO+//17ufyio/OI4p/Lgc47ztFmzitW21vjxctflOKfi4Din8oDjnOSV/6QlfR3q16+P9evXf3I7tWrVgrKyMhISEtCqVasC6+Qvf5ibmyuWPXv2DHfv3sXKlSvFmaAfbsZUkqTlqVOn4OvrK246lJ6ejri4uGK3U1C77u7u6NOnD4B3yat79+6JE+g0NTWhqan5yX00a9YMw4YNE8s+dRMx4N2GVObm5uLMUADiLFDgXQIuP4n5MfkzIUtKVVW1wL4yMjJkHmdXVFT86Hq+ioqKACAm5s3Ni/dHwKLUr18f9evXx4QJE9C0aVNs2LABTZo0QcWKFaXGclEMDAzw6tUrvH79WkzGfviIf926dREZGVnoUgTF6S+fjY0Nbty4IVUWFBSEV69eYfHixTAzMxPLb968+dHZ6PIo84RrxYoVxcHl6OiICxcu4P/Yu/f4nuv//+P392Zms/PsYA7bHHI+hTR8ipxCzh2cR4pChVYoh7EcUk4VisqQtVT0EZNYjpHkkERyXhj6hM0sM9t+f/ju/fO2mfd7e83MbtcuLhfv1+v5er4er9eeI/c9X8/X7Nmz9dFHH2Vpm/nTkSNHjqhixYry9/fPsiDzuXPnJCnHtRwcHR2z/cmSg4NDofufi4xiGXdudJN77foK4z3H3cc4R1GQn+O8mI0vWsjNeGWcwxqMcxQFjHNYi/t8b2nTpo1Gjx6tixcvytPTM9f9uLq6KiwsTMOHD1d6erqaNm2qhIQE/fjjj3Jzc1NoaKgCAwNlMpm0atUqtWvXTk5OTvL09JS3t7fmz5+v0qVLKy4uTqNs+KHJ7VSuXFnLly9Xhw4dZDKZNHbs2Dy9hOvmfr/66itt27ZNnp6emjFjhs6dO2fxKP7t/P3331lCtuweT69cubIWL16stWvXKjg4WEuWLNHOnTsVHByc59rj4uIUHR2thg0bavXq1VqxYsUdj5syZYoaNGigihUrKiUlRTExMVqyZInmzfv/P2gbPXq0Tp8+rcWLF5u3ZV5rUlKS+dqLFy+e473q0KGDJk2apPLly6tGjRras2ePZsyYkeXFU8nJyeYnvM+dO6eIiAiVKFFCrVu3tuWW5Oj48eOaP3++OnbsqICAAB06dEiHDx9W3759Jd1YA/f48ePau3evypYtK1dX19vO5m7UqJGcnZ31xhtv6OWXX9aOHTsUGRlp0Wb8+PFq0aKFKlasqO7du+v69euKiYnRyJEjzefbvHmzunfvLkdHxzsuESBJJUqUyLIOcub6ybdu37JliyH3r8AD11vl9NOBzEGa+Y0YEhKiSZMm6fz58/L19ZV04xEYNzc3q77JAQAAAABA/qsQWuHOjQpQrVq19OCDD2rZsmUWa4/mRkREhHx8fDRlyhQdO3ZMHh4eevDBB83rgJYpU0YTJkzQqFGj1L9/f/Xt21eRkZGKjo7Wyy+/rJo1a6pKlSp67733cv2oeqbMkK5x48YqVaqURo4cacjs6jFjxujYsWNq06aNnJ2dNXDgQHXu3FkJCQl3PDYqKkpRUVEW2yIiIsyzZTMNGjRIe/bs0TPPPCOTyaQePXpo8ODBt12H1FodO3bU8OHDNXToUKWkpKh9+/YaO3aswsPDczzuypUrGjx4sE6dOiUnJydVrVpVn332mZ555hlzm/j4+Czr4948W3LXrl2KiopSYGBgjjON33//fY0dO1aDBw/W+fPnFRAQoEGDBmncuHEW7RYsWKAFCxZIkjw9PVW7dm3FxMSoSpUqVt6NO3N2dtYff/yhRYsW6Z9//lHp0qU1ZMgQ8/dJt27dtHz5cjVv3lyXLl3SwoUL1a9fv2z78vLy0meffabXXntNCxYsUIsWLRQeHm7xhHuzZs305ZdfKiIiQlOnTpWbm5seeeQR8/6JEydq0KBB5uA7c4kCk8mU47mtcfr0aW3bts2Q2e6mjFsXT7iLRo8erbZt26p8+fK6fPmyoqKi9Pbbb2vt2rWqUKGCoqKi1K5dO3l7e2vfvn0aPny4ypYtq02bNkm6Mf2+bt26CggI0LRp03T27Fn16dNHzz33nCZPnmx1HYmJiXJ3d1dCQkKhW1Jg9sXZNrV/xfOVfKrENqmpqYqJiVG7du34yS7uiHGOoiA/x3lCNuuW58R9/Hir2zLOYQvGOYoCxjmsVZj/HXo7V69e1fHjxxUcHHzbF1Ddy1avXq3XXntN+/fvz9PbyQHcXcePH9cDDzygAwcOqHLlyrnuZ+TIkbp48aLmz5+f7X5b/owr0Bmu58+fV9++fRUfHy93d3fVrl1ba9euVatWrfTXX39p/fr1mjVrlq5cuaJy5cqpW7duGjNmjPl4e3t7rVq1Si+++KJCQkJUsmRJhYaGauLEiQV4VQAKk/z8hwsAAACAwqN9+/Y6fPiwTp8+bbGmI4B7W0xMjAYOHJinsFWSfH19Ld75lBcFGrh+8sknt91Xrlw580zWnAQGBiomF2/fBAAAAAAAuNmwYcMKugQANhoyZIgh/bz66quG9CNJzJEHAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAirx//vlHvr6+OnHiREGXYrWNGzfKZDLp0qVLkqTIyEh5eHgUaE25deLECZlMJu3du7egS0ER1b17d02fPt2QvghcAQAAAABA/jr8/d37lUuTJk1Sp06dFBQUZNx1Z+PWkLQoadasmYYNG1bQZeTK8uXL1aBBA3l4eKhkyZKqW7eulixZkuMx8fHx6tmzpx544AHZ2dlZfe1paWkaO3asgoOD5eTkpIoVKyoiIkIZGRnmNs2aNZPJZDL/8vPz01NPPaWTJ0/m2LfR4y88PFx169Y1pC9rBAUFadasWbk6dtasWapSpYqcnJxUrlw5DR8+XFevXjXvHzNmjCZNmqSEhIQ810ngCgAAAAAAirTk5GR98sknGjBgQEGXgnuUl5eX3nzzTW3fvl379u1T//791b9/f61du/a2x6SkpMjHx0djxoxRnTp1rD7X22+/rXnz5umDDz7QwYMH9fbbb2vatGl6//33Ldo9//zzio+P15kzZ/Tf//5Xf/31l3r37p3ra8xPqampBXr+qKgojRo1SuPHj9fBgwf1ySef6IsvvtAbb7xhblOzZk1VrFhRn332WZ7PR+AKAAAAAACKtJiYGDk6Ourhhx82b8ucCbh27VrVq1dPTk5Oeuyxx3T+/HmtWbNG1apVk5ubm3r27Knk5GTzcenp6ZoyZYp5dmKdOnX01VdfSbrx2Hzz5s0lSZ6enjKZTOrXr58k6bvvvlPTpk3l4eEhb29vPfHEEzp69Gieruvo0aPq1KmT/Pz85OLiooYNG2r9+vUWbYKCgvTWW2+pb9++cnFxUWBgoFauXKm///5bnTp1kouLi2rXrq1ffvnFfMw///yjHj16qEyZMnJ2dlatWrX0+eef56nWW6WlpWnAgAHm+1ilShXNnj3bok2/fv3UuXNnTZ48WX5+fvLw8NDEiRN1/fp1vfbaa/Ly8lLZsmW1cOFCi+NGjhypBx54QM7OzqpQoYLGjh17x0CwWbNm6tKli6pVq6aKFSvqlVdeUe3atbV169bbHhMUFKTZs2erb9++cnd3t/rat23bpk6dOql9+/YKCgrSk08+qdatW+vnn3+2aOfs7Cx/f3+VLl1aDz/8sIYOHardu3dbfR7p/y9DsXbtWlWrVk0uLi56/PHHFR8fb26zceNGPfTQQypZsqQ8PDzUpEkTnTx5UpGRkZowYYJ+/fVX80zbyMhISZLJZNK8efPUsWNHlSxZUpMmTcp2yYtvvvlGJpPJYtu3336rhg0bqkSJEipVqpS6dOki6cbX4OTJkxo+fLj5fNbatm2bmjRpop49eyooKEitW7dWjx49stzTDh06KDo62oY7mD0CVwAAAAAAUKRt2bJF9evXz3ZfeHi4PvjgA23btk1//fWXnn76ac2aNUtRUVFavXq1vv/+e4uZh1OmTNHixYv14Ycf6vfff9fw4cPVu3dvbdq0SeXKldPXX38tSTp06JDi4+PNIeKVK1c0YsQI/fLLL4qNjZWdnZ26dOmi9PT0XF9XUlKS2rVrp9jYWO3Zs0ePP/64OnTooLi4OIt2M2fOVJMmTbRnzx61b99effr0Ud++fdW7d2/t3r1bFStWVN++fc2PtF+9elX169fX6tWrtX//fg0cOFB9+vTJEl7lRXp6usqWLasvv/xSBw4c0Lhx4/TGG29o2bJlFu1++OEHnTlzRps3b9aMGTM0fvx4PfHEE/L09NSOHTv0wgsvaNCgQTp16pT5GFdXV0VGRurAgQOaPXu2FixYoJkzZ1pdW0ZGhmJjY3Xo0CE98sgjhl1zpsaNGys2NlZ//vmnJOnXX3/V1q1b1bZt29sec+HCBS1btkyNGjWy+XzJycl69913tWTJEm3evFlxcXEKCwuTJF2/fl2dO3fWo48+qn379mn79u0aOHCgTCaTnnnmGb366quqUaOG4uPjFR8fr2eeecbcb3h4uLp06aLffvtNzz77rFW1rF69Wl26dFG7du20Z88excbG6qGHHpJ0Y1mHsmXLauLEiebzWatx48batWuXeYweO3ZMMTExateunUW7hx56SD///LNSUlKs7js7xfJ0NAAAAAAAQCF38uRJBQQEZLvvrbfeUpMmTSRJAwYM0OjRo3X06FFVqFBBkvTkk09qw4YNGjlypFJSUjR58mStX79eISEhkqQKFSpo69at+uijj/Too4/Ky8tLkuTr62sx269bt24W5/3000/l4+OjAwcOqGbNmrm6rjp16lg8yh4REaEVK1Zo5cqVGjp0qHl7u3btNGjQIEnSuHHjNG/ePDVs2FBPPfWUpBszQkNCQnTu3Dn5+/urTJky5kBOkl566SWtXbtWy5YtM4djeeXg4KAJEyaYPwcHB2v79u1atmyZnn76afN2Ly8vvffee7Kzs1OVKlU0bdo0JScnmx8VHz16tKZOnaqtW7eqe/fukm6s1ZkpKChIYWFhio6O1uuvv55jTQkJCSpTpoxSUlJkb2+vuXPnqlWrVoZc781GjRqlxMREVa1aVfb29kpLS9OkSZPUq1cvi3Zz587Vxx9/rIyMDCUnJ+uBBx7IcYmD20lNTdWHH36oihUrSpKGDh2qiRMnSpISExOVkJCgJ554wry/WrVq5mNdXFxUrFgx+fv7Z+m3Z8+e6t+/v021TJo0Sd27d7f42meOYS8vL9nb28vV1TXb8+WkZ8+e+t///qemTZsqIyND169f1wsvvGCxpIAkBQQE6Nq1azp79qwCAwNtOsfNmOEKAAAAAACKtH///VclSpTIdl/t2rXNv/fz8zM/hn7ztvPnz0uSjhw5ouTkZLVq1UouLi7mX4sXL77j8gCHDx9Wjx49VKFCBbm5uZlf3pU5G7Vt27bm/mrUqGHVdSUlJSksLEzVqlWTh4eHXFxcdPDgwSwzXG+9RkmqVatWlm2Z15mWlqaIiAjVqlVLXl5ecnFx0dq1a839Ll261OL6t2zZYlW9t5ozZ47q168vHx8fubi4aP78+Vlqr1Gjhuzs/n+85efnZ1G7vb29vL29zbVL0hdffKEmTZrI399fLi4uGjNmjLnfuLg4i9onT55sPs7V1VV79+7Vzp07NWnSJI0YMUIbN27M1bVJN2ZW33yupUuXSpKWLVumpUuXKioqSrt379aiRYv07rvvatGiRRbH9+rVS3v37jXPgK1UqZJat26ty5cvm+9NZt85zY51dnY2h6mSVLp0afP98vLyUr9+/dSmTRt16NBBs2fPtnpmaYMGDWy6H5K0d+9etWjRwubj7mTjxo2aPHmy5s6dq927d2v58uVavXq1IiIiLNo5OTlJksUyIbnBDFcAAAAAAFCklSpVShcvXsx2n4ODg/n3JpPJ4nPmtszH/pOSkiTdeCy6TJkyFu0cHR1zrKFDhw4KDAzUggULFBAQoPT0dNWsWVPXrl2TJH388cf6999/s9SUk7CwMK1bt07vvvuuKlWqJCcnJz355JPmPm93jbfblnmd77zzjmbPnq1Zs2apVq1aKlmypIYNG2but2PHjhaPtt96L6wRHR2tsLAwTZ8+XSEhIXJ1ddU777yjHTt23Lb2zFpz+hpt375dvXr10oQJE9SmTRu5u7srOjpa06dPl3RjhuPevXvNx2bOSJYkOzs7VapUSZJUt25dHTx4UFOmTFGzZs1svj7pRiB587kyg+3XXntNo0aNMs/IrVWrlk6ePKkpU6YoNDTU3N7d3d1cT6VKlfTJJ5+odOnS+uKLL/Tcc88pJibGvDZtZpCYnezuV+byEZK0cOFCvfzyy/ruu+/0xRdfaMyYMVq3bp3FmsfZKVmypMVnOzs7i36lrC/TyqnOvBg7dqz69Omj5557TtKNe3rlyhUNHDhQb775pjm0v3DhgiTJx8cnT+cjcAUAAAAAAEVavXr1DHkzefXq1eXo6Ki4uDg9+uij2bYpXry4pBuzRDP9888/OnTokBYsWKD//Oc/kpTlZUy5CS1//PFH9evXz/zSoaSkJJ04ccLmfrLrt1OnTurdu7ekG0Hsn3/+qerVq0u6MRPU1dU1z+do3LixBg8ebN6W15eISTdenhQYGKg333zTvO3kyZPm3xcrVswcYt5Jenp6ntb6dHJyyvZcycnJFrN2pRszde+0nq+9vb0kmYP5vDwSf6t69eqpXr16Gj16tEJCQhQVFaWHH35YxYsXtxjLOfHx8dHly5d15coVcxh7c+As3ZhtHRsbe9ulCGw5381ud08lWYTA+/fvV9myZVWqVCmbz3EzAlcAAAAAAFCktWnTRqNHj9bFixfl6emZ635cXV0VFham4cOHKz09XU2bNlVCQoJ+/PFHubm5KTQ0VIGBgTKZTFq1apXatWsnJycneXp6ytvbW/Pnz1fp0qUVFxenUaNG5fm6KleurOXLl6tDhw4ymUwaO3Zsnl7CdXO/X331lbZt2yZPT0/NmDFD586dMweuOfn777+zhGylS5fO9hyLFy/W2rVrFRwcrCVLlmjnzp0KDg7Oc+1xcXGKjo5Ww4YNtXr1aq1YseKOx02ZMkUNGjRQxYoVlZKSopiYGC1ZskTz5s0ztxk9erROnz6txYsXm7dlXmtSUpL52osXL57jverQoYMmTZqk8uXLq0aNGtqzZ49mzJiR5cVTycnJOnv2rCTp3LlzioiIUIkSJdS6dWtbbkmOjh8/rvnz56tjx44KCAjQoUOHdPjwYfXt21fSjTVwjx8/rr1796ps2bJydXW97WzuRo0aydnZWW+88YZefvll7dixQ5GRkRZtxo8frxYtWqhixYrq3r27rl+/rpiYGI0cOdJ8vs2bN6t79+5ydHS0Ohjt0KGDZsyYoXr16qlRo0Y6cuSIxo4dqw4dOpiDV+nGMg9G3D8CVwAAAAAAkL8qGxcA5YdatWrpwQcf1LJly8wvj8qtiIgI+fj4aMqUKTp27Jg8PDz04IMPml/OU6ZMGU2YMEGjRo1S//791bdvX0VGRio6Olovv/yyatasqSpVqui9997L9aPqmTJDusaNG6tUqVIaOXKkEhMT89SndOOlU8eOHVObNm3k7OysgQMHqnPnzkpISLjjsVFRUYqKirLYFhERYZ4tm2nQoEHas2ePnnnmGZlMJvXo0UODBw/WmjVr8lR7x44dNXz4cA0dOlQpKSlq3769xo4dq/Dw8ByPu3LligYPHqxTp07JyclJVatW1WeffaZnnnnG3CY+Pj7LGrP16tUz/37Xrl2KiopSYGBgjjON33//fY0dO1aDBw/W+fPnFRAQoEGDBmncuHEW7RYsWKAFCxZIkjw9PVW7dm3FxMSoSpUqVt6NO3N2dtYff/yhRYsW6Z9//lHp0qU1ZMgQ8/dJt27dtHz5cjVv3lyXLl3SwoUL1a9fv2z78vLy0meffabXXntNCxYsUIsWLRQeHq6BAwea2zRr1kxffvmlIiIiNHXqVLm5uemRRx4x7584caIGDRpkDr4zZ6eaTKYczz1mzBiZTCaNGTNGp0+flo+PjznYznT16lV98803+u677/J41yRTxq2LJxRBiYmJcnd3V0JCgtzc3Aq6HJvMvjjbpvaveL6ST5XYJjU1VTExMWrXrp3Va8+g6MrPcZ5w05sPreE+frzVbRnnsAXjHEUB4xxFAeMc1irM/w69natXr+r48eMKDg6+7Quo7mWrV6/Wa6+9pv3792d59BjAvev48eN64IEHdODAAVWuXDnX/cybN08rVqzQ999/n+1+W/6MY4YrAAAAAAAo8tq3b6/Dhw/r9OnTKleuXEGXA8BKMTExGjhwYJ7CVunGy8Pef/99Q2oicAUAAAAAAJA0bNiwgi4BgI2GDBliSD/PPfecIf1IEnPkAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAgCLvn3/+ka+vr06cOFHQpVht48aNMplMunTpkiQpMjJSHh4eBVpTbp04cUImk0l79+4t6FJQBF27dk1BQUH65ZdfDOmvmCG9AAAAAAAA3Ebbfbvu2rnW1K6fq+MmTZqkTp06KSgoyNiCbrFx40Y1b95cFy9eLLThaG41a9ZMdevW1axZswq6FJstX75ckydP1pEjR5SamqrKlSvr1VdfVZ8+fXI8Zt68edq7d69SUlJUo0YNhYeHq02bNnc814cffqhdu3bpwoUL2rNnj+rWrWvRJigoSCdPnpQk2dnZyc/PT23bttW7774rT0/P2/YdGRmpYcOGmUP6vOrXr58uXbqkb775xpD+7sRkMmnFihXq3LlzrvuYOnWqRo8erVdeecU8FosXL66wsDCNHDlSsbGxea6TGa4AAAAAAKBIS05O1ieffKIBAwYUdCm4R3l5eenNN9/U9u3btW/fPvXv31/9+/fX2rVrb3vM5s2b1apVK8XExGjXrl1q3ry5OnTooD179uR4ritXrqhp06Z6++23c2w3ceJExcfHKy4uTkuXLtXmzZv18ssv5+r68ltqampBlyBJ2rlzpz766CPVrl07y75evXpp69at+v333/N8HgJXAAAAAABQpMXExMjR0VEPP/yweVvm4/pr165VvXr15OTkpMcee0znz5/XmjVrVK1aNbm5ualnz55KTk42H5eenq4pU6YoODhYTk5OqlOnjr766itJNx6bb968uSTJ09NTJpNJ/fr1kyR99913atq0qTw8POTt7a0nnnhCR48ezdN1HT16VJ06dZKfn59cXFzUsGFDrV+/3qJNUFCQ3nrrLfXt21cuLi4KDAzUypUr9ffff6tTp05ycXFR7dq1LR61/ueff9SjRw+VKVNGzs7OqlWrlj7//PM81XqrtLQ0DRgwwHwfq1SpotmzZ1u06devnzp37qzJkyfLz89PHh4emjhxoq5fv67XXntNXl5eKlu2rBYuXGhx3MiRI/XAAw/I2dlZFSpU0NixY+8YCDZr1kxdunRRtWrVVLFiRb3yyiuqXbu2tm7dettjZs2apddff10NGzZU5cqVNXnyZFWuXFnffvttjufq06ePxo0bp5YtW+bYztXVVf7+/ipTpoyaN2+u0NBQ7d69O8djbhUeHq66detqyZIlCgoKkru7u7p3767Lly+b23z11VeqVauWnJyc5O3trZYtW+rKlSsKDw/XokWL9N///lcmk0kmk0kbN240Lw/xxRdf6NFHH1WJEiW0dOlS87luvUe3zir/9NNPVaNGDTk6Oqp06dIaOnSoJJnbdenSRSaTyebZ6ElJSerVq5cWLFiQ7SxgT09PNWnSRNHR0Tb1mx0CVwAAAAAAUKRt2bJF9etnvxRBeHi4PvjgA23btk1//fWXnn76ac2aNUtRUVFavXq1vv/+e73//vvm9lOmTNHixYv14Ycf6vfff9fw4cPVu3dvbdq0SeXKldPXX38tSTp06JDi4+PNIeKVK1c0YsQI/fLLL4qNjZWdnZ26dOmi9PT0XF9XUlKS2rVrp9jYWO3Zs0ePP/64OnTooLi4OIt2M2fOVJMmTbRnzx61b99effr0Ud++fdW7d2/t3r1bFStWVN++fZWRkSFJunr1qurXr6/Vq1dr//79GjhwoPr06aOff/4517XeKj09XWXLltWXX36pAwcOaNy4cXrjjTe0bNkyi3Y//PCDzpw5o82bN2vGjBkaP368nnjiCXl6emrHjh164YUXNGjQIJ06dcp8jKurqyIjI3XgwAHNnj1bCxYs0MyZM62uLSMjQ7GxsTp06JAeeeQRm67p8uXL8vLysvoYa50+fVrffvutGjVqZPOxR48e1TfffKNVq1Zp1apV2rRpk6ZOnSpJio+PV48ePfTss8/q4MGD2rhxo7p27aqMjAyFhYXp6aef1uOPP674+HjFx8ercePG5n5HjRqlV155RQcPHrzjMgqZ5s2bpyFDhmjgwIH67bfftHLlSlWqVEnSjdmpkrRw4ULFx8ebP1tryJAhat++fY5B9kMPPaQtW7bY1G92WMMVAAAAAAAUaSdPnlRAQEC2+9566y01adJEkjRgwACNHj1aR48eVYUKFSRJTz75pDZs2KCRI0cqJSVFkydP1vr16xUSEiJJqlChgrZu3aqPPvpIjz76qDls8/X1tVjDtVu3bhbn/fTTT+Xj46MDBw6oZs2aubquOnXqqE6dOubPERERWrFihVauXGmeNShJ7dq106BBgyRJ48aN07x589SwYUM99dRTkm7MCA0JCdG5c+fMMyrDwsLMx7/00ktau3atli1bpoceeihXtd7KwcFBEyZMMH8ODg7W9u3btWzZMj399NPm7V5eXnrvvfdkZ2enKlWqaNq0aUpOTtYbb7whSRo9erSmTp2qrVu3qnv37pKkMWPGmI8PCgpSWFiYoqOj9frrr+dYU0JCgsqUKaOUlBTZ29tr7ty5atWqldXX9O677yopKcmi/rwYOXKkxowZo7S0NF29elWNGjXSjBkzbO4nPT1dkZGRcnV1lXRjhm1sbKwmTZqk+Ph4Xb9+XV27dlVgYKAkqVatWuZjnZyclJKSIn9//yz9Dhs2TF27drWplrfeekuvvvqqXnnlFfO2hg0bSpJ8fHwkSR4eHtmeLyfR0dHavXv3HUPagIAA89q4ecEMVwAAAAAAUKT9+++/KlGiRLb7bl7r0c/Pz/wY+s3bzp8/L0k6cuSIkpOT1apVK7m4uJh/LV68+I7LAxw+fFg9evRQhQoV5ObmZn5cOnM2atu2bc391ahRw6rrSkpKUlhYmKpVqyYPDw+5uLjo4MGDWWa43nqNkmWolrkt8zrT0tIUERGhWrVqycvLSy4uLlq7dq2536VLl1pcf25nDM6ZM0f169eXj4+PXFxcNH/+/Cy116hRQ3Z2/z/e8vPzs6jd3t5e3t7e5tol6YsvvlCTJk3k7+8vFxcXjRkzxtxvXFycRe2TJ082H+fq6qq9e/dq586dmjRpkkaMGKGNGzdadS1RUVGaMGGCli1bJl9fX0l5v0+vvfaa9u7dq3379plf9NS+fXulpaVJkkXfL7zwwm37CQoKMoetklS6dGnz/apTp45atGihWrVq6amnntKCBQt08eJFq+pr0KCBTddz/vx5nTlzRi1atLDpuDv566+/9Morr2jp0qW3/T7P5OTkZLFESG4xwxUAAAAAABRppUqVum2I5ODgYP69yWSy+Jy5LfOx/6SkJEnS6tWrVaZMGYt2jo6OOdbQoUMHBQYGasGCBQoICFB6erpq1qypa9euSZI+/vhj/fvvv1lqyklYWJjWrVund999V5UqVZKTk5OefPJJc5+3u8bbbcu8znfeeUezZ8/WrFmzVKtWLZUsWVLDhg0z99uxY0eLR9tvvRfWiI6OVlhYmKZPn66QkBC5urrqnXfe0Y4dO25be2atOX2Ntm/frl69emnChAlq06aN3N3dFR0drenTp0u6McNx79695mNvfvzfzs7O/Hh73bp1dfDgQU2ZMkXNmjW747U899xz+vLLLy0eZ8/rfSpVqpS5nsqVK2vWrFkKCQnRhg0b1LJlS4vrcHNzu20/Od0ve3t7rVu3Ttu2bTMvn/Hmm29qx44dCg4OzrG+kiVLWny2s7MzL0uR6ea1c52cnHLsL7d27dql8+fP68EHHzRvS0tL0+bNm/XBBx+YZyxL0oULF8wzafOCwBW3lXDT1H1ruI8fn0+VAAAAAACQf+rVq6fPPvssz/1Ur15djo6OiouL06OPPpptm+LFi0uSeRaidOMlVIcOHdKCBQv0n//8R5KyvIwpN6Hljz/+qH79+qlLly6SbgTCJ06csLmf7Prt1KmTevfuLelGEPvnn3+qevXqkm7MBL15xmRuz9G4cWMNHjzYvC2vLxGTpG3btikwMFBvvvmmedvNj5AXK1bMHGLeSXp6ulJSUnJs8/nnn+vZZ59VdHS02rdvb7HPiPt0s8zQMDOYt/Y67sRkMqlJkyZq0qSJxo0bp8DAQK1YsUIjRoxQ8eLFLcZyTnx8fHT27FllZGSYQ/ybQ2FXV1cFBQUpNjbW/HK5Wzk4OFh9vkwtWrTQb7/9ZrGtf//+qlq1qkaOHGm+b5K0f/9+1atXz6b+s0PgCgAAAAAAirQ2bdpo9OjRunjxYrZvL7eWq6urwsLCNHz4cKWnp6tp06ZKSEjQjz/+KDc3N4WGhiowMFAmk0mrVq1Su3bt5OTkJE9PT3l7e2v+/PkqXbq04uLiNGrUqDxfV+XKlbV8+XJ16NBBJpNJY8eOzdNLuG7u96uvvtK2bdvk6empGTNm6Ny5c+bANSd///23Rcgm3XiEPbtzLF68WGvXrlVwcLCWLFminTt33nFWpTW1x8XFKTo6Wg0bNtTq1au1YsWKOx43ZcoUNWjQQBUrVlRKSopiYmK0ZMkSzZs3z9xm9OjROn36tBYvXizpxjICoaGhmj17tho1aqSzZ89KujGT093d/bbnunDhguLi4nTmzBlJN16wJkn+/v4Wa5devnzZHGD+9ddfev311+Xj42Px4qq82rFjh2JjY9W6dWv5+vpqx44d+vvvv1WtWjVJN5YjWLt2rQ4dOiRvb+8cr6tZs2b6+++/NW3aND355JP67rvvtGbNGovZt+Hh4XrhhRfk6+urtm3b6vLly/rxxx/10ksvmc8XGxurJk2ayNHR0arvV1dX1yzrIJcsWVLe3t5Ztm/ZskURERFW35/bIXAFAAAAAAD5ak3t+gVdQo5q1aqlBx98UMuWLTO/PCq3IiIi5OPjoylTpujYsWPy8PDQgw8+aH6JU5kyZTRhwgSNGjVK/fv3V9++fRUZGano6Gi9/PLLqlmzpqpUqaL33nvvjo+q38mMGTP07LPPqnHjxipVqpRGjhypxMTEPPUp3Xjp1LFjx9SmTRs5Oztr4MCB6ty5sxISEu54bFRUlKKioiy2RUREmGfLZho0aJD27NmjZ555RiaTST169NDgwYO1Zs2aPNXesWNHDR8+XEOHDlVKSorat2+vsWPHKjw8PMfjrly5osGDB+vUqVNycnJS1apV9dlnn+mZZ54xt4mPj7dYY3b+/Pm6fv26hgwZoiFDhpi3h4aGKjIy8rbnWrlypfr372/+nPmyr/Hjx1vUOW7cOI0bN07SjdmjDRs21Pfffy9vb29rboVV3NzctHnzZs2aNUuJiYkKDAzU9OnT1bZtW0nS888/r40bN6pBgwZKSkrShg0bzOsP36patWqaO3euJk+erIiICHXr1k1hYWGaP3++uU1oaKiuXr2qmTNnKiwsTKVKldKTTz5p3j99+nSNGDFCCxYsUJkyZXTixAmdOHFCwcHB2rBhQ56+Z7Zv366EhASL8+WWKePWxROKoMTERLm7uyshISHHNS3uRbMvzrap/Suer9y50f/JzyUFUlNTFRMTo3bt2lm99gyKLsY5igLGOYoCxjmKAsY5rFWY/x16O1evXtXx48cVHBx8xxfT3ItWr16t1157Tfv377d4CROAe9uGDRvUtWtXHTt2LE8z1J955hnVqVPH/MORW9nyZxwzXAEAAAAAQJHXvn17HT58WKdPn1a5cuUKuhwAVoqJidEbb7yRp7D12rVrqlWrloYPH25ITQSuAAAAAAAAkoYNG1bQJQCw0TvvvJPnPooXL64xY8YYUM0NzJEHAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAirx//vlHvr6+OnHiREGXYrWNGzfKZDLp0qVLkqTIyEh5eHgUaE25deLECZlMJu3du7egS0ER9fDDD+vrr782pK9ihvQCAAAAAABwG6kTXr1r53IYPz1Xx02aNEmdOnVSUFCQsQXdYuPGjWrevLkuXrxYaMPR3GrWrJnq1q2rWbNmFXQpNlu+fLkmT56sI0eOKDU1VZUrV9arr76qPn365HjMvHnztHfvXqWkpKhGjRoKDw9XmzZt7niuDz/8ULt27dKFCxe0Z88e1a1b16JNUFCQTp48KUmys7OTn5+f2rZtq3fffVeenp637TsyMlLDhg0zh/R51a9fP126dEnffPONIf3diclk0ooVK9S5c2ebjktLS1N4eLg+++wznT17VgEBAerXr5/GjBkjk8kkSRozZoyGDx+uLl26yM4ub3NUmeEKAAAAAACKtOTkZH3yyScaMGBAQZeCe5SXl5fefPNNbd++Xfv27VP//v3Vv39/rV279rbHbN68Wa1atVJMTIx27dql5s2bq0OHDtqzZ0+O57py5YqaNm2qt99+O8d2EydOVHx8vOLi4rR06VJt3rxZL7/8cq6uL7+lpqYW6PnffvttzZs3Tx988IEOHjyot99+W9OmTdP7779vbtO2bVtdvnxZa9asyfP5CFwBAAAAAECRFhMTI0dHRz388MPmbZmP669du1b16tWTk5OTHnvsMZ0/f15r1qxRtWrV5Obmpp49eyo5Odl8XHp6uqZMmaLg4GA5OTmpTp06+uqrryTdeGy+efPmkiRPT0+ZTCb169dPkvTdd9+padOm8vDwkLe3t5544gkdPXo0T9d19OhRderUSX5+fnJxcVHDhg21fv16izZBQUF666231LdvX7m4uCgwMFArV67U33//rU6dOsnFxUW1a9fWL7/8Yj7mn3/+UY8ePVSmTBk5OzurVq1a+vzzz/NU663S0tI0YMAA832sUqWKZs+ebdGmX79+6ty5syZPniw/Pz95eHho4sSJun79ul577TV5eXmpbNmyWrhwocVxI0eO1AMPPCBnZ2dVqFBBY8eOvWMg2KxZM3Xp0kXVqlVTxYoV9corr6h27draunXrbY+ZNWuWXn/9dTVs2FCVK1fW5MmTVblyZX377bc5nqtPnz4aN26cWrZsmWM7V1dX+fv7q0yZMmrevLlCQ0O1e/fuHI+5VXh4uOrWraslS5YoKChI7u7u6t69uy5fvmxu89VXX6lWrVpycnKSt7e3WrZsqStXrig8PFyLFi3Sf//7X5lMJplMJm3cuNG8PMQXX3yhRx99VCVKlNDSpUvN57r1Ht06q/zTTz9VjRo15OjoqNKlS2vo0KGSZG7XpUsXmUwmm2ajb9u2TZ06dVL79u0VFBSkJ598Uq1bt9bPP/9sbmNvb6927dopOjrapnuYHQJXAAAAAABQpG3ZskX169fPdl94eLg++OADbdu2TX/99ZeefvppzZo1S1FRUVq9erW+//57i1lyU6ZM0eLFi/Xhhx/q999/1/Dhw9W7d29t2rRJ5cqVM68ReejQIcXHx5tDxCtXrmjEiBH65ZdfFBsbKzs7O3Xp0kXp6em5vq6kpCS1a9dOsbGx2rNnjx5//HF16NBBcXFxFu1mzpypJk2aaM+ePWrfvr369Omjvn37qnfv3tq9e7cqVqyovn37KiMjQ5J09epV1a9fX6tXr9b+/fs1cOBA9enTxyK8yqv09HSVLVtWX375pQ4cOKBx48bpjTfe0LJlyyza/fDDDzpz5ow2b96sGTNmaPz48XriiSfk6empHTt26IUXXtCgQYN06tQp8zGurq6KjIzUgQMHNHv2bC1YsEAzZ860uraMjAzFxsbq0KFDeuSRR2y6psuXL8vLy8vqY6x1+vRpffvtt2rUqJHNxx49elTffPONVq1apVWrVmnTpk2aOnWqJCk+Pl49evTQs88+q4MHD2rjxo3q2rWrMjIyFBYWpqefflqPP/644uPjFR8fr8aNG5v7HTVqlF555RUdPHjwjssoZJo3b56GDBmigQMH6rffftPKlStVqVIlSdLOnTslSQsXLlR8fLz5szUaN26s2NhY/fnnn5KkX3/9VVu3blXbtm0t2j300EPasmWL1f3eDmu4AgAAAACAIu3kyZMKCAjIdt9bb72lJk2aSJIGDBig0aNH6+jRo6pQoYIk6cknn9SGDRs0cuRIpaSkaPLkyVq/fr1CQkIkSRUqVNDWrVv10Ucf6dFHHzWHbb6+vhZruHbr1s3ivJ9++ql8fHx04MAB1axZM1fXVadOHdWpU8f8OSIiQitWrNDKlSvNswYlqV27dho0aJAkady4cZo3b54aNmyop556StKNGaEhISE6d+6ceUZlWFiY+fiXXnpJa9eu1bJly/TQQw/lqtZbOTg4aMKECebPwcHB2r59u5YtW6ann37avN3Ly0vvvfee7OzsVKVKFU2bNk3Jycl64403JEmjR4/W1KlTtXXrVnXv3l3SjbU6MwUFBSksLEzR0dF6/fXXc6wpISFBZcqUUUpKiuzt7TV37ly1atXK6mt69913lZSUZFF/XowcOVJjxoxRWlqarl69qkaNGmnGjBk295Oenq7IyEi5urpKujHDNjY2VpMmTVJ8fLyuX7+url27KjAwUJJUq1Yt87FOTk5KSUmRv79/ln6HDRumrl272lTLW2+9pVdffVWvvPKKeVvDhg0lST4+PpIkDw+PbM+Xk1GjRikxMVFVq1aVvb290tLSNGnSJPXq1cuiXUBAgP766y+lp6fnaR1XZrgCAAAAAIAi7d9//1WJEiWy3Ve7dm3z7/38/MyPod+87fz585KkI0eOKDk5Wa1atZKLi4v51+LFi++4PMDhw4fVo0cPVahQQW5ububHpTNno7Zt29bcX40aNay6rqSkJIWFhalatWry8PCQi4uLDh48mGWG663XKFmGapnbMq8zLS1NERERqlWrlry8vOTi4qK1a9ea+126dKnF9ed2xuCcOXNUv359+fj4yMXFRfPnz89Se40aNSyCMT8/P4va7e3t5e3tba5dkr744gs1adJE/v7+cnFx0ZgxY8z9xsXFWdQ+efJk83Gurq7au3evdu7cqUmTJmnEiBHauHGjVdcSFRWlCRMmaNmyZfL19ZWU9/v02muvae/evdq3b59iY2MlSe3bt1daWpokWfT9wgsv3LafoKAgc9gqSaVLlzbfrzp16qhFixaqVauWnnrqKS1YsEAXL160qr4GDRrYdD3nz5/XmTNn1KJFC5uOs8ayZcu0dOlSRUVFaffu3Vq0aJHeffddLVq0yKKdk5OT0tPTlZKSkqfzMcMVAAAAAAAUaaVKlbptiOTg4GD+vclksvicuS3zsf+kpCRJ0urVq1WmTBmLdo6OjjnW0KFDBwUGBmrBggUKCAhQenq6atasqWvXrkmSPv74Y/37779ZaspJWFiY1q1bp3fffVeVKlWSk5OTnnzySXOft7vG223LvM533nlHs2fP1qxZs1SrVi2VLFlSw4YNM/fbsWNHi0fbb70X1oiOjlZYWJimT5+ukJAQubq66p133tGOHTtuW3tmrTl9jbZv365evXppwoQJatOmjdzd3RUdHa3p06dLujHDce/eveZjb378387Ozvx4e926dXXw4EFNmTJFzZo1u+O1PPfcc/ryyy8t1mXN630qVaqUuZ7KlStr1qxZCgkJ0YYNG9SyZUuL63Bzc7ttPzndL3t7e61bt07btm0zL5/x5ptvaseOHQoODs6xvpIlS1p8trOzMy9LkenmtXOdnJxy7C8vXnvtNY0aNco8y7lWrVo6efKkpkyZotDQUHO7CxcuqGTJknmuhcAVAAAAAAAUafXq1dNnn32W536qV68uR0dHxcXF6dFHH822TfHixSXJPAtRuvESqkOHDmnBggX6z3/+I0lZXsaUm9Dyxx9/VL9+/dSlSxdJNwLhEydO2NxPdv126tRJvXv3lnQjiP3zzz9VvXp1STdmgt48YzK352jcuLEGDx5s3pbXl4hJN16eFBgYqDfffNO87eTJk+bfFytWzBxi3ok1MyE///xzPfvss4qOjlb79u0t9hlxn25mb28vSeZg3trruBOTyaQmTZqoSZMmGjdunAIDA7VixQqNGDFCxYsXtxjLOfHx8dHZs2eVkZFhDvFvDoVdXV0VFBSk2NhY88vlbuXg4GD1+W6WnJycZYkAe3v7LGsk79+/X/Xq1bO5/1sRuAIAAAAAgCKtTZs2Gj16tC5evChPT89c9+Pq6qqwsDANHz5c6enpatq0qRISEvTjjz/Kzc1NoaGhCgwMlMlk0qpVq9SuXTs5OTnJ09NT3t7emj9/vkqXLq24uDiNGjUqz9dVuXJlLV++XB06dJDJZNLYsWPz9BKum/v96quvtG3bNnl6emrGjBk6d+6cOXDNyd9//20Rskk3HmHP7hyLFy/W2rVrFRwcrCVLlmjnzp13nFVpTe1xcXGKjo5Ww4YNtXr1aq1YseKOx02ZMkUNGjRQxYoVlZKSopiYGC1ZskTz5s0ztxk9erROnz6txYsXS7qxjEBoaKhmz56tRo0a6ezZs5JuzOR0d3e/7bkuXLiguLg4nTlzRtKNF6xJkr+/v8XapZcvXzYHmH/99Zdef/11+fj4WLy4Kq927Nih2NhYtW7dWr6+vtqxY4f+/vtvVatWTdKN5QjWrl2rQ4cOydvbO8fratasmf7++29NmzZNTz75pL777jutWbPGYvZteHi4XnjhBfn6+qpt27a6fPmyfvzxR7300kvm88XGxqpJkyZydHS0+vu1Q4cOmjRpksqXL68aNWpoz549mjFjhp599lmLdlu2bFHr1q1tvU1ZELgCAAAAAIB85TB+ekGXkKNatWrpwQcf1LJly8wvj8qtiIgI+fj4aMqUKTp27Jg8PDz04IMPml/iVKZMGU2YMEGjRo1S//791bdvX0VGRio6Olovv/yyatasqSpVqui9996746Pqd5IZKDVu3FilSpXSyJEjlZiYmKc+pRsvnTp27JjatGkjZ2dnDRw4UJ07d1ZCQsIdj42KilJUVJTFtoiICPNs2UyDBg3Snj179Mwzz8hkMqlHjx4aPHiw1qxZk6faO3bsqOHDh2vo0KFKSUlR+/btNXbsWIWHh+d43JUrVzR48GCdOnVKTk5Oqlq1qj777DM988wz5jbx8fEWa8zOnz9f169f15AhQzRkyBDz9tDQUEVGRt72XCtXrlT//v3NnzMfgx8/frxFnePGjdO4ceMk3Zg92rBhQ33//ffy9va25lZYxc3NTZs3b9asWbOUmJiowMBATZ8+XW3btpUkPf/889q4caMaNGigpKQkbdiwwbz+8K2qVaumuXPnavLkyYqIiFC3bt0UFham+fPnm9uEhobq6tWrmjlzpsLCwlSqVCk9+eST5v3Tp0/XiBEjtGDBApUpU0YnTpzQiRMnFBwcrA0bNtz2e+b999/X2LFjNXjwYJ0/f14BAQEaNGiQ+f5J0unTp7Vt2zZDZrubMm5dPOEumjdvnubNm2eezl6jRg2NGzfO/EW7evWqXn31VUVHRyslJUVt2rTR3LlzzYs1SzcWM37xxRe1YcMGubi4KDQ0VFOmTFGxYtZnyYmJiXJ3d1dCQkKOa1rci2ZfnG1T+1c8X7lzo/+TcNPbAK3hPn681W1TU1MVExOjdu3aWb32DIouxjmKAsY5igLGOYoCxjmsVZj/HXo7V69e1fHjxxUcHHzbF1Ddy1avXq3XXntN+/fvz9PbyQHcXRs2bFDXrl117NixPM1QHzlypC5evGgRAN/Mlj/jCvRPkLJly2rq1KnatWuXfvnlFz322GPq1KmTfv/9d0nS8OHD9e233+rLL7/Upk2bdObMGXXt2tV8fFpamtq3b69r165p27ZtWrRokSIjIy3SaQAAAAAAgDtp3769Bg4cqNOnTxd0KQBsEBMTozfeeCNPYask+fr6KiIiwpCaCnRJgQ4dOlh8njRpkubNm6effvpJZcuW1SeffKKoqCg99thjkqSFCxeqWrVq+umnn/Twww/r+++/14EDB7R+/Xr5+fmpbt26ioiI0MiRIxUeHm5eiPpWKSkpFosaZ06nT01NtXg7WmFgum6yqb0t13fdxp/o2dJ3ZtvCdr9RMBjnKAoY5ygKGOcoChjnsBb3+t40bNiwgi4BgI3eeecdQ/p59dVXDelHKuAlBW6WlpamL7/8UqGhodqzZ4/Onj2rFi1a6OLFi/Lw8DC3CwwM1LBhwzR8+HCNGzdOK1eutFhs+fjx46pQoYJ2795927eKhYeHa0I2j+NERUXJ2dnZ6EsDAAAAAMBCcnKyevbsyZICAFBI2PJnXIG/NOu3335TSEiIrl69KhcXF61YsULVq1fX3r17Vbx4cYuwVZL8/PzMb3U7e/asxXqumfsz993O6NGjNWLECPPnxMRElStXTq1bty50f9HNuzTvzo1u8qLHi1a3TZw61aa+3Wx4g2JqaqrWrVunVq1asUYU7ohxjqKAcY6igHGOooBxDmsZ8eIiAMC9qcAD1ypVqmjv3r1KSEjQV199pdDQUG3atClfz+no6ChHR8cs2x0cHArd/1xkFLNtgrIt11csPT3f+r75mMJ2z3H3Mc5RFDDOURQwzlEUMM5hrfv5Pt8jD9ICgKFs+bOtwF+7V7x4cVWqVEn169fXlClTVKdOHc2ePVv+/v66du2aLl26ZNH+3Llz8vf3lyT5+/vr3LlzWfZn7gMAAAAAAHdHZoicnJxcwJUAgPGuXbsmSbK3t79j2wKf4Xqr9PR0paSkqH79+nJwcFBsbKy6desmSTp06JDi4uIUEhIiSQoJCdGkSZN0/vx5+fr6SpLWrVsnNzc3Va9evcCuAQAAAACAosbe3l4eHh46f/68JMnZ2Vkmk20vkgOAe1F6err+/vtvOTs7q1ixO8epBRq4jh49Wm3btlX58uV1+fJlRUVFaePGjVq7dq3c3d01YMAAjRgxQl5eXnJzc9NLL72kkJAQPfzww5Kk1q1bq3r16urTp4+mTZums2fPasyYMRoyZEi2SwYAAAAAAID8k/m0aWboCgD3Czs7O5UvX96qHyQVaOB6/vx59e3bV/Hx8XJ3d1ft2rW1du1atWrVSpI0c+ZM2dnZqVu3bkpJSVGbNm00d+5c8/H29vZatWqVXnzxRYWEhKhkyZIKDQ3VxIkTC+qSAAAAAAAoskwmk0qXLi1fX1+lpqYWdDkAYJjixYvLzs661VkLNHD95JNPctxfokQJzZkzR3PmzLltm8DAQMXExBhdGgAAAAAAyCV7e3ur1jkEgPtRgb80CwAAAAAAAADuFwSuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYpJitBxw/flxbtmzRyZMnlZycLB8fH9WrV08hISEqUaJEftQIAAAAAAAAAIWC1YHr0qVLNXv2bP3yyy/y8/NTQECAnJycdOHCBR09elQlSpRQr169NHLkSAUGBuZnzQAAAAAAAABwT7IqcK1Xr56KFy+ufv366euvv1a5cuUs9qekpGj79u2Kjo5WgwYNNHfuXD311FP5UjAAAAAAAAAA3KusClynTp2qNm3a3Ha/o6OjmjVrpmbNmmnSpEk6ceKEUfUBAAAAAAAAQKFhVeCaU9h6K29vb3l7e+e6IAAAAAAAAAAorOxsPSAmJkZr167Nsn3t2rVas2aNIUUBAAAAAAAAQGFkc+A6atQopaWlZdmekZGhUaNGGVIUAAAAAAAAABRGNgeuhw8fVvXq1bNsr1q1qo4cOWJIUQAAAAAAAABQGNkcuLq7u+vYsWNZth85ckQlS5Y0pCgAAAAAAAAAKIxsDlw7deqkYcOG6ejRo+ZtR44c0auvvqqOHTsaWhwAAAAAAAAAFCY2B67Tpk1TyZIlVbVqVQUHBys4OFjVqlWTt7e33n333fyoEQAAAAAAAAAKhWK2HuDu7q5t27Zp3bp1+vXXX+Xk5KTatWvrkUceyY/6AAAAAAAAAKDQsDlwlSSTyaTWrVvrkUcekaOjo0wmk9F1AQAAAAAAAEChY/OSAunp6YqIiFCZMmXk4uKi48ePS5LGjh2rTz75xPACAQAAAAAAAKCwsDlwfeuttxQZGalp06apePHi5u01a9bUxx9/bGhxAAAAAAAAAFCY2By4Ll68WPPnz1evXr1kb29v3l6nTh398ccfhhYHAAAAAAAAAIWJzYHr6dOnValSpSzb09PTlZqaakhRAAAAAAAAAFAY2Ry4Vq9eXVu2bMmy/auvvlK9evUMKQoAAAAAAAAACqNith4wbtw4hYaG6vTp00pPT9fy5ct16NAhLV68WKtWrcqPGgEAAAAAAACgULB5hmunTp307bffav369SpZsqTGjRungwcP6ttvv1WrVq3yo0YAAAAAAAAAKBRsnuEqSf/5z3+0bt06o2sBAAAAAAAAgELN5hmuf/31l06dOmX+/PPPP2vYsGGaP3++oYUBAAAAAAAAQGFjc+Das2dPbdiwQZJ09uxZtWzZUj///LPefPNNTZw40fACAQAAAAAAAKCwsDlw3b9/vx566CFJ0rJly1SrVi1t27ZNS5cuVWRkpE19TZkyRQ0bNpSrq6t8fX3VuXNnHTp0yKJNs2bNZDKZLH698MILFm3i4uLUvn17OTs7y9fXV6+99pquX79u66UBAAAAAAAAQJ7YvIZramqqHB0dJUnr169Xx44dJUlVq1ZVfHy8TX1t2rRJQ4YMUcOGDXX9+nW98cYbat26tQ4cOKCSJUua2z3//PMWs2ednZ3Nv09LS1P79u3l7++vbdu2KT4+Xn379pWDg4MmT55s6+UBAAAAAAAAQK7ZHLjWqFFDH374odq3b69169YpIiJCknTmzBl5e3vb1Nd3331n8TkyMlK+vr7atWuXHnnkEfN2Z2dn+fv7Z9vH999/rwMHDmj9+vXy8/NT3bp1FRERoZEjRyo8PFzFixfPckxKSopSUlLMnxMTEyXdCJNTU1NtuoaCZrpusqm9Ldd33c62CdC29J3ZtrDdbxQMxjmKAsY5igLGOYoCxjmsxb0GgPuXKSMjI8OWAzZu3KguXbooMTFRoaGh+vTTTyVJb7zxhv744w8tX74818UcOXJElStX1m+//aaaNWtKurGkwO+//66MjAz5+/urQ4cOGjt2rHmW67hx47Ry5Urt3bvX3M/x48dVoUIF7d69W/Xq1ctynvDwcE2YMCHL9qioKIvZswAAAAAA5Ifk5GT17NlTCQkJcnNzK+hyAAAGsjlwlW48xp+YmChPT0/zthMnTpjXUM2N9PR0dezYUZcuXdLWrVvN2+fPn6/AwEAFBARo3759GjlypB566CFzsDtw4ECdPHlSa9euNR+TnJyskiVLKiYmRm3bts1yruxmuJYrV07/+9//Ct1fdPMuzbOp/YseL1rdNnHqVJv6dhs1yuq2qampWrdunVq1aiUHBwebzoOih3GOooBxjqKAcY6igHEOayUmJqpUqVIErgBwH7J5SQFJsre3twhbJSkoKChPhQwZMkT79++3CFulG4Fqplq1aql06dJq0aKFjh49qooVK+bqXI6OjuZ1aG/m4OBQ6P7nIqOYbXm5LddXLD093/q++ZjCds9x9zHOURQwzlEUMM5RFDDOYS3uMwDcv6xaBOjxxx/XTz/9dMd2ly9f1ttvv605c+bYVMTQoUO1atUqbdiwQWXLls2xbaNGjSTdWH5Akvz9/XXu3DmLNpmfb7fuKwAAAAAAAADkB6tmuD711FPq1q2b3N3d1aFDBzVo0EABAQEqUaKELl68qAMHDmjr1q2KiYlR+/bt9c4771h18oyMDL300ktasWKFNm7cqODg4Dsek7lWa+nSpSVJISEhmjRpks6fP29ezmDdunVyc3NT9erVraoDAAAAAAAAAIxgVeA6YMAA9e7dW19++aW++OILzZ8/XwkJCZIkk8mk6tWrq02bNtq5c6eqVatm9cmHDBmiqKgo/fe//5Wrq6vOnj0rSXJ3d5eTk5OOHj2qqKgotWvXTt7e3tq3b5+GDx+uRx55RLVr15YktW7dWtWrV1efPn00bdo0nT17VmPGjNGQIUOyXTYAAAAAAAAAAPKL1Wu4Ojo6qnfv3urdu7ckKSEhQf/++6+8vb1zvfbMvHk3FpRv1qyZxfaFCxeqX79+Kl68uNavX69Zs2bpypUrKleunLp166YxY8aY29rb22vVqlV68cUXFRISopIlSyo0NFQTJ07MVU0AAAAAAAAAkFu5emmWdGMWqru7e55OnpGR84Ly5cqV06ZNm+7YT2BgoGJiYvJUCwAAAAAAAADklVUvzQIAAAAAAAAA3BmBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABslV4Hrp0iV9/PHHGj16tC5cuCBJ2r17t06fPm1ocQAAAAAAAABQmBSz9YB9+/apZcuWcnd314kTJ/T888/Ly8tLy5cvV1xcnBYvXpwfdQIAAAAAAADAPc/mGa4jRoxQv379dPjwYZUoUcK8vV27dtq8ebOhxQEAAAAAAABAYWJz4Lpz504NGjQoy/YyZcro7NmzhhQFAAAAAAAAAIWRzYGro6OjEhMTs2z/888/5ePjY0hRAAAAAAAAAFAY2Ry4duzYURMnTlRqaqokyWQyKS4uTiNHjlS3bt0MLxAAAAAAAAAACgubA9fp06crKSlJvr6++vfff/Xoo4+qUqVKcnV11aRJk/KjRgAAAAAAAAAoFIrZeoC7u7vWrVunrVu3at++fUpKStKDDz6oli1b5kd9AAAAAAAAAFBo2By4ZmratKmaNm1qZC0AAAAAAAAAUKjlKnDduXOnNmzYoPPnzys9Pd1i34wZMwwpDAAAAAAAAAAKG5sD18mTJ2vMmDGqUqWK/Pz8ZDKZzPtu/j0AAAAAoHA5EXVC9rK/Y7sKoRXuQjUAABRONgeus2fP1qeffqp+/frlQzkAAAAAAAAAUHjZ2XyAnZ2aNGmSH7UAAAAAAAAAQKFmc+A6fPhwzZkzJz9qAQAAAAAAAIBCzeYlBcLCwtS+fXtVrFhR1atXl4ODg8X+5cuXG1YcAAAAAAAAABQmNgeuL7/8sjZs2KDmzZvL29ubF2UBAAAAAAAAwP+xOXBdtGiRvv76a7Vv3z4/6gEAAAAAAACAQsvmNVy9vLxUsWLF/KgFAAAAAAAAAAo1mwPX8PBwjR8/XsnJyflRDwAAAAAAAAAUWjYvKfDee+/p6NGj8vPzU1BQUJaXZu3evduw4gAAAIB7xYmoE7KX/R3bVQitcBeqAQAAwL3K5sC1c+fO+VAGAAAAAAAAABR+Ngeu48ePz486AAAAAAAAAKDQs3kNVwAAAAAAAABA9qya4erl5aU///xTpUqVkqenp0wm023bXrhwwbDiAAAAAAAAAKAwsSpwnTlzplxdXSVJs2bNys96AAAAAAAAAKDQsipwDQ0N1WOPPably5crNDQ0v2sCAAAAAAAAgELJ6jVcN27cqGvXruVnLQAAAAAAAABQqFk1wxUAACAnJ6JOyF72d2xXIbTCXagGAAAAAAqOTYHrgQMHdPbs2Rzb1K5dO08FAQAAAAAAAEBhZVPg2qJFC2VkZGTZbjKZlJGRIZPJpLS0NMOKAwAAAAAAAIDCxKbAdceOHfLx8cmvWgAAAAAAAACgULMpcC1fvrx8fX3zqxYAAAAAAAAAKNTsCroAAAAAAAAAALhfWB24PvrooypevHh+1gIAAAAAAAAAhZrVSwps2LAhP+sAAAAAAAAAgEKPJQUAAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGMTql2ZlGjFiRLbbTSaTSpQooUqVKqlTp07y8vLKc3EAAAAAAAAAUJjYHLju2bNHu3fvVlpamqpUqSJJ+vPPP2Vvb6+qVatq7ty5evXVV7V161ZVr17d8IIBAAAAAAAA4F5l85ICnTp1UsuWLXXmzBnt2rVLu3bt0qlTp9SqVSv16NFDp0+f1iOPPKLhw4fnR70AAAAAAAAAcM+yOXB95513FBERITc3N/M2d3d3hYeHa9q0aXJ2dta4ceO0a9cuQwsFAAAAAAAAgHudzYFrQkKCzp8/n2X733//rcTEREmSh4eHrl27lvfqAAAAAAAAAKAQydWSAs8++6xWrFihU6dO6dSpU1qxYoUGDBigzp07S5J+/vlnPfDAA0bXCgAAAAAAAAD3NJtfmvXRRx9p+PDh6t69u65fv36jk2LFFBoaqpkzZ0qSqlatqo8//tjYSgEAAAAAAADgHmdz4Ori4qIFCxZo5syZOnbsmCSpQoUKcnFxMbepW7euYQUCAAAAAAAAQGFhc+CaycXFRbVr1zayFgAAAAAAAAAo1GwOXK9cuaKpU6cqNjZW58+fV3p6usX+zFmvAAAAAAAAAFDU2By4Pvfcc9q0aZP69Omj0qVLy2Qy5UddAHDfOBF1Qvayv2O7CqEV7kI1AAAAAAAgP9kcuK5Zs0arV69WkyZN8qMeAAAAAAAAACi07Gw9wNPTU15eXvlRCwAAAAAAAAAUajYHrhERERo3bpySk5Pzox4AAAAAAAAAKLRsXlJg+vTpOnr0qPz8/BQUFCQHBweL/bt37zasOAAAAAC4n03d8z+r276Yj3UAAADj2By4du7cOR/KAAAAAADcr9ru22VT+zW16+dTJQAA5D+bA9fx48cbdvIpU6Zo+fLl+uOPP+Tk5KTGjRvr7bffVpUqVcxtrl69qldffVXR0dFKSUlRmzZtNHfuXPn5+ZnbxMXF6cUXX9SGDRvk4uKi0NBQTZkyRcWK2Xx5AAAAAAAAAJBrNq/haqRNmzZpyJAh+umnn7Ru3TqlpqaqdevWunLlirnN8OHD9e233+rLL7/Upk2bdObMGXXt2tW8Py0tTe3bt9e1a9e0bds2LVq0SJGRkRo3blxBXBIAAAAAAACAIsyqKaBeXl76888/VapUKXl6espkMt227YULF6w++XfffWfxOTIyUr6+vtq1a5ceeeQRJSQk6JNPPlFUVJQee+wxSdLChQtVrVo1/fTTT3r44Yf1/fff68CBA1q/fr38/PxUt25dRUREaOTIkQoPD1fx4sWznDclJUUpKSnmz4mJiZKk1NRUpaamWl3/vcB0/fZfi+zYcn3X7WzL423pO7NtYbvfKBiFfZynKc3wvnH/YZyjKGCcoyiwdZzbpV+3um1hHucO6ek2tS8K30dF4RoBoKgyZWRkZNyp0aJFi9S9e3c5Ojpq0aJFObYNDQ3NdTFHjhxR5cqV9dtvv6lmzZr64Ycf1KJFC128eFEeHh7mdoGBgRo2bJiGDx+ucePGaeXKldq7d695//Hjx1WhQgXt3r1b9erVy3Ke8PBwTZgwIcv2qKgoOTs757p+AAAAAACskZycrJ49eyohIUFubm4FXQ4AwEBWzXC9OUTNS6Cak/T0dA0bNkxNmjRRzZo1JUlnz55V8eLFLcJWSfLz89PZs2fNbW5ezzVzf+a+7IwePVojRowwf05MTFS5cuXUunXrQvcX3bxL82xq/6KH9e82TZw61aa+3UaNsrptamqq1q1bp1atWsnBwcGm86DoKezjvMqlKrKX/R3bB/UMsqkW3F8Y5ygKGOcoCmwd51fjnra67YCYBTb1fS+N826/77Wp/dc16trUvjDKfNISAHD/ydVbpY4ePaqFCxfq6NGjmj17tnx9fbVmzRqVL19eNWrUyFUhQ4YM0f79+7V169ZcHW8LR0dHOTo6Ztnu4OBQ6MK/jGJ3nKBswZbrK2bjYz+5uXeF8Z7j7ivs49z+//7Lj75x/2CcoyhgnKMosHWcp9tZ/0+ywjzOU21cDqEofB8VhWsEgKLK5pdmbdq0SbVq1dKOHTu0fPlyJSUlSZJ+/fVXjR8/PldFDB06VKtWrdKGDRtUtmxZ83Z/f39du3ZNly5dsmh/7tw5+fv7m9ucO3cuy/7MfQAAAAAAAABwt9gcuI4aNUpvvfWW1q1bZ/FCqscee0w//fSTTX1lZGRo6NChWrFihX744QcFBwdb7K9fv74cHBwUGxtr3nbo0CHFxcUpJCREkhQSEqLffvtN58+fN7dZt26d3NzcVL16dVsvDwAAAAAAAAByzeYlBX777TdFRUVl2e7r66v//e9/NvU1ZMgQRUVF6b///a9cXV3Na666u7vLyclJ7u7uGjBggEaMGCEvLy+5ubnppZdeUkhIiB5++GFJUuvWrVW9enX16dNH06ZN09mzZzVmzBgNGTIk22UDAAAAAAAAACC/2By4enh4KD4+Psts1D179qhMmTI29TVv3o0F5Zs1a2axfeHCherXr58kaebMmbKzs1O3bt2UkpKiNm3aaO7cuea29vb2WrVqlV588UWFhISoZMmSCg0N1cSJE229NAAAAOCua7tvl03t19Sun0+VAAAAwAg2B67du3fXyJEj9eWXX8pkMik9PV0//vijwsLC1LdvX5v6ysi484LyJUqU0Jw5czRnzpzbtgkMDFRMTIxN5wYAAAAAAAAAo9m8huvkyZNVtWpVlStXTklJSapevboeeeQRNW7cWGPGjMmPGgEAAAAAAACgULB5hmvx4sW1YMECjR07Vvv371dSUpLq1aunypUr50d9AAAAKKKm7rH+/QAv5mMdAAAAgC1sDlwzlS9fXuXLlzeyFgAAAAAAAAAo1GwOXJ999tkc93/66ae5LgYAAAAAAAAACjObA9eLFy9afE5NTdX+/ft16dIlPfbYY4YVBgAAAAAAAACFjc2B64oVK7JsS09P14svvqiKFSsaUhQAAAAAAAAAFEZ2hnRiZ6cRI0Zo5syZRnQHAAAAAAAAAIWSIYGrJB09elTXr183qjsAAAAAAAAAKHRsXlJgxIgRFp8zMjIUHx+v1atXKzQ01LDCAAAAAAAAAKCwsTlw3bNnj8VnOzs7+fj4aPr06Xr22WcNKwwAANze1D3/s7rti/lYBwAAAADAks2B64YNG/KjDgAAgCza7ttlU/s1tevnUyUAAAAAYJ1creF6/fp1rV+/Xh999JEuX74sSTpz5oySkpIMLQ4AAAAAAAAAChObZ7iePHlSjz/+uOLi4pSSkqJWrVrJ1dVVb7/9tlJSUvThhx/mR50AAAAAAFv8YbK+bZqTpM/zrRQAAIoSm2e4vvLKK2rQoIEuXrwoJycn8/YuXbooNjbW0OIAAAAAAAAAoDCxeYbrli1btG3bNhUvXtxie1BQkE6fPm1YYQAAAAAAAABQ2Ng8wzU9PV1paWlZtp86dUqurq6GFAUAAAAAAAAAhZHNgWvr1q01a9Ys82eTyaSkpCSNHz9e7dq1M7I2AAAAAAAAAChUbF5SYPr06WrTpo2qV6+uq1evqmfPnjp8+LBKlSqlzz9nkXUAAAAAAAAARZfNgWvZsmX166+/Kjo6Wvv27VNSUpIGDBigXr16WbxECwAAAAAAAACKGpsDV0kqVqyYevfubXQtAAAAAAAAAFCo2Ry4Ll68OMf9ffv2zXUxAAAAAAAAAFCY2Ry4vvLKKxafU1NTlZycrOLFi8vZ2ZnAFQAAAPe+P0zWt01zksS7CgAAAGAdO1sPuHjxosWvpKQkHTp0SE2bNuWlWQAAAAAAAACKtFyt4XqrypUra+rUqerdu7f++OMPI7pEEXEi6oTsZX/HdhVCK9yFagAAAAAAAIC8sXmG6+0UK1ZMZ86cMao7AAAAAAAAACh0bJ7hunLlSovPGRkZio+P1wcffKAmTZoYVhgA5NbUPf+zuu2L+VgHAAAAAAAoemwOXDt37mzx2WQyycfHR4899pimT59uVF0AAAAAAAAAUOjYHLimp6fnRx0AAAAAAAAAUOjleg3X//3vf0pMTDSyFgAAAAAAAAAo1Gya4Xrp0iW9+eab+uKLL3Tx4kVJko+Pj/r376+xY8fK2dk5X4oEAGTVdt8um9qvqV0/nyoBAAAAAACZrA5cL1y4oJCQEJ0+fVq9evVStWrVJEkHDhzQ+++/r3Xr1mnr1q3at2+ffvrpJ7388sv5VjQAAAAAAAAA3IusDlwnTpyo4sWL6+jRo/Lz88uyr3Xr1urTp4++//57vffee4YXCgAAAAAAAAD3OqsD12+++UYfffRRlrBVkvz9/TVt2jS1a9dO48ePV2hoqKFFAgCAu+gPk/Vt05wkfZ5vpQAAAABAYWP1S7Pi4+NVo0aN2+6vWbOm7OzsNH78eEMKAwAAAAAAAIDCxurAtVSpUjpx4sRt9x8/fly+vr5G1AQAAAAAAAAAhZLVgWubNm305ptv6tq1a1n2paSkaOzYsXr88ccNLQ4AAAAAAAAAChObXprVoEEDVa5cWUOGDFHVqlWVkZGhgwcPau7cuUpJSdHixYvzs1YAAAAAAAAAuKdZHbiWLVtW27dv1+DBgzV69GhlZGRIkkwmk1q1aqUPPvhA5cuXz7dCAQAAAAAAAOBeZ3XgKknBwcFas2aNLl68qMOHD0uSKlWqJC8vr3wpDgAAAAAAAAAKE5sC10yenp566KGHjK4FAAAAAAAAAAo1q1+aBQAAAAAAAADIGYErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGKVbQBQBAofKHyfq2aU6SPs+3UgAAAAAAwL2HGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMUaOC6efNmdejQQQEBATKZTPrmm28s9vfr108mk8ni1+OPP27R5sKFC+rVq5fc3Nzk4eGhAQMGKCkp6S5eBQAAAAAAAADcUKCB65UrV1SnTh3NmTPntm0ef/xxxcfHm399/vnnFvt79eql33//XevWrdOqVau0efNmDRw4ML9LBwAAAAAAAIAsihXkydu2bau2bdvm2MbR0VH+/v7Z7jt48KC+++477dy5Uw0aNJAkvf/++2rXrp3effddBQQEGF4zAAAAAAAAANxOgQau1ti4caN8fX3l6empxx57TG+99Za8vb0lSdu3b5eHh4c5bJWkli1bys7OTjt27FCXLl2y7TMlJUUpKSnmz4mJiZKk1NRUpaam5uPVGM903WRTe1uu77qdbROgbek7s22a0gzvG/cfW8e5Xfp1q9vaPM7TnKxvm36jbX6Nc4f0dJva8310b2OcZ49xfn9hnGePcX5/YZxnj3GeVVG4RgAoqkwZGRkZBV2EJJlMJq1YsUKdO3c2b4uOjpazs7OCg4N19OhRvfHGG3JxcdH27dtlb2+vyZMna9GiRTp06JBFX76+vpowYYJefPHFbM8VHh6uCRMmZNkeFRUlZ2dnQ68LAAAAAIBbJScnq2fPnkpISJCbm1tBlwMAMNA9PcO1e/fu5t/XqlVLtWvXVsWKFbVx40a1aNEi1/2OHj1aI0aMMH9OTExUuXLl1Lp160L3F928S/Nsan817mmr2w6IWWBT326jRlndNjU1VevWrVOVS1VkL/s7tg/qGWRTLbi/3FPjvOsUq9umpjtp3bFP822cd/t9r03tv65R16b2uLsY59ljnN9fGOfZY5zfXxjn2WOcZ5X5pCUA4P5zTweut6pQoYJKlSqlI0eOqEWLFvL399f58+ct2ly/fl0XLly47bqv0o11YR0dHbNsd3BwkIODg+F156eMYrZNUE63s/5LXszGx35yc+/s/++//Ogb9497apzb/2tTeyn/xnmqjY8V8n10b2OcZ49xfn9hnGePcX5/YZxnj3GeVVG4RgAoqmz7W6+AnTp1Sv/8849Kly4tSQoJCdGlS5e0a9cuc5sffvhB6enpatSoUUGVCQAAAAAAAKCIKtAZrklJSTpy5Ij58/Hjx7V37155eXnJy8tLEyZMULdu3eTv76+jR4/q9ddfV6VKldSmTRtJUrVq1fT444/r+eef14cffqjU1FQNHTpU3bt3V0BAQEFdFgAAAAAAAIAiqkBnuP7yyy+qV6+e6tWrJ0kaMWKE6tWrp3Hjxsne3l779u1Tx44d9cADD2jAgAGqX7++tmzZYrEcwNKlS1W1alW1aNFC7dq1U9OmTTV//vyCuiQAAAAAAAAARViBznBt1qyZMjJuv8bR2rVr79iHl5eXoqKijCwLAAAAAAAAAHKlUL00C7BW23277tzoJmtq18+nSgAAAAAAAFCUFKqXZgEAAAAAAADAvYzAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMUqygC8B95A+T9W3TnCR9nm+lAAAAAAAAAAWBGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwSLGCLgAAAAAAUMgc/t7GA7zzpQwAAO5FzHAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQYoVdAEAAADAfeXw9zYe4J0vZQAAAKBgMMMVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYpFhBFwAAAIqQw9/beIB3vpQBAAAAAPmFGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgxQry5Js3b9Y777yjXbt2KT4+XitWrFDnzp3N+zMyMjR+/HgtWLBAly5dUpMmTTRv3jxVrlzZ3ObChQt66aWX9O2338rOzk7dunXT7Nmz5eLiUgBXBAAAAADIq9QJr1rd1mH89HysBAAA2xXoDNcrV66oTp06mjNnTrb7p02bpvfee08ffvihduzYoZIlS6pNmza6evWquU2vXr30+++/a926dVq1apU2b96sgQMH3q1LAAAAAAAAAACzAp3h2rZtW7Vt2zbbfRkZGZo1a5bGjBmjTp06SZIWL14sPz8/ffPNN+revbsOHjyo7777Tjt37lSDBg0kSe+//77atWund999VwEBAXftWgAAAAAAAACgQAPXnBw/flxnz55Vy5Ytzdvc3d3VqFEjbd++Xd27d9f27dvl4eFhDlslqWXLlrKzs9OOHTvUpUuXbPtOSUlRSkqK+XNiYqIkKTU1Vampqfl0RfnDdN1kU3u79OtWt71uZ9sE6NQ0J+vbpt9om6Y069rb+HVxSE+3qX1h+7oXNYzz7DHO7y9FZpynZdhUC+P8/sI4zx7j/P7COM+ezePczt6GxoXze4LvZQC4f5kyMjJs+5syn5hMJos1XLdt26YmTZrozJkzKl26tLnd008/LZPJpC+++EKTJ0/WokWLdOjQIYu+fH19NWHCBL344ovZnis8PFwTJkzIsj0qKkrOzs7GXRQAAAAAANlITk5Wz549lZCQIDc3t4IuBwBgoHt2hmt+Gj16tEaMGGH+nJiYqHLlyql169aF7i+6eZfm2dT+atzTVrcdELPApr7duk6xum1qupPWHftUVS5Vkb3u/NProJ5BNtXS7fe9NrX/ukZdm9rj7mKcZ49xfn8pMuO80TGbaul21cum9ozzexvjPHuM8/sL4zx7to7z6G+/tLqtw6hJNvV9r8h80hIAcP+5ZwNXf39/SdK5c+csZrieO3dOdevWNbc5f/68xXHXr1/XhQsXzMdnx9HRUY6Ojlm2Ozg4yMHBwYDq756MYrZNUE63s/5LXszGx34c7P+1qb0k2f/ff3fs28avS6qNj1sVtq97UcM4zx7j/P5SZMa5vW2P2jLO7y+M8+wxzu8vjPPs2TzO061b2kAqvN8ThbVuAMCd2fa33l0UHBwsf39/xcbGmrclJiZqx44dCgkJkSSFhITo0qVL2rVrl7nNDz/8oPT0dDVq1Oiu1wwAAAAAAACgaCvQGa5JSUk6cuSI+fPx48e1d+9eeXl5qXz58ho2bJjeeustVa5cWcHBwRo7dqwCAgLM67xWq1ZNjz/+uJ5//nl9+OGHSk1N1dChQ9W9e3cFBAQU0FUBAAAAAAAAKKoKNHD95Zdf1Lx5c/PnzHVVQ0NDFRkZqddff11XrlzRwIEDdenSJTVt2lTfffedSpQoYT5m6dKlGjp0qFq0aCE7Ozt169ZN77333l2/FgAAAAAAAAAo0MC1WbNmysi4/RpHJpNJEydO1MSJE2/bxsvLS1FRUflRHgAAAHDPSZ3wqtVtHcZPz8dKAAAAkJ17dg1XAAAAAAAAAChsCnSGKwDgJoe/t/EA73wpAwAAAAAA5B4zXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABilW0AUAAAAYJXXCq1a3dRg/PR8rAQAAAFBUMcMVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMEixgi4AAHB3pE541eq2DuOn52MlAAAAAADcv5jhCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGKVbQBQBWOfy9jQd450sZAAAAAAAAQE6Y4QoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxyTweu4eHhMplMFr+qVq1q3n/16lUNGTJE3t7ecnFxUbdu3XTu3LkCrBgAAAAAAABAUXZPB66SVKNGDcXHx5t/bd261bxv+PDh+vbbb/Xll19q06ZNOnPmjLp27VqA1QIAAAAAAAAoyooVdAF3UqxYMfn7+2fZnpCQoE8++URRUVF67LHHJEkLFy5UtWrV9NNPP+nhhx++26UCAAAAAAAAKOLu+cD18OHDCggIUIkSJRQSEqIpU6aofPny2rVrl1JTU9WyZUtz26pVq6p8+fLavn17joFrSkqKUlJSzJ8TExMlSampqUpNTc2/i8kHpusmm9rbpV+3uu11O9smQKemOVnfNv1G2zSlWdl3hk21OKSn29S+sH3dixrGefZsHud29jY05nvibmOcZ49xfn9hnGePcX5/YZxnj3GeFf8GAYD7lykjI8O2vynvojVr1igpKUlVqlRRfHy8JkyYoNOnT2v//v369ttv1b9/f4vgVJIeeughNW/eXG+//fZt+w0PD9eECROybI+KipKzs7Ph1wEAAAAAwM2Sk5PVs2dPJSQkyM3NraDLAQAY6J4OXG916dIlBQYGasaMGXJycsp14JrdDNdy5crpf//7X6H7i27epXk2tb8a97TVbQfELLCpb7euU6xum5rupHXHPlWVS1Vkrzv/9Dqo0TGbaul21cum9l/XqGtTe9xdjPPs2TrOo7/90uq2DqMm2dQ38o5xnj3G+f2FcZ49xvn9hXGePcZ5VomJiSpVqhSBKwDch+75JQVu5uHhoQceeEBHjhxRq1atdO3aNV26dEkeHh7mNufOnct2zdebOTo6ytHRMct2BwcHOTg4GF12vsooZltenm5n/Ze8mI2P/TjY/2tTe0my/7//7ty3bY9mpdr4uFVh+7oXNYzz7Nk8ztOte0RQ4nuiIDDOs8c4v78wzrPHOL+/MM6zxzjPqrDWDQC4M9v+1itgSUlJOnr0qEqXLq369evLwcFBsbGx5v2HDh1SXFycQkJCCrBKAAAAAAAAAEXVPT3DNSwsTB06dFBgYKDOnDmj8ePHy97eXj169JC7u7sGDBigESNGyMvLS25ubnrppZcUEhKS4wuzAAAAAAAAACC/3NOB66lTp9SjRw/9888/8vHxUdOmTfXTTz/Jx8dHkjRz5kzZ2dmpW7duSklJUZs2bTR37twCrhoAAAAAAABAUXVPB67R0dE57i9RooTmzJmjOXPm3KWKAAAAAAAAAOD2CtUargAAAAAAAABwLyNwBQAAAAAAAACD3NNLCgB3S+qEV61u6zB+ej5WAgAAAAAAgMKMGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYJD7JnCdM2eOgoKCVKJECTVq1Eg///xzQZcEAAAAAAAAoIi5LwLXL774QiNGjND48eO1e/du1alTR23atNH58+cLujQAAAAAAAAARUixgi7ACDNmzNDzzz+v/v37S5I+/PBDrV69Wp9++qlGjRqVpX1KSopSUlLMnxMSEiRJFy5cUGpq6t0p2iApCSl3bnSTa4kXrW578do1m/q+nlDC6rap6SWUnJyshH8TZC/7O7b/JyHJplp01cGm5v9cu251W4d//rGtFuQZ4/w2GOf3Fcb5bTDO7yuM89tgnN9XGOe3wTjP4vLly5KkjIyMAq4EAGA0U0Yh/9P92rVrcnZ21ldffaXOnTubt4eGhurSpUv673//m+WY8PBwTZgw4S5WCQAAAABAVn/99ZfKli1b0GUAAAxU6Ge4/u9//1NaWpr8/Pwstvv5+emPP/7I9pjRo0drxIgR5s/p6em6cOGCvL29ZTKZ8rVe3JCYmKhy5crpr7/+kpubW0GXA+QLxjmKAsY5igLGOYoCxvndl5GRocuXLysgIKCgSwEAGKzQB6654ejoKEdHR4ttHh4eBVNMEefm5sb/0OG+xzhHUcA4R1HAOEdRwDi/u9zd3Qu6BABAPij0L80qVaqU7O3tde7cOYvt586dk7+/fwFVBQAAAAAAAKAoKvSBa/HixVW/fn3Fxsaat6Wnpys2NlYhISEFWBkAAAAAAACAoua+WFJgxIgRCg0NVYMGDfTQQw9p1qxZunLlivr371/QpeE2HB0dNX78+CxLOwD3E8Y5igLGOYoCxjmKAsY5AADGMWVkZGQUdBFG+OCDD/TOO+/o7Nmzqlu3rt577z01atSooMsCAAAAAAAAUITcN4ErAAAAAAAAABS0Qr+GKwAAAAAAAADcKwhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAADINd69CQAAAACWCFwBAECuOTo66uDBgwVdBgAgF7Zs2aLevXsrJCREp0+fliQtWbJEW7duLeDKAAAo3IoVdAHAX3/9pfHjx+vTTz8t6FKAPPn333+1a9cueXl5qXr16hb7rl69qmXLlqlv374FVB2QNyNGjMh2e1pamqZOnSpvb29J0owZM+5mWUC++OCDD/Tzzz+rXbt26t69u5YsWaIpU6YoPT1dXbt21cSJE1WsGP8bjcLt66+/Vp8+fdSrVy/t2bNHKSkpkqSEhARNnjxZMTExBVwhAACFlymDZwFRwH799Vc9+OCDSktLK+hSgFz7888/1bp1a8XFxclkMqlp06aKjo5W6dKlJUnnzp1TQEAA4xyFlp2dnerUqSMPDw+L7Zs2bVKDBg1UsmRJmUwm/fDDDwVTIGCQt956S9OmTVPr1q31448/atiwYXrnnXc0fPhw2dnZaebMmXrxxRc1YcKEgi4VyJN69epp+PDh6tu3r1xdXfXrr7+qQoUK2rNnj9q2bauzZ88WdIkAABRa/Gge+W7lypU57j927NhdqgTIPyNHjlTNmjX1yy+/6NKlSxo2bJiaNGmijRs3qnz58gVdHpBnkydP1vz58zV9+nQ99thj5u0ODg6KjIzMMqsbKKwiIyMVGRmprl276tdff1X9+vW1aNEi9erVS5JUtWpVvf766wSuKPQOHTqkRx75f+3dbUzV5ePH8c/3KBxvuBGGBmwiM1yEUQmHbmQITpJSCSIXMxTblE1dSitEfJCbmvM80FnTLVuhqGGiq2zZrEwtbzPBpJGmqaU2TmmIQ8CyI+f/wHX25weZ5he+cs77tZ0H5/refc7GA/hwnesa1WE8NDRUly9f7v5AAAD4EApXdLnc3FwZhnHTjVUMw+jGRID5Dhw4oC+++EIRERGKiIjQxx9/rFmzZiktLU27d+9W//79rY4I3JGysjKNGTNGkydPVnZ2tpYuXaqAgACrYwGmq6+vl8PhkCQ99NBDstlsevjhh73Hk5KSVF9fb1E6wDyRkZE6deqUYmNj243v27dPQ4cOtSYUAAA+gk2z0OWioqL0wQcfqK2trdPXkSNHrI4I3LGrV6+2W8/PMAy9+eabys7OVnp6uk6ePGlhOsAcKSkpqqmp0cWLF+VwOFRXV8c/zOBzIiMjdezYMUnSjz/+qOvXr3vfS9L333+vQYMGWRUPME1RUZGKi4t16NAhGYah+vp6VVZWqqSkRDNnzrQ6HgAAPRozXNHlkpOTVVNTo5ycnE6P/9vsV6AniI+PV3V1te6///5246tWrZIkPf3001bEAkwXFBSkdevWadOmTcrMzGRdYvicgoICFRYWKicnRzt37lRpaalKSkrU0NAgwzC0ZMkSTZw40eqYwB0rKytTW1ubxowZo9bWVo0aNUp2u10lJSWaPXu21fEAAOjR2DQLXW7v3r1qaWnRk08+2enxlpYWVVdXKz09vZuTAeZZunSp9u7d+487+s6aNUurV69WW1tbNycDus4vv/yimpoaZWZmsmwGfEZbW5ucTqcOHjyokSNHqqysTFVVVSotLVVra6uys7O1atUqfubhM65du6ZTp06publZCQkJCgoKsjoSAAA9HoUrAAAAAAAAAJiEJQUAAAAAwM+0tLTI6XRq586dunDhQodv4Zw5c8aiZAAA9HwUrgAAAADgZ6ZPn66vvvpKU6ZMUVRUFJsgAgBgIpYUAAAAAAA/M2DAAH3yySdKTU21OgoAAD7HZnUAAAAAAED3CgsLU3h4uNUxAADwSRSuAAAAAOBnFi9erAULFqi1tdXqKAAA+ByWFAAAAAAAPzNixAidPn1aHo9HsbGxCggIaHf8yJEjFiUDAKDnY9MsAAAAAPAzubm5VkcAAMBnMcMVAAAAAAAAAEzCGq4AAAAAAAAAYBKWFAAAAAAAPxAeHq6TJ08qIiJCYWFhMgzjH8+9dOlSNyYDAMC3ULgCAAAAgB9YsWKFgoODJUmvv/66tWEAAPBhrOEKAAAAAAAAACZhhisAAAAA+IGmpqZbPjckJKQLkwAA4NuY4QoAAAAAfsBms9103VZJ8ng8MgxD169f76ZUAAD4Hma4AgAAAIAf2L17t9URAADwC8xwBQAAAAA/kJeXp4qKCoWEhGj9+vXKz8+X3W63OhYAAD6HwhUAAAAA/EBgYKDOnj2rqKgo9erVSy6XS4MGDbI6FgAAPoclBQAAAADAD8THx2v+/PkaPXq0PB6PNm/e/I+bYxUWFnZzOgAAfAczXAEAAADAD+zfv1+vvPKKTp8+rUuXLik4OLjTTbQMw9ClS5csSAgAgG+gcAUAAAAAP2Oz2fTrr7+ypAAAAF3AZnUAAAAAAEDXy8vLU1NTkyRp7dq1Cg4OtjgRAAC+iRmuAAAAAOAH2DQLAIDuwaZZAAAAAOAH2DQLAIDuwQxXAAAAAPADBw4c0Msvv8ymWQAAdDEKVwAAAADwMzabTS6XS/fcc4/VUQAA8DkUrgAAAADgZ86ePauQkBCtWbNGx48flyQNHz5c06ZN+8dlBgAAwK2hcAUAAAAAP1NdXa2srCz17dtXjzzyiCTp8OHDunr1qj777DMlJydbnBAAgJ6LwhUAAAAA/ExaWpri4uL09ttvq3fvG3spu91uTZ8+XWfOnNGePXssTggAQM9F4QoAAAAAfqZv37769ttvFR8f32782LFjcjgcam1ttSgZAAA9n83qAAAAAACA7hUSEqJz5851GD9//ryCg4MtSAQAgO+gcAUAAAAAP5Ofn69p06apqqpK58+f1/nz57Vp0yZNnz5dkyZNsjoeAAA9Wm+rAwAAAAAAuteyZctkGIYKCwvldrslSQEBAZo5c6acTqfF6QAA6NlYwxUAAAAA/FRra6tOnz4tSbr33nvVr18/ixMBANDzUbgCAAAAAAAAgElYwxUAAAAAAAAATELhCgAAAAAAAAAmoXAFAAAAAAAAAJNQuAIAAAAAAACASShcAQDwQ4ZhaOvWrabd78svv5RhGLp8+bJp9wQAAACAnojCFQCAHuLgwYPq1auXxo8ff8f3crlceuqpp0xIdetiY2NlGIYMw1C/fv2UmJiod95557bvY3ZZDAAAAABmonAFAKCHKC8v1+zZs7Vnzx7V19ff9FyPxyO3291h/Nq1a5KkyMhI2e32Lsl5M4sWLZLLQDxxLgAABhpJREFU5VJdXZ0mT56soqIibd++vdtzAAAAAEBXoXAFAKAHaG5uVlVVlWbOnKnx48eroqKi3fG/v9K/fft2JScny263a9++fcrIyNCLL76ol156SREREcrKypLUfpboyJEjNW/evHb3u3jxogICArRnzx5J0oYNG+RwOBQcHKzIyEg9//zzunDhwm1/jr+vHzp0qObNm6fw8HDt2LHDe/zw4cN64oknFBERodDQUKWnp+vIkSPe47GxsZKkZ555RoZheN9L0kcffaSkpCT16dNHQ4cO1cKFCzstnQEAAACgK1G4AgDQA2zevFnx8fG67777NHnyZK1Zs0Yej6fDeWVlZXI6nTp+/LgefPBBSdK6desUGBio/fv3a/Xq1R2uKSgo0KZNm9rdr6qqStHR0UpLS5Mk/fXXX1q8eLFqa2u1detW/fzzz3rhhRf+8+dpa2vT+++/r8bGRgUGBnrHr1y5oqlTp2rfvn36+uuvNWzYMI0bN05XrlyRdKOQlaS1a9fK5XJ53+/du1eFhYUqLi7WsWPH9NZbb6miokJLliz5zxkBAAAA4L8wPJ39tQYAAO4qqampeu6551RcXCy3262oqCht2bJFGRkZkm7McB09erS2bt2qnJwc73UZGRlqampqN0tUujHD9cMPP1Rubq4uXryo6Oho7dq1y1uwjhw5UqNGjZLT6ew0T3V1tVJSUnTlyhUFBQV5n9/Y2KgBAwZ0ek1sbKxcLpcCAgL0559/yu12Kzw8XIcOHVJcXFyn17S1tWnAgAHauHGjJkyY0CH73zIzMzVmzBjNnz/fO/buu++qtLT0X5dfAAAAAAAzMcMVAIC73IkTJ/TNN99o0qRJkqTevXsrPz9f5eXlHc51OBwdxpKTk296/4EDB2rs2LGqrKyUJP300086ePCgCgoKvOfU1NQoOztbMTExCg4OVnp6uiTp3Llzt/VZ5s6dq6NHj2rXrl169NFHtWLFinZl62+//aaioiINGzZMoaGhCgkJUXNz878+p7a2VosWLVJQUJD3VVRUJJfLpdbW1tvKCAAAAAB3orfVAQAAwM2Vl5fL7XYrOjraO+bxeGS327Vq1SqFhoZ6x/v379/h+s7G/ldBQYHmzJmjlStXauPGjUpMTFRiYqIkqaWlRVlZWcrKylJlZaUGDhyoc+fOKSsry7sJ162KiIhQXFyc4uLitGXLFiUmJsrhcCghIUGSNHXqVDU0NOiNN97QkCFDZLfb9fjjj//rc5qbm7Vw4ULl5eV1ONanT5/byggAAAAAd4IZrgAA3MXcbrfWr1+v5cuX6+jRo95XbW2toqOj9d5775nynJycHP3xxx/69NNPtXHjxnazW3/44Qc1NDTI6XQqLS1N8fHx/2nDrP81ePBg5efnt1sGYP/+/ZozZ47GjRun4cOHy2636/fff293XUBAgK5fv95uLCkpSSdOnPCWuf//ZbPx6w4AAACA7sMMVwAA7mLbtm1TY2Ojpk2b1m4mqyQ9++yzKi8v14wZM+74Of3791dubq5effVVHT9+3Lt8gSTFxMQoMDBQK1eu1IwZM1RXV6fFixff8TMlqbi4WA888ICqq6vlcDg0bNgwbdiwQQ6HQ01NTZo7d6769u3b7prY2Fjt3LlTqampstvtCgsL04IFCzRhwgTFxMRo4sSJstlsqq2tVV1dnV577TVTsgIAAADArWDKBwAAd7Hy8nJlZmZ2KFulG4VrdXW1vvvuO1OeVVBQoNraWqWlpSkmJsY7PnDgQFVUVGjLli1KSEiQ0+nUsmXLTHlmQkKCxo4dqwULFki68XkbGxuVlJSkKVOmaM6cORo0aFC7a5YvX64dO3Zo8ODBGjFihCQpKytL27Zt0+eff66UlBQ99thjWrFihYYMGWJKTgAAAAC4VYbH4/FYHQIAAAAAAAAAfAEzXAEAAAAAAADAJBSuAAAAAAAAAGASClcAAAAAAAAAMAmFKwAAAAAAAACYhMIVAAAAAAAAAExC4QoAAAAAAAAAJqFwBQAAAAAAAACTULgCAAAAAAAAgEkoXAEAAAAAAADAJBSuAAAAAAAAAGASClcAAAAAAAAAMMn/AST4flJEnzHJAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create a list to store the queueing time data\n", + "qt_data = []\n", + "\n", + "# Iterate over the models, batch sizes, and arrival rates to calculate queueing time\n", + "for ssm in small_model_names:\n", + " for batch_size in batch_sizes:\n", + " for arrival_rate in arrival_rates:\n", + " model_name = ssm.replace(\"/\", \"-\")\n", + " filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n", + " if os.path.exists(filepath):\n", + " qt = get_queueing_time(filepath)\n", + " qt_data.append({\n", + " 'Model': model_name,\n", + " 'Batch Size': batch_size,\n", + " 'Arrival Rate': arrival_rate,\n", + " 'Queueing Time': qt\n", + " })\n", + "# add incremental decoding entry\n", + "for batch_size in batch_sizes:\n", + " for arrival_rate in arrival_rates:\n", + " model_name = ssm.replace(\"/\", \"-\")\n", + " filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n", + " if os.path.exists(filepath):\n", + " qt = get_queueing_time(filepath)\n", + " qt_data.append({\n", + " 'Model': \"Incr Dec (baseline)\",\n", + " 'Batch Size': batch_size,\n", + " 'Arrival Rate': arrival_rate,\n", + " 'Queueing Time': qt\n", + " })\n", + "\n", + "# Convert the list to a DataFrame\n", + "qt_df = pd.DataFrame(qt_data)\n", + "print(qt_df.head())\n", + "\n", + "# Pivot the dataframe to have models and batch sizes as columns\n", + "pivot_df = qt_df.pivot_table(index='Arrival Rate', columns=['Model', 'Batch Size'], values='Queueing Time')\n", + "\n", + "# Plot the data\n", + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "\n", + "colors = ['lightgreen', 'skyblue', 'lightcoral', 'gold', 'plum', 'peachpuff', 'mediumturquoise', 'salmon']\n", + "pivot_df.plot(kind='bar', ax=ax, color=colors)\n", + "\n", + "ax.set_title('Queueing Time vs Arrival Rate for Different Models and Batch Sizes\\nLLM: LLAMA-3.1-70B-Instruct')\n", + "ax.set_xlabel('Arrival Rate (requests/sec)')\n", + "ax.set_ylabel('Queueing Time (sec)')\n", + "ax.grid(True)\n", + "ax.legend(title='Model and Batch Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "\n", + "# Save the plot as a PDF\n", + "plt.savefig('/usr/FlexFlow/benchmarking/queueing_time_vs_arrival_rate.pdf', bbox_inches='tight')\n", + "\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/benchmarking/queueing_time_vs_arrival_rate.pdf b/benchmarking/queueing_time_vs_arrival_rate.pdf new file mode 100644 index 000000000..a552ebcea Binary files /dev/null and b/benchmarking/queueing_time_vs_arrival_rate.pdf differ diff --git a/benchmarking/throughput_vs_tpot.pdf b/benchmarking/throughput_vs_tpot.pdf new file mode 100644 index 000000000..064bfb661 Binary files /dev/null and b/benchmarking/throughput_vs_tpot.pdf differ diff --git a/benchmarking/ttft_vs_arrival_rate.pdf b/benchmarking/ttft_vs_arrival_rate.pdf new file mode 100644 index 000000000..041d5e501 Binary files /dev/null and b/benchmarking/ttft_vs_arrival_rate.pdf differ diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake index c140a44ec..82cf3b412 100644 --- a/cmake/nccl.cmake +++ b/cmake/nccl.cmake @@ -2,140 +2,88 @@ set(NCCL_NAME nccl) # set(NCCL_CUDA_ARCH "-gencode=arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}") # message("NCCL_CUDA_ARCH: ${NCCL_CUDA_ARCH}") -set(NCCL_URL "") -if((FF_USE_PREBUILT_NCCL OR FF_USE_ALL_PREBUILT_LIBRARIES) AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") - if(LINUX_VERSION MATCHES "20.04") - if (CUDA_VERSION VERSION_EQUAL "11.0") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.0.3.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.1") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.1.1.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.2") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.2.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.3") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.3.1.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.4") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.4.3.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.5") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.5.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.6") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.6.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.7") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.7.0.tar.gz") - endif() - elseif(LINUX_VERSION MATCHES "18.04") - if (CUDA_VERSION VERSION_EQUAL "10.1") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.1.243.tar.gz") - elseif (CUDA_VERSION VERSION_EQUAL "10.2") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.2.89.tar.gz") - elseif (CUDA_VERSION VERSION_EQUAL "11.0") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.0.3.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.1") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.1.1.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.2") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.2.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.3") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.3.1.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.4") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.4.3.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.5") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.5.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.6") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.6.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.7") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.7.0.tar.gz") - endif() - endif() +if(NCCL_PATH) + set(NCCL_ROOT ${NCCL_PATH}) +else() + # if NCCL_PATH is not set, let's try to find it in the CUDA root + set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) endif() -if(NCCL_URL) - # Download and import pre-compiled NCCL library - message(STATUS "Using pre-compiled NCCL library") - message(STATUS "NCCL_URL: ${NCCL_URL}") +find_library(NCCL_LIBRARY + NAMES libnccl${LIBEXT} + PATHS ${NCCL_ROOT} ${CUDA_ROOT} + PATH_SUFFIXES lib lib64 + DOC "NCCL library." ) - include(FetchContent) - FetchContent_Declare(${NCCL_NAME} - URL ${NCCL_URL} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - ) - FetchContent_GetProperties(${NCCL_NAME}) - if(NOT ${NCCL_NAME}_POPULATED) - FetchContent_Populate(${NCCL_NAME}) - endif() - - set(NCCL_FOLDER_PATH ${${NCCL_NAME}_SOURCE_DIR}/deps/${NCCL_NAME}) - set(NCCL_INCLUDE_DIR ${NCCL_FOLDER_PATH}/include) - set(NCCL_LIB_DIR ${NCCL_FOLDER_PATH}/lib) - message(STATUS "NCCL library path: ${NCCL_FOLDER_PATH}") - add_library(nccl SHARED IMPORTED) - set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_FOLDER_PATH}) +find_path(NCCL_INCLUDE_DIR + NAMES nccl.h + HINTS ${NCCL_ROOT} + PATH_SUFFIXES include + DOC "NCCL include directory.") - list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) - list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIB_DIR}/libnccl${LIBEXT}) - install(DIRECTORY ${NCCL_INCLUDE_DIR}/ DESTINATION include) - install(DIRECTORY ${NCCL_LIB_DIR}/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE) - -else() - if(NCCL_PATH) - set(NCCL_ROOT ${NCCL_PATH}) +# find NCCL, set NCCL lib and include +if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR) + set(NCCL_FOUND ON) + set(NCCL_LIBRARIES ${NCCL_LIBRARY}) + set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) + + # Check NCCL version + if(EXISTS "${NCCL_INCLUDE_DIR}/nccl.h") + file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES + REGEX "#define NCCL_MAJOR [0-9]+" ) + file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES2 + REGEX "#define NCCL_MINOR [0-9]+" ) + string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES}) + string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2}) + set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}") + if(NCCL_VERSION VERSION_LESS 2.23) + set(NCCL_OLD TRUE) + else() + set(NCCL_OLD FALSE) + endif() + message(STATUS "Found NCCL version: ${NCCL_VERSION}") else() - # if NCCL_PATH is not set, let's try to find it in the CUDA root - set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + message(WARNING "NCCL header not found, unable to determine version") + set(NCCL_OLD TRUE) # Assume old version if we can't determine endif() - - find_library(NCCL_LIBRARY - NAMES libnccl${LIBEXT} - PATHS ${NCCL_ROOT} ${CUDA_ROOT} - PATH_SUFFIXES lib lib64 - DOC "NCCL library." ) +endif() - find_path(NCCL_INCLUDE_DIR - NAMES nccl.h - HINTS ${NCCL_ROOT} - PATH_SUFFIXES include - DOC "NCCL include directory.") - - # find NCCL, set NCCL lib and include - if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR) - set(NCCL_FOUND ON) - set(NCCL_LIBRARIES ${NCCL_LIBRARY}) - set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) - endif() - - # find NCCL - if(NCCL_FOUND) - list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES}) - list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS}) - message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" ) - message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" ) - add_library(nccl SHARED IMPORTED) - - # Build NCCL from source - else() - message(STATUS "Building NCCL from source") - list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE) - - ExternalProject_Add(${NCCL_NAME} - SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME} - PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME} - INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME} - BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT} - INSTALL_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}" - BUILD_IN_SOURCE 1 - ) +# find NCCL +if(NCCL_FOUND AND (NOT NCCL_OLD OR CUDA_VERSION VERSION_LESS 12.0)) + list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES}) + list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS}) + message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" ) + message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" ) + add_library(nccl SHARED IMPORTED) + +# Build NCCL from source +else() + message(STATUS "Building NCCL from source") + list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE) - ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR) - message(STATUS "NCCL install dir: ${INSTALL_DIR}") - list(APPEND FLEXFLOW_INCLUDE_DIRS - ${INSTALL_DIR}/include) - list(APPEND FLEXFLOW_EXT_LIBRARIES - ${INSTALL_DIR}/lib/libnccl${LIBEXT}) - set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/") - - install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include) - install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE) + set(NCCL_BUILD_CMD make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}") + if(DEFINED ENV{MAKEFLAGS}) + set(NCCL_BUILD_CMD ${CMAKE_COMMAND} -E env MAKEFLAGS=$ENV{MAKEFLAGS} ${NCCL_BUILD_CMD}) endif() + ExternalProject_Add(${NCCL_NAME} + SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME} + PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME} + INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME} + BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT} + INSTALL_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND ${NCCL_BUILD_CMD} + BUILD_IN_SOURCE 1 + ) + ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR) + message(STATUS "NCCL install dir: ${INSTALL_DIR}") + list(APPEND FLEXFLOW_INCLUDE_DIRS + ${INSTALL_DIR}/include) + list(APPEND FLEXFLOW_EXT_LIBRARIES + ${INSTALL_DIR}/lib/libnccl${LIBEXT}) + set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/") + + install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include) + install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE) endif() diff --git a/config/config.linux b/config/config.linux index acffc210f..8eb4f3087 100755 --- a/config/config.linux +++ b/config/config.linux @@ -111,6 +111,11 @@ function get_build_configs() { BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" } +#install raft +echo "Building raft dependency ..." +INSTALL_PREFIX=./install $(dirname $0)/../deps/raft/build.sh libraft > /dev/null +echo "Building raft dependency ... Done" + if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then . $(dirname $0)/config.inc # Passing CMAKE_FLAGS or CUDA_PATH as $1 will print the value of the CMAKE_FLAGS/CUDA_PATH variable, diff --git a/deps/flashinfer b/deps/flashinfer new file mode 160000 index 000000000..be6bf5bb2 --- /dev/null +++ b/deps/flashinfer @@ -0,0 +1 @@ +Subproject commit be6bf5bb26f1f1b3edf094d903544600c574ee09 diff --git a/deps/legion b/deps/legion index 24e8c4523..0d32b3554 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 +Subproject commit 0d32b35542bc0e9aba5950e485b8fc3413ae664b diff --git a/deps/nccl b/deps/nccl index 6e24ef4e1..2ea4ee94b 160000 --- a/deps/nccl +++ b/deps/nccl @@ -1 +1 @@ -Subproject commit 6e24ef4e1f1eac9f104d115ef65429f179924ee7 +Subproject commit 2ea4ee94bfb04c886c79ccae60ac9961000fdee2 diff --git a/deps/raft b/deps/raft new file mode 160000 index 000000000..b79f15d2f --- /dev/null +++ b/deps/raft @@ -0,0 +1 @@ +Subproject commit b79f15d2f229849bc02425b2e4ffd7bd3db89d4c diff --git a/deps/tensorrt_llm/README.md b/deps/tensorrt_llm/README.md new file mode 100644 index 000000000..39fcecdd7 --- /dev/null +++ b/deps/tensorrt_llm/README.md @@ -0,0 +1,5 @@ +## Custom AllReduce Implementation + +This is an adapted version of the custom AllReduce plugin from NVIDIA's [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) repository. + +To replace the NCCL AllReduce call, we should also add a CUDA IPC support to the custom AllReduce usage. Our IPC&AllReduce implementation is referenced from [mlc-ai/relax](https://github.com/mlc-ai/relax). diff --git a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu new file mode 100644 index 000000000..619eb8987 --- /dev/null +++ b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "custom_allreduce_kernels.h" + +namespace tensorrt_llm { + +static inline __device__ void st_flag_release(uint32_t &flag, + uint32_t *flag_addr) { +#if __CUDA_ARCH__ >= 700 + asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag), + "l"(flag_addr)); +#else + __threadfence_system(); + asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr)); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +static inline __device__ void ld_flag_acquire(uint32_t &flag, + uint32_t *flag_addr) { +#if __CUDA_ARCH__ >= 700 + asm volatile("ld.global.acquire.sys.b32 %0, [%1];" + : "=r"(flag) + : "l"(flag_addr)); +#else + asm volatile("ld.global.volatile.b32 %0, [%1];" + : "=r"(flag) + : "l"(flag_addr)); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// Type Converter that packs data format to 128 bits data type +// +using PackedFloat = union { + int4 packed; + float unpacked[4]; +}; + +using PackedHalf = union { + int4 packed; + half2 unpacked[4]; +}; + +template +struct PackedOn16Bytes {}; + +template <> +struct PackedOn16Bytes { + using Type = PackedFloat; +}; + +template <> +struct PackedOn16Bytes { + using Type = PackedHalf; +}; + +#ifdef ENABLE_BF16 +using PackedBFloat16 = union { + int4 packed; + __nv_bfloat162 unpacked[4]; +}; + +template <> +struct PackedOn16Bytes<__nv_bfloat16> { + using Type = PackedBFloat16; +}; +#endif + +// add two 128b data +template +inline __device__ int4 add128b(T &a, T &b) { + T c; + c.unpacked[0] = a.unpacked[0] + b.unpacked[0]; + c.unpacked[1] = a.unpacked[1] + b.unpacked[1]; + c.unpacked[2] = a.unpacked[2] + b.unpacked[2]; + c.unpacked[3] = a.unpacked[3] + b.unpacked[3]; + return c.packed; +} + +__inline__ __device__ void multi_gpu_barrier(uint32_t **signals, + const uint32_t flag, + const size_t rank, + const size_t world_size, + int const tidx, + int const bidx) { + // At the end of the function, we now that has least block 0 from all others + // GPUs have reached that point. + uint32_t volatile *my_signals = signals[rank]; + if (tidx < world_size) { + // The 1st block notifies the other ranks. + if (bidx == 0) { + signals[tidx][rank] = flag; + } + + // Busy-wait until all ranks are ready. + while (my_signals[tidx] != flag) { + } + } + + // Make sure we can move on... + __syncthreads(); +} + +__global__ void multiGpuBarrierKernel(AllReduceParams params) { + multi_gpu_barrier(params.peer_barrier_ptrs_out, + params.barrier_flag, + params.local_rank, + params.ranks_per_node, + threadIdx.x, + blockIdx.x); +} + +template +static __global__ void oneShotAllReduceKernel(AllReduceParams params) { + int const bidx = blockIdx.x; + int const tidx = threadIdx.x; + + // The number of elements packed into one for comms + static constexpr int NUM_ELTS = 16 / sizeof(T); + + // Packed data type for comms + using PackedStruct = typename PackedOn16Bytes::Type; + + multi_gpu_barrier(params.peer_barrier_ptrs_in, + params.barrier_flag, + params.local_rank, + RANKS_PER_NODE, + tidx, + bidx); + + // The source pointers. Distributed round-robin for the different warps. + T const *src_d[RANKS_PER_NODE]; +#pragma unroll + for (int ii = 0; ii < RANKS_PER_NODE; ++ii) { + int rank = (params.local_rank + ii) % RANKS_PER_NODE; + src_d[ii] = reinterpret_cast(params.peer_comm_buffer_ptrs[rank]); + } + + // The location in the destination array (load 8 fp16 or load 4 fp32 using + // LDG.128). + size_t offset = bidx * params.elts_per_block + tidx * NUM_ELTS; + // The end of the segment computed by that block. + size_t max_offset = + min((bidx + 1) * params.elts_per_block, params.elts_per_rank); + + // Each block accumulates the values from the different GPUs on the same node. + for (size_t iter_offset = offset; iter_offset < max_offset; + iter_offset += blockDim.x * NUM_ELTS) { + // Iterate over the different ranks/devices on the node to load the values. + PackedStruct vals[RANKS_PER_NODE]; +#pragma unroll + for (int ii = 0; ii < RANKS_PER_NODE; ++ii) { + vals[ii].packed = + *reinterpret_cast(&src_d[ii][iter_offset]); + } + + // Sum the values from the different ranks. + PackedStruct sums; + sums.packed = {0, 0, 0, 0}; +#pragma unroll + for (int ii = 0; ii < RANKS_PER_NODE; ++ii) { + sums.packed = add128b(sums, vals[ii]); + } + + // Store to the destination buffer. + *reinterpret_cast(&reinterpret_cast( + params.local_output_buffer_ptr)[iter_offset]) = sums.packed; + } +} + +template +static __global__ void twoShotAllReduceKernel(AllReduceParams params) { + // The block index. + int const bidx = blockIdx.x; + // The thread index with the block. + int const tidx = threadIdx.x; + + // The number of elements packed into one for comms + static constexpr int NUM_ELTS = 16 / sizeof(T); + + // Packed data type for comms + using PackedType = typename PackedOn16Bytes::Type; + + // The location in the destination array (load 8 fp16 or load 4 fp32 using + // LDG.128). + const size_t block_offset = bidx * params.elts_per_block + tidx * NUM_ELTS; + const size_t block_start = params.rank_offset + block_offset; + // The end of the segment computed by that block. + size_t max_offset = min(block_start + params.elts_per_block, + params.rank_offset + params.elts_per_rank); + + multi_gpu_barrier(params.peer_barrier_ptrs_in, + params.barrier_flag, + params.local_rank, + RANKS_PER_NODE, + tidx, + bidx); + + // The source pointers. Distributed round-robin for the different warps. + T *src_d[RANKS_PER_NODE]; + // The destination ranks for round-robin gathering + size_t dst_rank[RANKS_PER_NODE]; +#pragma unroll + for (int ii = 0; ii < RANKS_PER_NODE; ++ii) { + int rank = (params.local_rank + ii) % RANKS_PER_NODE; + src_d[ii] = reinterpret_cast(params.peer_comm_buffer_ptrs[rank]); + dst_rank[ii] = rank; + } + + // Each block accumulates the values from the different GPUs on the same node. + for (size_t local_offset = block_start; local_offset < max_offset; + local_offset += blockDim.x * NUM_ELTS) { + // Iterate over the different ranks/devices on the node to load the values. + PackedType vals[RANKS_PER_NODE]; +#pragma unroll + for (int ii = 0; ii < RANKS_PER_NODE; ++ii) { + vals[ii].packed = + *reinterpret_cast(&src_d[ii][local_offset]); + } + + // Sum the values from the different ranks. + PackedType sums; + sums.packed = {0, 0, 0, 0}; +#pragma unroll + for (int ii = 0; ii < RANKS_PER_NODE; ++ii) { + sums.packed = add128b(sums, vals[ii]); + } + + // Store to the local buffer. + *reinterpret_cast(&src_d[0][local_offset]) = sums.packed; + } + + // sync threads to make sure all block threads have the sums + __syncthreads(); + + // barriers among the blocks with the same idx (release-acquire semantics) + if (tidx < RANKS_PER_NODE) { + // The all blocks notifies the other ranks. + uint32_t flag_block_offset = RANKS_PER_NODE + bidx * RANKS_PER_NODE; + st_flag_release(params.barrier_flag, + params.peer_barrier_ptrs_in[tidx] + flag_block_offset + + params.local_rank); + + // Busy-wait until all ranks are ready. + uint32_t rank_barrier = 0; + uint32_t *peer_barrier_d = params.peer_barrier_ptrs_in[params.local_rank] + + flag_block_offset + tidx; + do { + ld_flag_acquire(rank_barrier, peer_barrier_d); + } while (rank_barrier != params.barrier_flag); + } + + // sync threads to make sure all other ranks has the final partial results + __syncthreads(); + + size_t max_block_offset = + min(block_offset + params.elts_per_block, params.elts_per_rank); + // Gather all needed elts from other intra-node ranks + for (size_t local_offset = block_offset; local_offset < max_block_offset; + local_offset += blockDim.x * NUM_ELTS) { +#pragma unroll + for (int ii = 0; ii < RANKS_PER_NODE; ++ii) { + // use round-robin gathering from other ranks + size_t offset_rank = dst_rank[ii] * params.elts_per_rank + local_offset; + if (offset_rank >= params.elts_total) { + continue; + } + *reinterpret_cast( + &reinterpret_cast(params.local_output_buffer_ptr)[offset_rank]) = + *reinterpret_cast(&src_d[ii][offset_rank]); + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int divUp(int a, int b) { + return (a + b - 1) / b; +} + +std::tuple kernelLaunchConfig(AllReduceStrategyType algo, + AllReduceParams ¶m, + size_t elts_per_thread) { + assert(param.elts_total % elts_per_thread == 0); + + int blocks_per_grid = 1, threads_per_block = DEFAULT_BLOCK_SIZE; + + const size_t total_threads = param.elts_total / elts_per_thread; + switch (algo) { + case AllReduceStrategyType::ONESHOT: { // one stage all reduce algo + if (total_threads <= DEFAULT_BLOCK_SIZE) { // local reduce + threads_per_block = WARP_SIZE * divUp(total_threads, WARP_SIZE); + blocks_per_grid = 1; + } else { // local reduce + threads_per_block = DEFAULT_BLOCK_SIZE; + blocks_per_grid = divUp(total_threads, DEFAULT_BLOCK_SIZE); + blocks_per_grid = + std::min(static_cast(MAX_ALL_REDUCE_BLOCKS), blocks_per_grid); + } + param.elts_per_rank = param.elts_total; + param.elts_per_block = + elts_per_thread * + divUp(param.elts_per_rank, elts_per_thread * blocks_per_grid); + break; + } + case AllReduceStrategyType::TWOSHOT: { // two stage all reduce algo + const size_t elts_per_rank = param.elts_total / param.ranks_per_node; + assert(elts_per_rank % elts_per_thread == 0); + + size_t total_threads = elts_per_rank / elts_per_thread; + total_threads = WARP_SIZE * ((total_threads + WARP_SIZE - 1) / WARP_SIZE); + assert(total_threads % WARP_SIZE == 0); + + while (total_threads % blocks_per_grid != 0 || + total_threads / blocks_per_grid > DEFAULT_BLOCK_SIZE) { + blocks_per_grid += 1; + } + + threads_per_block = total_threads / blocks_per_grid; + + // NOTE: need to adjust here + if (static_cast(blocks_per_grid) > MAX_ALL_REDUCE_BLOCKS) { + size_t iter_factor = 1; + while (blocks_per_grid / iter_factor > MAX_ALL_REDUCE_BLOCKS || + blocks_per_grid % iter_factor) { + iter_factor += 1; + } + blocks_per_grid /= iter_factor; + } + param.elts_per_rank = param.elts_total / param.ranks_per_node; + param.elts_per_block = param.elts_per_rank / blocks_per_grid; + param.elts_per_block = + elts_per_thread * divUp(param.elts_per_block, elts_per_thread); + param.rank_offset = param.rank * param.elts_per_rank; + break; + } + default: + assert(false && "Algorithm not supported here."); + } + + return std::make_tuple(blocks_per_grid, threads_per_block); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +void dispatchARKernels(AllReduceStrategyType algo, + AllReduceParams ¶m, + int blocks_per_grid, + int threads_per_block, + cudaStream_t stream) { + if (algo == AllReduceStrategyType::ONESHOT) { + oneShotAllReduceKernel + <<>>(param); + } else { + twoShotAllReduceKernel + <<>>(param); + } + multiGpuBarrierKernel<<<1, param.ranks_per_node, 0, stream>>>(param); +} + +template +void invokeOneOrTwoShotAllReduceKernel(AllReduceParams ¶m, + AllReduceStrategyType strat, + cudaStream_t stream) { + assert(strat == AllReduceStrategyType::ONESHOT || + strat == AllReduceStrategyType::TWOSHOT); + auto last_error = cudaGetLastError(); + if (last_error != cudaSuccess) { + printf("cuda error: %s\n", cudaGetErrorString(last_error)); + assert(false && "Error before launching the kernel"); + } + + size_t elts_per_thread = 16 / sizeof(T); + auto [blocks_per_grid, threads_per_block] = + kernelLaunchConfig(strat, param, elts_per_thread); + switch (param.ranks_per_node) { + case 2: + dispatchARKernels( + strat, param, blocks_per_grid, threads_per_block, stream); + break; + case 4: + dispatchARKernels( + strat, param, blocks_per_grid, threads_per_block, stream); + break; + case 6: + dispatchARKernels( + strat, param, blocks_per_grid, threads_per_block, stream); + break; + case 8: + dispatchARKernels( + strat, param, blocks_per_grid, threads_per_block, stream); + break; + default: + break; + } + last_error = cudaGetLastError(); + if (last_error != cudaSuccess) { + printf("cuda error: %s\n", cudaGetErrorString(last_error)); + assert(false && "Error after launching the kernel"); + } +} + +void invokeMultiGpuBarrier(AllReduceParams ¶m, cudaStream_t stream) { + multiGpuBarrierKernel<<<1, param.ranks_per_node, 0, stream>>>(param); +} + +void customAllReduce(AllReduceParams ¶ms, + void *data, + size_t elts, + DataType dataType, + AllReduceStrategyType strat, + cudaStream_t stream) { + params.local_output_buffer_ptr = data; + params.elts_total = elts; + + if (elts == 0) { + return; + } + + if (dataType == DT_FLOAT) { + invokeOneOrTwoShotAllReduceKernel(params, strat, stream); + } else if (dataType == DT_HALF) { + invokeOneOrTwoShotAllReduceKernel(params, strat, stream); + } else { + assert(false && "Unspported data type"); + } +} + +} // namespace tensorrt_llm diff --git a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h new file mode 100644 index 000000000..e56795047 --- /dev/null +++ b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst.h" + +#include +#include + +namespace tensorrt_llm { + +constexpr size_t WARP_SIZE = 32; +constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24; +constexpr size_t MAX_RANKS_PER_NODE = 8; +constexpr size_t DEFAULT_BLOCK_SIZE = 1024; + +enum class AllReduceStrategyType : int8_t { + RING = 0, + ONESHOT = 1, + TWOSHOT = 2, + AUTO = 3, +}; + +struct AllReduceParams { + size_t elts_total; + size_t elts_per_rank; + size_t elts_per_block; + size_t rank_offset; + size_t ranks_per_node, rank, local_rank; + uint32_t barrier_flag; + uint32_t *peer_barrier_ptrs_in[MAX_RANKS_PER_NODE]; + uint32_t *peer_barrier_ptrs_out[MAX_RANKS_PER_NODE]; + void *peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE]; + void *local_output_buffer_ptr; +}; + +inline size_t GetMaxRequiredWorkspaceSize(int world_size) { + if (world_size <= 2) { + return 16 * 1000 * 1000; + } + return 8 * 1000 * 1000; +} + +inline AllReduceStrategyType SelectImplementation(size_t message_size, + int world_size) { + const size_t maxWorkspaceSize = GetMaxRequiredWorkspaceSize(world_size); + + if (message_size > maxWorkspaceSize) { + return AllReduceStrategyType::RING; + } + + if (world_size <= 2) { + return AllReduceStrategyType::ONESHOT; + } + + if (world_size <= 4) { + if (message_size < 1 * 1000 * 1000) { + return AllReduceStrategyType::ONESHOT; + } + return AllReduceStrategyType::TWOSHOT; + } + + if (message_size < 500 * 1000) { + return AllReduceStrategyType::ONESHOT; + } + return AllReduceStrategyType::TWOSHOT; +} + +void customAllReduce(AllReduceParams ¶ms, + void *data, + size_t elts, + DataType dataType, + AllReduceStrategyType strat, + cudaStream_t stream); + +} // namespace tensorrt_llm diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 6ca337f58..db7164c84 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -37,21 +37,43 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ chmod +x ~/${MINICONDA_SCRIPT_NAME} && \ bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \ rm ~/${MINICONDA_SCRIPT_NAME} && \ + /opt/conda/bin/conda config --set solver classic && \ /opt/conda/bin/conda upgrade --all && \ /opt/conda/bin/conda install conda-build conda-verify && \ /opt/conda/bin/conda clean -ya -# Optionally install HIP dependencies +# set MAKEFLAGS to speedup any dependency that uses make +ARG N_BUILD_CORES +ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}" + +# Set env vars +ENV PATH /opt/conda/bin:$PATH +ENV CUDNN_DIR /usr/local/cuda +ENV CUDA_DIR /usr/local/cuda + +# GPU-specific dependencies +ARG FF_GPU_BACKEND "cuda" + +# Update NCCL if FF_GPU_BACKEND is cuda +RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \ + echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \ + ubuntu_version=$(lsb_release -rs); \ + ubuntu_version=${ubuntu_version//./}; \ + wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \ + DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \ + DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \ + rm -f cuda-keyring_1.0-1_all.deb; \ + DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \ + else \ + echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \ + fi' + +# Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This # package attempts to re-install cuda even though cuda is already installed # in the container. It also attempts to install packages for a graphical install. # For our container, we don't need `hip-runtime-nvidia` -ARG FF_GPU_BACKEND "cuda" ARG hip_version "5.6" -ARG N_BUILD_CORES -# set MAKEFLAGS to speedup any dependency that uses make -ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}" - RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \ # Check that hip_version is one of 5.3,5.4,5.5,5.6 @@ -82,11 +104,6 @@ RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ] fi RUN rm -rf /var/lib/apt/lists/* -# Set env vars -ENV PATH /opt/conda/bin:$PATH -ENV CUDNN_DIR /usr/local/cuda -ENV CUDA_DIR /usr/local/cuda - # Install python packages and other dependencies RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing # Install CPU-only Pytorch and related dependencies diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile index 60f9d4d65..dff925965 100644 --- a/docker/flexflow/Dockerfile +++ b/docker/flexflow/Dockerfile @@ -27,9 +27,7 @@ RUN for pair in $BUILD_CONFIGS; do \ # Build and install C++ and Python versions of FlexFlow RUN mkdir -p build && cd build && \ eval "$BUILD_CONFIGS" ../config/config.linux && \ - make -j $N_BUILD_CORES && \ - eval "$BUILD_CONFIGS" ../config/config.linux && \ - make install && \ + make -j $N_BUILD_CORES install && \ ldconfig ENTRYPOINT ["/bin/bash"] diff --git a/docker/run.sh b/docker/run.sh index 666c8e112..2575150ae 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -18,8 +18,6 @@ ATTACH_GPUS=${ATTACH_GPUS:-true} gpu_arg="" if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi -# Whether to attach inference weights / files (make sure to download the weights first) -ATTACH_INFERENCE_FILES=${ATTACH_INFERENCE_FILES:-false} # Amount of shared memory to give the Docker container access to # If you get a Bus Error, increase this value. If you don't have enough memory @@ -115,9 +113,11 @@ if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":lat exit 1 fi -inference_volumes="" -if $ATTACH_INFERENCE_FILES ; then - inference_volumes="-v ~/.cache/flexflow:/usr/FlexFlow/inference"; +hf_token_volume="" +hf_token_path="$HOME/.cache/huggingface/token" +if [ -f "$hf_token_path" ]; then + # If the token exists, add the volume mount to the Docker command + hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token" fi -eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" +eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc index 128496eab..350788232 100644 --- a/examples/cpp/AlexNet/alexnet.cc +++ b/examples/cpp/AlexNet/alexnet.cc @@ -26,7 +26,7 @@ using FlexFlow::ParallelTensor; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("AlexNet"); +Legion::Logger log_app("AlexNet"); void parse_input_args(char **argv, int argc, AlexNetConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc index 7dc49215b..d7dc16755 100644 --- a/examples/cpp/DLRM/dlrm.cc +++ b/examples/cpp/DLRM/dlrm.cc @@ -19,7 +19,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("DLRM"); +Legion::Logger log_app("DLRM"); void parse_input_args(char **argv, int argc, DLRMConfig &apConfig); diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc index b2070cc52..6d0fa7ee5 100644 --- a/examples/cpp/InceptionV3/inception.cc +++ b/examples/cpp/InceptionV3/inception.cc @@ -21,7 +21,7 @@ using namespace Legion; using namespace FlexFlow; -LegionRuntime::Logger::Category log_app("Inceptionv3"); +Legion::Logger log_app("Inceptionv3"); Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) { Tensor t1 = input; diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc index 455eb743a..49ce934a6 100644 --- a/examples/cpp/ResNet/resnet.cc +++ b/examples/cpp/ResNet/resnet.cc @@ -24,7 +24,7 @@ using FlexFlow::Optimizer; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("ResNet"); +Legion::Logger log_app("ResNet"); void parse_input_args(char **argv, int argc, ResNetConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc index d61a63cd0..b04093b0a 100644 --- a/examples/cpp/Transformer/transformer.cc +++ b/examples/cpp/Transformer/transformer.cc @@ -17,7 +17,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("Transformer"); +Legion::Logger log_app("Transformer"); Tensor create_emb(FFModel *model, Tensor const &input, diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc index 2e6c3cec9..a2272f36e 100644 --- a/examples/cpp/XDL/xdl.cc +++ b/examples/cpp/XDL/xdl.cc @@ -18,7 +18,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("XDL"); +Legion::Logger log_app("XDL"); void parse_input_args(char **argv, int argc, XDLConfig &apConfig); diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc index 779b8e9c1..e9f4bf876 100644 --- a/examples/cpp/candle_uno/candle_uno.cc +++ b/examples/cpp/candle_uno/candle_uno.cc @@ -21,7 +21,7 @@ using namespace Legion; using namespace std; -LegionRuntime::Logger::Category log_app("Candle_Uno"); +Legion::Logger log_app("Candle_Uno"); void parse_input_args(char **argv, int argc, CandleConfig &apConfig); diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc index a70731088..a25f94abd 100644 --- a/examples/cpp/mixture_of_experts/moe.cc +++ b/examples/cpp/mixture_of_experts/moe.cc @@ -20,7 +20,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("MoE"); +Legion::Logger log_app("MoE"); void parse_input_args(char **argv, int argc, MoeConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc index 3c28ca27b..9b71b37cc 100644 --- a/examples/cpp/resnext50/resnext.cc +++ b/examples/cpp/resnext50/resnext.cc @@ -7,7 +7,7 @@ using FlexFlow::Optimizer; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("resnext"); +Legion::Logger log_app("resnext"); Tensor resnext_block(FFModel &ff, Tensor input, diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc index 97b98c321..ac9d516a5 100644 --- a/examples/cpp/split_test/split_test.cc +++ b/examples/cpp/split_test/split_test.cc @@ -3,7 +3,7 @@ using namespace Legion; using namespace FlexFlow; -LegionRuntime::Logger::Category log_app("split_test"); +Legion::Logger log_app("split_test"); void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc index 69385d14c..fef078adb 100644 --- a/examples/cpp/split_test_2/split_test_2.cc +++ b/examples/cpp/split_test_2/split_test_2.cc @@ -9,7 +9,7 @@ using FlexFlow::PCG::Graph; using FlexFlow::PCG::GraphSearchHelper; using FlexFlow::PCG::Node; -LegionRuntime::Logger::Category log_app("split_test_2"); +Legion::Logger log_app("split_test_2"); void top_level_task(Task const *task, std::vector const ®ions, diff --git a/include/flexflow/attention_config.h b/include/flexflow/attention_config.h new file mode 100644 index 000000000..558246867 --- /dev/null +++ b/include/flexflow/attention_config.h @@ -0,0 +1,217 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _FLEXFLOW_ATTENTION_CONFIG_H_ +#define _FLEXFLOW_ATTENTION_CONFIG_H_ +#include "flexflow/batch_config.h" + +namespace FlexFlow { + +constexpr uint32_t kPagesize = 64; + +inline int round_up_pages(int const num_elements) { + return (num_elements + kPagesize - 1) / kPagesize; +} + +#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...) \ + switch (head_dim) { \ + case 64: { \ + constexpr size_t HEAD_DIM = 64; \ + __VA_ARGS__ \ + break; \ + } \ + case 128: { \ + constexpr size_t HEAD_DIM = 128; \ + __VA_ARGS__ \ + break; \ + } \ + case 256: { \ + constexpr size_t HEAD_DIM = 256; \ + __VA_ARGS__ \ + break; \ + } \ + default: { \ + std::ostringstream err_msg; \ + err_msg << "Unsupported head_dim: " << head_dim; \ + throw std::invalid_argument(err_msg.str()); \ + } \ + } + +class AttentionMetaData { +public: + AttentionMetaData() { + num_q_heads_ = 0; + num_kv_heads_ = 0; + head_dim_ = 0; + q_indptr = nullptr; + kv_indptr = nullptr; + kv_indices = nullptr; + kv_last_page_len = nullptr; + qk_indptr = nullptr; + custom_mask = nullptr; + workspace = nullptr; + workspace_size = 0; + float_workspace = nullptr; + float_workspace_size = 0; + int_workspace = nullptr; + int_workspace_size = 0; + mem_size_ = 0; + enabled_ = false; + } + AttentionMetaData(AttentionMetaData const &rhs) { + num_q_heads_ = rhs.num_q_heads_; + num_kv_heads_ = rhs.num_kv_heads_; + head_dim_ = rhs.head_dim_; + q_indptr = rhs.q_indptr; + kv_indptr = rhs.kv_indptr; + kv_indices = rhs.kv_indices; + kv_last_page_len = rhs.kv_last_page_len; + qk_indptr = rhs.qk_indptr; + custom_mask = rhs.custom_mask; + workspace = rhs.workspace; + workspace_size = rhs.workspace_size; + float_workspace = rhs.float_workspace; + float_workspace_size = rhs.float_workspace_size; + int_workspace = rhs.int_workspace; + int_workspace_size = rhs.int_workspace_size; + mem_size_ = rhs.mem_size_; + enabled_ = rhs.enabled_; + decode_handler_collections = rhs.decode_handler_collections; + prompt_handler_collections = rhs.prompt_handler_collections; + } + + size_t mem_size() { + if (mem_size_ > 0) { + return mem_size_; + } + size_t batch_size = BatchConfig::max_requests_per_batch(); + size_t max_num_pages = + round_up_pages(BatchConfig::max_spec_tree_token_num() + + BatchConfig::max_sequence_length()); + size_t indices_size = std::max( + (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024); + size_t custom_mask_size = BatchConfig::max_requests_per_batch() * + ((BatchConfig::max_spec_tree_token_num() * + (BatchConfig::max_spec_tree_token_num() + + BatchConfig::max_sequence_length()) + + 7) / + 8); + + float_workspace_size = 128 * 1024 * 1024; // 128 MB + int_workspace_size = 8 * 1024 * 1024; // 8 MB + workspace_size = + float_workspace_size + int_workspace_size; // float + int workspace + + mem_size_ = alignTo(sizeof(int32_t) * indices_size + + sizeof(uint8_t) * custom_mask_size + workspace_size, + 16); + return mem_size_; + } + + void assign_address(void *ptr, int size) { + if (ptr == nullptr) { + q_indptr = nullptr; + kv_indptr = nullptr; + kv_indices = nullptr; + kv_last_page_len = nullptr; + qk_indptr = nullptr; + custom_mask = nullptr; + workspace = nullptr; + float_workspace = nullptr; + int_workspace = nullptr; + return; + } + assert(size >= mem_size() && + "Insufficient memory size for attention metadata"); + size_t batch_size = BatchConfig::max_requests_per_batch(); + size_t max_num_pages = + round_up_pages(BatchConfig::max_spec_tree_token_num() + + BatchConfig::max_sequence_length()); + size_t indices_size = std::max( + (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024); + size_t custom_mask_size = BatchConfig::max_requests_per_batch() * + ((BatchConfig::max_spec_tree_token_num() * + (BatchConfig::max_spec_tree_token_num() + + BatchConfig::max_sequence_length()) + + 7) / + 8); + + q_indptr = static_cast(ptr); + kv_indptr = q_indptr + batch_size + 1; + kv_indices = kv_indptr + batch_size + 1; + kv_last_page_len = kv_indices + max_num_pages * batch_size; + qk_indptr = kv_last_page_len + batch_size + 1; + custom_mask = static_cast(ptr) + sizeof(int32_t) * indices_size; + workspace = static_cast(static_cast(ptr) + + sizeof(int32_t) * indices_size + + sizeof(uint8_t) * custom_mask_size); + float_workspace = workspace; + int_workspace = static_cast(static_cast(workspace) + + float_workspace_size); + } + + void set_num_q_heads(uint32_t const num_q_heads) { + num_q_heads_ = num_q_heads; + } + void set_num_kv_heads(uint32_t const num_kv_heads) { + num_kv_heads_ = num_kv_heads; + } + void set_head_dim(uint32_t const head_dim) { + head_dim_ = head_dim; + } + uint32_t num_q_heads() const { + return num_q_heads_; + } + uint32_t num_kv_heads() const { + return num_kv_heads_; + } + uint32_t head_dim() const { + return head_dim_; + } + + void set_enabled(bool const enabled) { + enabled_ = enabled; + } + bool enabled() const { + return enabled_; + } + + uint32_t num_q_heads_; + uint32_t num_kv_heads_; + uint32_t head_dim_; + + int32_t *q_indptr; + int32_t *kv_indptr; + int32_t *kv_indices; + int32_t *kv_last_page_len; + int32_t *qk_indptr; + uint8_t *custom_mask; + void *workspace; + size_t workspace_size; + void *float_workspace; + size_t float_workspace_size; + void *int_workspace; + size_t int_workspace_size; + + size_t mem_size_; + + // batchsize -> handler + bool enabled_; + std::unordered_map decode_handler_collections; + std::unordered_map prompt_handler_collections; +}; +} // namespace FlexFlow + +#endif // _FLEXFLOW_ATTENTION_CONFIG_H_ diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 009d1c250..76521e5cf 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -20,185 +20,207 @@ #include #include -// #define MAX_SEQ_LEN 1024 -// #define BATCH_SIZE 2 -// #define BATCH_SIZE 16 -// #define MAX_REQUESTS 256 - namespace FlexFlow { +inline int alignTo(int x, int y) { + return ((x + y - 1) / y) * y; +} + class InferenceResult; -class BeamInferenceResult; using BatchConfigFuture = Legion::Future; using InferenceResultFuture = Legion::Future; -using BeamSearchBatchConfigFuture = Legion::Future; -using TreeVerifyBatchConfigFuture = Legion::Future; -using BeamInferenceResultFuture = Legion::Future; + +/* + * StreamingCacheInfo is a class that manages the streaming kv cache for + * attention operator (https://arxiv.org/abs/2309.17453), and we use it in the + * draft model. It maintains a fixed-content *sink* cache and a fixed-size + * *window* cache. The *sink* cache is the foremost part of the original kv + * cache, while the *window* cache is the backmost part of the original kv cache + * and is rolling updated. The information is per-request. Note that the + * position encoding of the q&k alters each iteration (relative position), so we + * store the *pre-pos-encoding* kv value in the cache. + */ +class StreamingCacheInfo { +public: + StreamingCacheInfo(); + StreamingCacheInfo(int sink_cache_size, int window_cache_size); + StreamingCacheInfo(StreamingCacheInfo const &other); + + StreamingCacheInfo &operator=(StreamingCacheInfo const &other); + + void commit_cache(int len); + void reset_cache(); + int global_2_cache_index(int global_index); + int cache_2_global_index(int cache_index); + +public: + int sink_cache_size, window_cache_size; + // the meta info of the window cache, commit_len helps to determine if we fill + // up the window. + int window_back, commit_len, total_len; +}; class BatchConfig { public: using RequestGuid = size_t; using TokenId = int; - BatchConfig(); + BatchConfig(InferenceMode inference_mode = INC_DECODING_MODE, + int model_id = 0); + BatchConfig(BatchConfig const &other); int num_active_requests() const; int num_active_tokens() const; static int max_requests_per_batch(); static int max_tokens_per_batch(); - static int max_verify_tokens_per_batch(); + static int max_tokens_per_ssm_batch(); + static int max_tokens_per_prefilling_batch(); static int max_spec_tree_token_num(); static int max_sequence_length(); + static int max_output_length(); + static size_t max_kv_cache_size(); + static bool streaming_cache(); + static int get_max_tree_depth(); friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc); void print() const; void save_to_file(std::string const &filename) const; virtual InferenceMode get_mode() const; static BatchConfig const *from_future(BatchConfigFuture const &future); + // Maximum possible values for different parameters // These maximum values are used for copying BatchConfig // across workers - static int const MAX_NUM_REQUESTS = 64; - static int const MAX_NUM_TOKENS = 1024; - static int const MAX_SPEC_TREE_TOKEN_NUM = 64; - - // Set by update - int num_tokens; - // number of tokens in prompt phase, start offset of tokens in inc_decoding - // phase. num_tokens - num_prompt_tokens = num_generation_tokens; - int num_generation_tokens; + inline static int const MAX_NUM_REQUESTS = 64; + inline static int const MAX_NUM_TOKENS = 1024; + inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 8; + inline static int const MAX_TREE_DEPTH = 8; + inline static int const MAX_TREE_WIDTH = 16; + inline static int const MAX_SPEC_TREE_TOKEN_NUM = + MAX_TREE_DEPTH * MAX_TREE_WIDTH; + inline static int const MAX_K_LOGITS = 16; + + // The Constants for the Streaming KVCache + inline static int const SINK_SIZE = 4; + // size_SINK + size_WINDOW + depth_DRAFT shouldn't exceed this value + inline static int const MAX_STREAMING_POS = 2048; + + int num_tokens = 0; + int num_available_requests = 0; + bool prompt_phase = false; + int num_tokens_to_commit = 0; + int model_id; + InferenceMode inference_mode; struct PerRequestInfo { - int first_token_depth_in_request; - int first_token_offset_in_batch; - int num_tokens_in_batch; - int max_sequence_length; - - // request id in batch config: - int batch_config_request_id; - bool prompt_phase = false; + int first_token_index_in_request = -1; + int first_token_offset_in_batch = -1; + int num_tokens_in_batch = 0; RequestGuid request_guid; + + static constexpr size_t request_guid_size = sizeof(RequestGuid); + static constexpr size_t alignment = 16; + static constexpr size_t padding_size = + (alignment - (sizeof(int) * 3 + request_guid_size) % alignment) % + alignment; + static constexpr size_t padding_length = padding_size / sizeof(int); + int padding[padding_length] = {}; // Padding for memory pointer alignment }; + struct PerTokenInfo { - int abs_depth_in_request; - int request_index; - TokenId token_id; + TokenId token_id = -1; + // Difference between the two: + // abs_index_in_request: non-tree cache size + index in the flattened + // speculative tree + // abs_depth_in_request: non_tree cache size + depth in the speculative tree + int abs_index_in_request = -1; + int abs_depth_in_request = -1; + int request_index = -1; }; - struct BitMask { - unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0}; + struct CommittedTokensInfo { + int index_in_kv_cache = -1; // the index in the temporary key-value cache + int request_index = -1; // request index in the batch + int token_depth = -1; // position of the token in the request's sequence + }; - // how many tokens before the tree, every sub requests need this part of - // cache + class BitMask { + public: + class Bitset { + public: + Bitset() : bits{0} {} + + Bitset(Bitset const &other) { + // Copy the entire array of bits from 'other' to this object + std::copy( + std::begin(other.bits), std::end(other.bits), std::begin(bits)); + } + + void set_bit(size_t pos) { + size_t idx = pos / 64; // Find the index in the array + size_t bit = pos % 64; // Find the bit position within the uint64_t + bits[idx] |= (1ULL << bit); + } + + bool test_bit(size_t pos) const { + size_t idx = pos / 64; + size_t bit = pos % 64; + return (bits[idx] & (1ULL << bit)) != 0; + } + + void clear() { + std::fill(std::begin(bits), std::end(bits), 0); + } + + uint64_t bits[(MAX_SPEC_TREE_TOKEN_NUM + 63) / 64]; + }; + + Bitset bit_mask[MAX_SPEC_TREE_TOKEN_NUM]; + // the number of generated tokens before the speculation tree (excluding the + // prompt tokens) int non_tree_cache_size = 0; - - // current tree size - int tree_size = 0; - - int this_layer_size = 0; - - // input length-> prompt/root - int prompt_size = 0; + // Tree size or prompt size. Because the prefilling phase and the decoding + // phase are separated, we only need one field to store the size of the tree + // or the prompt. + int tree_or_prompt_size = 0; + int current_layer_size = 0; + + BitMask() = default; + + BitMask(BitMask const &other) { + non_tree_cache_size = other.non_tree_cache_size; + tree_or_prompt_size = other.tree_or_prompt_size; + current_layer_size = other.current_layer_size; + for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) { + bit_mask[i] = other.bit_mask[i]; + } + } + + void clear_bitmask() { + // Clear bit_mask but keep the other fields + for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) { + bit_mask[i].clear(); + } + } }; BitMask causalMask[MAX_NUM_REQUESTS]; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; + StreamingCacheInfo streamingCacheInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; - - bool request_completed[MAX_NUM_REQUESTS]; - bool request_running[MAX_NUM_REQUESTS]; -}; - -class TreeVerifyBatchConfig : public BatchConfig { -public: - TreeVerifyBatchConfig(); - ~TreeVerifyBatchConfig(); - InferenceMode get_mode() const; - friend std::ostream &operator<<(std::ostream &os, - TreeVerifyBatchConfig const &bc); - void print() const; - void save_to_file(std::string const &filename) const; - struct CommittedTokensInfo { - int token_index; // the index of the token in the previous batch - int request_index; // request index in the batch - int token_depth; // position of the token in the request's sequence - }; - - int num_tokens_to_commit; CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS]; + bool request_available[MAX_NUM_REQUESTS]; }; struct InferenceResult { - static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; - BatchConfig::TokenId token_ids[MAX_NUM_TOKENS]; -}; - -class BeamSearchBatchConfig : public BatchConfig { -public: - BeamSearchBatchConfig(); - BeamSearchBatchConfig(int model_id); - BeamSearchBatchConfig(size_t beam_width, size_t target_iterations); - BeamSearchBatchConfig(BeamSearchBatchConfig const &other, int model_id); - InferenceMode get_mode() const; - - ~BeamSearchBatchConfig(); - - friend std::ostream &operator<<(std::ostream &os, - BeamSearchBatchConfig const &bc); - void print() const; - void save_to_file(std::string const &filename) const; - bool done() const; - int max_beam_depth_all_requests() const; - int current_depth_all_requests() const; - int get_speculative_request_num() const; - - size_t beam_width; - size_t target_iterations; - - // how many requests is in speculative phase - int speculative_request_num = 0; - inline static int const MAX_BEAM_WIDTH = 3; - inline static int const MAX_BEAM_DEPTH = 8; - - // maximum tree branches for a request - inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3; - - int model_id; - - struct BeamSearchPerRequestInfo { - int beam_size; - int current_depth = -1; - int max_depth = MAX_BEAM_DEPTH; - - BatchConfig::TokenId - tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - int parent_id[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - int sub_request_num; - }; - - struct BeamSearchPerTokenInfo { - int sub_request_index; - }; - - BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; - BeamSearchPerTokenInfo - beamTokenInfo[MAX_NUM_TOKENS + - MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS]; - - int sub_requests[MAX_NUM_REQUESTS]; - -private: - size_t current_iteration; -}; - -struct BeamInferenceResult { - static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; + int num_token_ids; + int num_gumbel_logits; BatchConfig::TokenId - token_ids[MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - float probs[MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - int parent_id[MAX_NUM_TOKENS * - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + token_ids[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS]; + float probs[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS]; + float gumbel_logits[BatchConfig::MAX_NUM_TOKENS * + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + InferenceResult() : num_token_ids(0), num_gumbel_logits(0) {} + InferenceResult(InferenceResult const &other); + friend std::ostream &operator<<(std::ostream &os, InferenceResult const &ir); }; }; // namespace FlexFlow diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 2c11ae113..1aa80112b 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -16,10 +16,14 @@ #ifndef _FLEXFLOW_CONFIG_H_ #define _FLEXFLOW_CONFIG_H_ #include "ffconst.h" +#include "flexflow/attention_config.h" #include "flexflow/batch_config.h" +#include "flexflow/ops/kernels/gemm_impl.h" #include "legion.h" +#include #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include #include #include #elif defined(FF_USE_HIP_ROCM) @@ -70,6 +74,8 @@ struct FFHandler { #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnHandle_t dnn; cublasHandle_t blas; + cublasLtHandle_t blasLt; + Internal::GemmEngine *gemm_engine; #else miopenHandle_t dnn; hipblasHandle_t blas; @@ -77,19 +83,24 @@ struct FFHandler { void *workSpace; size_t workSpaceSize; void *batch_config_metadata; + AttentionMetaData *incr_attention_metadata; + AttentionMetaData *tree_search_attention_metadata; + AttentionMetaData *tree_verify_attention_metadata; - // request info + token info + topolopgy mask info - size_t batch_config_metadata_size = + size_t batch_config_metadata_size = alignTo( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens) + - sizeof(BatchConfig::request_completed); + sizeof(BatchConfig::request_available) + + sizeof(BatchConfig::causalMask) + + sizeof(BatchConfig::streamingCacheInfo) + + sizeof(BatchConfig::committed_tokens) + sizeof(int), + 16); + void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; bool allowTensorOpMathConversion; + int num_devices; + int device_id; #ifdef FF_USE_NCCL ncclComm_t ncclComm; #endif @@ -145,6 +156,7 @@ class FFConfig { Legion::Runtime *lg_hlr; Legion::IndexSpaceT<1> all_gpu_task_is; // Legion::FieldSpace field_space; + bool log_instance_creation; bool benchmarking, profiling, perform_fusion; bool inference_debugging; size_t simulator_work_space_size; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 512645e62..f713e4592 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -68,7 +68,7 @@ enum MetricsType { enum InferenceMode { INC_DECODING_MODE = 2001, - BEAM_SEARCH_MODE = 2002, + TREE_SEARCH_MODE = 2002, TREE_VERIFY_MODE = 2003, }; @@ -137,6 +137,7 @@ enum OperatorType { OP_SHAPE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Shape OP_SIZE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Size OP_TOPK, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#TopK + OP_GUMBEL_TOPK, OP_ARG_TOPK, OP_WHERE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Where OP_CEIL, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Ceil @@ -166,7 +167,7 @@ enum OperatorType { OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html OP_RMS_NORM, OP_RESIDUAL_RMS_NORM, - OP_BEAM_TOPK, + // OP_BEAM_TOPK, OP_ARGMAX, OP_INC_MULTIHEAD_SELF_ATTENTION, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 0b74b7fce..60372780e 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -133,71 +133,71 @@ flexflow_tensor_t flexflow_model_get_label_tensor(flexflow_model_t handle); void flexflow_model_zero_gradients(flexflow_model_t handle); flexflow_tensor_t flexflow_model_add_exp(flexflow_model_t handle, - const flexflow_tensor_t x, + flexflow_tensor_t const x, char const *name); flexflow_tensor_t flexflow_model_add_sin(flexflow_model_t handle, - const flexflow_tensor_t x, + flexflow_tensor_t const x, char const *name); flexflow_tensor_t flexflow_model_add_cos(flexflow_model_t handle, - const flexflow_tensor_t x, + flexflow_tensor_t const x, char const *name); flexflow_tensor_t flexflow_model_add_add(flexflow_model_t handle, - const flexflow_tensor_t x, - const flexflow_tensor_t y, + flexflow_tensor_t const x, + flexflow_tensor_t const y, bool inplace_a, char const *name); flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle, - const flexflow_tensor_t x, - const flexflow_tensor_t y, + flexflow_tensor_t const x, + flexflow_tensor_t const y, bool inplace_a, char const *name); flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle, - const flexflow_tensor_t x, - const flexflow_tensor_t y, + flexflow_tensor_t const x, + flexflow_tensor_t const y, bool inplace_a, char const *name); flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle, - const flexflow_tensor_t x, - const flexflow_tensor_t y, + flexflow_tensor_t const x, + flexflow_tensor_t const y, bool inplace_a, char const *name); flexflow_tensor_t flexflow_model_add_max(flexflow_model_t handle, - const flexflow_tensor_t x, - const flexflow_tensor_t y, + flexflow_tensor_t const x, + flexflow_tensor_t const y, bool inplace_a, char const *name); flexflow_tensor_t flexflow_model_add_min(flexflow_model_t handle, - const flexflow_tensor_t x, - const flexflow_tensor_t y, + flexflow_tensor_t const x, + flexflow_tensor_t const y, bool inplace_a, char const *name); flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int *axes, int n, bool keepdims, char const *name); flexflow_tensor_t flexflow_model_add_rsqrt(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name); flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const exponent, char const *name); flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int *dims, int n, bool keepdims, @@ -205,7 +205,7 @@ flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_conv2d(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, int out_channels, int kernel_h, int kernel_w, @@ -223,7 +223,7 @@ flexflow_tensor_t flexflow_tensor_t flexflow_model_add_embedding(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, int num_entries, int out_dim, enum AggrMode aggr, @@ -246,12 +246,12 @@ flexflow_tensor_t char const *name); flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, bool relu, char const *name); flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, int n, int *axes, bool elementwise_affine, @@ -261,9 +261,9 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle, flexflow_tensor_t * flexflow_model_add_residual_layer_norm(flexflow_model_t handle, - const flexflow_tensor_t input, - const flexflow_tensor_t residual1, - const flexflow_tensor_t residual2, + flexflow_tensor_t const input, + flexflow_tensor_t const residual1, + flexflow_tensor_t const residual2, bool use_two_residuals, int n, int *axes, @@ -274,8 +274,8 @@ flexflow_tensor_t * flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( flexflow_model_t handle, - const flexflow_tensor_t input, - const flexflow_tensor_t residual, + flexflow_tensor_t const input, + flexflow_tensor_t const residual, int n, int *axes, bool elementwise_affine, @@ -285,20 +285,21 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( flexflow_tensor_t flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle, - const flexflow_tensor_t input1, - const flexflow_tensor_t input2, + flexflow_tensor_t const input1, + flexflow_tensor_t const input2, + int intermediate_size, char const *name); flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle, - const flexflow_tensor_t a, - const flexflow_tensor_t b, + flexflow_tensor_t const a, + flexflow_tensor_t const b, int a_seq_length_dim /* -1 */, int b_seq_length_dim /* -1 */); flexflow_tensor_t flexflow_model_add_dense( flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, int out_dim, enum ActiMode activation /* AC_MODE_NONE */, bool use_bias /* true */, @@ -329,96 +330,96 @@ flexflow_tensor_t flexflow_model_add_flat(flexflow_model_t handle, char const *name); flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle, - const flexflow_tensor_t input, - const flexflow_tensor_t index, + flexflow_tensor_t const input, + flexflow_tensor_t const index, int dim, char const *name); flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, int dim, char const *name); flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, int n, int *perm, char const *name); flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, int n, int *shape, char const *name); flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, int axis, char const *name); flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, bool inplace, char const *name); flexflow_tensor_t flexflow_model_add_scalar_multiply(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, float const scalar, bool inplace, char const *name); flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, float const scalar, bool inplace, char const *name); flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, float const scalar, bool inplace, char const *name); flexflow_tensor_t flexflow_model_add_scalar_truediv(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, float const scalar, bool inplace, char const *name); flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, char const *name); flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, char const *name); flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, char const *name); flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, char const *name); flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, bool inplace, char const *name); flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle, - const flexflow_tensor_t input, + flexflow_tensor_t const input, float rate, unsigned long long seed, char const *name); flexflow_tensor_t flexflow_model_add_multihead_attention( flexflow_model_t handle, - const flexflow_tensor_t query, - const flexflow_tensor_t key, - const flexflow_tensor_t value, + flexflow_tensor_t const query, + flexflow_tensor_t const key, + flexflow_tensor_t const value, int embed_dim, int num_heads, int kdim, @@ -432,7 +433,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention( flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -444,15 +445,22 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, + bool streaming_cache, char const *name); flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -464,15 +472,22 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, + bool streaming_cache, char const *name); flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -484,15 +499,21 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, char const *name); -flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( +flexflow_tensor_t flexflow_model_add_groupquery_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -505,15 +526,22 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, + bool streaming_cache, char const *name); flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -526,15 +554,22 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, + bool streaming_cache, char const *name); flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -547,6 +582,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -554,39 +595,39 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( char const *name); flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float eps, int dim, char const *name); flexflow_tensor_t * flexflow_model_add_residual_rms_norm(flexflow_model_t handle_, - const flexflow_tensor_t input1_, - const flexflow_tensor_t input2_, + flexflow_tensor_t const input1_, + flexflow_tensor_t const input2_, float eps, int dim, char const *name); flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int k, bool sorted, - bool speculative_decoding, + bool renormalize, char const *name); -flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, - const flexflow_tensor_t input_, - int max_beam_size, - bool sorted, - char const *name); +// flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, +// const flexflow_tensor_t +// input_, int max_beam_size, +// bool sorted, +// char const *name); flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float top_p, char const *name); flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool beam_search, char const *name); @@ -972,12 +1013,21 @@ void flexflow_request_manager_set_max_requests_per_batch( void flexflow_request_manager_set_max_tokens_per_batch( flexflow_request_manager_t handle_, int max_num_tokens); -void flexflow_request_manager_set_max_spec_tree_token_num( - flexflow_request_manager_t handle_, int max_num_tokens); +void flexflow_request_manager_set_max_tokens_per_ssm_batch( + flexflow_request_manager_t handle_, int max_num_ssm_tokens); + +void flexflow_request_manager_set_max_tokens_per_prefilling_batch( + flexflow_request_manager_t handle_, int max_num_prefilling_tokens); void flexflow_request_manager_set_max_sequence_length( flexflow_request_manager_t handle_, int max_seq_length); +void flexflow_request_manager_set_max_output_length( + flexflow_request_manager_t handle_, int max_output_length); + +void flexflow_request_manager_set_max_kv_cache_size( + flexflow_request_manager_t handle_, int max_kv_cache_size); + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, @@ -1027,7 +1077,7 @@ flexflow_file_data_loader_t int num_q_heads, int num_kv_heads, int hidden_dim, - int qkv_inner_dim, + int head_dim, int tensor_parallelism_degree, bool use_full_precision); diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index 2e0cf1ca4..9dc657259 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -24,7 +24,7 @@ #include "legion/legion_utilities.h" #include -extern LegionRuntime::Logger::Category log_dp; +extern Legion::Logger log_dp; namespace FlexFlow::PCG { diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index f24a797ff..a866e52cb 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -15,21 +15,47 @@ #pragma once #include "flexflow/batch_config.h" +#include #include #include +using json = nlohmann::json; + namespace FlexFlow { struct GenerationConfig { bool do_sample = false; + bool spec_sample = false; float temperature = 0.8; + // top-p renormalization float topp = 0.6; - GenerationConfig(bool _do_sample, float _temperature, float _topp) { - temperature = _temperature > 0 ? _temperature : temperature; - topp = _topp > 0 ? _topp : topp; - do_sample = _do_sample; + // top-k renormalization + int topk = 16; + GenerationConfig(bool _do_sample = false, + float _temperature = 0.8, + float _topp = 0.6, + bool _spec_sample = false, + int _topk = 16) + : do_sample(_do_sample), temperature(_temperature), topp(_topp), + spec_sample(_spec_sample), topk(_topk) { + assert(temperature > 0.0); + assert(topk <= BatchConfig::MAX_K_LOGITS); } - GenerationConfig() {} +}; + +struct GenerationRequest { + std::string prompt; + bool add_special_tokens = true; + double slo_ratio; + double emission_time_ms; + + GenerationRequest(std::string const &prompt_, + double slo_ratio_, + double emission_time_ms_, + bool add_special_tokens_ = true) + : prompt(prompt_), slo_ratio(slo_ratio_), + emission_time_ms(emission_time_ms_), + add_special_tokens(add_special_tokens_) {} }; struct GenerationResult { @@ -40,10 +66,139 @@ struct GenerationResult { std::string output_text; std::vector input_tokens; std::vector output_tokens; + double slo_ratio; + double emission_time_ms; + int decoding_steps; }; -#include -#include +// Contains the configuration for how to emit requests to the server, +// managing the request arrival rate. +class EmissionMachine { +public: + enum class EmissionMode { Constant, Poisson, Trace }; + EmissionMode mode; + double elapsed_time_ms; + double last_request_time_ms; + double req_per_s; + std::vector> slo_ratios; + + EmissionMachine(EmissionMode mode_, + double req_per_s_, + std::vector> slo_ratios_) + : mode(mode_), elapsed_time_ms(0), last_request_time_ms(0), + req_per_s(req_per_s_), slo_ratios(slo_ratios_) { + // cumulate the slo ratios for sampling + for (size_t i = 1; i < slo_ratios.size(); i++) { + slo_ratios[i].second += slo_ratios[i - 1].second; + } + } + void wait_until_next_request(); + + // Simulate next request arrival time + virtual double get_next_interval_ms() = 0; + virtual double sample_slo_ratio(); + double get_elapsed_time_ms(); +}; + +class EmissionTrace { +public: + std::string prompt; + int input_length, output_length; + double slo_ratio; + double emission_time_ms; + + EmissionTrace(std::string prompt_, + int input_length_, + int output_length_, + double slo_ratio_, + double emission_time_ms_) + : prompt(prompt_), input_length(input_length_), + output_length(output_length_), slo_ratio(slo_ratio_), + emission_time_ms(emission_time_ms_) {} + EmissionTrace(GenerationResult const &result) + : prompt(result.input_text), input_length(result.input_tokens.size()), + output_length(result.output_tokens.size()), slo_ratio(result.slo_ratio), + emission_time_ms(result.emission_time_ms) {} + EmissionTrace(json const &json_obj); + + json to_json() const; +}; + +class ConstantEmissionMachine : public EmissionMachine { +public: + double interval_ms; + + ConstantEmissionMachine(double req_per_s_, + std::vector> slo_ratios_) + : EmissionMachine(EmissionMode::Constant, req_per_s_, slo_ratios_), + interval_ms(req_per_s_ > 0 ? 1e3 / req_per_s_ : 0) {} + + double get_next_interval_ms() override; +}; + +class PoissonEmissionMachine : public EmissionMachine { +public: + double lambda; + + PoissonEmissionMachine(double req_per_s_, + std::vector> slo_ratios_) + : EmissionMachine(EmissionMode::Poisson, req_per_s_, slo_ratios_), + lambda(req_per_s_) {} + + double get_next_interval_ms() override; +}; + +class TraceEmissionMachine : public EmissionMachine { +public: + std::vector timestamps, ratios; + size_t idx; + + TraceEmissionMachine(std::vector const ×tamps_, + std::vector const &ratios_) + : EmissionMachine(EmissionMode::Trace, 0, {}), timestamps(timestamps_), + ratios(ratios_), idx(0) {} + + double get_next_interval_ms() override; + double sample_slo_ratio() override; +}; + +struct RotaryEmbeddingMeta { + bool apply_rotary_embedding = false; + float rope_theta = 10000.0f; + std::string rope_type = "default"; + float factor = 8.0f; + float low_freq_factor = 1.0f; + float high_freq_factor = 4.0f; + int original_max_position_embeddings = 8192; + + RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false, + float rope_theta_ = 10000.0f, + std::string rope_type_ = "default", + float factor_ = 8.0f, + float low_freq_factor_ = 1.0f, + float high_freq_factor_ = 4.0f, + int original_max_position_embeddings_ = 8192) + : apply_rotary_embedding(apply_rotary_embedding_), + rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_), + low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_), + original_max_position_embeddings(original_max_position_embeddings_) {} + + friend std::ostream &operator<<(std::ostream &os, + RotaryEmbeddingMeta const &meta) { + os << std::boolalpha // To print bool as true/false instead of 1/0 + << "RotaryEmbeddingMeta {\n" + << " apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n" + << " rope_theta: " << meta.rope_theta << ",\n" + << " rope_type: \"" << meta.rope_type << "\",\n" + << " factor: " << meta.factor << ",\n" + << " low_freq_factor: " << meta.low_freq_factor << ",\n" + << " high_freq_factor: " << meta.high_freq_factor << ",\n" + << " original_max_position_embeddings: " + << meta.original_max_position_embeddings << "\n" + << "}"; + return os; + } +}; std::string join_path(std::vector const &paths); diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h index 69a57e4e1..9d9045a44 100644 --- a/include/flexflow/layer.h +++ b/include/flexflow/layer.h @@ -32,11 +32,13 @@ class Layer { void add_float_property(std::string const &key, float value); void add_int_vector_property(std::string const &key, std::vector const &value); + void add_string_property(std::string const &key, std::string const &value); void add_initializer(std::string const &key, Initializer *initializer); bool get_int_property(std::string const &key, long long &value) const; bool get_float_property(std::string const &key, float &value) const; bool get_int_vector_property(std::string const &key, std::vector &value) const; + bool get_string_property(std::string const &key, std::string &value) const; bool get_initializer(std::string const &key, Initializer *&initializer) const; Tensor get_parameter(int index); void print(); @@ -59,6 +61,7 @@ class Layer { std::unordered_map float_properties; std::unordered_map initializers; std::unordered_map> int_vector_properties; + std::unordered_map string_properties; }; }; // namespace FlexFlow diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 95be9ab58..6d9356aee 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -145,6 +145,9 @@ enum TaskIDs { TOPK_INIT_TASK_ID, TOPK_FWD_TASK_ID, TOPK_BWD_TASK_ID, + GUMBEL_TOPK_INIT_TASK_ID, + GUMBEL_TOPK_INF_TASK_ID, + GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID, ARG_TOPK_INIT_TASK_ID, ARG_TOPK_INF_TASK_ID, ARG_TOPK_INF_SPECULATIVE_TASK_ID, @@ -164,8 +167,8 @@ enum TaskIDs { RMSNORM_INF_TASK_ID, RESIDUAL_RMSNORM_INIT_TASK_ID, RESIDUAL_RMSNORM_INF_TASK_ID, - BEAM_TOPK_INIT_TASK_ID, - BEAM_TOPK_INF_TASK_ID, + // BEAM_TOPK_INIT_TASK_ID, + // BEAM_TOPK_INF_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, @@ -244,11 +247,13 @@ enum TaskIDs { RM_LOAD_TOKENS_TASK_ID, RM_LOAD_POSITION_TASK_ID, RM_LOAD_BATCH_CONFIG_TASK_ID, + RM_GET_NEXT_BATCH_CONFIG_TASK_ID, RM_PREPARE_NEXT_BATCH_TASK_ID, RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, - RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, + RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, RM_BACKGROUND_SERVING_TASK_ID, + LOAD_WEIGHT_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, @@ -335,11 +340,12 @@ class Reshape; class Softmax; class Split; class TopK; +class GumbelTopK; class ArgTopK; class Transpose; class RMSNorm; class ResidualRMSNorm; -class BeamTopK; +// class BeamTopK; class SpecIncMultiHeadSelfAttention; class Sampling; class ArgMax; @@ -407,74 +413,74 @@ class FFModel { bool cpu_offload; // C++ APIs for constructing models // Add an exp layer - Tensor exp(const Tensor x, char const *name = NULL); + Tensor exp(Tensor const x, char const *name = NULL); // Add an add layer - Tensor add(const Tensor x, - const Tensor y, + Tensor add(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a subtract layer - Tensor subtract(const Tensor x, - const Tensor y, + Tensor subtract(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a multiply layer - Tensor multiply(const Tensor x, - const Tensor y, + Tensor multiply(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a divide layer - Tensor divide(const Tensor x, - const Tensor y, + Tensor divide(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a max layer - Tensor max(const Tensor x, - const Tensor y, + Tensor max(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a min layer - Tensor min(const Tensor x, - const Tensor y, + Tensor min(Tensor const x, + Tensor const y, bool inplace_a = false, char const *name = NULL); // Add a rsqrt layer - Tensor rsqrt(const Tensor x, bool inplace = true, char const *name = NULL); + Tensor rsqrt(Tensor const x, bool inplace = true, char const *name = NULL); // Add a pow layer - Tensor pow(const Tensor x, + Tensor pow(Tensor const x, float const exponent, bool inplace = true, char const *name = NULL); // Add a scalar multiply layer - Tensor scalar_multiply(const Tensor x, + Tensor scalar_multiply(Tensor const x, float const scalar, bool inplace = true, char const *name = NULL); - Tensor scalar_add(const Tensor x, + Tensor scalar_add(Tensor const x, float const scalar, bool inplace = true, char const *name = NULL); - Tensor scalar_sub(const Tensor x, + Tensor scalar_sub(Tensor const x, float const scalar, bool inplace = true, char const *name = NULL); - Tensor scalar_truediv(const Tensor x, + Tensor scalar_truediv(Tensor const x, float const scalar, bool inplace = true, char const *name = NULL); // Add a sin layer - Tensor sin(const Tensor x, char const *name = NULL); + Tensor sin(Tensor const x, char const *name = NULL); // Add a cos layer - Tensor cos(const Tensor x, char const *name = NULL); + Tensor cos(Tensor const x, char const *name = NULL); // Add an activation layer - Tensor relu(const Tensor x, bool inplace = true, char const *name = NULL); - Tensor identity(const Tensor x, char const *name = NULL); - Tensor gelu(const Tensor x, char const *name = NULL); - Tensor sigmoid(const Tensor x, char const *name = NULL); - Tensor tanh(const Tensor x, char const *name = NULL); - Tensor elu(const Tensor x, bool inplace = true, char const *name = NULL); + Tensor relu(Tensor const x, bool inplace = true, char const *name = NULL); + Tensor identity(Tensor const x, char const *name = NULL); + Tensor gelu(Tensor const x, char const *name = NULL); + Tensor sigmoid(Tensor const x, char const *name = NULL); + Tensor tanh(Tensor const x, char const *name = NULL); + Tensor elu(Tensor const x, bool inplace = true, char const *name = NULL); // Add a 2D convolutional layer - Tensor conv2d(const Tensor input, + Tensor conv2d(Tensor const input, int outChannels, int kernelH, int kernelW, @@ -490,12 +496,12 @@ class FFModel { Initializer *bias_initializer = NULL, char const *name = NULL); // Add a dropout layer - Tensor dropout(const Tensor input, + Tensor dropout(Tensor const input, float rate, unsigned long long seed = 0, char const *name = NULL); // Add an embedding layer - Tensor embedding(const Tensor input, + Tensor embedding(Tensor const input, int num_entries, int outDim, AggrMode aggr, @@ -504,13 +510,13 @@ class FFModel { Initializer *kernel_initializer = NULL, char const *name = NULL); // Add a gather layer - Tensor gather(const Tensor input, - const Tensor index, + Tensor gather(Tensor const input, + Tensor const index, int dim, char const *name = NULL); // Add a group_by layer - void group_by(const Tensor data, - const Tensor assign, + void group_by(Tensor const data, + Tensor const assign, Tensor *outputs, int n, float alpha, @@ -532,7 +538,7 @@ class FFModel { float lambda_bal, char const *name = NULL); // Add a 2D pooling layer - Tensor pool2d(const Tensor input, + Tensor pool2d(Tensor const input, int kernelH, int kernelW, int strideH, @@ -543,7 +549,7 @@ class FFModel { ActiMode activation = AC_MODE_NONE, char const *name = NULL); // Add a layer_norm layer - Tensor layer_norm(const Tensor input, + Tensor layer_norm(Tensor const input, std::vector const &axes, bool elementwise_affine, float eps, @@ -551,9 +557,9 @@ class FFModel { DataType data_type = DT_NONE, char const *name = NULL); // Add a layer_norm layer with residual(s) - void residual_layer_norm(const Tensor input, - const Tensor residual1, - const Tensor residual2, + void residual_layer_norm(Tensor const input, + Tensor const residual1, + Tensor const residual2, Tensor *outputs, bool use_two_residuals, std::vector const &axes, @@ -563,8 +569,8 @@ class FFModel { DataType data_type = DT_NONE, char const *name = NULL); // Add a add_bias_residual_layer_norm layer - void add_bias_residual_layer_norm(const Tensor input, - const Tensor residual, + void add_bias_residual_layer_norm(Tensor const input, + Tensor const residual, Tensor *outputs, std::vector const &axes, bool elementwise_affine, @@ -573,41 +579,42 @@ class FFModel { DataType data_type = DT_NONE, char const *name = NULL); // Add a sigmoid_silu_multi layer - Tensor sigmoid_silu_multi(const Tensor input1, - const Tensor input2, + Tensor sigmoid_silu_multi(Tensor const input1, + Tensor const input2, + int intermediate_size, DataType data_type = DT_NONE, char const *name = NULL); // Add a batch_norm layer Tensor - batch_norm(const Tensor input, bool relu = true, char const *name = NULL); + batch_norm(Tensor const input, bool relu = true, char const *name = NULL); // Add a batch_matmul layer - Tensor batch_matmul(const Tensor A, - const Tensor B, + Tensor batch_matmul(Tensor const A, + Tensor const B, int a_seq_length_dim = -1, int b_seq_length_dim = -1, char const *name = nullptr); // Add a root mean square layer - Tensor rms_norm(const Tensor input, + Tensor rms_norm(Tensor const input, float eps, int dim, DataType data_type = DT_NONE, char const *name = NULL); // Add a residual root mean square layer - void residual_rms_norm(const Tensor input1, - const Tensor input2, + void residual_rms_norm(Tensor const input1, + Tensor const input2, Tensor *outputs, float eps, int dim, DataType data_type = DT_NONE, char const *name = NULL); - // Add a beam search top k layer - Tensor beam_top_k(const Tensor input, - int max_beam_size, - bool sorted, - char const *name = NULL); + // // Add a beam search top k layer + // Tensor beam_top_k(Tensor const input, + // int max_beam_size, + // bool sorted, + // char const *name = NULL); // Add a dense layer - Tensor dense(const Tensor input, + Tensor dense(Tensor const input, int outDim, ActiMode activation = AC_MODE_NONE, bool use_bias = true, @@ -619,7 +626,7 @@ class FFModel { float regularizer_lambda = 0.0, char const *name = NULL); // Add a cast layer - Tensor cast(const Tensor input, DataType dtype, char const *name = nullptr); + Tensor cast(Tensor const input, DataType dtype, char const *name = nullptr); // Add a concat layer Tensor concat(int n, Tensor const *tensors, int axis, char const *name = NULL); @@ -634,58 +641,64 @@ class FFModel { int experts_internal_dim_size = 0, // hidden dimension for internal layers char const *name = NULL); // Add a mean layer - Tensor mean(const Tensor input, + Tensor mean(Tensor const input, std::vector const &dims, bool keepdims, char const *name); // Add a moe layer (wrapping topk, group_by and aggregate operators) - Tensor moe(const Tensor input, + Tensor moe(Tensor const input, int num_exp, int num_select, int expert_hidden_size, float alpha, float lambda); // Add a split layer - void split(const Tensor input, + void split(Tensor const input, Tensor *outputs, std::vector const &split, int axis, char const *name = NULL); // Add a flat layer - Tensor flat(const Tensor input, char const *name = NULL); + Tensor flat(Tensor const input, char const *name = NULL); // Add a softmax layer - Tensor softmax(const Tensor input, + Tensor softmax(Tensor const input, int dim = -1, DataType data_type = DT_NONE, char const *name = NULL); // Create input tensors and constants - Tensor transpose(const Tensor input, + Tensor transpose(Tensor const input, std::vector const &perm, char const *name = NULL); - Tensor reduce_sum(const Tensor input, + Tensor reduce_sum(Tensor const input, std::vector const &axes, bool keepdims = false, char const *name = nullptr); - Tensor reshape(const Tensor input, + Tensor reshape(Tensor const input, std::vector const &shape, char const *name = NULL); - Tensor reverse(const Tensor input, int axis, char const *name = NULL); - void top_k(const Tensor input, + Tensor reverse(Tensor const input, int axis, char const *name = NULL); + void top_k(Tensor const input, Tensor *outputs, int k, bool sorted, char const *name = NULL); - Tensor arg_top_k(const Tensor input, + Tensor gumbel_top_k(Tensor const input, + // Tensor *outputs, + int k, + bool sorted, + bool speculative_decoding, + char const *name = NULL); + Tensor arg_top_k(Tensor const input, // Tensor *outputs, int k, bool sorted, - bool speculative_decoding, + bool renormalize, char const *name = NULL); - Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL); - Tensor sampling(const Tensor input, float top_p, char const *name = NULL); - Tensor multihead_attention(const Tensor query, - const Tensor key, - const Tensor value, + Tensor argmax(Tensor const input, bool beam_search, char const *name = NULL); + Tensor sampling(Tensor const input, float top_p, char const *name = NULL); + Tensor multihead_attention(Tensor const query, + Tensor const key, + Tensor const value, int embed_dim, int num_heads, int kdim = 0, @@ -697,42 +710,7 @@ class FFModel { DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, char const *name = NULL); - Tensor inc_multihead_self_attention(const Tensor input, - int embed_dim, - int num_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); - Tensor - spec_inc_multihead_self_attention(const Tensor input, - int embed_dim, - int num_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); - Tensor inc_multihead_self_attention_verify( + Tensor inc_multihead_self_attention( const Tensor input, int embed_dim, int num_heads, @@ -744,52 +722,73 @@ class FFModel { bool add_zero_attn = false, DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, bool position_bias = false, + bool streaming_cache = false, char const *name = NULL); - Tensor inc_multiquery_self_attention(const Tensor input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); - Tensor - spec_inc_multiquery_self_attention(const Tensor input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); - Tensor inc_multiquery_self_attention_verify( + Tensor spec_inc_multihead_self_attention( const Tensor input, int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + bool streaming_cache = false, + char const *name = NULL); + Tensor inc_multihead_self_attention_verify( + Tensor const input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); + Tensor groupquery_self_attention( + Tensor const input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + bool streaming_cache = false, + char const *name = NULL); + Tensor spec_inc_multiquery_self_attention( + Tensor const input, + int embed_dim, int num_q_heads, int num_kv_heads, int kdim = 0, @@ -800,7 +799,27 @@ class FFModel { bool add_zero_attn = false, DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + bool streaming_cache = false, + char const *name = NULL); + Tensor inc_multiquery_self_attention_verify( + Tensor const input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, @@ -810,7 +829,11 @@ class FFModel { // Inference APIs // ======================================== std::vector generate(std::vector &prompts, - int max_seq_length); + EmissionMachine &emission_machine); + + std::vector + generate(std::vector &requests, + EmissionMachine &emission_machine); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], @@ -820,7 +843,7 @@ class FFModel { bool create_grad = true); ParallelTensor create_parallel_tensor_legion_ordering(int num_dim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, int owner_idx = 0, @@ -833,7 +856,7 @@ class FFModel { int owner_idx = 0, bool create_grad = true); ParallelTensor create_parallel_tensor(int num_dim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, int owner_idx = 0, @@ -846,7 +869,7 @@ class FFModel { int owner_idx = 0, bool create_grad = true); template - ParallelTensor create_parallel_tensor(const ParallelDim dims[], + ParallelTensor create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, int owner_idx = 0, @@ -870,7 +893,7 @@ class FFModel { ParameterSyncType sync_type = ParameterSyncType::NONE); template ParallelParameter create_parallel_weight( - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, bool create_grad = true, @@ -878,7 +901,7 @@ class FFModel { ParameterSyncType sync_type = ParameterSyncType::NONE); ParallelParameter create_parallel_weight( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, bool create_grad = true, @@ -886,7 +909,7 @@ class FFModel { ParameterSyncType sync_type = ParameterSyncType::NONE); ParallelParameter create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op = NULL, bool create_grad = true, @@ -895,7 +918,7 @@ class FFModel { void map_tensor(ParallelTensor tensor, Op const *parallel_op); void map_weight(ParallelTensor tensor, Op const *parallel_op); - bool get_parallel_tensor_from_tensor(const Tensor tensor, + bool get_parallel_tensor_from_tensor(Tensor const tensor, ParallelTensor ¶llel_tensor) const; template @@ -936,7 +959,7 @@ class FFModel { // Internal PCG::Node creation APIs // ======================================== template - PCG::Node get_or_create_node(const typename T::Input &input, + PCG::Node get_or_create_node(typename T::Input const &input, typename T::Params const ¶ms) { using Params = typename T::Params; @@ -966,50 +989,50 @@ class FFModel { return this->new_node(op); } - PCG::Node get_or_create_noop_node(const ParallelTensor input); + PCG::Node get_or_create_noop_node(ParallelTensor const input); PCG::Node get_or_create_input_node(ParallelTensorShape const &); PCG::Node get_or_create_fused_parallel_node( - const ParallelTensor input, + ParallelTensor const input, std::vector const ¶llel_ops); - PCG::Node get_or_create_parallel_op_node(const ParallelTensor input, + PCG::Node get_or_create_parallel_op_node(ParallelTensor const input, ParallelOpInfo const &); // ======================================== // Internal APIs that should not be invoked from applications // ======================================== void create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], Legion::IndexSpace const &part_is, Legion::LogicalRegion const ®ion, Legion::LogicalPartition &part); template void create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], Legion::IndexSpaceT const &part_is, Legion::LogicalRegion const ®ion, Legion::LogicalPartition &part); void create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, Legion::IndexSpace const &part_is, Legion::LogicalRegion const ®ion, Legion::LogicalPartition &part); template void create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, Legion::IndexSpaceT const &part_is, Legion::LogicalRegion const ®ion, Legion::LogicalPartition &part); template - void create_disjoint_partition(const ParallelTensor tensor, + void create_disjoint_partition(ParallelTensor const tensor, Legion::IndexSpaceT const &part_is, Legion::LogicalPartition &part_fwd, Legion::LogicalPartition &part_bwd); template void create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, Legion::IndexSpaceT const &task_is, Legion::LogicalPartition &part_fwd, Legion::LogicalPartition &part_bwd); @@ -1059,6 +1082,10 @@ class FFModel { CompMode comp_mode = COMP_MODE_TRAINING); void compile_inference(); void set_transformer_layer_id(int id); + void set_num_transformer_layers(int num_layers); + void set_num_kv_heads(int num_heads); + void set_qkv_dim(int qkv_dim); + void set_size_dt(int size_dt); void set_position_offset(int offset); void graph_optimize(size_t budget, bool only_data_parallel, @@ -1078,6 +1105,7 @@ class FFModel { bool use_propagation) const; #ifdef FF_USE_NCCL ncclComm_t *find_nccl_comms(MachineView const &view) const; + void finish_nccl_comms(); #endif #ifdef FF_USE_PROPAGATE void propagate(std::map const ¤t, @@ -1097,7 +1125,7 @@ class FFModel { Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc); Legion::IndexSpace get_or_create_task_is(MachineView const &view); Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain); - Legion::IndexSpace get_or_create_task_is(const ParallelTensor); + Legion::IndexSpace get_or_create_task_is(ParallelTensor const); Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; @@ -1119,6 +1147,10 @@ class FFModel { size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid; size_t current_transformer_layer_id; // positional embedding start offset + int num_transformer_layers; + int num_kv_heads; + int qkv_dim; + int size_dt; int position_offset; FFConfig config; FFIterationConfig iter_config; @@ -1203,8 +1235,8 @@ class FFModel { std::unordered_map< std::pair, IncMultiHeadSelfAttention *>, - std::unordered_map, - BeamTopK *>, + // std::unordered_map, + // BeamTopK *>, std::unordered_map, Sampling *>, std::unordered_map, @@ -1223,6 +1255,8 @@ class FFModel { std::unordered_map, Softmax *>, std::unordered_map, TopK *>, + std::unordered_map, + GumbelTopK *>, std::unordered_map, ArgTopK *>, std::unordered_map, diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1b19bdb82..311699d92 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -19,7 +19,7 @@ namespace FlexFlow { -extern LegionRuntime::Logger::Category log_measure; +extern Legion::Logger log_measure; class OpMeta; class Simulator; @@ -233,6 +233,8 @@ class Op { std::vector const &, MachineView const *mv = nullptr) { assert(false); + Legion::FutureMap empty_map; + return empty_map; }; virtual void print_layer(FFModel const &model) = 0; template diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 5b187839e..766d4a582 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -8,7 +8,7 @@ #include "flexflow/ops/argmax_params.h" #include "flexflow/ops/attention_params.h" #include "flexflow/ops/batch_matmul_params.h" -#include "flexflow/ops/beam_topk_params.h" +// #include "flexflow/ops/beam_topk_params.h" #include "flexflow/ops/cast_params.h" #include "flexflow/ops/concat_params.h" #include "flexflow/ops/conv_2d_params.h" @@ -20,6 +20,7 @@ #include "flexflow/ops/flat_params.h" #include "flexflow/ops/gather_params.h" #include "flexflow/ops/groupby_params.h" +#include "flexflow/ops/gumbel_topk_params.h" #include "flexflow/ops/inc_multihead_self_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" @@ -69,7 +70,7 @@ using OperatorParameters = mp::variant namespace FlexFlow { class ArgTopKMeta : public OpMeta { public: - ArgTopKMeta(FFHandler handle, Op const *op); bool sorted; int k; - bool speculative_decoding; + bool renormalize; + Realm::RegionInstance reserveInst; + void *half_precision_output; + int max_output_size; + std::unordered_map device_resources; + ArgTopKMeta(FFHandler handle, + Op const *op, + MemoryAllocator &gpu_mem_allocator); + ~ArgTopKMeta(void); }; class ArgTopK : public Op { @@ -22,15 +32,15 @@ class ArgTopK : public Op { using Input = ParallelTensor; ArgTopK(FFModel &model, LayerID const &layer_guid, - const ParallelTensor input, + ParallelTensor const input, int k, bool sorted, - bool speculative_decoding, + bool renormalize, char const *name); ArgTopK(FFModel &model, LayerID const &layer_guid, ArgTopK const &other, - const ParallelTensor input); + ParallelTensor const input); ArgTopK(FFModel &model, Params const ¶ms, Input const input, @@ -64,7 +74,7 @@ class ArgTopK : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static BeamInferenceResult inference_speculative_task( + static InferenceResult inference_speculative_task( Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -81,28 +91,29 @@ class ArgTopK : public Op { MachineView const &pc, CostMetrics &cost_metrics) const override; template - static void forward_kernel(ArgTopKMeta const *m, + static void forward_kernel(ArgTopKMeta *m, DT const *input_ptr, - float *output_ptr, + DT *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, - BeamSearchBatchConfig const *bc, + bool renormalize, + BatchConfig const *bc, ffStream_t stream); - static void forward_kernel_wrapper(ArgTopKMeta const *m, + static void forward_kernel_wrapper(ArgTopKMeta *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &prob, GenericTensorAccessorW const &indices, int batch_size, - BeamSearchBatchConfig const *bc); + BatchConfig const *bc); Params get_params() const; public: int k; bool sorted; - bool speculative_decoding; + bool renormalize; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h index b2876c011..306ce9dd1 100644 --- a/include/flexflow/ops/arg_topk_params.h +++ b/include/flexflow/ops/arg_topk_params.h @@ -11,7 +11,7 @@ struct ArgTopKParams { LayerID layer_guid; int k; bool sorted; - bool speculative_decoding; + bool renormalize; char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h index 298059e3e..e58e8ca80 100644 --- a/include/flexflow/ops/argmax.h +++ b/include/flexflow/ops/argmax.h @@ -34,10 +34,10 @@ class ArgMax : public Op { using Params = ArgMaxParams; using Input = ParallelTensor; ArgMax(FFModel &model, - const ParallelTensor input, + ParallelTensor const input, bool beam_search, char const *name); - ArgMax(FFModel &model, ArgMax const &other, const ParallelTensor input); + ArgMax(FFModel &model, ArgMax const &other, ParallelTensor const input); ArgMax(FFModel &model, Params const ¶ms, Input const input, @@ -66,7 +66,7 @@ class ArgMax : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static BeamInferenceResult + static InferenceResult inference_task_beam(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/beam_topk_params.h b/include/flexflow/ops/beam_topk_params.h deleted file mode 100644 index 3e09848c9..000000000 --- a/include/flexflow/ops/beam_topk_params.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef _FLEXFLOW_BEAM_TOPK_PARAMS_H -#define _FLEXFLOW_BEAM_TOPK_PARAMS_H - -#include "flexflow/ffconst.h" -#include "flexflow/fftype.h" -#include "flexflow/parallel_tensor.h" - -namespace FlexFlow { - -struct BeamTopKParams { - LayerID layer_guid; - bool sorted; - int max_beam_width; - char name[MAX_OPNAME]; - bool is_valid(ParallelTensorShape const &) const; -}; -bool operator==(BeamTopKParams const &, BeamTopKParams const &); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash { - size_t operator()(FlexFlow::BeamTopKParams const &) const; -}; -} // namespace std - -#endif // _FLEXFLOW_BEAM_TOPK_PARAMS_H diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h index a8326e9ab..b8e417ddc 100644 --- a/include/flexflow/ops/fused.h +++ b/include/flexflow/ops/fused.h @@ -1,17 +1,41 @@ #ifndef _FLEXFLOW_FUSED_H_ #define _FLEXFLOW_FUSED_H_ +#include "flexflow/batch_config.h" #include "flexflow/model.h" +#include "graph_params.h" namespace FlexFlow { +// declare Legion names +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::LogicalPartition; +using Legion::LogicalRegion; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Runtime; +using Legion::Task; + class FusedOp; class FusedOpMeta { public: - FusedOpMeta(void) {} + FusedOpMeta(void) { + graphCaptured = false; + graph_collections.reserve(BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_NUM_TOKENS * 2); + } OpMeta *meta[MAX_NUM_FUSED_OPERATORS]; FusedOp *fused_op; int numOperators; + bool graphCaptured = false; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + std::unordered_map graph_collections; +#else + std::unordered_map graph_collections; +#endif }; class FusedOp : public Op { diff --git a/include/flexflow/ops/graph_params.h b/include/flexflow/ops/graph_params.h new file mode 100644 index 000000000..0362801c8 --- /dev/null +++ b/include/flexflow/ops/graph_params.h @@ -0,0 +1,51 @@ +#ifndef _FLEXFLOW_GRAPH_PARAMS_H_ +#define _FLEXFLOW_GRAPH_PARAMS_H_ + +#include +#include + +namespace FlexFlow { +struct GraphParams { + int num_active_requests; + int num_active_tokens; + bool prompt_phase; + + GraphParams(int num_active_requests, int num_active_tokens, bool prompt_phase) + : num_active_requests(num_active_requests), + num_active_tokens(num_active_tokens), prompt_phase(prompt_phase) {} + + void Print() const { + printf("GraphParams, num_active_requests: %d, num_active_tokens: %d, " + "prompt_phase: %d\n\n", + num_active_requests, + num_active_tokens, + prompt_phase); + } +}; + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::GraphParams const &gp) const { + return std::hash()(gp.num_active_requests) ^ + std::hash()(gp.num_active_tokens) ^ + std::hash()(gp.prompt_phase); + } +}; +} // namespace std + +namespace std { +template <> +struct equal_to { + bool operator()(FlexFlow::GraphParams const &lhs, + FlexFlow::GraphParams const &rhs) const { + return lhs.num_active_requests == rhs.num_active_requests && + lhs.num_active_tokens == rhs.num_active_tokens && + lhs.prompt_phase == rhs.prompt_phase; + } +}; +} // namespace std + +#endif diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/gumbel_topk.h similarity index 53% rename from include/flexflow/ops/beam_topk.h rename to include/flexflow/ops/gumbel_topk.h index 9466ba2a3..b74361fb2 100644 --- a/include/flexflow/ops/beam_topk.h +++ b/include/flexflow/ops/gumbel_topk.h @@ -1,45 +1,58 @@ -#ifndef _FLEXFLOW_BEAM_TOPK_H_ -#define _FLEXFLOW_BEAM_TOPK_H_ +#ifndef _FLEXFLOW_GUMBEL_TOPK_H_ +#define _FLEXFLOW_GUMBEL_TOPK_H_ #include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/node.h" -#include "flexflow/ops/beam_topk_params.h" +#include "flexflow/ops/gumbel_topk_params.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include +#include +#elif defined(FF_USE_HIP_ROCM) +#include +#include +#endif #include "flexflow/utils/memory_allocator.h" namespace FlexFlow { -class BeamTopKMeta : public OpMeta { +class GumbelTopKMeta : public OpMeta { public: - BeamTopKMeta(FFHandler handle, - Op const *op, - MemoryAllocator &gpu_mem_allocator); - ~BeamTopKMeta(void); bool sorted; - int max_beam_width; - int *parent_ids; - void *acc_probs; - int *block_start_index; - int *request_id; - int *tokens_per_request; + int k; + bool speculative_decoding; Realm::RegionInstance reserveInst; +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + curandState *state; + int state_max_length; +#elif defined(FF_USE_HIP_ROCM) + hiprandState *state; +#endif + GumbelTopKMeta(FFHandler handle, + Op const *op, + MemoryAllocator &gpu_mem_allocator); + ~GumbelTopKMeta(void); }; -class BeamTopK : public Op { +class GumbelTopK : public Op { public: - using Params = BeamTopKParams; + using Params = GumbelTopKParams; using Input = ParallelTensor; - BeamTopK(FFModel &model, - const ParallelTensor input, - LayerID const &_layer_guid, - int max_beam_width, - bool sorted, - char const *name); - BeamTopK(FFModel &model, BeamTopK const &other, const ParallelTensor input); - BeamTopK(FFModel &model, - Params const ¶ms, - Input const input, - char const *name = nullptr); + GumbelTopK(FFModel &model, + LayerID const &layer_guid, + ParallelTensor const input, + int k, + bool sorted, + bool speculative_decoding, + char const *name); + GumbelTopK(FFModel &model, + LayerID const &layer_guid, + GumbelTopK const &other, + ParallelTensor const input); + GumbelTopK(FFModel &model, + Params const ¶ms, + Input const input, + char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, @@ -64,11 +77,16 @@ class BeamTopK : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static BeamInferenceResult + static InferenceResult inference_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static InferenceResult inference_speculative_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); void serialize(Legion::Serializer &s) const override; static PCG::Node deserialize(FFModel &ff, Legion::Deserializer &d, @@ -81,30 +99,31 @@ class BeamTopK : public Op { MachineView const &pc, CostMetrics &cost_metrics) const override; template - static void forward_kernel(BeamTopKMeta const *m, - BeamSearchBatchConfig const *bc, + static void forward_kernel(GumbelTopKMeta const *m, DT const *input_ptr, - float *output_ptr, + float *log_probs_ptr, + float *perturbed_log_probs_ptr, int *indices_ptr, - int *parent_ptr, - int batch_size, + size_t batch_size, int length, + int k, bool sorted, + BatchConfig const *bc, ffStream_t stream); - static void forward_kernel_wrapper(BeamTopKMeta const *m, - BeamSearchBatchConfig const *bc, - GenericTensorAccessorR const &input, - float *output_ptr, - int *indices_ptr, - int *parent_ptr, - int batch_size, - int length, - bool sorted); + static void + forward_kernel_wrapper(GumbelTopKMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &log_probs, + GenericTensorAccessorW const &perturbed_log_probs, + GenericTensorAccessorW const &indices, + int batch_size, + BatchConfig const *bc); Params get_params() const; public: + int k; bool sorted; - int max_beam_width; + bool speculative_decoding; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/gumbel_topk_params.h b/include/flexflow/ops/gumbel_topk_params.h new file mode 100644 index 000000000..480e7b9cc --- /dev/null +++ b/include/flexflow/ops/gumbel_topk_params.h @@ -0,0 +1,29 @@ +#ifndef _FLEXFLOW_GUMBEL_TOPK_PARAMS_H +#define _FLEXFLOW_GUMBEL_TOPK_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct GumbelTopKParams { + LayerID layer_guid; + int k; + bool sorted; + bool speculative_decoding; + char name[MAX_OPNAME]; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(GumbelTopKParams const &, GumbelTopKParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::GumbelTopKParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_GUMBEL_TOPK_PARAMS_H diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 43dc527bc..8bc3b15a3 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H #include "flexflow/accessor.h" +#include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/inference.h" @@ -39,7 +40,7 @@ class IncMultiHeadSelfAttention : public Op { bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, @@ -47,6 +48,7 @@ class IncMultiHeadSelfAttention : public Op { bool allocate_weights, DataType _quantization_type, bool _offload, + bool _streaming_cache, int _tensor_parallelism_degree, char const *name); IncMultiHeadSelfAttention(FFModel &model, @@ -61,7 +63,7 @@ class IncMultiHeadSelfAttention : public Op { bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, @@ -69,6 +71,7 @@ class IncMultiHeadSelfAttention : public Op { bool allocate_weights, DataType _quantization_type, bool _offload, + bool _streaming_cache, int _tensor_parallelism_degree, char const *name); IncMultiHeadSelfAttention(FFModel &model, @@ -113,7 +116,7 @@ class IncMultiHeadSelfAttention : public Op { MachineView const &mv, CostMetrics &cost_metrics) const override; - static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, @@ -126,12 +129,12 @@ class IncMultiHeadSelfAttention : public Op { int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool qkv_bias; - bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; + int hidden_size, qk_dim, v_dim, o_dim; int qoSeqLength, kvSeqLength; DataType quantization_type; - bool offload; + bool offload, streaming_cache; }; class IncMultiHeadSelfAttentionMeta : public OpMeta { @@ -146,14 +149,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { IncMultiHeadSelfAttentionMeta(FFHandler handler, InferenceMode infer_mode, Op const *attn, - int _qSize, - int _kSize, - int _vSize, - int _qProjSize, - int _kProjSize, - int _vProjSize, - int _oProjSize, - bool _apply_rotary_embedding, + int _hidden_size, + int _qk_dim, + int _v_dim, + int _o_dim, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _qkv_bias, bool _scaling_query, bool _qk_prod_scaling, @@ -168,18 +168,19 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { int _num_q_heads, int _num_kv_heads, DataType _quantization_type, - bool _offload); + bool _offload, + bool _streaming_cache); ~IncMultiHeadSelfAttentionMeta(void); public: Realm::RegionInstance reserveInst; size_t weights_params, weightSize, biasSize, reserveSpaceSize, quantized_weightSize; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int hidden_size, qk_dim, v_dim, o_dim; int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads, - hidden_size; + local_hidden_size; bool *has_load_weights; - bool *apply_rotary_embedding; + RotaryEmbeddingMeta *rotary_embedding_meta; bool *qkv_bias; bool *final_bias; bool *scaling_query; @@ -187,12 +188,20 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { bool *position_bias; float scaling_factor; void *weight_ptr, *bias_ptr; // for weight offload - void *devQKVProjArray, *keyCache, *valueCache; - void *qk_prods, *qk_prods_softmax; + void *devQKVProjArray, *queryTmp; + half *outputTmp; + void *kvCache; + bool streaming_cache; + // When enable Streaming cache, we alter relative position each iteration, so + // we need below memory buffer for storing the pre-pos-encoding key value in + // sink and window. + void *streamingPrePosEncBuf; void *attn_heads; char *quantized_weight_ptr; BatchConfig::PerTokenInfo *token_infos; BatchConfig::PerRequestInfo *request_infos; + bool *request_available; + StreamingCacheInfo *streaming_cache_infos; DataType quantization_type; bool offload; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index 58681069e..809c4f19e 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -3,6 +3,7 @@ #include "flexflow/ffconst.h" #include "flexflow/fftype.h" +#include "flexflow/inference.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { @@ -12,10 +13,11 @@ struct IncMultiHeadSelfAttentionParams { int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; + bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling, + position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; DataType quantization_type; - bool offload; + bool offload, streaming_cache; char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/kernels/gemm_impl.h b/include/flexflow/ops/kernels/gemm_impl.h new file mode 100644 index 000000000..f0e08a67d --- /dev/null +++ b/include/flexflow/ops/kernels/gemm_impl.h @@ -0,0 +1,129 @@ +#ifndef GEMM_IMPL_H +#define GEMM_IMPL_H + +#include +#include + +namespace Internal { + +/* TODO: Consider appropriate case to use Lt */ +// #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) +// // Strangely, if mat2 has only 1 row or column, we get +// // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. +// // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == +// mat2_sizes[1] +// // is to use lt interface only when self is bias. +// // for cuda 11.4, cublasLtMatmul is activated +// // the last two conditions is to skip 16b transA and non-trans-B having +// // leading dim >> rows when they are sliced from a large tensor +// // see +// fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul if +// (!disable_addmm_cuda_lt) { +// useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 && +// result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] && +// self.is_contiguous() && result.is_contiguous() && +// (scalar_type == at::ScalarType::Double || +// scalar_type == at::ScalarType::Float || +// scalar_type == at::ScalarType::Half || +// scalar_type == at::ScalarType::BFloat16) && +// #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010) +// mat2_sizes[0] > 1 && mat2_sizes[1] > 1; +// #else +// mat2_sizes[0] > 1 && mat2_sizes[1] > 1 && +// mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 && +// mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 && +// // avoid leading dim >> rows bugs +// ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) || +// (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) || +// (scalar_type != at::ScalarType::Half && +// scalar_type != at::ScalarType::BFloat16)) && +// ((mat2.strides()[0] == 1 && mat2.strides()[1] == mat2_sizes[0]) || +// (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) || +// (scalar_type != at::ScalarType::Half && +// scalar_type != at::ScalarType::BFloat16)); +// #endif +// } +// #endif + +#define USE_CUBLASLT + +#ifdef USE_CUBLASLT +template +inline void gemm_internal_cublaslt(cublasLtHandle_t handle, + cudaDeviceProp *prop, + void *workspace, + size_t workspace_size, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + Dtype alpha, + Dtype const *a, + int64_t lda, + Dtype const *b, + int64_t ldb, + Dtype beta, + Dtype *c, + int64_t ldc, + cudaStream_t stream); +#else +template +inline void gemm_internal_cublas(cublasHandle_t handle, + cudaDeviceProp *prop, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + Dtype alpha, + Dtype const *a, + int64_t lda, + Dtype const *b, + int64_t ldb, + Dtype beta, + Dtype *c, + int64_t ldc, + cudaStream_t stream); +#endif + +// Wrapper for gemm +// Adopted from pytorch: +// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/CUDABlas.cpp +class GemmEngine { +public: + // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind + // defaultlt setting workspace size to 1M. + GemmEngine(cublasHandle_t blas_, + cublasLtHandle_t blasLt_, + cudaDeviceProp *device_prop_ = nullptr, + size_t workspace_size_ = 1024 * 1024); + void assign_workspace(void *workspace_, size_t workspace_size_); + + template + void gemm_internal(cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + Dtype alpha, + Dtype const *a, + int64_t lda, + Dtype const *b, + int64_t ldb, + Dtype beta, + Dtype *c, + int64_t ldc, + cudaStream_t stream); + +public: + cublasHandle_t blas; + cublasLtHandle_t blasLt; + cudaDeviceProp *device_prop; + size_t workspace_size; // in bytes + void *workspace; +}; + +} // namespace Internal + +#endif // GEMM_IMPL_H diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 9bf2f581e..4c66c1f2c 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -14,97 +14,143 @@ namespace FlexFlow { namespace Kernels { namespace IncMultiHeadAttention { +// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim] +__device__ __forceinline__ size_t + get_k_entry_offset_verify(int const token_idx, + int const page_idx, + int const num_heads, + int const head_dim) { + size_t index = ((page_idx)*kPagesize * 2 + (token_idx % kPagesize)) * + head_dim * num_heads; + return index; +} + +// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim] +__device__ __forceinline__ size_t + get_v_entry_offset_verify(int const token_idx, + int const page_idx, + int const num_heads, + int const head_dim) { + size_t index = + ((page_idx)*kPagesize * 2 + kPagesize + (token_idx % kPagesize)) * + head_dim * num_heads; + return index; +} + +// // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim] +__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx, + int const token_idx, + int const max_num_pages, + int const num_heads, + int const head_dim) { + return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 + + token_idx % kPagesize) * /* page slot index */ + num_heads * + head_dim; +} + +// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim] +__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx, + int const token_idx, + int const max_num_pages, + int const num_heads, + int const head_dim) { + return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 + + kPagesize + token_idx % kPagesize) * /* page slot index */ + num_heads * + head_dim; +} + template -void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - DT *output_ptr, - ffStream_t stream); +void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + ffStream_t stream); +// [For the tokens in batch] +// Compute qkv projection for the tokens in the batch. template -void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *weight_ptr, - DT const *bias_ptr, - int num_tokens, - ffStream_t stream); +void compute_qkv(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + ffStream_t stream); +// [For the tokens in batch] +// Apply position embedding for qk. +// Note that this is only used for tokens in the current batch. +// For other Key tokens like in streaming cache, we nned other kernel to apply +// the position embedding. template -__global__ void apply_position_bias_qkprd(DT *input_ptr, - int num_tokens, - int num_total_tokens, - int num_heads, - int global_num_q_heads, - int shard_id); +void apply_pos_encoding_to_tokens_in_batch( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream); +// [For the tokens in streaming cache] +// Apply position embedding for k projection in the streaming cache. +// Note that before the position encoding, the projection is moved *in order* to +// the kv memory took by the attention kernel. So our operation is applied where +// kvCache points to. template -__global__ void apply_proj_bias_w(DT *input_ptr, - DT const *bias_ptr, - int num_tokens, - int qkv_weight_size, - int oProjSize); +void apply_pos_encoding_to_streaming_proj( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); +// [For the tokens in batch] +// Update the kv cache, and compact the q array. +// Source: qkv projeciton array of tokens in the batch. +// Destination: q&kv ptr took by the attention kernel. +// Note that the q&k here are the value after applying with position encoding. template -__global__ void apply_proj_bias_qkv(DT *input_ptr, - DT const *bias_ptr, - int shard_id, - int num_tokens, - int qProjSize, - int kProjSize, - int vProjSize, - int num_heads, - int num_kv_heads, - bool scaling_query, - float scaling_factor); - -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template +void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream, + bool is_spec); + +// [For the tokens in streaming cache] +// Convert the out-of-order cache to in-order relative position. +// Source: pre-pos-encoding kv values in the streaming cache. +// Destination: kv ptr took by the attention kernel. template -__global__ void - apply_rotary_embedding(DT *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_heads, - int num_tokens, - int num_kv_heads, - int q_block_size, - int k_block_size, - int q_array_size, - bool q_tensor); -#elif defined(FF_USE_HIP_ROCM) +void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +// [For the tokens in batch] +// Commit the kv values to the streaming cache. +// Source: qkv projeciton array of tokens in the batch. +// Destination: pre-pos-encoding kv values in the streaming cache. template -__global__ void - apply_rotary_embedding(DT *input_ptr, - hipFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_heads, - int num_tokens, - int num_kv_heads, - int q_block_size, - int k_block_size, - int q_array_size, - bool q_tensor); -#endif +void commit_kv(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); template -void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - DT const *bias_ptr, - ffStream_t stream); +void produce_output(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream); template -void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - ffStream_t stream); +void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *weight_ptr, + DT const *bias_ptr, + int num_tokens, + ffStream_t stream); } // namespace IncMultiHeadAttention } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index d1e0e050b..481243867 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -476,24 +476,24 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, int max_sequence_length, int threads_per_value, int threads_per_block, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, int shared_mem[]) { int max_query_length = 0; int max_total_length = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (!bc->request_available[i]) { continue; } max_query_length = max(max_query_length, bc->requestsInfo[i].num_tokens_in_batch); max_total_length = max(max_total_length, - bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].first_token_index_in_request + bc->requestsInfo[i].num_tokens_in_batch); } // todo fix this - int max_qk_length = max_query_length; + int max_qk_length = max_total_length; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; @@ -512,7 +512,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, size_t red_sz = rows_per_red * hidden_size_per_head * sizeof(float) / 2; // The max. shared_mem[0] = qk_sz; - shared_mem[1] = softmax_sz + red_sz + q_size; + shared_mem[1] = max(softmax_sz, red_sz) + q_size; } template @@ -520,5 +520,8 @@ struct threads_per_value_t { static int const value = Dh * sizeof(T) / 16; }; +#define test_bit(bit_mask, idx, pos) \ + (((bit_mask)[idx][(pos) / 64] & (1ULL << ((pos) % 64))) != 0) + } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H \ No newline at end of file diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 0eef4ca72..084898710 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -47,7 +47,8 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, GenericTensorAccessorR const &input2, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &residual_output, - GenericTensorAccessorW const &output); + GenericTensorAccessorW const &output, + int batch_size); } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/sampling.h b/include/flexflow/ops/sampling.h index 1696582cc..026be221f 100644 --- a/include/flexflow/ops/sampling.h +++ b/include/flexflow/ops/sampling.h @@ -46,10 +46,10 @@ class Sampling : public Op { using Params = SamplingParams; using Input = ParallelTensor; Sampling(FFModel &model, - const ParallelTensor input, + ParallelTensor const input, float top_p, char const *name); - Sampling(FFModel &model, Sampling const &other, const ParallelTensor input); + Sampling(FFModel &model, Sampling const &other, ParallelTensor const input); Sampling(FFModel &model, Params const ¶ms, Input const input, diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h index 604438260..bc07e253e 100644 --- a/include/flexflow/ops/sigmoid_silu_multi.h +++ b/include/flexflow/ops/sigmoid_silu_multi.h @@ -19,6 +19,8 @@ class SigmoidSiluMulti : public Op { LayerID const &_layer_guid, const ParallelTensor _input1, const ParallelTensor _input2, + int _intermediate_size, + int _tensor_parallelism_degree, char const *name = nullptr); void init(FFModel const &) override; void init_inference(FFModel const &, @@ -68,18 +70,25 @@ class SigmoidSiluMulti : public Op { static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, - GenericTensorAccessorW const &output); + GenericTensorAccessorW const &output, + int token_size); + +public: + int intermediate_size, tensor_parallelism_degree; }; class SigmoidSiluMultiMeta : public OpMeta { public: SigmoidSiluMultiMeta(FFHandler handle, SigmoidSiluMulti const *ln, - MemoryAllocator &gpu_mem_allocator); + MemoryAllocator &gpu_mem_allocator, + int _global_intermediate_size, + int _intermediate_size); ~SigmoidSiluMultiMeta(void); public: Realm::RegionInstance reserveInst; + int global_intermediate_size, intermediate_size; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h index eb152db5c..0e92c0aa6 100644 --- a/include/flexflow/ops/sigmoid_silu_multi_params.h +++ b/include/flexflow/ops/sigmoid_silu_multi_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SigmoidSiluMultiParams { LayerID layer_guid; + int intermediate_size, tensor_parallelism_degree; char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index a0d01092b..e4e077e78 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -26,7 +26,7 @@ class SpecIncMultiHeadSelfAttention : public Op { SpecIncMultiHeadSelfAttention(FFModel &model, LayerID const &layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -36,16 +36,18 @@ class SpecIncMultiHeadSelfAttention : public Op { bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, bool allocate_weights, + bool _streaming_cache, + int _tensor_parallelism_degree, char const *name); SpecIncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, + ParallelTensor const _input, + ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -55,16 +57,18 @@ class SpecIncMultiHeadSelfAttention : public Op { bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, bool allocate_weights, + bool _streaming_cache, + int _tensor_parallelism_degree, char const *name); SpecIncMultiHeadSelfAttention(FFModel &model, SpecIncMultiHeadSelfAttention const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights); SpecIncMultiHeadSelfAttention(FFModel &model, Params const ¶ms, @@ -107,24 +111,24 @@ class SpecIncMultiHeadSelfAttention : public Op { MachineView const &mv, CostMetrics &cost_metrics) const override; - static void - inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias); + static void inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias); Params get_params() const; public: int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool qkv_bias; - bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; + int hidden_size, qk_dim, v_dim, o_dim; int qoSeqLength, kvSeqLength; + bool streaming_cache; }; class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { @@ -137,13 +141,6 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int _num_q_heads, int _num_kv_heads); ~SpecIncMultiHeadSelfAttentionMeta(void); - -public: - Realm::RegionInstance beam_search_reserve_inst; - BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; - bool *request_completed; - BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index 1461224ba..75cb576dc 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -9,10 +9,13 @@ namespace FlexFlow { struct SpecIncMultiHeadSelfAttentionParams { LayerID layer_guid; - int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; + int embed_dim, num_q_heads, num_kv_heads, kdim, vdim, + tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; + bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling, + position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; + bool streaming_cache; char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 02df0c013..3edf4dbd7 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -26,7 +26,7 @@ class TreeIncMultiHeadSelfAttention : public Op { TreeIncMultiHeadSelfAttention(FFModel &model, LayerID const &layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -36,7 +36,7 @@ class TreeIncMultiHeadSelfAttention : public Op { bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, @@ -47,8 +47,8 @@ class TreeIncMultiHeadSelfAttention : public Op { int _tensor_parallelism_degree, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, + ParallelTensor const _input, + ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -58,7 +58,7 @@ class TreeIncMultiHeadSelfAttention : public Op { bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, @@ -70,7 +70,7 @@ class TreeIncMultiHeadSelfAttention : public Op { char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, TreeIncMultiHeadSelfAttention const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights); TreeIncMultiHeadSelfAttention(FFModel &model, Params const ¶ms, @@ -111,7 +111,7 @@ class TreeIncMultiHeadSelfAttention : public Op { CostMetrics &cost_metrics) const override; static void inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta *m, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, @@ -124,9 +124,9 @@ class TreeIncMultiHeadSelfAttention : public Op { int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; bool qkv_bias; - bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; + int hidden_size, qk_dim, v_dim, o_dim; int qoSeqLength, kvSeqLength; DataType quantization_type; bool offload; @@ -145,10 +145,8 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { public: int num_active_tokens; - Realm::RegionInstance committed_token_reserve_inst; - TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; - bool *request_completed; - BatchConfig::BitMask *causalMask; + BatchConfig::CommittedTokensInfo *committed_token_infos; + int *num_tokens_to_commit; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index d1a51b8b8..3906210d4 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -12,8 +12,9 @@ struct TreeIncMultiHeadSelfAttentionParams { int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; + bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling, + position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; DataType quantization_type; bool offload; char name[MAX_OPNAME]; diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index bab7e6e4e..35f0c8542 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -20,7 +20,8 @@ #include "legion.h" namespace FlexFlow { - +using Legion::Context; +using Legion::Runtime; class FFModel; class OpMeta; @@ -60,7 +61,9 @@ class SGDOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void nccl_update_task_gpu(SGDOptimizer const *op, + static void nccl_update_task_gpu(Legion::Context ctx, + Legion::Runtime *runtime, + SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -103,7 +106,9 @@ class AdamOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void nccl_update_task_gpu(AdamOptimizer const *op, + static void nccl_update_task_gpu(Legion::Context ctx, + Legion::Runtime *runtime, + AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h new file mode 100644 index 000000000..c0d6df085 --- /dev/null +++ b/include/flexflow/page_manager.h @@ -0,0 +1,162 @@ +#pragma once + +#include "flexflow/batch_config.h" +#include "flexflow/config.h" +#include "flexflow/inference.h" +#include "flexflow/model.h" +#include "flexflow/utils/file_loader.h" +#include +#include +#include +#include + +namespace FlexFlow { + +using TokenId = BatchConfig::TokenId; + +/** + * @class LogicalTokenBlock + * @brief A class to represent a sequence of tokens for each request + */ +class LogicalTokenBlock { +public: + using TokenId = BatchConfig::TokenId; + + // Constructor + LogicalTokenBlock(int block_number, uint32_t block_size); + + // Method to check if the block is empty + bool is_empty() const; + + // Method to check if the block is full + bool is_full() const; + + // Method to get the number of empty slots + int get_num_empty_slots() const; + + // Method to get the number of allocated slots + int get_num_alloc_slots() const; + + // Used to clean up the spec tokens in a block since these spec tokens may not + // be committed after use + void reset_num_spec_tokens(); + + // Method to append tokens + void append_tokens(std::vector const &token_ids_to_append, + bool committed); + + int get_num_tokens() const { + return num_tokens; + } + int get_num_commit_tokens() const { + return num_commit_tokens; + } + int get_num_spec_tokens() const { + return num_spec_tokens; + } + + std::vector get_token_ids() const; + +private: + int block_number; // the index of the logical token block + int block_size; // the size of the block + int num_tokens; // the number of tokens currently stored in the block + int num_commit_tokens; // the number of tokens inside this block that are + // already committed + int num_spec_tokens; // the number of tokens inside this block that are + // speculative tokens, which is stored temporarily + std::vector token_ids; // store the token ids in a order that + // corresponds to the inference sequence +}; + +/** + * @class PhysicalTokenBlock + * @brief A class to represent a physical block of tokens similar to physical + * memory address It keeps track of the location of the tokens stored on GPU + * memory + */ +class PhysicalTokenBlock { +public: + // Constructor + PhysicalTokenBlock(int block_number, int block_size); + + // Method to get the block number + int get_block_number() const { + return block_number; + } + void incr_ref_count() { + ref_count++; + } + void decr_ref_count() { + ref_count--; + } + int ref_count; // reference count, TODO: move to private + +private: + int block_number; // the index of the physical token block + int block_size; // the size of the block +}; + +/** + * @class BlockAllocator + * @brief A Block Manager that is reponsible for maintaining a pool of free + * blocks + */ +class BlockAllocator { +public: + // Constructor + BlockAllocator(int block_size, int num_total_blocks); + + // Allocate a block + PhysicalTokenBlock allocate(); + + // Free a block + void free(PhysicalTokenBlock &block); + + // Get the number of free blocks + int get_num_free_blocks() const; + +private: + int block_size; + size_t num_total_blocks; + std::deque free_blocks; +}; + +/* + * @class PageManager + * @brief A wrapper class that manages the kv cache allocation status + * notice that all the layers of model will share the same page manager because + * the position of kv cache will be the same + */ +class PageManager { +public: + // Get the singleton instance of the PageManager as it will be shared in + // multiple places + static PageManager *get_page_manager(); + static PageManager *get_page_manager(FFModel *ff, size_t kv_cache_size); + size_t get_kv_cache_size_per_layer(); + using BlockTable = std::vector; + using RequestGuid = BatchConfig::RequestGuid; + PageManager(int block_size, size_t num_total_blocks); + int allocate_one_block(RequestGuid const &request_guid); + void free_request(RequestGuid const &request_guid); + // used for the case that we want to free the last num_blocks that stores spec + // tokens(which are the tokens are not yet committed) + void free_multiple_blocks(RequestGuid const &request_guid, int num_blocks); + std::vector + get_block_table_indices(RequestGuid const &request_guid) const; + + void free_block_table(BlockTable &block_table); + +private: + size_t kv_cache_size_per_layer; + int block_size; // the size of the block + int num_total_blocks; // the total number of blocks + BlockAllocator block_allocator; + std::unordered_map block_tables; + + int get_num_total_free_blocks() const; + int get_num_allocated_blocks(RequestGuid const &request_guid) const; +}; + +}; // namespace FlexFlow \ No newline at end of file diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h index bdf7aae50..3436fc2a6 100644 --- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h +++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h @@ -6,23 +6,41 @@ #include "flexflow/fftype.h" #include "flexflow/op_meta.h" #include "flexflow/parallel_ops/allreduce.h" +#include "flexflow/utils/communication_buffer.h" +#include "flexflow/utils/memory_allocator.h" +#include namespace FlexFlow { class AllReduceMeta : public OpMeta { public: - AllReduceMeta(FFHandler handle, AllReduce const *reduct); + AllReduceMeta(FFHandler handle, + AllReduce const *reduct, + MemoryAllocator &gpu_mem_allocator); + ~AllReduceMeta(void); + +public: + std::unordered_map comm_bufs; + Realm::RegionInstance reserveInst; + void *allgather_src, *allgather_dst; + // reuse for communication buffer + void *barrier_in_ptr, *barrier_out_ptr; + int barrier_ptr_size, barrier_flag; }; namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, +void inference_kernel_wrapper(Legion::Context ctx, + Legion::Runtime *runtime, + AllReduceMeta *m, BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void forward_kernel_wrapper(AllReduceMeta const *m, +void forward_kernel_wrapper(Legion::Context ctx, + Legion::Runtime *runtime, + AllReduceMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h old mode 100644 new mode 100755 index a38a3b267..16b41285b --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -18,7 +18,9 @@ #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/model.h" +#include "flexflow/page_manager.h" #include "flexflow/utils/file_loader.h" +#include #include #include #include @@ -26,7 +28,7 @@ namespace FlexFlow { class FFModel; -class BeamTree; +class TokenTree; class RequestManager; using tokenizers::Tokenizer; @@ -34,7 +36,7 @@ class InferenceManager { public: InferenceManager(); static InferenceManager *get_inference_manager(); - void compile_model_and_allocate_buffer(FFModel *model); + void compile_model_and_allocate_buffer(FFModel *model, bool is_llm = true); void init_operators_inference(FFModel *model); Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); Legion::FutureMap @@ -57,6 +59,72 @@ class InferenceManager { std::unordered_map model_weights_loaders; }; +class TokenTreeNode { +public: + BatchConfig::TokenId id; + double log_accumulated_prob; + int parent_pos; + bool included = false; + bool gumbel = false; + float gumbel_logit = 0.0f; + + TokenTreeNode(BatchConfig::TokenId id, + double log_accumulated_prob, + int parent_pos, + bool gumbel = false, + float gumbel_logit = 0.0f) + : id(id), log_accumulated_prob(log_accumulated_prob), + parent_pos(parent_pos), gumbel(gumbel), gumbel_logit(gumbel_logit) {} +}; + +bool operator<(std::shared_ptr const &lhs, + std::shared_ptr const &rhs); + +bool operator<=(std::shared_ptr const &lhs, + std::shared_ptr const &rhs); + +// A comparator for std::shared_ptr +// This is used to construct a max heap for the token tree nodes +struct SharedTokenTreeNodePtrLess { + bool operator()(std::shared_ptr const &lhs, + std::shared_ptr const &rhs) const { + if (lhs->gumbel) { + assert(rhs->gumbel); + return lhs->gumbel_logit < rhs->gumbel_logit; + } + return lhs->log_accumulated_prob < rhs->log_accumulated_prob; + } +}; + +// A comparator for std::pair, double> +// This is used to construct a max heap for the token tree nodes +struct SharedTokenTreeNodePtrDoubleLess { + bool operator()( + std::pair, double> const &lhs, + std::pair, double> const &rhs) const { + return lhs.second < rhs.second; + } +}; + +class TokenTree { +public: + std::vector>> tree_layers = {}; + void add_layer() { + tree_layers.emplace_back(); + tree_layers.back().reserve(BatchConfig::MAX_TREE_WIDTH); + } + + void clear() { + tree_layers.clear(); + } + + TokenTree() { + tree_layers.reserve(BatchConfig::MAX_TREE_DEPTH + 1); + } +}; + +std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree); + struct Request { enum Status { PENDING = 101, // loading prompt @@ -65,46 +133,180 @@ struct Request { FINISHING = 104, // finishing request, but not yet verified }; BatchConfig::RequestGuid guid; - int max_sequence_length; - int initial_len; + int batch_index = -1; int ssm_cache_size = 0; int llm_cache_size = 0; + double slo_ratio = 1.0; + double decode_latency_ms = 0.0; + int ssm_prefill_len = 0; + int llm_prefill_len = 0; + bool attained = true; + bool add_special_tokens = true; + + int first_token_offset_in_batch = 0; + int num_tokens_in_batch = 0; Status status = PENDING; std::vector tokens; - std::vector beam_trees; -}; - -// store the result of beam search -struct BeamTree { - struct treeLayer { - BeamSearchBatchConfig::TokenId - tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - int parent_ids[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - int nodes_num_this_layer = 0; + // page attention, page_last_committed should be -1 because there are no + // blocks at the beginning + int page_last_committed = -1; + std::vector blocks; + + // TokenTree speculative_token_tree; + std::vector speculative_token_trees; + // To make request manager stateful, we need to store the causal mask here + BatchConfig::BitMask causal_mask; + // Here we maintain a struct CommittedToken which has a field `from_index` and + // `to_index`. The `from_index` is used by the LLM KV cache commitment and the + // `to_index` is used both by the the SSM KV cache recomputation and the LLM + // KV cache commitment. Details are as follows: + // + // 1. Recompute the SSM KV cache: We don't commit the KV cache of the SSM + // committed tokens but recompute them instead. That is, after the we append + // the committed tokens to the generated sequence, just like in the prefilling + // phase, and pass them into the SSM to recompute the KV cache. Here we don't + // need `from_index` because we don't copy the KV cache, but we need + // `to_index`, which is the indices of the committed tokens in the request. + // + // to_index -> BatchConfig::PerTokenInfo.abs_index_in_request + // + // 2. Commit the LLM KV cache: On the GPU, the KV cache of the speculative + // token tree and the generated tokens are stored separately. So the + // `from_index` should be the index of the token in the speculative token + // tree. `to_index` should be the place to put the KV cache in the LLM KV + // cache: prompt_length + generated_sequence_length + + // index_in_committed_tokens. + // + // from_index -> TreeVerifyBatchConfig::CommittedTokensInfo.index_in_kv_cache + // to_index -> TreeVerifyBatchConfig::CommittedTokensInfo.token_depth + // + // Actually, for a committed token, the `to_index` for the LLM KV cache and + // the SSM KV cache are the same thing, so we can use the same field to store + // the information. + // + // When storing the committed tokens: + // from_index: The offset of the committed token in the request in the + // TreeVerifyBatchConfig + // to_index: The absolute index of the token in the request + + struct CommittedToken { + int from_index; + int to_index; + BatchConfig::TokenId token_id; + CommittedToken(int from_index, int to_index, BatchConfig::TokenId token_id) + : from_index(from_index), to_index(to_index), token_id(token_id) {} }; - treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; + std::vector committed_tokens; + + // Enabling Streaming KVCache means we doesn't store the whole KV sequence of + // the tokens in a request. Instead, we only store the sink cache (a few + // foremost tokens) and the window cache (rolling-updated backmost tokens + // through decoding). Currently, we only use streaming cache in the *draft + // model* calculation. + // - Maintain the streaming cache: During inference, we + // first fill up the sink cache then the window cache. After the window cache + // is full, we move back to the beginning of the window cache and commit the + // tokens in replace there. + // - When to update the streaming cache: + // 1. Prefilling phase + // 2. Committing phase after the target model verification + StreamingCacheInfo streaming_cache_info; + + std::priority_queue< + std::pair, double>, + std::vector, double>>, + SharedTokenTreeNodePtrDoubleLess> + token_tree_nodes_acc_prob_pair_pq; + + double get_length_weight(); + void set_slo_ratio(double slo_ratio_); + double get_slo_ratio(); + int decode_length() const; + + Request() { + std::vector, double>> + _prealloc_vector; + _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + token_tree_nodes_acc_prob_pair_pq = std::priority_queue< + std::pair, double>, + std::vector, double>>, + SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(), + std::move(_prealloc_vector)); + } }; -// struct BeamTree_v2 { -// std::vector tokens; -// std::vector parent_ids; -// std::vector probs; -// }; +struct NewProfileInfo { + long long timestamp; + BatchConfig::RequestGuid request_guid; + int request_step_idx; + int num_speculated_tokens; + int num_accepted_tokens; + double speculation_score; + int num_generated_tokens; + long long speculation_start_timestamp; + long long speculation_end_timestamp; +}; +struct RequestProfileInfo { + int llm_prefilling_steps = 0; + int ssm_prefilling_steps = 0; + int llm_decoding_steps = 0; + int ssm_decoding_steps = 0; + long long start_time = 0, start_decoding_time = 0, finish_time = 0; + long long speculation_start_timestamp; + long long speculation_end_timestamp; + std::vector speculated_size_per_step; + std::vector accepted_tokens_per_step; + std::vector generated_tokens_per_step__; +}; +struct ProfileInfo { + // For SpecInfer: One step is comprised of one ssm speculation phase + a + // single llm verification phase (forward pass + verification) For Incr + // Decoding: One step is one LLM decoding phase + long long llm_step_start = 0, ssm_step_start = 0; + // Times for each LLM verification phase (in ms) + std::vector llm_step_times; + // Number of requests in batch at each step + std::vector requests_per_step; + // Times for each SSM speculation phase (in ms) + std::vector ssm_step_times; + // Number of requests getting decoded at each step + std::vector ssm_steps; + std::vector tree_operation_step_times; + // Number of generated tokens at each step + std::vector generated_tokens_per_step; + // To calculate the E2E time of serving + long long server_start_time = 0; + long long server_end_time = 0; +}; class RequestManager { public: - enum Status { - INITIALIZED = 1001, - SERVING = 1002, - TERMINATED = 1003, + enum State { + PREFILLING = 1001, + DECODING = 1002, + SSM_SPEC = 1003, + LLM_VERIFY = 1004, + }; + enum BackgroundServerStatus { + INITIALIZED = 2001, + SERVING = 2002, + TERMINATED = 2003, + }; + enum DecodingMode { + INCREMENTAL_DECODING = 3001, + SPECULATIVE_DECODING = 3002, }; + enum PrefillModel { + LLM = 4001, + SSM = 4002, + }; + using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; - static const RequestGuid INVALID_GUID = 0; + inline static RequestGuid const INVALID_GUID = 0; RequestManager(); static RequestManager *get_request_manager(); size_t get_num_processed_requests(); @@ -114,104 +316,81 @@ class RequestManager { int get_max_requests_per_batch(); void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); - void set_max_spec_tree_token_num(int max_num_tokens); + void set_max_tokens_per_ssm_batch(int max_num_ssm_tokens); + int get_max_tokens_per_ssm_batch(); + void set_max_tokens_per_prefilling_batch(int max_num_prefilling_tokens); + int get_max_tokens_per_prefilling_batch(); int get_max_spec_tree_token_num(); - int get_max_verify_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); - void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); + void set_max_kv_cache_size(size_t max_kv_cache_size); + size_t get_max_kv_cache_size(); + void set_max_output_length(int max_output_length); + int get_max_output_length(); + void set_decoding_mode(DecodingMode mode); + void set_verbose(bool verbose_); + int get_k(); + void set_k(int k); + int get_max_tree_depth(); + void set_max_tree_depth(int max_tree_depth); + int get_max_tree_width(); + void set_max_tree_width(int max_tree_width); + int get_expansion_degree(); + void set_expansion_degree(int expansion_degree_); + void set_speculative_sampling(bool speculative_sampling); + void set_baseline_latency(double baseline_latency_ms); + double get_baseline_latency(); + void set_ssm_spec_latency(double ssm_spec_latency_ms); + double get_ssm_spec_latency(); + void set_llm_verify_latency(double llm_verify_latency_ms); + double get_llm_verify_latency(); + void set_correction_factor(double correction_factor); + double get_correction_factor(); + void set_streaming_cache(bool streaming_cache); + bool get_streaming_cache(); + bool get_memory_occupancy(); + void set_memory_occupancy(bool memory_occupancy); + void + set_slo_violation_early_termination(bool slo_violation_early_termination); + void set_spec_infer_old_version(bool spec_infer_old_version); + void set_greedy_schedule(bool greedy_schedule); + void set_equal_schedule(bool equal_schedule); + void set_fcfs_slo(bool fcfs_slo); + void set_stta(bool stta); + bool get_spec_infer_old_version(); + bool get_greedy_schedule(); + bool get_equal_schedule(); + bool get_fcfs_slo(); + bool get_stta(); + inline double get_slo_constraint(Request &request); + void set_eval_overhead_breakdown(bool eval_overhead_breakdown); + bool get_eval_overhead_breakdown(); + double get_request_expected_latency(Request &request); + Request &get_request_with_guid(RequestGuid guid); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, int bos_token_id, - int eos_token_id, + std::vector eos_token_ids, std::string const &path); + std::vector tokenize(std::string const &text); void register_output_filepath(std::string const &); - void initBitMask(BatchConfig::BitMask &bitmask, int initLength); - void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength); - void appendBitMask(BatchConfig::BitMask &bitmask, - int newNodes, - int preBeamSize, - int old_sub_num, - BeamTree const tree, - int currentDepth); - void updateBitMask(BatchConfig::BitMask &bitmask, - int initLength, - int non_tree_size); FFModel *get_ssm_model(int model_id); - void serve_incr_decoding(FFModel *model); void serve_spec_infer(FFModel *model); + void serve_spec_infer_sync(FFModel *model); + void serve_decoding(FFModel *model); GenerationResult get_generation_result(RequestGuid const &guid); - RequestGuid register_new_request(std::string const &prompt, - int max_sequence_length); - RequestGuid register_new_request(std::vector const &prompt, - int max_sequence_length); + RequestGuid register_new_request(GenerationRequest const &req); // Methods to start and terminate request manager's background task void start_background_server(FFModel *model); + bool is_background_server_serving(); bool is_background_server_terminated(); void terminate_background_server(); static void terminate_background_server_at_exit(); // Methods to check and mark request completion - bool is_request_completed(RequestGuid const &guid); void trigger_request_completion_future(RequestGuid const &guid); - // Methods for preparing next batches - BatchConfig prepare_next_batch(BatchConfig const &bc, - InferenceResult const &result); - BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, - InferenceResultFuture const &result, - Legion::Context ctx, - Legion::Runtime *runtime); - BeamSearchBatchConfig - prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, - BeamInferenceResult const &result); - BeamSearchBatchConfigFuture - prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc, - BeamInferenceResultFuture const &result, - Legion::Context ctx, - Legion::Runtime *runtime); - BeamSearchBatchConfig - prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, - InferenceResult const &result, - int model_id); - BeamSearchBatchConfigFuture - prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc, - InferenceResultFuture const &result, - int model_id, - Legion::Context ctx, - Legion::Runtime *runtime); - TreeVerifyBatchConfig prepare_next_batch_verify( - std::vector const &old_batches); - TreeVerifyBatchConfigFuture prepare_next_batch_verify( - std::vector const &old_batches, - Legion::Context ctx, - Legion::Runtime *runtime); - - void store_beam_metadata(BeamSearchBatchConfig const &old_bc, - BeamInferenceResult const &result); - void update_beam_metadata(BeamSearchBatchConfig &new_bc, - BeamSearchBatchConfig const &old_bc, - BeamTree &tree, - int request_index); - - std::vector> - traverse_beam_tree(BeamSearchBatchConfig const &old_bc, - int request_index, - int first_token_depth_in_request); - - // remove guid after put the cached tree in request - std::vector> merge_dfs_trees( - std::vector>> - input_trees, - int root_depth, - RequestGuid guid); - - std::vector> traverse_verify_tree( - size_t guid, - std::vector> const - &inputSerializedTree, - std::vector> const - &outputSerializedTree); + bool is_eos_token(TokenId token_id); static void background_serving_task( Legion::Task const *task, std::vector const ®ions, @@ -233,80 +412,218 @@ class RequestManager { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static BatchConfig prepare_next_batch_task( - Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - - static BeamSearchBatchConfig prepare_next_batch_beam_task( - Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - - static BeamSearchBatchConfig prepare_next_batch_init_task( - Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - - static TreeVerifyBatchConfig prepare_next_batch_verify_task( + // API for rm state machine + BatchConfigFuture get_next_batch_config(InferenceResultFuture const &result, + Legion::Context ctx, + Legion::Runtime *runtime); + static BatchConfig get_next_batch_config_task( Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + BatchConfig get_next_batch_config(InferenceResult const &result); + void update_inference_results(InferenceResult const &result); + BatchConfig prepare_next_batch(); + + int get_num_active_requests(); + int get_empty_request_index(); + + std::unordered_map get_requests_profiling(); + std::unordered_map + get_request_generation_results(); + ProfileInfo get_profiling_info(); + std::vector get_new_profiling_info(); + + // Comparters + struct SharedTokenTreeNodePtrRequestGuidWeightedLess { + bool operator()( + std::pair, RequestGuid> const &lhs, + std::pair, RequestGuid> const &rhs) + const; + }; + struct SharedTokenTreeNodePtrDoubleRequestGuidLess { + bool operator()( + std::tuple, double, RequestGuid> const + &lhs, + std::tuple, double, RequestGuid> const + &rhs) const; + }; private: // configuration parameters int max_requests_per_batch; int max_tokens_per_batch; + int max_tokens_per_ssm_batch; + int max_tokens_per_prefilling_batch; int max_spec_tree_token_num; int max_sequence_length; - Status request_manager_status; + int max_output_length; + size_t max_kv_cache_size; + int max_tree_depth; + int max_tree_width; + int k; + int expansion_degree = 3; + // Profile based latency + double baseline_latency_ms = 43; + double ssm_spec_latency_ms = 17; + double llm_verify_latency_ms = 65; + double correction_factor = 1.05; + + State request_manager_status; + BackgroundServerStatus background_server_status; + DecodingMode decoding_mode; + PrefillModel prefill_model; + bool speculative_sampling = false; + // specify if enable streaming cache for incremental decoding or draft model + bool streaming_cache = false; + bool memory_occupancy = false; + bool slo_violation_early_termination = false; + bool spec_infer_old_version = false; + bool greedy_schedule = false; + bool equal_schedule = false; + bool fcfs_slo = false; + bool stta = false; // The smallest time to attain policy + bool eval_overhead_breakdown = false; // for evaluation purpose + double eval_ssm_prefill_latency_us = 0.0; + double eval_llm_prefill_latency_us = 0.0; + double eval_ssm_spec_latency_us = 0.0; + double eval_llm_verify_latency_us = 0.0; + double eval_process_latency_us = 0.0; + double eval_schedule_latency_us = 0.0; + double eval_other_latency_us = 0.0; // load pending request, request complete - // tree width in each speculative step, if not specified 1 - std::vector spec_infer_tree_width; - - // private fields std::unique_ptr tokenizer_; bool verbose; ModelType model_type; int bos_token_id; - int eos_token_id; - std::string output_filepath; + std::vector eos_token_ids; + bool old_llama_tokenizer = false; + std::string output_filepath, csv_filepath; std::queue pending_request_queue; std::unordered_map all_requests; std::unordered_map request_generation_results; std::mutex request_queue_mutex; + std::condition_variable request_queue_cv; + std::mutex request_result_mutex; std::unordered_map *> request_to_promise; std::mutex request_to_promise_mutex; RequestGuid next_available_guid; - - // TODO: Move this two vector to request struct - std::unordered_map>> - dfs_tree_inputs; - std::unordered_map>> - committed_tokens; + std::queue prefilled_requests; + std::vector prefilling_requests; + + // Added to make the request manager stateful. During the processing of the + // first small model inference results, the step equals to 1. That is, every + // time a small model inference task is launched, the step is increased + // by 1. + int current_ssm_step = 0; + // Maps the index of the request in the batch config to the request guid. + // Note that we may have some prefilled requests not in the batch config, + // but should be re-considered in the decoding phase. + int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS]; + int num_running_requests = 0; + // Available requests in the batch config + bool request_available[BatchConfig::MAX_NUM_REQUESTS]; + int num_available_requests = 0; + int ssm_completed = true; + int ssm_tree_depth = 0; // Multi-model support std::vector ssm_models; - // Performance profiling - size_t num_processed_requests; - // Background server handler Legion::Future background_server_handler; -private: - struct ProfileInfo { - int llm_decoding_steps; - int ssm_decoding_steps; - double start_time, finish_time; - }; - std::unordered_map profiling_requests; + // Performance profiling + // TODO: maintain this field + size_t num_processed_requests; + + ProfileInfo profiling; + std::unordered_map profiling_requests; + std::vector new_profiling_info; double total_request_run_time; + bool load_pending_request_to_batch(); + void request_update_attainment(int index, bool attained); + void request_complete_clean_up(int batch_index); + void request_offload_from_batch(int batch_index); + void request_load_onto_batch(int batch_index); + /* ---------- Incremental Decoding Helper Functions ---------- */ + bool update_llm_prefill_results(InferenceResult const &result); + bool update_llm_decode_results(InferenceResult const &result); + BatchConfig prepare_llm_prefilling_batch(); + BatchConfig prepare_decoding_batch(); + BatchConfig prepare_decoding_batch_fcfs_slo(); + BatchConfig prepare_decoding_batch_stta(); + /* ---------- Incremental Decoding Helper Functions ---------- */ + + /* ---------- Spec Decoding Helper Functions ---------- */ + BatchConfig prepare_ssm_prefilling_batch(); + bool update_llm_verify_results(InferenceResult const &llm_verify_result); + bool + update_ssm_inference_results(InferenceResult const &ssm_inference_result); + void update_ssm_prefill_results(InferenceResult const &ssm_prefill_result); + // Prepare the next speculation batch config. This function is called before + // the second step of the speculation. + BatchConfig prepare_next_spec_batch_config(); + // Prepare the first speculation batch config. This function is called before + // the first step of the speculation. The difference with + // prepare_next_batch_config_spec is that we put the info of the committed + // tokens into the batch config in the first speculation step to commit the KV + // cache of the small model. + BatchConfig prepare_first_spec_batch_config(); + BatchConfig prepare_verify_batch_config(); + + // LLM result verification + void get_verify_results_greedy(InferenceResult const &llm_verify_result); + void get_verify_results_sample(InferenceResult const &llm_verify_result); + + // Bitmask related + void init_bitmask_prompt(RequestGuid guid, int prompt_length); + void append_bitmask(RequestGuid guid); + void update_bitmask_prompt(RequestGuid guid, int num_committed_tokens); + void init_bitmask_spec(RequestGuid guid); + BatchConfig::BitMask create_llm_bitmask(RequestGuid guid); + + // Page Attention related + int get_num_blocks_allocated(Request &request) const; + int get_len_last_block(Request &request) const; + int get_idx_last_logical_token(Request &request) const; + int idx_logical_to_physical(Request &request, int idx_logical); + void _append_block_to_request(Request &request, bool is_commit); + int append_token_to_block(Request &request, TokenId token, bool is_commit); + void reset_block_table(Request &request); + void print_num_tokens(Request &request); + + // Token tree related + void init_token_tree(RequestGuid guid); + void add_root_to_spec_token_tree(RequestGuid guid, + BatchConfig::TokenId token_id); + void add_tokens_to_spec_token_tree( + InferenceResult const &ssm_inference_result); + void add_tokens_to_spec_token_tree_old_version( + InferenceResult const &ssm_inference_result); + void prune_token_tree(); + void prune_token_tree_equal(); + void prune_token_tree_greedy(); + void add_tokens_toward_slo(RequestGuid guid, + int &budget, + int num_req_with_slo); + void add_tokens_toward_memory_occupancy(int budget); + void add_tokens_toward_goodput(int budget); + void add_tokens_toward_goodput_per_request(int budget, int request_index); + void update_token_tree_depth(); + + /* ---------- Spec Decoding Helper Functions ---------- */ + void renormalize(std::vector> &D, + std::unordered_map &R, + TokenId token_id); + std::tuple + reject_sampling(std::vector> &D, + std::unordered_map &R, + int k); + void gumbel_conditioned_on_max(double target_max, + std::vector> &logits); + + // Profiling related functions + void reset_profiling_statistics(); }; - }; // namespace FlexFlow diff --git a/include/flexflow/substitution_loader.h b/include/flexflow/substitution_loader.h index e0c252ffd..e7367c5bb 100644 --- a/include/flexflow/substitution_loader.h +++ b/include/flexflow/substitution_loader.h @@ -103,6 +103,7 @@ NLOHMANN_JSON_SERIALIZE_ENUM( {OP_SHAPE, "OP_SHAPE"}, {OP_SIZE, "OP_SIZE"}, {OP_TOPK, "OP_TOPK"}, + {OP_GUMBEL_TOPK, "OP_GUMBEL_TOPK"}, {OP_WHERE, "OP_WHERE"}, {OP_CEIL, "OP_CEIL"}, {OP_CAST, "OP_CAST"}, diff --git a/include/flexflow/utils/communication_buffer.h b/include/flexflow/utils/communication_buffer.h new file mode 100644 index 000000000..016860bf6 --- /dev/null +++ b/include/flexflow/utils/communication_buffer.h @@ -0,0 +1,78 @@ +/* Copyright 2023 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _COMMUNICATION_BUFFER_H +#define _COMMUNICATION_BUFFER_H + +#include "legion.h" +#include +#ifdef FF_USE_NCCL +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include +#else +#include +#endif +#endif + +// adapted from https://github.com/mlc-ai/relax + +// The CUDA interdevice communication memory object, +// which internally contains data pointers to other device's peer memory. +// It is be useful for efficient all-reduce implementation. +// Right now the class members are closely tied with customized +// all-reduce kernel. They may also be extended for other uses in +// the future. +class CommunicationBuffer { +public: + // The device information for CUDA CommunicationBuffer. + int num_devices; + int device_id; + void *local_ptr; + + // The data pointers of all all-reduce inputs. + // It has "num_devices" pointers. The i-th pointer is the data pointer on + // worker i. If "i != device_id", the pointer is an peer data pointer of other + // device. Otherwise, the pointer is a local CUDA data pointer. + std::vector comm_ptrs; + + // The barrier helper datas per CommunicationBuffer, which can be used + // by custom collective operations and allow fine-grained synchronization on + // each buffer. They have "num_devices" pointers, and the pointer arrangement + // is the same as "comm_ptrs". + std::vector barrier_in; + std::vector barrier_out; + + // The integer buffer flag for all-reduce. + // It will self increment by 1 after each all-reduce operation. + int *barrier_flag; +}; + +// All NCCL operations need to be wrapped by Legion concurrent_task_barrier. +CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx, + Legion::Runtime *runtime, + int num_devices, + int device_id, + ncclComm_t ncclComm, + void *allgather_src, + void *allgather_dst, + void *local_ptr, + void *barrier_in_ptr, + void *barrier_out_ptr, + int *barrier_flag, + cudaStream_t stream); + +void release_comm_buf(CommunicationBuffer *comm_buf); + +#endif // _COMMUNICATION_BUFFER_H diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index f8bf67b3e..f5ea76c5b 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -3,6 +3,7 @@ #include "flexflow/accessor.h" #include "flexflow/ffconst.h" #include "legion.h" +#include #include #include #ifdef FF_USE_NCCL diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h index 646eb18da..4ccc6db48 100644 --- a/include/flexflow/utils/file_loader.h +++ b/include/flexflow/utils/file_loader.h @@ -21,6 +21,7 @@ using namespace std; using namespace FlexFlow; +using namespace Legion; class FileDataLoader { public: @@ -29,17 +30,38 @@ class FileDataLoader { int _num_heads, int _num_kv_heads, size_t _hidden_dim, - size_t _qkv_inner_dim, + size_t _head_dim, int _tensor_parallelism_degree, bool _use_full_precision); BatchConfig::TokenId *generate_requests(int num, int length); template - void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); + void load_single_weight_tensor(FFModel *ff, + Layer *l, + int weight_idx, + size_t volume, + size_t num_replicas, + DT *weight, + Domain weight_domain); - void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx); - void load_weights(FFModel *ff); + void load_quantization_weight(FFModel *ff, + Layer *l, + int weight_idx, + size_t volume, + size_t num_replicas, + char *weight, + DataType data_type, + Domain weight_domain); + + static void + load_weight_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + void load_weights_parallel(FFModel *ff, + Legion::Context ctx, + Legion::Runtime *runtime); void load_positions(FFModel *ff, Tensor pt, @@ -49,8 +71,26 @@ class FileDataLoader { private: int num_heads, num_kv_heads, tensor_parallelism_degree; - size_t hidden_dim, qkv_inner_dim; + size_t hidden_dim, head_dim; std::string prompts_filepath; std::string weights_folder; bool use_full_precision; }; + +struct WeightLoadTaskArgs { + FFModel *ff; + FileDataLoader *loader; + Layer *layer; + int weight_idx; + size_t volume, num_replicas; + DataType data_type; + WeightLoadTaskArgs(FFModel *_ff, + FileDataLoader *_loader, + Layer *_l, + int _idx, + size_t _volume, + size_t _num_replicas, + DataType _data_type) + : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), volume(_volume), + num_replicas(_num_replicas), data_type(_data_type) {} +}; diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h index 8e50a4c3b..af3327b04 100644 --- a/include/flexflow/utils/memory_allocator.h +++ b/include/flexflow/utils/memory_allocator.h @@ -23,7 +23,9 @@ namespace FlexFlow { class MemoryAllocator { public: MemoryAllocator(Legion::Memory memory); - void create_legion_instance(Realm::RegionInstance &inst, size_t size); + void create_legion_instance(Realm::RegionInstance &inst, + size_t size, + char const *task_name = NULL); void register_reserved_work_space(void *base, size_t size); inline void *allocate_reserved_untyped(size_t datalen) { void *ptr = static_cast(reserved_ptr) + reserved_allocated_size; @@ -60,6 +62,7 @@ class MemoryAllocator { void *instance_ptr; size_t reserved_total_size, reserved_allocated_size; size_t instance_total_size, instance_allocated_size; + bool log_instance_creation; }; }; // namespace FlexFlow diff --git a/include/flexflow/utils/recursive_logger.h b/include/flexflow/utils/recursive_logger.h index 2c43b4230..d073f58f3 100644 --- a/include/flexflow/utils/recursive_logger.h +++ b/include/flexflow/utils/recursive_logger.h @@ -26,7 +26,7 @@ class DepthTag { class RecursiveLogger { public: - /* RecursiveLogger(LegionRuntime::Logger::Category const &); */ + /* RecursiveLogger(Legion::Logger const &); */ RecursiveLogger(std::string const &category_name); Realm::LoggerMessage info(); @@ -42,7 +42,7 @@ class RecursiveLogger { void print_prefix(Realm::LoggerMessage &) const; - LegionRuntime::Logger::Category logger; + Legion::Logger logger; }; }; // namespace FlexFlow diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index aae7256ff..a34d27e9a 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -20,19 +20,19 @@ #include "models/mpt.h" #include "models/opt.h" #include "models/starcoder.h" +#include #include -#include - using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; -LegionRuntime::Logger::Category log_app("llama"); +Legion::Logger log_app("llama"); struct FilePaths { std::string cache_folder_path; std::string prompt_file_path; + std::string trace_file_path; std::string output_file_path; }; @@ -47,7 +47,24 @@ void parse_input_args(char **argv, float &topp, int &max_requests_per_batch, int &max_tokens_per_batch, - int &max_sequence_length) { + int &max_tokens_per_ssm_batch, + int &max_tokens_per_prefilling_batch, + int &max_sequence_length, + int &max_output_length, + size_t &max_kv_cache_size, + int &sampling_seed, + bool &streaming_cache, + bool &slo_attainment_early_termination, + double &baseline_latency_ms, + double &ssm_spec_latency_ms, + double &llm_verify_latency_ms, + double &slo_filter, + int &replica, + double &request_per_second, + std::string &emission_file_path, + bool &add_special_tokens, + bool &fcfs_slo, + bool &stta) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -67,6 +84,11 @@ void parse_input_args(char **argv, paths.prompt_file_path = std::string(argv[++i]); continue; } + // traces + if (!strcmp(argv[i], "-trace")) { + paths.trace_file_path = std::string(argv[++i]); + continue; + } // output file if (!strcmp(argv[i], "-output-file")) { paths.output_file_path = std::string(argv[++i]); @@ -101,10 +123,78 @@ void parse_input_args(char **argv, max_tokens_per_batch = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "--max-tokens-per-ssm-batch")) { + max_tokens_per_ssm_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-prefilling-batch")) { + max_tokens_per_prefilling_batch = std::stoi(argv[++i]); + continue; + } if (!strcmp(argv[i], "--max-sequence-length")) { max_sequence_length = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "--max-output-length")) { + max_output_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-kv-cache-size")) { + max_kv_cache_size = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--sampling-seed")) { + sampling_seed = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--enable-streaming-cache")) { + streaming_cache = true; + continue; + } + if (!strcmp(argv[i], "--slo-attainment-early-termination")) { + slo_attainment_early_termination = true; + continue; + } + if (!strcmp(argv[i], "--baseline-latency-ms")) { + baseline_latency_ms = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--ssm-spec-latency-ms")) { + ssm_spec_latency_ms = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--llm-verify-latency-ms")) { + llm_verify_latency_ms = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--eval-slo-filter")) { + slo_filter = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--eval-replica")) { + replica = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--request-per-second")) { + request_per_second = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--emission-file-path")) { + emission_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--no-special-tokens")) { + add_special_tokens = false; + continue; + } + if (!strcmp(argv[i], "--fcfs-serving")) { + fcfs_slo = true; + continue; + } + if (!strcmp(argv[i], "--stta-serving")) { + stta = true; + continue; + } } if (paths.cache_folder_path.empty()) { char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); @@ -131,11 +221,30 @@ void FlexFlow::top_level_task(Task const *task, bool use_full_precision = false; bool verbose = false; bool do_sample = false; - float temperature = 0.0f; - float topp = 0.0f; - int max_requests_per_batch = 8; + float temperature = 0.8f; + float topp = 0.6f; + int max_requests_per_batch = 1; int max_tokens_per_batch = 128; + int max_tokens_per_ssm_batch = -1; + int max_tokens_per_prefilling_batch = -1; int max_sequence_length = 256; + int max_output_length = 512; + size_t max_kv_cache_size = 0; // if -1, then use the default value + RequestManager::DecodingMode decoding_mode = + RequestManager::INCREMENTAL_DECODING; + int sampling_seed = 0; + bool streaming_cache = false; + bool slo_attainment_early_termination = false; + double baseline_latency_ms = 50; + double ssm_spec_latency_ms = 20; + double llm_verify_latency_ms = 50; + double slo_filter = 0.0; + int replica = 1; + double request_per_second = 1.0; + bool add_special_tokens = true; + bool fcfs_slo = false; + bool stta = false; + std::string emission_file_path; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -151,7 +260,33 @@ void FlexFlow::top_level_task(Task const *task, topp, max_requests_per_batch, max_tokens_per_batch, - max_sequence_length); + max_tokens_per_ssm_batch, + max_tokens_per_prefilling_batch, + max_sequence_length, + max_output_length, + max_kv_cache_size, + sampling_seed, + streaming_cache, + slo_attainment_early_termination, + baseline_latency_ms, + ssm_spec_latency_ms, + llm_verify_latency_ms, + slo_filter, + replica, + request_per_second, + emission_file_path, + add_special_tokens, + fcfs_slo, + stta); + if (max_tokens_per_ssm_batch == -1) { + max_tokens_per_ssm_batch = max_tokens_per_batch; + } + if (max_tokens_per_prefilling_batch == -1) { + max_tokens_per_prefilling_batch = max_tokens_per_batch; + } + if (slo_filter == 0.0) { + replica = 1; + } assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == @@ -179,7 +314,8 @@ void FlexFlow::top_level_task(Task const *task, ModelType model_type = ModelType::UNKNOWN; auto architectures = model_config["architectures"]; for (auto const &str : architectures) { - if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" || + str == "MistralForCausalLM") { model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { @@ -199,20 +335,48 @@ void FlexFlow::top_level_task(Task const *task, int bos_token_id = model_config.find("bos_token_id") == model_config.end() ? -1 : (int)model_config.at("bos_token_id"); - int eos_token_id = model_config.find("eos_token_id") == model_config.end() - ? -1 - : (int)model_config.at("eos_token_id"); + // int eos_token_id = model_config.find("eos_token_id") == model_config.end() + // ? -1 + // : (int)model_config.at("eos_token_id"); + std::vector eos_token_ids; + if (model_config.find("eos_token_id") != model_config.end()) { + if (model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : model_config["eos_token_id"]) { + eos_token_ids.push_back(eos_token_id); + } + } else { + eos_token_ids.push_back(model_config["eos_token_id"]); + } + } else { + eos_token_ids.push_back(-1); + } assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); + srand(sampling_seed); GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); rm->set_max_requests_per_batch(max_requests_per_batch); rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch); + rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch); rm->set_max_sequence_length(max_sequence_length); + rm->set_max_output_length(max_output_length); + rm->set_max_kv_cache_size(max_kv_cache_size); + rm->set_decoding_mode(decoding_mode); + rm->set_slo_violation_early_termination(slo_attainment_early_termination); + rm->set_baseline_latency(baseline_latency_ms); + rm->set_ssm_spec_latency(ssm_spec_latency_ms); + rm->set_llm_verify_latency(llm_verify_latency_ms); + rm->set_max_tree_depth(8); + rm->set_max_tree_width(16); + rm->set_verbose(verbose); + rm->set_streaming_cache(streaming_cache); + rm->set_fcfs_slo(fcfs_slo); + rm->set_stta(stta); rm->register_tokenizer( - model_type, bos_token_id, eos_token_id, tokenizer_filepath); + model_type, bos_token_id, eos_token_ids, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); @@ -222,6 +386,7 @@ void FlexFlow::top_level_task(Task const *task, weights_filepath, INC_DECODING_MODE, generationConfig, + streaming_cache, use_full_precision); } else if (model_type == ModelType::OPT) { OPT::create_opt_model(model, @@ -255,24 +420,84 @@ void FlexFlow::top_level_task(Task const *task, rm->start_background_server(&model); - int total_num_requests = 0; { - using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - std::vector prompts; - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - total_num_requests++; - prompts.push_back(text); - } - std::vector result = - model.generate(prompts, 128 /*max_sequence_length*/); + std::vector requests; + std::vector results; + + if (!file_paths.prompt_file_path.empty()) { + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + // Parse slo_ratios + std::vector> slo_ratios; + if (prompt_json[0].contains("slo_ratios")) { + for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) { + slo_ratios.emplace_back(std::stod(key), value.get()); + } + } + double total = std::accumulate( + slo_ratios.begin(), + slo_ratios.end(), + 0.0, + [](double sum, std::pair const &pair) { + return sum + pair.second; + }); + if (std::abs(total - 1.0) > 1e-6) { + std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: " + << total << std::endl; + assert(false); + } + for (size_t i = 1; i < prompt_json.size(); ++i) { + requests.push_back( + GenerationRequest(prompt_json[i]["prompt"].get(), + -1.0, + 0, + add_special_tokens)); + } + PoissonEmissionMachine emission_machine(request_per_second, slo_ratios); + // ConstantEmissionMachine emission_machine(-1, slo_ratios); + results = model.generate(requests, emission_machine); + } else if (!file_paths.trace_file_path.empty()) { + std::ifstream file_handle(file_paths.trace_file_path); + assert(file_handle.good() && "Trace file does not exist."); + json trace_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + std::vector timestamps, ratios; + for (auto const &json_obj : trace_json) { + EmissionTrace trace(json_obj); + if (slo_filter != 0.0 && + std::fabs(trace.slo_ratio - slo_filter) > 1e-6) { + continue; + } + for (size_t i = 0; i < replica; ++i) { + requests.push_back( + GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens)); + timestamps.push_back(trace.emission_time_ms); + ratios.push_back(trace.slo_ratio); + } + } + TraceEmissionMachine emission_machine(timestamps, ratios); + results = model.generate(requests, emission_machine); + } else { + assert(false && "No prompt or trace file provided."); + } + + // output generation results as json + if (!emission_file_path.empty()) { + json output_json; + for (size_t i = 0; i < results.size(); ++i) { + EmissionTrace trace(results[i]); + output_json.push_back(trace.to_json()); + } + std::ofstream emission_file_handle(emission_file_path); + emission_file_handle << output_json.dump(2) << std::endl; + } } // terminate the request manager by stopping the background thread diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index a529411dd..9049b3885 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -40,9 +40,10 @@ void FALCON::create_falcon_model(FFModel &ff, { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); int const token_dims[] = { - (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), + std::max(mode == TREE_SEARCH_MODE + ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -62,6 +63,11 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor mha = nullptr, mlp_output = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; + ff.set_num_transformer_layers(falcon_config.n_layer); + ff.set_num_kv_heads(falcon_config.n_head_kv); + ff.set_qkv_dim(falcon_config.hidden_size / falcon_config.n_head * 2); + ff.set_size_dt(data_type_size(input->data_type)); + for (int i = 0; i < falcon_config.n_layer; i++) { // set transformer layer id ff.set_transformer_layer_id(i); @@ -76,7 +82,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.layer_norm_epsilon, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_layer_norm( @@ -90,14 +96,14 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.layer_norm_epsilon, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = res_ln_outputs[0]; att_norm = res_ln_outputs[1]; } switch (mode) { - case BEAM_SEARCH_MODE: { + case TREE_SEARCH_MODE: { mha = ff.spec_inc_multiquery_self_attention( att_norm, falcon_config.hidden_size, @@ -111,12 +117,13 @@ void FALCON::create_falcon_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + falcon_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + false, /*streaming_cache*/ + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -136,19 +143,19 @@ void FALCON::create_falcon_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + falcon_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; } case INC_DECODING_MODE: { - mha = ff.inc_multiquery_self_attention( + mha = ff.groupquery_self_attention( att_norm, falcon_config.hidden_size, falcon_config.n_head, @@ -161,12 +168,13 @@ void FALCON::create_falcon_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + falcon_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + false, /*streaming_cache*/ + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -187,7 +195,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h") + std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h") .c_str()); dense_h_to_4h = ff.gelu(dense_h_to_4h); @@ -203,7 +211,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h") + std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h") .c_str()); } // final normalization and linear @@ -233,7 +241,7 @@ void FALCON::create_falcon_model(FFModel &ff, "lm_head"); Tensor output; - if (mode == BEAM_SEARCH_MODE) { + if (mode == TREE_SEARCH_MODE) { Tensor softmax = ff.softmax(lm_head, -1); output = ff.argmax(softmax, /*beam_Search*/ true); } else { diff --git a/inference/models/falcon.h b/inference/models/falcon.h index fce2dade3..a15c28991 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -16,6 +16,7 @@ // #include "file_loader.h" #include "flexflow/batch_config.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" #include @@ -50,6 +51,26 @@ class FALCON { : model_config["num_hidden_layers"]; parallel_attn = model_config["parallel_attn"]; vocab_size = model_config["vocab_size"]; + rotary_embedding_meta.apply_rotary_embedding = true; + if (model_config.find("rope_theta") != model_config.end()) { + rotary_embedding_meta.rope_theta = model_config["rope_theta"]; + } else { + rotary_embedding_meta.rope_theta = 10000.0f; + } + if (model_config.find("scaling_factor") != model_config.end() && + !model_config["scaling_factor"].is_null()) { + rotary_embedding_meta.rope_type = + model_config["scaling_factor"]["rope_type"]; + rotary_embedding_meta.factor = + model_config["scaling_factor"]["factor"]; + rotary_embedding_meta.low_freq_factor = + model_config["scaling_factor"]["low_freq_factor"]; + rotary_embedding_meta.high_freq_factor = + model_config["scaling_factor"]["high_freq_factor"]; + rotary_embedding_meta.original_max_position_embeddings = + model_config["scaling_factor"] + ["original_max_position_embeddings"]; + } } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); @@ -61,8 +82,7 @@ class FALCON { } // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; } void print() const { @@ -76,18 +96,19 @@ class FALCON { std::cout << "\tn_layer: " << n_layer << std::endl; std::cout << "\tparallel_attn: " << parallel_attn << std::endl; std::cout << "\tvocab_size: " << vocab_size << std::endl; - + std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta + << std::endl; // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; - std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; - std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; + std::cout << "\tk_of_arg_topk: " << k_of_arg_topk << std::endl; } bool bias, multi_query, parallel_attn; int hidden_size, n_head, n_head_kv, n_layer, vocab_size; float layer_norm_epsilon; + RotaryEmbeddingMeta rotary_embedding_meta; // int max_seq_len, max_num_tokens; - int max_beam_width, max_beam_depth; + int k_of_arg_topk; }; static void create_falcon_model(FFModel &ff, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 517f53443..414306877 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -25,6 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string const &weight_file_path, InferenceMode mode, GenerationConfig generation_config, + bool streaming_cache, bool use_full_precision) { // do not apply cpu offload in beam search model. LLAMAConfig llama_config(model_config_file_path); @@ -42,9 +43,10 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { int const token_dims[] = { - (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), + std::max(mode == TREE_SEARCH_MODE + ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -58,10 +60,17 @@ void LLAMA::create_llama_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "tok_embeddings"); + "embed_tokens"); Tensor w2 = nullptr; + // metadata that needs to be sent to page manager in order to calculate the kv + // cache per layer + ff.set_num_transformer_layers(llama_config.num_hidden_layers); + ff.set_num_kv_heads(llama_config.num_key_value_heads); + int qkv_dim = llama_config.hidden_size / llama_config.num_attention_heads * 2; + ff.set_qkv_dim(qkv_dim); + ff.set_size_dt(data_type_size(input->data_type)); for (int i = 0; i < llama_config.num_hidden_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); @@ -75,7 +84,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.rms_norm_eps, llama_config.hidden_size, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_rms_norm( @@ -85,7 +94,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.rms_norm_eps, llama_config.hidden_size, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = token_att_norm[0]; att_norm = token_att_norm[1]; @@ -93,11 +102,12 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor mha; switch (mode) { - case BEAM_SEARCH_MODE: { - mha = ff.spec_inc_multihead_self_attention( + case TREE_SEARCH_MODE: { + mha = ff.spec_inc_multiquery_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -106,21 +116,23 @@ void LLAMA::create_llama_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + llama_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + streaming_cache, + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; } case TREE_VERIFY_MODE: { - mha = ff.inc_multihead_self_attention_verify( + mha = ff.inc_multiquery_self_attention_verify( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -129,21 +141,22 @@ void LLAMA::create_llama_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + llama_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; } case INC_DECODING_MODE: { - mha = ff.inc_multihead_self_attention( + mha = ff.groupquery_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -152,12 +165,13 @@ void LLAMA::create_llama_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + llama_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + streaming_cache, /*streaming_cache*/ + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -176,53 +190,52 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.rms_norm_eps, llama_config.hidden_size, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str()); + std::string("layers." + std::to_string(i) + ".post_attention_layernorm") + .c_str()); token = token_ff_norm[0]; Tensor ff_norm = token_ff_norm[1]; - Tensor w1 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w1") - .c_str()); + Tensor w1 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str()); - Tensor w3 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w3") - .c_str()); + Tensor w3 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str()); - Tensor multi = ff.sigmoid_silu_multi(w1, w3); + Tensor multi = + ff.sigmoid_silu_multi(w1, w3, llama_config.intermediate_size); - w2 = - ff.dense(multi, - llama_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w2") - .c_str()); + w2 = ff.dense( + multi, + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; @@ -244,23 +257,29 @@ void LLAMA::create_llama_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "output"); + "lm_head"); Tensor output; - if (mode == BEAM_SEARCH_MODE) { + if (mode == TREE_SEARCH_MODE) { Tensor softmax = ff.softmax(dense, -1); - // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); - // output = ff.argmax(softmax, /*beam_Search*/ true); - output = ff.arg_top_k(softmax, llama_config.max_beam_width, false, true); - // output = ff.top_k(softmax, ) + output = ff.arg_top_k(softmax, llama_config.k_of_arg_topk, false, false); + } else if (mode == INC_DECODING_MODE) { + if (generation_config.do_sample) { + Tensor softmax = ff.softmax(dense, -1); + output = ff.sampling(softmax, generation_config.topp); + } else { + output = ff.argmax(dense, /*beam_Search*/ false); + } } else { - // Tensor softmax = ff.softmax(dense, -1); if (generation_config.do_sample) { dense = ff.scalar_truediv(dense, generation_config.temperature, false); Tensor softmax = ff.softmax(dense, -1); - output = ff.sampling(softmax, generation_config.topp); + if (generation_config.spec_sample) { + output = ff.gumbel_top_k(softmax, generation_config.topk, false, true); + } else { + output = ff.sampling(softmax, generation_config.topp); + } } else { - // output = ff.arg_top_k(dense, /*k=*/1, false); output = ff.argmax(dense, /*beam_Search*/ false); } } @@ -269,7 +288,7 @@ void LLAMA::create_llama_model(FFModel &ff, "", weight_file_path, llama_config.num_attention_heads, - llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size, llama_config.hidden_size / llama_config.num_attention_heads, ff.config.tensor_parallelism_degree, diff --git a/inference/models/llama.h b/inference/models/llama.h index ba1f0236f..cd6f9c5cc 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -16,6 +16,7 @@ // #include "file_loader.h" #include "flexflow/batch_config.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" #include @@ -36,9 +37,34 @@ class LLAMA { num_hidden_layers = model_config["num_hidden_layers"]; vocab_size = model_config["vocab_size"]; num_attention_heads = model_config["num_attention_heads"]; + if (model_config.find("num_key_value_heads") != model_config.end()) { + num_key_value_heads = model_config["num_key_value_heads"]; + } else { + num_key_value_heads = num_attention_heads; + } hidden_size = model_config["hidden_size"]; rms_norm_eps = model_config["rms_norm_eps"]; intermediate_size = model_config["intermediate_size"]; + rotary_embedding_meta.apply_rotary_embedding = true; + if (model_config.find("rope_theta") != model_config.end()) { + rotary_embedding_meta.rope_theta = model_config["rope_theta"]; + } else { + rotary_embedding_meta.rope_theta = 10000.0f; + } + if (model_config.find("scaling_factor") != model_config.end() && + !model_config["scaling_factor"].is_null()) { + rotary_embedding_meta.rope_type = + model_config["scaling_factor"]["rope_type"]; + rotary_embedding_meta.factor = + model_config["scaling_factor"]["factor"]; + rotary_embedding_meta.low_freq_factor = + model_config["scaling_factor"]["low_freq_factor"]; + rotary_embedding_meta.high_freq_factor = + model_config["scaling_factor"]["high_freq_factor"]; + rotary_embedding_meta.original_max_position_embeddings = + model_config["scaling_factor"] + ["original_max_position_embeddings"]; + } } catch (json::exception const &e) { std::cerr << "Error parsing LLAMA config from JSON file: " << e.what() << std::endl; @@ -49,10 +75,7 @@ class LLAMA { << std::endl; assert(false); } - // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; } void print() const { @@ -61,21 +84,24 @@ class LLAMA { std::cout << "\tvocab_size: " << vocab_size << std::endl; std::cout << "\tnum_attention_heads: " << num_attention_heads << std::endl; + std::cout << "\tnum_key_value_heads: " << num_key_value_heads + << std::endl; std::cout << "\thidden_size: " << hidden_size << std::endl; std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl; std::cout << "\tintermediate_size: " << intermediate_size << std::endl; - + std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta + << std::endl; // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; - std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; - std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; + std::cout << "\tk_of_arg_topk : " << k_of_arg_topk << std::endl; } // int max_seq_len, max_num_tokens; - int max_beam_width, max_beam_depth; - int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, - intermediate_size; + int k_of_arg_topk; + int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads, + hidden_size, intermediate_size; float rms_norm_eps; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_llama_model(FFModel &ff, @@ -83,6 +109,7 @@ class LLAMA { std::string const &weight_file_path, InferenceMode mode, GenerationConfig generation_config, + bool streaming_cache, bool use_full_precision = false); }; diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 70e2b5e9c..b95cb5c91 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -41,9 +41,10 @@ void MPT::create_mpt_model(FFModel &ff, Tensor input; { int const token_dims[] = { - (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), + std::max(mode == TREE_SEARCH_MODE + ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -58,11 +59,15 @@ void MPT::create_mpt_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor intermediate_output = nullptr, layernorm_output = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; + ff.set_num_transformer_layers(mpt_config.n_layers); + ff.set_num_kv_heads(mpt_config.n_heads); + ff.set_qkv_dim(mpt_config.hidden_size / mpt_config.n_heads * 2); + ff.set_size_dt(data_type_size(input->data_type)); for (int i = 0; i < mpt_config.n_layers; i++) { ff.set_transformer_layer_id(i); @@ -74,7 +79,7 @@ void MPT::create_mpt_model(FFModel &ff, 1e-05, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); } else { ff.residual_layer_norm( intermediate_output, @@ -87,14 +92,14 @@ void MPT::create_mpt_model(FFModel &ff, 1e-05, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; } Tensor attn_outputs; switch (mode) { - case BEAM_SEARCH_MODE: { + case TREE_SEARCH_MODE: { attn_outputs = ff.spec_inc_multihead_self_attention( layernorm_output, mpt_config.hidden_size, @@ -107,13 +112,13 @@ void MPT::create_mpt_model(FFModel &ff, false, DT_NONE, /*data_type*/ NULL, - false, + mpt_config.rotary_embedding_meta, /*scaling query*/ true, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -131,13 +136,13 @@ void MPT::create_mpt_model(FFModel &ff, false, DT_NONE, /*data_type*/ NULL, - false, + mpt_config.rotary_embedding_meta, /*scaling query*/ true, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -155,13 +160,13 @@ void MPT::create_mpt_model(FFModel &ff, false, DT_NONE, /*data_type*/ NULL, - false, + mpt_config.rotary_embedding_meta, /*scaling query*/ true, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -182,7 +187,7 @@ void MPT::create_mpt_model(FFModel &ff, 1e-05, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_2").c_str()); + std::string("layers." + std::to_string(i) + ".norm_2").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; @@ -198,7 +203,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str()); layernorm_output = ff.gelu(layernorm_output); intermediate_output = ff.dense( layernorm_output, @@ -211,7 +216,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str()); } // final @@ -225,7 +230,7 @@ void MPT::create_mpt_model(FFModel &ff, 1e-05, false, DT_NONE, - "transformer_norm_f"); + "norm_f"); Tensor all_final_norm = res_ln_outputs[1]; Tensor lm_head = ff.dense(all_final_norm, @@ -241,7 +246,7 @@ void MPT::create_mpt_model(FFModel &ff, "lm_head"); Tensor output; - if (mode == BEAM_SEARCH_MODE) { + if (mode == TREE_SEARCH_MODE) { Tensor softmax = ff.softmax(lm_head, -1); output = ff.argmax(softmax, /*beam_Search*/ true); } else { diff --git a/inference/models/mpt.h b/inference/models/mpt.h index 08597e1d7..8466ea1cb 100644 --- a/inference/models/mpt.h +++ b/inference/models/mpt.h @@ -16,6 +16,7 @@ // #include "file_loader.h" #include "flexflow/batch_config.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" #include @@ -37,6 +38,7 @@ class MPT { n_heads = model_config["n_heads"]; n_layers = model_config["n_layers"]; vocab_size = model_config["vocab_size"]; + rotary_embedding_meta.apply_rotary_embedding = false; } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); @@ -48,8 +50,7 @@ class MPT { } // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; } void print() const { @@ -61,8 +62,9 @@ class MPT { } // int max_seq_len, max_num_tokens; - int max_beam_width, max_beam_depth; + int k_of_arg_topk; int hidden_size, n_heads, n_layers, vocab_size; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_mpt_model(FFModel &ff, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 5677d5658..352809ede 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -43,9 +43,10 @@ void OPT::create_opt_model(FFModel &ff, ff.set_position_offset(2); { int const token_dims[] = { - (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), + std::max(mode == TREE_SEARCH_MODE + ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); @@ -76,6 +77,10 @@ void OPT::create_opt_model(FFModel &ff, Tensor fc2 = nullptr, added = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; + ff.set_num_transformer_layers(opt_config.num_hidden_layers); + ff.set_num_kv_heads(opt_config.num_attention_heads); + ff.set_qkv_dim(opt_config.hidden_size / opt_config.num_attention_heads * 2); + ff.set_size_dt(data_type_size(input->data_type)); for (int i = 0; i < opt_config.num_hidden_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); @@ -95,14 +100,14 @@ void OPT::create_opt_model(FFModel &ff, 1e-05, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_layer_norm") + std::string("layers." + std::to_string(i) + ".self_attn_layer_norm") .c_str()); Tensor residual = res_ln_outputs[0]; Tensor hidden_states = res_ln_outputs[1]; Tensor mha; switch (mode) { - case BEAM_SEARCH_MODE: { + case TREE_SEARCH_MODE: { mha = ff.spec_inc_multihead_self_attention( hidden_states, opt_config.hidden_size, @@ -115,13 +120,13 @@ void OPT::create_opt_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - false, /*apply_rotary_embedding*/ - true, /*scaling query*/ + opt_config.rotary_embedding_meta, + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -139,13 +144,13 @@ void OPT::create_opt_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - false, /*apply_rotary_embedding*/ - true, /*scaling query*/ + opt_config.rotary_embedding_meta, + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -163,13 +168,13 @@ void OPT::create_opt_model(FFModel &ff, false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - false, /*apply_rotary_embedding*/ - true, /*scaling query*/ + opt_config.rotary_embedding_meta, + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -187,8 +192,8 @@ void OPT::create_opt_model(FFModel &ff, 1e-05, true, DT_NONE, - std::string("layers_" + std::to_string(i) + - "_add_bias_residual_layer_norm") + std::string("layers." + std::to_string(i) + + ".add_bias_residual_layer_norm") .c_str()); added = res_ln_outputs[0]; Tensor final_norm = res_ln_outputs[1]; @@ -205,7 +210,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc1").c_str()); + std::string("layers." + std::to_string(i) + ".fc1").c_str()); fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, @@ -216,7 +221,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc2").c_str()); + std::string("layers." + std::to_string(i) + ".fc2").c_str()); } // final @@ -243,13 +248,12 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "embed_tokens_weight_lm_head"); + "lm_head"); Tensor output; - if (mode == BEAM_SEARCH_MODE) { + if (mode == TREE_SEARCH_MODE) { Tensor softmax = ff.softmax(lm_head, -1); - // output = ff.beam_top_k(softmax, opt_config.max_beam_width, false); - output = ff.argmax(softmax, /*beam_Search*/ true); + output = ff.arg_top_k(softmax, opt_config.k_of_arg_topk, false, false); } else { // output = ff.arg_top_k(lm_head, /*k=*/1, false); output = ff.argmax(lm_head, /*beam_Search*/ false); diff --git a/inference/models/opt.h b/inference/models/opt.h index 7c736a26d..23ba8888b 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -16,6 +16,7 @@ // #include "file_loader.h" #include "flexflow/batch_config.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" #include @@ -45,6 +46,7 @@ class OPT { num_hidden_layers = model_config["num_hidden_layers"]; vocab_size = model_config["vocab_size"]; word_embed_proj_dim = model_config["word_embed_proj_dim"]; + rotary_embedding_meta.apply_rotary_embedding = false; } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); @@ -56,8 +58,7 @@ class OPT { } // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; } void print() const { @@ -78,19 +79,20 @@ class OPT { std::cout << "\tvocab_size: " << vocab_size << std::endl; std::cout << "\tword_embed_proj_dim: " << word_embed_proj_dim << std::endl; - + std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta + << std::endl; // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; - std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; - std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; + std::cout << "\tk_of_arg_topk : " << k_of_arg_topk << std::endl; } // int max_seq_len, max_num_tokens; - int max_beam_width, max_beam_depth; + int k_of_arg_topk; bool do_layer_norm_before, enable_bias, layer_norm_elementwise_affine; float dropout; int ffn_dim, hidden_size, max_position_embeddings, num_attention_heads, num_hidden_layers, vocab_size, word_embed_proj_dim; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_opt_model(FFModel &ff, diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 8b0dc1098..401a754d0 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -49,9 +49,10 @@ void STARCODER::create_starcoder_model( { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); int const token_dims[] = { - (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), + std::max(mode == TREE_SEARCH_MODE + ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()), 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); @@ -66,7 +67,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor positional_embedding = ff.embedding(position_input, @@ -76,11 +77,16 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wpe"); + "wpe"); Tensor residual = nullptr, c_proj = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; + ff.set_num_transformer_layers(startcoder_config.num_hidden_layers); + ff.set_num_kv_heads(startcoder_config.num_attention_heads); + ff.set_qkv_dim(startcoder_config.hidden_size / + startcoder_config.num_attention_heads * 2); + ff.set_size_dt(data_type_size(input->data_type)); for (int i = 0; i < startcoder_config.num_hidden_layers; i++) { // set transformer layer id ff.set_transformer_layer_id(i); @@ -97,14 +103,14 @@ void STARCODER::create_starcoder_model( startcoder_config.layer_norm_epsilon, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_1").c_str()); + std::string("layers." + std::to_string(i) + ".ln_1").c_str()); Tensor hidden_states = res_ln_outputs[0]; Tensor ln_1 = res_ln_outputs[1]; Tensor mha; switch (mode) { case INC_DECODING_MODE: { - mha = ff.inc_multiquery_self_attention( + mha = ff.groupquery_self_attention( ln_1, startcoder_config.hidden_size, startcoder_config.num_attention_heads, @@ -113,18 +119,19 @@ void STARCODER::create_starcoder_model( startcoder_config.num_attention_heads, startcoder_config.hidden_size / startcoder_config.num_attention_heads, - startcoder_config.dropout_p, /*dropout*/ - true, /*bias*/ - false, /*add_bias_kv*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - false, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + startcoder_config.dropout_p, /*dropout*/ + true, /*bias*/ + false, /*add_bias_kv*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + startcoder_config.rotary_embedding_meta, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ + false, /*streaming_cache*/ + std::string("layers." + std::to_string(i) + ".attn.c_attn") .c_str() /*name*/ ); break; @@ -145,7 +152,7 @@ void STARCODER::create_starcoder_model( startcoder_config.layer_norm_epsilon, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_2").c_str()); + std::string("layers." + std::to_string(i) + ".ln_2").c_str()); residual = res_ln_outputs[0]; Tensor l2_norm = res_ln_outputs[1]; @@ -161,7 +168,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str()); c_fc = ff.gelu(c_fc); @@ -176,7 +183,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str()); } // final normalization and linear ff.residual_layer_norm(residual, @@ -189,7 +196,7 @@ void STARCODER::create_starcoder_model( startcoder_config.layer_norm_epsilon, true, DT_NONE, - "transformer_ln_f"); + "ln_f"); Tensor ln_f = res_ln_outputs[1]; Tensor lm_head = ff.dense(ln_f, @@ -205,10 +212,10 @@ void STARCODER::create_starcoder_model( "lm_head"); Tensor output; - if (mode == BEAM_SEARCH_MODE) { + if (mode == TREE_SEARCH_MODE) { Tensor softmax = ff.softmax(lm_head, -1); - // output = ff.beam_top_k(softmax, startcoder_config.max_beam_width, false); - output = ff.argmax(softmax, /*beam_Search*/ true); + output = + ff.arg_top_k(softmax, startcoder_config.k_of_arg_topk, false, false); } else { // Tensor softmax = ff.softmax(dense, -1); if (generationConfig.do_sample) { diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h index 0e9577d56..57e1229f1 100644 --- a/inference/models/starcoder.h +++ b/inference/models/starcoder.h @@ -16,6 +16,7 @@ // #include "file_loader.h" #include "flexflow/batch_config.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" #include @@ -41,6 +42,7 @@ class STARCODER { intermediate_size = model_config["n_inner"]; dropout_p = model_config["attn_pdrop"]; max_position_embeddings = model_config["n_positions"]; + rotary_embedding_meta.apply_rotary_embedding = false; } catch (json::exception const &e) { std::cerr << "Error parsing STARCODER config from JSON file: " << e.what() << std::endl; @@ -53,17 +55,17 @@ class STARCODER { } // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; - max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; + k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; } void print() const {} // int max_seq_len, max_num_tokens; - int max_beam_width, max_beam_depth; + int k_of_arg_topk; int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, intermediate_size, max_position_embeddings; float layer_norm_epsilon, dropout_p; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_starcoder_model(FFModel &ff, diff --git a/inference/python/chat.py b/inference/python/chat.py new file mode 100644 index 000000000..95132443a --- /dev/null +++ b/inference/python/chat.py @@ -0,0 +1,103 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 8, + "memory_per_gpu": 34000, + "zero_copy_memory_per_node": 200000, + # optional parameters + "num_cpus": 16, + "legion_utility_processors": 16, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 8, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "profiling": False, + "benchmarking": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + # optional parameters + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": False, + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +def main(): + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + ) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=2048, + max_tokens_per_batch=256, + ) + + llm.start_server() + + nemotron_system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature." + llama_generic_system = "You are a helpful an honest programming assistant." + + + messages=[ + {"role": "system", "content": nemotron_system}, + {"role": "user", "content": "Is Rust better than Python?"}, + ] + llm.generate(messages, max_new_tokens=1024) + + llm.stop_server() + + +if __name__ == "__main__": + print("flexflow inference example (incremental decoding)") + main() diff --git a/inference/simplified_infer/CMakeLists.txt b/inference/simplified_infer/CMakeLists.txt new file mode 100644 index 000000000..35ee40711 --- /dev/null +++ b/inference/simplified_infer/CMakeLists.txt @@ -0,0 +1,74 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlow_SpecInfer) +set(project_target1 specinfer) + + +set(CPU_SRC1 + ${FLEXFLOW_CPP_DRV_SRC} + specinfer.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target1} ${CPU_SRC1}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target1} ${CPU_SRC1}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target1} DESTINATION ${BIN_DEST}) + + +project(FlexFlow_IncrDecoding) +set(project_target3 incr_dec) + + +set(CPU_SRC3 + ${FLEXFLOW_CPP_DRV_SRC} + incr_dec.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target3} ${CPU_SRC3}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target3} ${CPU_SRC3}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target3} DESTINATION ${BIN_DEST}) diff --git a/inference/simplified_infer/incr_dec.cc b/inference/simplified_infer/incr_dec.cc new file mode 100644 index 000000000..ed6125d0f --- /dev/null +++ b/inference/simplified_infer/incr_dec.cc @@ -0,0 +1,473 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string trace_file_path; + std::string trace_output_path; + std::string log_file_path; + std::string csv_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + bool &use_full_precision, + bool &verbose, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_output_length, + bool &do_sample, + int &request_per_second, + bool &add_special_tokens, + std::string &target_partition) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // traces + if (!strcmp(argv[i], "-trace")) { + paths.trace_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-trace-output-path")) { + paths.trace_output_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-target-partition")) { + target_partition = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-log-output-path")) { + paths.log_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-csv-output-path")) { + paths.csv_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-output-length")) { + max_output_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--request-per-second")) { + request_per_second = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--add-special-tokens")) { + add_special_tokens = true; + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 512; + int max_output_length = 512; + int num_warmup_requests = 0; + double warmup_delay = 15.0; + RequestManager::DecodingMode decoding_mode = + RequestManager::INCREMENTAL_DECODING; + int sampling_seed = 0; + int request_per_second = -1; + bool add_special_tokens = false; + std::string target_partition = "FEATURE_EXTRACTION"; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + use_full_precision, + verbose, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_output_length, + do_sample, + request_per_second, + add_special_tokens, + target_partition); + + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + // Get dataset + std::ifstream input_file(file_paths.trace_file_path); + assert(input_file.good() && "Prompt file does not exist."); + nlohmann::ordered_json j = nlohmann::ordered_json::parse(input_file); + input_file.close(); + + // Find the partition with name "FEATURE_EXTRACTION" + auto &partitions = j["partitions"]; + auto it = + std::find_if(partitions.begin(), + partitions.end(), + [target_partition](nlohmann::ordered_json const &partition) { + return partition["partition_name"] == target_partition; + }); + nlohmann::ordered_json &partition = *it; + if (it == partitions.end()) { + std::cerr << "Partition " << target_partition + << " not found in the trace file." << std::endl; + assert(false); + } + // check that the max prompt + response length sum in the eval_entries in the + // partition does not exceed the max_sequence_length + int max_prompt_response_length = 0; + for (auto &eval_entry : partition["eval_entries"]) { + int prompt_length = eval_entry["prompt_length"]; + int response_length = eval_entry["response_length"]; + if (response_length >= max_output_length) { + std::cerr << "Error: A response length from the targt partition in the " + "dataset (=" + << response_length + << ") exceeds the max_output_length(=" << max_output_length + << ")." << std::endl; + assert(false); + } + max_prompt_response_length = + std::max(max_prompt_response_length, prompt_length + response_length); + } + if (max_prompt_response_length >= max_sequence_length) { + std::cerr << "Error: max prompt + response length sum (=" + << max_prompt_response_length + << ") in the eval_entries in the partition exceeds the " + "max_sequence_length(=" + << max_sequence_length << ")." << std::endl; + assert(false); + } + + // Get model configs + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" || + str == "MistralForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + std::vector eos_token_ids; + if (model_config.find("eos_token_id") != model_config.end()) { + if (model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : model_config["eos_token_id"]) { + eos_token_ids.push_back(eos_token_id); + } + } else { + eos_token_ids.push_back(model_config["eos_token_id"]); + } + } else { + eos_token_ids.push_back(-1); + } + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // set request manager properties + srand(sampling_seed); + GenerationConfig generationConfig(do_sample, 0.8, 0.6, false, 16); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_tokens_per_ssm_batch(max_tokens_per_batch); + rm->set_max_tokens_per_prefilling_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->set_max_output_length(max_output_length); + rm->set_decoding_mode(decoding_mode); + rm->set_slo_violation_early_termination(false); + rm->set_baseline_latency(50); + rm->set_ssm_spec_latency(20); + rm->set_llm_verify_latency(50); + rm->set_spec_infer_old_version(true); + rm->set_greedy_schedule(false); + rm->set_equal_schedule(false); + rm->set_max_tree_depth(8); + rm->set_max_tree_width(16); + rm->set_verbose(verbose); + rm->set_streaming_cache(false); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_ids, tokenizer_filepath); + rm->register_output_filepath(file_paths.log_file_path); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + false, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + rm->start_background_server(&model); + + int total_num_requests = 0; + { + // Iterate through eval_entries + std::vector requests; + std::vector timestamps, ratios; + if (partition.contains("num_warmup_requests")) { + num_warmup_requests = partition["num_warmup_requests"]; + } + for (auto &entry : partition["eval_entries"]) { + std::string text = entry["prompt"]; + int max_new_tokens_ = entry["response_length"]; + + bool is_warmup_request = total_num_requests < num_warmup_requests; + double request_delay = + 1000.0 * + (request_per_second > 0 ? (1.0 / (double)request_per_second) : 0); + double emission_time_ms = + is_warmup_request + ? 0.0 + : (warmup_delay + + request_delay * (total_num_requests - num_warmup_requests)); + + GenerationRequest inference_req(text, // prompt + -1.0, // slo_ratio + emission_time_ms, // emission_time_ms + add_special_tokens); + + requests.push_back(inference_req); + timestamps.push_back(emission_time_ms); + ratios.push_back(1.0); + total_num_requests++; + + if (verbose) { + break; + } + } + TraceEmissionMachine emission_machine(timestamps, ratios); + std::vector result = + model.generate(requests, emission_machine); + assert(result.size() == requests.size()); + assert(result.size() == total_num_requests); + assert(result.size() == partition["eval_entries"].size()); + int i = 0; + for (auto &entry : partition["eval_entries"]) { + entry["original_response"] = entry["response"]; + entry["original_response_length"] = entry["response_length"]; + std::string ff_out = result[i].output_text; + int tot_length = result[i].output_text.length(); + entry["response"] = ff_out; + entry["response_length"] = result[i].output_tokens.size(); + entry["specinfer_decoding_steps"] = result[i].decoding_steps; + i++; + } + + // Write the modified JSON to a file + std::ofstream output_file(file_paths.trace_output_path); + if (output_file.is_open()) { + output_file << j.dump(2); + output_file.close(); + std::cout << "Modified JSON has been saved to " + << file_paths.trace_output_path << std::endl; + } else { + std::cerr << "Unable to open file for writing." << std::endl; + } + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + std::string header = + "llm,partition,max_requests_per_batch,max_tokens_per_" + "batch,request_per_second,is_warmup_request,request_guid," + "request_step_idx,timestamp,num_generated_tokens"; + // csv filepath + // create csv filepath and add header if it doesn't exist + + bool csv_file_exists = std::filesystem::exists(file_paths.csv_file_path); + if (!csv_file_exists) { + // Create new file and write header + std::ofstream file(file_paths.csv_file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_paths.csv_file_path + << std::endl; + assert(false); + } + file << header << "\n"; + file.close(); + } + + // Append the new row + std::ofstream file(file_paths.csv_file_path, std::ios::app); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_paths.csv_file_path + << std::endl; + } + + std::vector new_profiling_info = rm->get_new_profiling_info(); + for (auto const &info : new_profiling_info) { + file << llm_model_name + ","; + file << target_partition + ","; + file << std::to_string(max_requests_per_batch) + ","; + file << std::to_string(max_tokens_per_batch) + ","; + file << std::to_string(request_per_second) + ","; + bool is_warmup_request = + (info.request_guid - 1000000) < num_warmup_requests; + file << std::to_string(is_warmup_request) + ","; + file << info.request_guid << "," << info.request_step_idx << "," + << info.timestamp << "," << info.num_generated_tokens << "\n"; + } + file.close(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/simplified_infer/specinfer.cc b/inference/simplified_infer/specinfer.cc new file mode 100644 index 000000000..58f302075 --- /dev/null +++ b/inference/simplified_infer/specinfer.cc @@ -0,0 +1,692 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include +#include +#include +#include + +using namespace FlexFlow; +using namespace Legion; +using RequestGuid = BatchConfig::RequestGuid; + +Legion::Logger log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string trace_file_path; + std::string trace_output_path; + std::string log_file_path; + std::string csv_file_path; +}; + +struct ModelNames { + std::string llm_model_name; + std::vector ssm_model_names; +}; + +struct ModelMeta { + ModelNames model_names; + + ModelType llm_model_type; + std::string llm_tokenizer_path; + std::string llm_weights_path; + std::string llm_model_config_path; + + int bos_token_id; + std::vector eos_token_ids; + + std::vector ssm_model_types; + std::vector ssm_model_config_paths; + std::vector ssm_model_weights_paths; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + ModelNames &model_names, + bool &use_full_precision, + bool &verbose, + int &ssm_tp_degree, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_output_length, + int &max_tree_width, + int &max_tree_depth, + int &expansion_degree, + bool &do_sample, + int &request_per_second, + bool &add_special_tokens, + std::string &target_partition) { + for (int i = 1; i < argc; i++) { + // llm model name + if (!strcmp(argv[i], "-llm-model")) { + model_names.llm_model_name = std::string(argv[++i]); + for (char &c : model_names.llm_model_name) { + c = std::tolower(c); + } + continue; + } + // ssm models names + if (!strcmp(argv[i], "-ssm-model")) { + std::string ssm_model_name = std::string(argv[++i]); + for (char &c : ssm_model_name) { + c = std::tolower(c); + } + model_names.ssm_model_names.push_back(ssm_model_name); + continue; + } + if (!strcmp(argv[i], "-ssm-tp-degree")) { + ssm_tp_degree = std::stoi(argv[++i]); + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // trace + if (!strcmp(argv[i], "-trace")) { + paths.trace_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-trace-output-path")) { + paths.trace_output_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-target-partition")) { + target_partition = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-log-output-path")) { + paths.log_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-csv-output-path")) { + paths.csv_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-output-length")) { + max_output_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tree-width")) { + max_tree_width = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tree-depth")) { + max_tree_depth = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--expansion-degree")) { + expansion_degree = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--request-per-second")) { + request_per_second = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--add-special-tokens")) { + add_special_tokens = true; + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void get_model_meta(FilePaths &file_paths, + ModelMeta &model_metadata, + bool use_full_precision) { + if (model_metadata.model_names.llm_model_name.empty() || + model_metadata.model_names.ssm_model_names.size() == 0) { + assert(false && "SpecInfer needs at least one LLM and one SSM for " + "speculative inference"); + } + model_metadata.llm_model_config_path = + join_path({file_paths.cache_folder_path, + "configs", + model_metadata.model_names.llm_model_name, + "config.json"}); + model_metadata.llm_tokenizer_path = + join_path({file_paths.cache_folder_path, + "tokenizers", + model_metadata.model_names.llm_model_name}); + model_metadata.llm_weights_path = + join_path({file_paths.cache_folder_path, + "weights", + model_metadata.model_names.llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + + std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path); + if (!llm_config_file_handle.good()) { + std::cout << "LLM Model config file " + << model_metadata.llm_model_config_path << " not found." + << std::endl; + assert(false); + } + nlohmann::ordered_json llm_model_config = + nlohmann::ordered_json::parse(llm_config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + model_metadata.llm_model_type = ModelType::UNKNOWN; + auto architectures = llm_model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" || + str == "MistralForCausalLM") { + model_metadata.llm_model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_metadata.llm_model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_metadata.llm_model_type = ModelType::FALCON; + break; + } else if (str == "MPTForCausalLM") { + model_metadata.llm_model_type = ModelType::MPT; + break; + } + } + model_metadata.bos_token_id = + llm_model_config.find("bos_token_id") == llm_model_config.end() + ? -1 + : (int)llm_model_config.at("bos_token_id"); + // model_metadata.eos_token_id = + // llm_model_config.find("eos_token_id") == llm_model_config.end() + // ? -1 + // : (int)llm_model_config.at("eos_token_id"); + if (llm_model_config.find("eos_token_id") != llm_model_config.end()) { + if (llm_model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : llm_model_config["eos_token_id"]) { + model_metadata.eos_token_ids.push_back(eos_token_id); + } + } else { + model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]); + } + } else { + model_metadata.eos_token_ids.push_back(-1); + } + + for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) { + std::string ssm_config_path = join_path({file_paths.cache_folder_path, + "configs", + ssm_model_name, + "config.json"}); + std::string ssm_tokenizer_path = + join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name}); + std::string ssm_weights_path = + join_path({file_paths.cache_folder_path, + "weights", + ssm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + + std::ifstream ssm_config_file_handle(ssm_config_path); + if (!ssm_config_file_handle.good()) { + std::cout << "SSM Model config file " << ssm_config_path << " not found." + << std::endl; + assert(false); + } + nlohmann::ordered_json ssm_model_config = + nlohmann::ordered_json::parse(ssm_config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + ModelType ssm_model_type = ModelType::UNKNOWN; + auto architectures = ssm_model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" || + str == "MistralForCausalLM") { + ssm_model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + ssm_model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM") { + ssm_model_type = ModelType::FALCON; + break; + } else if (str == "MPTForCausalLM") { + ssm_model_type = ModelType::MPT; + break; + } + } + int ssm_bos_id = + ssm_model_config.find("bos_token_id") == ssm_model_config.end() + ? -1 + : (int)ssm_model_config.at("bos_token_id"); + // int ssm_eos_id = + // ssm_model_config.find("eos_token_id") == ssm_model_config.end() + // ? -1 + // : (int)ssm_model_config.at("eos_token_id"); + // if (ssm_bos_id != model_metadata.bos_token_id || + // ssm_eos_id != model_metadata.eos_token_id) { + // printf("Warning: bos/eos token id mismatch between LLM and one of the " + // "SSMs!\n"); + // } + model_metadata.ssm_model_types.push_back(ssm_model_type); + model_metadata.ssm_model_config_paths.push_back(ssm_config_path); + model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path); + } + + assert(model_metadata.llm_model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + for (auto mt : model_metadata.ssm_model_types) { + if (mt == ModelType::UNKNOWN) { + assert(false && "One of the SSM model types passed is invalid."); + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + FilePaths file_paths; + ModelMeta model_metadata; + bool use_full_precision = false; + bool verbose = false; + int ssm_tp_degree = 1; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 512; + int max_output_length = 512; + int expansion_degree = 3; + int max_tree_depth = 8; + int max_tree_width = 16; + RequestManager::DecodingMode decoding_mode = + RequestManager::SPECULATIVE_DECODING; + bool do_sample = false; + int sampling_seed = 0; + int request_per_second = -1; + int num_warmup_requests = 0; + double warmup_delay = 15.0; + bool add_special_tokens = false; + std::string target_partition = "FEATURE_EXTRACTION"; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + model_metadata.model_names, + use_full_precision, + verbose, + ssm_tp_degree, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_output_length, + max_tree_width, + max_tree_depth, + expansion_degree, + do_sample, + request_per_second, + add_special_tokens, + target_partition); + + get_model_meta(file_paths, model_metadata, use_full_precision); + + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + assert(ssm_tp_degree >= 1 && + ssm_tp_degree <= ffconfig.numNodes * ffconfig.workersPerNode); + + std::ifstream input_file(file_paths.trace_file_path); + assert(input_file.good() && "Prompt file does not exist."); + nlohmann::ordered_json j = nlohmann::ordered_json::parse(input_file); + input_file.close(); + + // Find the partition with name "FEATURE_EXTRACTION" + auto &partitions = j["partitions"]; + auto it = + std::find_if(partitions.begin(), + partitions.end(), + [target_partition](nlohmann::ordered_json const &partition) { + return partition["partition_name"] == target_partition; + }); + nlohmann::ordered_json &partition = *it; + if (it == partitions.end()) { + std::cerr << "Partition " << target_partition + << " not found in the trace file." << std::endl; + assert(false); + } + // check that the max prompt + response length sum in the eval_entries in the + // partition does not exceed the max_sequence_length + int max_prompt_response_length = 0; + for (auto &eval_entry : partition["eval_entries"]) { + int prompt_length = eval_entry["prompt_length"]; + int response_length = eval_entry["response_length"]; + if (response_length >= max_output_length) { + std::cerr << "Error: A response length from the targt partition in the " + "dataset (=" + << response_length + << ") exceeds the max_output_length(=" << max_output_length + << ")." << std::endl; + assert(false); + } + max_prompt_response_length = + std::max(max_prompt_response_length, prompt_length + response_length); + } + if (max_prompt_response_length >= max_sequence_length) { + std::cerr << "Error: max prompt + response length sum (=" + << max_prompt_response_length + << ") in the eval_entries in the partition exceeds the " + "max_sequence_length(=" + << max_sequence_length << ")." << std::endl; + assert(false); + } + + // Sanity check for SpecInfer old version + assert(max_tree_depth <= 8); + assert(max_tree_width >= 3); + // Total verified tokens + assert(max_tokens_per_batch >= max_requests_per_batch * 21); + + // Create SentencePiece tokenizer or OPT tokenizer + srand(sampling_seed); + GenerationConfig generationConfig(do_sample, 0.8, 0.6, false, 16); + InferenceManager *im = InferenceManager::get_inference_manager(); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_tokens_per_ssm_batch(max_tokens_per_batch); + rm->set_max_tokens_per_prefilling_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->set_max_output_length(max_output_length); + rm->set_max_tree_depth(max_tree_depth); + rm->set_max_tree_width(max_tree_width); + rm->set_expansion_degree(expansion_degree); + rm->set_verbose(verbose); + rm->set_streaming_cache(false); + rm->register_tokenizer(model_metadata.llm_model_type, + model_metadata.bos_token_id, + model_metadata.eos_token_ids, + model_metadata.llm_tokenizer_path); + rm->set_decoding_mode(decoding_mode); + rm->set_slo_violation_early_termination(false); + rm->set_baseline_latency(50); + rm->set_ssm_spec_latency(20); + rm->set_llm_verify_latency(50); + rm->set_spec_infer_old_version(true); + rm->set_greedy_schedule(false); + rm->set_equal_schedule(false); + rm->register_output_filepath(file_paths.log_file_path); + + // Create LLM model + FFModel tree_model(ffconfig, ffconfig.cpu_offload); + if (model_metadata.llm_model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + generationConfig, + false, + use_full_precision); + } else if (model_metadata.llm_model_type == ModelType::OPT) { + OPT::create_opt_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + use_full_precision); + } else if (model_metadata.llm_model_type == ModelType::FALCON) { + FALCON::create_falcon_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + use_full_precision); + } else if (model_metadata.llm_model_type == ModelType::MPT) { + MPT::create_mpt_model(tree_model, + model_metadata.llm_model_config_path, + model_metadata.llm_weights_path, + TREE_VERIFY_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "Invalid LLM model type passed (or no type was passed)."); + } + + // Create SSM models + int num_ssms = model_metadata.ssm_model_types.size(); + std::vector ssm_model_ids; + std::vector ssm_models; + FFConfig bm_config = ffconfig; + std::cout << "SSM TP Degree: " << ssm_tp_degree << std::endl; + // bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree = + // bm_config.pipeline_parallelism_degree = 1; + bm_config.data_parallelism_degree = 1; + bm_config.tensor_parallelism_degree = ssm_tp_degree; + bm_config.pipeline_parallelism_degree = 1; + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + FFModel beam_model(bm_config); + ssm_models.push_back(beam_model); + } + + for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { + FFModel &beam_model = ssm_models[ssm_id]; + if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) { + LLAMA::create_llama_model(beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + TREE_SEARCH_MODE, + generationConfig, + false, + use_full_precision); + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) { + OPT::create_opt_model(beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + TREE_SEARCH_MODE, + use_full_precision); + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) { + FALCON::create_falcon_model( + beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + TREE_SEARCH_MODE, + use_full_precision); + } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) { + MPT::create_mpt_model(beam_model, + model_metadata.ssm_model_config_paths[ssm_id], + model_metadata.ssm_model_weights_paths[ssm_id], + TREE_SEARCH_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "Invalid SSM model type passed."); + } + + rm->register_ssm_model(&beam_model); + } + + rm->start_background_server(&tree_model); + + int total_num_requests = 0; + { + // Iterate through eval_entries + std::vector requests; + std::vector timestamps, ratios; + if (partition.contains("num_warmup_requests")) { + num_warmup_requests = partition["num_warmup_requests"]; + } + for (auto &entry : partition["eval_entries"]) { + std::string text = entry["prompt"]; + int max_new_tokens_ = entry["response_length"]; + + bool is_warmup_request = total_num_requests < num_warmup_requests; + double request_delay = + 1000.0 * + (request_per_second > 0 ? (1.0 / (double)request_per_second) : 0); + double emission_time_ms = + is_warmup_request + ? 0.0 + : (warmup_delay + + request_delay * (total_num_requests - num_warmup_requests)); + + GenerationRequest inference_req(text, // prompt + -1.0, // slo_ratio + emission_time_ms, // emission_time_ms + add_special_tokens); + requests.push_back(inference_req); + timestamps.push_back(emission_time_ms); + ratios.push_back(1.0); + total_num_requests++; + + if (verbose) { + break; + } + } + TraceEmissionMachine emission_machine(timestamps, ratios); + std::vector result = + tree_model.generate(requests, emission_machine); + assert(result.size() == requests.size()); + assert(result.size() == total_num_requests); + assert(result.size() == partition["eval_entries"].size()); + int i = 0; + for (auto &entry : partition["eval_entries"]) { + entry["original_response"] = entry["response"]; + entry["original_response_length"] = entry["response_length"]; + std::string ff_out = result[i].output_text; + int tot_length = result[i].output_text.length(); + entry["response"] = ff_out; + entry["response_length"] = result[i].output_tokens.size(); + entry["specinfer_decoding_steps"] = result[i].decoding_steps; + i++; + } + + // Write the modified JSON to a file + std::ofstream output_file(file_paths.trace_output_path); + if (output_file.is_open()) { + output_file << j.dump(2); + output_file.close(); + std::cout << "Modified JSON has been saved to " + << file_paths.trace_output_path << std::endl; + } else { + std::cerr << "Unable to open file for writing." << std::endl; + } + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + std::string header = + "llm,ssm,partition,expansion_degree,max_tree_depth,max_tree_width,max_" + "requests_per_batch,max_tokens_per_batch,request_per_second,is_warmup_" + "request,request_guid," + "request_step_idx," + "timestamp,speculation_start_timestamp,speculation_end_timestamp,num_" + "speculated_tokens,num_accepted_tokens,num_generated_tokens"; + // csv filepath + // create csv filepath and add header if it doesn't exist + + bool csv_file_exists = std::filesystem::exists(file_paths.csv_file_path); + if (!csv_file_exists) { + // Create new file and write header + std::ofstream file(file_paths.csv_file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_paths.csv_file_path + << std::endl; + assert(false); + } + file << header << "\n"; + file.close(); + } + + // Append the new row + std::ofstream file(file_paths.csv_file_path, std::ios::app); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_paths.csv_file_path + << std::endl; + } + + std::vector new_profiling_info = rm->get_new_profiling_info(); + for (auto const &info : new_profiling_info) { + file << model_metadata.model_names.llm_model_name + ","; + file << model_metadata.model_names.ssm_model_names[0] + ","; + file << target_partition + ","; + file << std::to_string(expansion_degree) + ","; + file << std::to_string(max_tree_depth) + ","; + file << std::to_string(max_tree_width) + ","; + file << std::to_string(max_requests_per_batch) + ","; + file << std::to_string(max_tokens_per_batch) + ","; + file << std::to_string(request_per_second) + ","; + bool is_warmup_request = + (info.request_guid - 1000000) < num_warmup_requests; + file << std::to_string(is_warmup_request) + ","; + file << info.request_guid << "," << info.request_step_idx << "," + << info.timestamp << "," << info.speculation_start_timestamp << "," + << info.speculation_end_timestamp << "," << info.num_speculated_tokens + << "," << info.num_accepted_tokens << "," << info.num_generated_tokens + << "\n"; + } + file.close(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index f7edfd769..ddf92cbf6 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -18,19 +18,21 @@ #include "models/llama.h" #include "models/mpt.h" #include "models/opt.h" +#include #include -#include +#include #include using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; -LegionRuntime::Logger::Category log_app("llama"); +Legion::Logger log_app("llama"); struct FilePaths { std::string cache_folder_path; std::string prompt_file_path; + std::string trace_file_path; std::string output_file_path; }; @@ -47,7 +49,8 @@ struct ModelMeta { std::string llm_weights_path; std::string llm_model_config_path; - int bos_token_id, eos_token_id; + int bos_token_id; + std::vector eos_token_ids; std::vector ssm_model_types; std::vector ssm_model_config_paths; @@ -60,10 +63,32 @@ void parse_input_args(char **argv, ModelNames &model_names, bool &use_full_precision, bool &verbose, + int &ssm_tp_degree, int &max_requests_per_batch, int &max_tokens_per_batch, + int &max_tokens_per_ssm_batch, + int &max_tokens_per_prefilling_batch, int &max_sequence_length, - int &expansion_degree) { + int &max_output_length, + size_t &max_kv_cache_size, + int &max_tree_width, + int &max_tree_depth, + int &expansion_degree, + bool &spec_sampling, + bool &do_sample, + int &sampling_seed, + bool &streaming_cache, + bool &slo_attainment_early_termination, + double &baseline_latency_ms, + double &ssm_spec_latency_ms, + double &llm_verify_latency_ms, + double &request_per_second, + bool &spec_infer_old_version, + bool &greedy_schedule, + bool &equal_schedule, + std::string &emission_file_path, + bool &add_special_tokens, + bool &eval_overhead_breakdown) { for (int i = 1; i < argc; i++) { // llm model name if (!strcmp(argv[i], "-llm-model")) { @@ -82,6 +107,10 @@ void parse_input_args(char **argv, model_names.ssm_model_names.push_back(ssm_model_name); continue; } + if (!strcmp(argv[i], "-ssm-tp-degree")) { + ssm_tp_degree = std::stoi(argv[++i]); + continue; + } // cache folder if (!strcmp(argv[i], "-cache-folder")) { paths.cache_folder_path = std::string(argv[++i]); @@ -92,6 +121,11 @@ void parse_input_args(char **argv, paths.prompt_file_path = std::string(argv[++i]); continue; } + // traces + if (!strcmp(argv[i], "-trace")) { + paths.trace_file_path = std::string(argv[++i]); + continue; + } // output file if (!strcmp(argv[i], "-output-file")) { paths.output_file_path = std::string(argv[++i]); @@ -114,14 +148,99 @@ void parse_input_args(char **argv, max_tokens_per_batch = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "--max-tokens-per-ssm-batch")) { + max_tokens_per_ssm_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-prefilling-batch")) { + max_tokens_per_prefilling_batch = std::stoi(argv[++i]); + continue; + } if (!strcmp(argv[i], "--max-sequence-length")) { max_sequence_length = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "--max-output-length")) { + max_output_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-kv-cache-size")) { + max_kv_cache_size = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tree-width")) { + max_tree_width = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tree-depth")) { + max_tree_depth = std::stoi(argv[++i]); + continue; + } if (!strcmp(argv[i], "--expansion-degree")) { expansion_degree = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "--sampling-seed")) { + sampling_seed = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--spec-sampling")) { + spec_sampling = true; + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--enable-streaming-cache")) { + streaming_cache = true; + continue; + } + if (!strcmp(argv[i], "--slo-attainment-early-termination")) { + slo_attainment_early_termination = true; + continue; + } + if (!strcmp(argv[i], "--baseline-latency-ms")) { + baseline_latency_ms = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--ssm-spec-latency-ms")) { + ssm_spec_latency_ms = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--llm-verify-latency-ms")) { + llm_verify_latency_ms = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--request-per-second")) { + request_per_second = std::stod(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--spec-infer-old-version")) { + spec_infer_old_version = true; + continue; + } + if (!strcmp(argv[i], "--greedy-schedule")) { + greedy_schedule = true; + continue; + } + if (!strcmp(argv[i], "--equal-schedule")) { + equal_schedule = true; + continue; + } + if (!strcmp(argv[i], "--emission-file-path")) { + emission_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--no-special-tokens")) { + add_special_tokens = false; + continue; + } + if (!strcmp(argv[i], "--eval-overhead-breakdown")) { + eval_overhead_breakdown = true; + continue; + } } if (paths.cache_folder_path.empty()) { char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); @@ -173,7 +292,8 @@ void get_model_meta(FilePaths &file_paths, model_metadata.llm_model_type = ModelType::UNKNOWN; auto architectures = llm_model_config["architectures"]; for (auto const &str : architectures) { - if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" || + str == "MistralForCausalLM") { model_metadata.llm_model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { @@ -191,10 +311,21 @@ void get_model_meta(FilePaths &file_paths, llm_model_config.find("bos_token_id") == llm_model_config.end() ? -1 : (int)llm_model_config.at("bos_token_id"); - model_metadata.eos_token_id = - llm_model_config.find("eos_token_id") == llm_model_config.end() - ? -1 - : (int)llm_model_config.at("eos_token_id"); + // model_metadata.eos_token_id = + // llm_model_config.find("eos_token_id") == llm_model_config.end() + // ? -1 + // : (int)llm_model_config.at("eos_token_id"); + if (llm_model_config.find("eos_token_id") != llm_model_config.end()) { + if (llm_model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : llm_model_config["eos_token_id"]) { + model_metadata.eos_token_ids.push_back(eos_token_id); + } + } else { + model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]); + } + } else { + model_metadata.eos_token_ids.push_back(-1); + } for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) { std::string ssm_config_path = join_path({file_paths.cache_folder_path, @@ -223,7 +354,8 @@ void get_model_meta(FilePaths &file_paths, ModelType ssm_model_type = ModelType::UNKNOWN; auto architectures = ssm_model_config["architectures"]; for (auto const &str : architectures) { - if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" || + str == "MistralForCausalLM") { ssm_model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { @@ -241,15 +373,15 @@ void get_model_meta(FilePaths &file_paths, ssm_model_config.find("bos_token_id") == ssm_model_config.end() ? -1 : (int)ssm_model_config.at("bos_token_id"); - int ssm_eos_id = - ssm_model_config.find("eos_token_id") == ssm_model_config.end() - ? -1 - : (int)ssm_model_config.at("eos_token_id"); - if (ssm_bos_id != model_metadata.bos_token_id || - ssm_eos_id != model_metadata.eos_token_id) { - printf("Warning: bos/eos token id mismatch between LLM and one of the " - "SSMs!\n"); - } + // int ssm_eos_id = + // ssm_model_config.find("eos_token_id") == ssm_model_config.end() + // ? -1 + // : (int)ssm_model_config.at("eos_token_id"); + // if (ssm_bos_id != model_metadata.bos_token_id || + // ssm_eos_id != model_metadata.eos_token_id) { + // printf("Warning: bos/eos token id mismatch between LLM and one of the " + // "SSMs!\n"); + // } model_metadata.ssm_model_types.push_back(ssm_model_type); model_metadata.ssm_model_config_paths.push_back(ssm_config_path); model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path); @@ -274,11 +406,34 @@ void FlexFlow::top_level_task(Task const *task, ModelMeta model_metadata; bool use_full_precision = false; bool verbose = false; - int max_requests_per_batch = 16; - int max_tokens_per_batch = 256; - int max_sequence_length = 1024; - int max_spec_tree_token_num = 23; + int ssm_tp_degree = 1; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_tokens_per_ssm_batch = -1; + int max_tokens_per_prefilling_batch = -1; + int max_sequence_length = 512; + int max_output_length = 512; + size_t max_kv_cache_size = 0; // if 0, then use the default value int expansion_degree = 3; + int max_tree_depth = 8; + int max_tree_width = 16; + RequestManager::DecodingMode decoding_mode = + RequestManager::SPECULATIVE_DECODING; + bool spec_sampling = false; + bool do_sample = false; + int sampling_seed = 0; + bool streaming_cache = false; + bool slo_attainment_early_termination = false; + double baseline_latency_ms = 50; + double ssm_spec_latency_ms = 20; + double llm_verify_latency_ms = 50; + double request_per_second = 1.0; + bool spec_infer_old_version = false; + bool greedy_schedule = false; + bool equal_schedule = false; + bool add_special_tokens = true; + bool eval_overhead_breakdown = false; + std::string emission_file_path; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -289,37 +444,85 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.model_names, use_full_precision, verbose, + ssm_tp_degree, max_requests_per_batch, max_tokens_per_batch, + max_tokens_per_ssm_batch, + max_tokens_per_prefilling_batch, max_sequence_length, - expansion_degree); + max_output_length, + max_kv_cache_size, + max_tree_width, + max_tree_depth, + expansion_degree, + spec_sampling, + do_sample, + sampling_seed, + streaming_cache, + slo_attainment_early_termination, + baseline_latency_ms, + ssm_spec_latency_ms, + llm_verify_latency_ms, + request_per_second, + spec_infer_old_version, + greedy_schedule, + equal_schedule, + emission_file_path, + add_special_tokens, + eval_overhead_breakdown); + if (max_tokens_per_ssm_batch == -1) { + max_tokens_per_ssm_batch = max_tokens_per_batch; + } + if (max_tokens_per_prefilling_batch == -1) { + max_tokens_per_prefilling_batch = max_tokens_per_batch; + } get_model_meta(file_paths, model_metadata, use_full_precision); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); + assert(ssm_tp_degree >= 1 && + ssm_tp_degree <= ffconfig.numNodes * ffconfig.workersPerNode); + + // Sanity check for SpecInfer old version + if (spec_infer_old_version) { + assert(max_tree_depth = 8); + assert(max_tree_width >= 3); + // Total verified tokens + assert(max_tokens_per_batch >= max_requests_per_batch * 21); + } // Create SentencePiece tokenizer or OPT tokenizer - GenerationConfig generationConfig; + srand(sampling_seed); + GenerationConfig generationConfig(do_sample, 0.8, 0.6, spec_sampling, 16); InferenceManager *im = InferenceManager::get_inference_manager(); RequestManager *rm = RequestManager::get_request_manager(); rm->set_max_requests_per_batch(max_requests_per_batch); rm->set_max_tokens_per_batch(max_tokens_per_batch); - rm->set_max_spec_tree_token_num(max_spec_tree_token_num); + rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch); + rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch); rm->set_max_sequence_length(max_sequence_length); + rm->set_max_output_length(max_output_length); + rm->set_max_kv_cache_size(max_kv_cache_size); + rm->set_max_tree_depth(max_tree_depth); + rm->set_max_tree_width(max_tree_width); + rm->set_verbose(verbose); + rm->set_streaming_cache(streaming_cache); rm->register_tokenizer(model_metadata.llm_model_type, model_metadata.bos_token_id, - model_metadata.eos_token_id, + model_metadata.eos_token_ids, model_metadata.llm_tokenizer_path); + rm->set_decoding_mode(decoding_mode); + rm->set_slo_violation_early_termination(slo_attainment_early_termination); + rm->set_baseline_latency(baseline_latency_ms); + rm->set_ssm_spec_latency(ssm_spec_latency_ms); + rm->set_llm_verify_latency(llm_verify_latency_ms); + rm->set_spec_infer_old_version(spec_infer_old_version); + rm->set_greedy_schedule(greedy_schedule); + rm->set_equal_schedule(equal_schedule); rm->register_output_filepath(file_paths.output_file_path); - - // first decoding step: 3 results - if (expansion_degree != -1) { - rm->push_spec_infer_tree_width(1); - rm->push_spec_infer_tree_width(1); - rm->push_spec_infer_tree_width(expansion_degree); - } + rm->set_eval_overhead_breakdown(eval_overhead_breakdown); // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); @@ -329,6 +532,7 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_weights_path, TREE_VERIFY_MODE, generationConfig, + false, use_full_precision); } else if (model_metadata.llm_model_type == ModelType::OPT) { OPT::create_opt_model(tree_model, @@ -358,8 +562,12 @@ void FlexFlow::top_level_task(Task const *task, std::vector ssm_model_ids; std::vector ssm_models; FFConfig bm_config = ffconfig; - bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree = - bm_config.pipeline_parallelism_degree = 1; + std::cout << "SSM TP Degree: " << ssm_tp_degree << std::endl; + // bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree = + // bm_config.pipeline_parallelism_degree = 1; + bm_config.data_parallelism_degree = 1; + bm_config.tensor_parallelism_degree = ssm_tp_degree; + bm_config.pipeline_parallelism_degree = 1; for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { FFModel beam_model(bm_config); ssm_models.push_back(beam_model); @@ -371,27 +579,28 @@ void FlexFlow::top_level_task(Task const *task, LLAMA::create_llama_model(beam_model, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], - BEAM_SEARCH_MODE, + TREE_SEARCH_MODE, generationConfig, + streaming_cache, use_full_precision); } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) { OPT::create_opt_model(beam_model, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], - BEAM_SEARCH_MODE, + TREE_SEARCH_MODE, use_full_precision); } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) { FALCON::create_falcon_model( beam_model, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], - BEAM_SEARCH_MODE, + TREE_SEARCH_MODE, use_full_precision); } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) { MPT::create_mpt_model(beam_model, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], - BEAM_SEARCH_MODE, + TREE_SEARCH_MODE, generationConfig, use_full_precision); } else { @@ -404,25 +613,79 @@ void FlexFlow::top_level_task(Task const *task, rm->start_background_server(&tree_model); // Register requests from prompt file - int total_num_requests = 0; { - using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - - std::vector prompts; - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - total_num_requests++; - prompts.push_back(text); - // tree_model.generate(text, 128 /*max_sequence_length*/); - } - tree_model.generate(prompts, 128 /*max_sequence_length*/); + std::vector requests; + std::vector results; + + if (!file_paths.prompt_file_path.empty()) { + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + // Parse slo_ratios + std::vector> slo_ratios; + if (prompt_json[0].contains("slo_ratios")) { + for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) { + slo_ratios.emplace_back(std::stod(key), value.get()); + } + } + double total = std::accumulate( + slo_ratios.begin(), + slo_ratios.end(), + 0.0, + [](double sum, std::pair const &pair) { + return sum + pair.second; + }); + if (std::abs(total - 1.0) > 1e-6) { + std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: " + << total << std::endl; + assert(false); + } + for (size_t i = 1; i < prompt_json.size(); ++i) { + requests.push_back( + GenerationRequest(prompt_json[i]["prompt"].get(), + -1.0, + 0, + add_special_tokens)); + } + PoissonEmissionMachine emission_machine(request_per_second, slo_ratios); + // ConstantEmissionMachine emission_machine(-1, slo_ratios); + results = tree_model.generate(requests, emission_machine); + } else if (!file_paths.trace_file_path.empty()) { + std::ifstream file_handle(file_paths.trace_file_path); + assert(file_handle.good() && "Trace file does not exist."); + json trace_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + std::vector timestamps, ratios; + for (auto const &json_obj : trace_json) { + EmissionTrace trace(json_obj); + requests.push_back( + GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens)); + timestamps.push_back(trace.emission_time_ms); + ratios.push_back(trace.slo_ratio); + } + timestamps.erase(timestamps.begin()); + timestamps.push_back(timestamps.back() + 1000.0); + TraceEmissionMachine emission_machine(timestamps, ratios); + results = tree_model.generate(requests, emission_machine); + } else { + assert(false && "No prompt or trace file provided."); + } + + // output generation results as json + if (!emission_file_path.empty()) { + json output_json; + for (size_t i = 0; i < results.size(); ++i) { + EmissionTrace trace(results[i]); + output_json.push_back(trace.to_json()); + } + std::ofstream emission_file_handle(emission_file_path); + emission_file_handle << output_json.dump(2) << std::endl; + } } // terminate the request manager by stopping the background thread diff --git a/inference/trace_generator/CMakeLists.txt b/inference/trace_generator/CMakeLists.txt new file mode 100644 index 000000000..f18eb712c --- /dev/null +++ b/inference/trace_generator/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlow_TraceGenerator) +set(project_target trace_generator) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + trace_generator.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target} ${CPU_SRC}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target} ${CPU_SRC}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/inference/trace_generator/Makefile b/inference/trace_generator/Makefile new file mode 100644 index 000000000..0e4b79f51 --- /dev/null +++ b/inference/trace_generator/Makefile @@ -0,0 +1,37 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= llama_pipeline +# List all the application source files here +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc new file mode 100644 index 000000000..0b9285a0c --- /dev/null +++ b/inference/trace_generator/trace_generator.cc @@ -0,0 +1,558 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +struct FilePaths { + std::string cache_folder_path; + std::string log_file_path; + std::string emission_file_path; +}; + +struct Prompts { + std::vector file_paths; + std::vector proportions; + std::vector slo_ratios; + + std::vector jsons; + std::vector idxs; +}; + +struct ModelNames { + std::string llm_model_name; + std::vector ssm_model_names; +}; + +struct ModelMeta { + ModelNames model_names; + + ModelType llm_model_type; + std::string llm_tokenizer_path; + std::string llm_weights_path; + std::string llm_model_config_path; + + int bos_token_id; + std::vector eos_token_ids; + + std::vector ssm_model_types; + std::vector ssm_model_config_paths; + std::vector ssm_model_weights_paths; +}; + +template +std::vector split_by_comma(std::string const &input) { + std::vector result; + std::stringstream ss(input); + std::string item; + while (std::getline(ss, item, ',')) { + std::stringstream item_stream(item); + if constexpr (std::is_same::value) { + double value; + if (item_stream >> value) { + result.push_back(value); + } + } else if constexpr (std::is_same::value) { + result.push_back(item); + } + } + return result; +} + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + Prompts &prompts, + ModelNames &model_names, + bool &use_full_precision, + bool &verbose, + int &max_sequence_length, + int &max_output_length, + size_t &max_kv_cache_size, + double &scaling_factor) { + for (int i = 1; i < argc; i++) { + // llm model name + if (!strcmp(argv[i], "-llm-model")) { + model_names.llm_model_name = std::string(argv[++i]); + for (char &c : model_names.llm_model_name) { + c = std::tolower(c); + } + continue; + } + // ssm models names + if (!strcmp(argv[i], "-ssm-model")) { + std::string ssm_model_name = std::string(argv[++i]); + for (char &c : ssm_model_name) { + c = std::tolower(c); + } + model_names.ssm_model_names.push_back(ssm_model_name); + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "--prompt-files")) { + prompts.file_paths = split_by_comma(std::string(argv[++i])); + continue; + } + if (!strcmp(argv[i], "--prompt-proportions")) { + prompts.proportions = split_by_comma(std::string(argv[++i])); + continue; + } + if (!strcmp(argv[i], "--prompt-slo-ratios")) { + prompts.slo_ratios = split_by_comma(std::string(argv[++i])); + continue; + } + // traces + if (!strcmp(argv[i], "-log")) { + paths.log_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--emission-file-path")) { + paths.emission_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-output-length")) { + max_output_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-kv-cache-size")) { + max_kv_cache_size = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--scaling-factor")) { + scaling_factor = std::stod(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void get_model_meta(FilePaths &file_paths, + ModelMeta &model_metadata, + bool use_full_precision) { + if (model_metadata.model_names.llm_model_name.empty() || + model_metadata.model_names.ssm_model_names.size() == 0) { + assert(false && "SpecInfer needs at least one LLM and one SSM for " + "speculative inference"); + } + model_metadata.llm_model_config_path = + join_path({file_paths.cache_folder_path, + "configs", + model_metadata.model_names.llm_model_name, + "config.json"}); + model_metadata.llm_tokenizer_path = + join_path({file_paths.cache_folder_path, + "tokenizers", + model_metadata.model_names.llm_model_name}); + model_metadata.llm_weights_path = + join_path({file_paths.cache_folder_path, + "weights", + model_metadata.model_names.llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + + std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path); + if (!llm_config_file_handle.good()) { + std::cout << "LLM Model config file " + << model_metadata.llm_model_config_path << " not found." + << std::endl; + assert(false); + } + json llm_model_config = json::parse(llm_config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + model_metadata.llm_model_type = ModelType::UNKNOWN; + auto architectures = llm_model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" || + str == "MistralForCausalLM") { + model_metadata.llm_model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_metadata.llm_model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_metadata.llm_model_type = ModelType::FALCON; + break; + } else if (str == "MPTForCausalLM") { + model_metadata.llm_model_type = ModelType::MPT; + break; + } + } + model_metadata.bos_token_id = + llm_model_config.find("bos_token_id") == llm_model_config.end() + ? -1 + : (int)llm_model_config.at("bos_token_id"); + // model_metadata.eos_token_id = + // llm_model_config.find("eos_token_id") == llm_model_config.end() + // ? -1 + // : (int)llm_model_config.at("eos_token_id"); + if (llm_model_config.find("eos_token_id") != llm_model_config.end()) { + if (llm_model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : llm_model_config["eos_token_id"]) { + model_metadata.eos_token_ids.push_back(eos_token_id); + } + } else { + model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]); + } + } else { + model_metadata.eos_token_ids.push_back(-1); + } + + for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) { + std::string ssm_config_path = join_path({file_paths.cache_folder_path, + "configs", + ssm_model_name, + "config.json"}); + std::string ssm_tokenizer_path = + join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name}); + std::string ssm_weights_path = + join_path({file_paths.cache_folder_path, + "weights", + ssm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + + std::ifstream ssm_config_file_handle(ssm_config_path); + if (!ssm_config_file_handle.good()) { + std::cout << "SSM Model config file " << ssm_config_path << " not found." + << std::endl; + assert(false); + } + json ssm_model_config = json::parse(ssm_config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + ModelType ssm_model_type = ModelType::UNKNOWN; + auto architectures = ssm_model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" || + str == "MistralForCausalLM") { + ssm_model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + ssm_model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM") { + ssm_model_type = ModelType::FALCON; + break; + } else if (str == "MPTForCausalLM") { + ssm_model_type = ModelType::MPT; + break; + } + } + int ssm_bos_id = + ssm_model_config.find("bos_token_id") == ssm_model_config.end() + ? -1 + : (int)ssm_model_config.at("bos_token_id"); + // int ssm_eos_id = + // ssm_model_config.find("eos_token_id") == ssm_model_config.end() + // ? -1 + // : (int)ssm_model_config.at("eos_token_id"); + // if (ssm_bos_id != model_metadata.bos_token_id || + // ssm_eos_id != model_metadata.eos_token_id) { + // printf("Warning: bos/eos token id mismatch between LLM and one of the " + // "SSMs!\n"); + // } + model_metadata.ssm_model_types.push_back(ssm_model_type); + model_metadata.ssm_model_config_paths.push_back(ssm_config_path); + model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path); + } + + assert(model_metadata.llm_model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + for (auto mt : model_metadata.ssm_model_types) { + if (mt == ModelType::UNKNOWN) { + assert(false && "One of the SSM model types passed is invalid."); + } + } +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + FilePaths file_paths; + Prompts prompts; + ModelMeta model_metadata; + bool use_full_precision = false; + bool verbose = false; + int max_sequence_length = 256; + int max_output_length = 512; + size_t max_kv_cache_size = 0; + double scaling_factor = 1.0; + + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_tokens_per_ssm_batch = -1; + int max_tokens_per_prefilling_batch = -1; + int expansion_degree = 3; + int max_tree_depth = 8; + int max_tree_width = 16; + RequestManager::DecodingMode decoding_mode = + RequestManager::SPECULATIVE_DECODING; + bool spec_sampling = false; + bool do_sample = false; + int sampling_seed = 0; + bool streaming_cache = false; + bool slo_attainment_early_termination = false; + double baseline_latency_ms = 50; + double ssm_spec_latency_ms = 20; + double llm_verify_latency_ms = 50; + double request_per_second = 1.0; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + prompts, + model_metadata.model_names, + use_full_precision, + verbose, + max_sequence_length, + max_output_length, + max_kv_cache_size, + scaling_factor); + if (max_tokens_per_ssm_batch == -1) { + max_tokens_per_ssm_batch = max_tokens_per_batch; + } + if (max_tokens_per_prefilling_batch == -1) { + max_tokens_per_prefilling_batch = max_tokens_per_batch; + } + + assert(prompts.file_paths.size() == prompts.proportions.size() && + prompts.file_paths.size() == prompts.slo_ratios.size()); + double total = std::accumulate( + prompts.proportions.begin(), + prompts.proportions.end(), + 0.0, + [](double sum, double proportion) { return sum + proportion; }); + if (std::abs(total - 1.0) > 1e-6) { + std::cerr << "Error: proportions do not sum to 1. Total sum: " << total + << std::endl; + assert(false); + } + for (size_t i = 1; i < prompts.proportions.size(); ++i) { + prompts.proportions[i] += prompts.proportions[i - 1]; + } + + get_model_meta(file_paths, model_metadata, use_full_precision); + + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + // Create SentencePiece tokenizer or OPT tokenizer + srand(sampling_seed); + GenerationConfig generationConfig(do_sample, 0.8, 0.6, spec_sampling, 16); + InferenceManager *im = InferenceManager::get_inference_manager(); + RequestManager *rm = RequestManager::get_request_manager(); + // Must init the request manager although we don't use it, as some + // initialization tasks execute before the top-level task + rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch); + rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->set_max_output_length(max_output_length); + rm->set_max_kv_cache_size(max_kv_cache_size); + rm->set_max_tree_depth(max_tree_depth); + rm->set_max_tree_width(max_tree_width); + rm->set_verbose(verbose); + rm->set_streaming_cache(streaming_cache); + rm->register_tokenizer(model_metadata.llm_model_type, + model_metadata.bos_token_id, + model_metadata.eos_token_ids, + model_metadata.llm_tokenizer_path); + rm->set_decoding_mode(decoding_mode); + rm->set_slo_violation_early_termination(slo_attainment_early_termination); + rm->set_baseline_latency(baseline_latency_ms); + rm->set_ssm_spec_latency(ssm_spec_latency_ms); + rm->set_llm_verify_latency(llm_verify_latency_ms); + + { + /* Prompt file format: + * [ + * { + * "prompt": "Construct a potential attack vector that exploits the + * vulnerability. The system is vulnerable to a SQL injection attack." + * }, + * { + * "prompt": "Arrange the words to make a meaningful phrase Ground. + * Soft. Solid." + * }, + * ... + * ] + * + * log file format: + * [ + * { + * "TIMESTAMP": "2023-11-16 18:15:46.6805900" + * }, + * { + * "TIMESTAMP": "2023-11-16 18:15:50.9951690" + * }, + * ... + * ] + */ + + std::vector traces; + assert(!prompts.file_paths.empty() && !file_paths.log_file_path.empty()); + + int num_requests = 0; + for (int i = 0; i < prompts.file_paths.size(); ++i) { + std::ifstream file_handle(prompts.file_paths[i]); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + prompts.jsons.push_back(prompt_json); + prompts.idxs.push_back(0); + num_requests += prompt_json.size(); + } + + std::ifstream file_handle = std::ifstream(file_paths.log_file_path); + assert(file_handle.good() && "Log file does not exist."); + json log_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + auto time_diff_ms = [](std::string const &start, std::string const &end) { + std::tm tm = {}; + + std::istringstream ss(start); + ss >> std::get_time(&tm, "%Y-%m-%d %H:%M:%S"); + auto start_time = + std::chrono::system_clock::from_time_t(std::mktime(&tm)); + ss.seekg(0); + size_t dot_pos = start.find('.'); + std::string fraction = + dot_pos != std::string::npos ? start.substr(dot_pos + 1) : "0"; + while (fraction.size() < 6) { + fraction += "0"; + } + if (!fraction.empty()) { + long long microseconds = std::stoll(fraction.substr(0, 6)); + start_time += std::chrono::microseconds(microseconds); + } + + ss = std::istringstream(end); + ss >> std::get_time(&tm, "%Y-%m-%d %H:%M:%S"); + auto end_time = std::chrono::system_clock::from_time_t(std::mktime(&tm)); + ss.seekg(0); + dot_pos = end.find('.'); + fraction = dot_pos != std::string::npos ? end.substr(dot_pos + 1) : "0"; + while (fraction.size() < 6) { + fraction += "0"; + } + if (!fraction.empty()) { + long long microseconds = std::stoll(fraction.substr(0, 6)); + end_time += std::chrono::microseconds(microseconds); + } + + return std::chrono::duration_cast(end_time - + start_time) + .count() / + 1000.0; + }; + + num_requests = min((unsigned long)num_requests, log_json.size()); + std::string start_time = log_json[0]["TIMESTAMP"].get(); + srand(time(0)); + for (int i = 0; i < num_requests; ++i) { + // sample from proportions + double sample = (double)rand() / RAND_MAX; + int ptr = 0; + for (size_t j = 0; j < prompts.proportions.size(); ++j) { + if (sample < prompts.proportions[j]) { + ptr = j; + break; + } + } + int &idx = prompts.idxs[ptr]; + std::string prompt = prompts.jsons[ptr][idx]["prompt"].get(); + idx = (idx + 1) % prompts.jsons[ptr].size(); + std::vector input_tokens = rm->tokenize(prompt); + std::string timestamp = log_json[i]["TIMESTAMP"].get(); + EmissionTrace trace(prompt, + input_tokens.size(), + max_output_length, + prompts.slo_ratios[ptr], + time_diff_ms(start_time, timestamp) * scaling_factor); + traces.push_back(trace); + } + + // output generation results as json + assert(!file_paths.emission_file_path.empty()); + json output_json; + for (EmissionTrace const &trace : traces) { + output_json.push_back(trace.to_json()); + } + std::ofstream emission_file_handle(file_paths.emission_file_path); + emission_file_handle << output_json.dump(2) << std::endl; + } + + // float* data + std::cout << "----------trace generated--------------" << std::endl; +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/utils/mem_analysis.py b/inference/utils/mem_analysis.py new file mode 100644 index 000000000..5168e7003 --- /dev/null +++ b/inference/utils/mem_analysis.py @@ -0,0 +1,115 @@ +import pandas as pd +import re, os, math, argparse + +# Usage: +# Run FlexFlow code with --log-instance-creation flag and redirect the output to a file +# python mem_analysis.py --file_path /path/to/log_file.txt + +def extract_data(file_path): + # Define regex patterns + memory_allocator_pattern = re.compile(r'MemoryAllocator.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task_name: (.+)') + mapper_pattern = re.compile(r'Mapper.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task: (.+)') + parallel_tensor_pattern = re.compile(r'ParallelTensor.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task_name: (.+)') + + # Initialize lists to store extracted data + memory_kinds = [] + memory_ids = [] + sizes = [] + capacities = [] + tasks = [] + + # Read the file + with open(file_path, 'r') as file: + for line in file: + if 'MemoryAllocator' in line: + match = memory_allocator_pattern.search(line) + if match: + memory_kinds.append(match.group(1)) + memory_ids.append(match.group(2)) + sizes.append(int(match.group(3))) + capacities.append(int(match.group(4))) + tasks.append(match.group(5)) + elif 'Mapper' in line: + match = mapper_pattern.search(line) + if match: + memory_kinds.append(match.group(1)) + memory_ids.append(match.group(2)) + sizes.append(int(match.group(3))) + capacities.append(int(match.group(4))) + tasks.append(match.group(5)) + elif 'ParallelTensor' in line: + match = parallel_tensor_pattern.search(line) + if match: + memory_kinds.append(match.group(1)) + memory_ids.append(match.group(2)) + sizes.append(int(match.group(3))) + capacities.append(int(match.group(4))) + tasks.append(match.group(5)) + + # Create a DataFrame + df = pd.DataFrame({ + 'Memory Kind': memory_kinds, + 'Device ID': memory_ids, + 'Size': sizes, + 'Capacity': capacities, + 'Task': tasks + }) + + return df + +def human_readable_size(size_bytes): + if size_bytes == 0: + return "0B" + size_name = ("B", "KB", "MB", "GB", "TB") + i = int(math.floor(math.log(size_bytes, 1000))) + p = math.pow(1000, i) + s = round(size_bytes / p, 2) + return f"{s} {size_name[i]}" + +def print_grouped_by_device(df): + grouped_df = df.groupby(['Memory Kind', 'Device ID']).agg({'Size': 'sum', 'Capacity': 'first'}) + # Check that all entries that share the same memory id have the same capacity + for (memory_kind, memory_id), group in df.groupby(['Memory Kind', 'Device ID']): + capacities = group['Capacity'].unique() + if len(capacities) > 1: + print(f"Warning: Device ID {memory_id} in Memory Kind {memory_kind} has multiple capacities: {capacities}") + # Convert sizes to human-readable format + grouped_df['Size'] = grouped_df['Size'].apply(human_readable_size) + grouped_df['Capacity'] = grouped_df['Capacity'].apply(human_readable_size) + print("############## Memory usage (by device) ##############") + print(grouped_df) + +def print_grouped_by_task(df): + # Group by 'Memory Kind', 'Device ID', and 'Task', and sum the 'Size' column + task_grouped_df = df.groupby(['Memory Kind', 'Device ID', 'Task']).agg({'Size': 'sum'}).reset_index() + # Sort the DataFrame by 'Memory Kind', 'Device ID', and 'Size' in descending order + task_grouped_df = task_grouped_df.sort_values(by=['Memory Kind', 'Device ID', 'Size'], ascending=[True, True, False]) + print("\n\n############## Memory usage (by task) ##############") + for (memory_kind, memory_id), group in task_grouped_df.groupby(['Memory Kind', 'Device ID']): + print("\n-------------------------------------------------------------") + print(f"Memory Kind: {memory_kind}, Device ID: {memory_id}") + group['Size'] = group['Size'].apply(human_readable_size) + print(group[['Task', 'Size']].to_string(index=False)) + print("-------------------------------------------------------------") + +def print_notes(): + print("\n\n############## Notes ##############") + print("* Check that each GPU retains enough capacity in GPU_FB_MEM to hold the weights from Z_COPY_MEM (total size / tp_degree)") + print("* Check whether the memory usage is balanced across devices") + print("* `set_tensor` generally refers to the memory used to load the model weights") + print() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Analyze memory usage from a FlexFlow log file.') + parser.add_argument('--file_path', '-fp', type=str, help='Path to the input log file') + args = parser.parse_args() + + # Change working directory to the directory holding the script + # script_dir = os.path.dirname(os.path.abspath(__file__)) + # os.chdir(script_dir) + + df = extract_data(args.file_path) + print_grouped_by_device(df) + print_grouped_by_task(df) + + print_notes() \ No newline at end of file diff --git a/inference/utils/process_prompts.py b/inference/utils/process_prompts.py new file mode 100644 index 000000000..902662191 --- /dev/null +++ b/inference/utils/process_prompts.py @@ -0,0 +1,28 @@ +import json +import argparse + +def read_prompts_from_json(file_path): + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + return data + +def write_prompts_to_json(file_path, data): + with open(file_path, 'w', encoding='utf-8') as file: + json.dump(data, file, ensure_ascii=False, indent=4) + +def process_prompts(input_file, output_file): + prompts = read_prompts_from_json(input_file) + processed_prompts = [{"prompt": prompt, "slo_ratio": 1.0} for prompt in prompts] + write_prompts_to_json(output_file, processed_prompts) + +def main(): + parser = argparse.ArgumentParser(description="Process prompts JSON file and generate slo_ratio for each prompt.") + parser.add_argument('input_file', type=str, help="Input JSON file containing prompts.") + parser.add_argument('output_file', type=str, help="Output JSON file to save the processed prompts.") + + args = parser.parse_args() + + process_prompts(args.input_file, args.output_file) + +if __name__ == '__main__': + main() diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index 2820cf485..24bb15889 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -40,6 +40,7 @@ "zero_copy_memory_per_node": "-ll:zsize", "num_cpus": "-ll:cpu", "legion_utility_processors": "-ll:util", + "log_instance_creation": "--log-instance-creation", "profiling": "--profiling", "benchmarking": "--benchmarking", "inference_debugging": "--inference-debugging", diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 14cf4eebf..b17f36f72 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -15,6 +15,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals +from dataclasses import dataclass import warnings import numpy as np from .flexflow_logger import fflogger @@ -1241,6 +1242,21 @@ def get_weights(self, ffmodel): assert ret_val == True return np_array +# ----------------------------------------------------------------------- +# RotaryEmbeddingMeta +# ----------------------------------------------------------------------- + + +@dataclass +class RotaryEmbeddingMeta: + apply_rotary_embedding: bool = False + rope_theta: float = 10000.0 + rope_type: str = "default" + factor: float = 8.0 + low_freq_factor: float = 1.0 + high_freq_factor: float = 4.0 + original_max_position_embeddings: int = 8192 + # ----------------------------------------------------------------------- # FFModel @@ -2046,10 +2062,10 @@ def add_bias_residual_layer_norm( handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM) - def sigmoid_silu_multi(self, input1, input2, name=None): + def sigmoid_silu_multi(self, input1, input2, intermediate_size, name=None): c_name = get_c_name(name) handle = ffc().flexflow_model_add_sigmoid_silu_multi( - self.handle, input1.handle, input2.handle, c_name + self.handle, input1.handle, input2.handle, intermediate_size, c_name ) self.add_layer(OpType.SIGMOID_SILU_MULTI, name) return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI) @@ -2676,7 +2692,7 @@ def inc_multihead_self_attention( add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -2720,8 +2736,8 @@ def inc_multihead_self_attention( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -2756,7 +2772,13 @@ def inc_multihead_self_attention( add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -2779,11 +2801,12 @@ def spec_inc_multihead_self_attention( add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, position_bias=False, + streaming_cache=False, name=None, ): """Defines the MultiHead Attention operation as described in Attention Is All You Need @@ -2823,8 +2846,8 @@ def spec_inc_multihead_self_attention( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -2859,11 +2882,18 @@ def spec_inc_multihead_self_attention( add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, position_bias, + streaming_cache, c_name, ) self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) @@ -2882,7 +2912,7 @@ def inc_multihead_self_attention_verify( add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -2926,8 +2956,8 @@ def inc_multihead_self_attention_verify( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -2962,7 +2992,13 @@ def inc_multihead_self_attention_verify( add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -2972,7 +3008,7 @@ def inc_multihead_self_attention_verify( self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) - def inc_multiquery_self_attention( + def groupquery_self_attention( self, input, embed_dim, @@ -2986,11 +3022,12 @@ def inc_multiquery_self_attention( add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, position_bias=False, + streaming_cache=False, name=None, ): """Defines the multi-query head attention, which allows a different number of Q and KV heads, @@ -3033,8 +3070,8 @@ def inc_multiquery_self_attention( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -3056,7 +3093,7 @@ def inc_multiquery_self_attention( c_name = get_c_name(name) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention( + handle = ffc().flexflow_model_add_groupquery_self_attention( self.handle, input.handle, embed_dim, @@ -3070,11 +3107,18 @@ def inc_multiquery_self_attention( add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, position_bias, + streaming_cache, c_name, ) self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) @@ -3094,7 +3138,7 @@ def spec_inc_multiquery_self_attention( add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -3141,8 +3185,8 @@ def spec_inc_multiquery_self_attention( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -3178,7 +3222,13 @@ def spec_inc_multiquery_self_attention( add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -3202,7 +3252,7 @@ def inc_multiquery_self_attention_verify( add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -3249,8 +3299,8 @@ def inc_multiquery_self_attention_verify( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -3286,7 +3336,13 @@ def inc_multiquery_self_attention_verify( add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -4208,6 +4264,14 @@ def set_max_sequence_length(self, max_length): return ffc().flexflow_request_manager_set_max_sequence_length( self.handle, max_length) + def set_max_output_length(self, max_length): + return ffc().flexflow_request_manager_set_max_output_length( + self.handle, max_length) + + def set_max_kv_cache_size(self, max_size): + return ffc().flexflow_request_manager_set_max_kv_cache_size( + self.handle, max_size) + def start_server(self, model): return ffc().flexflow_request_manager_start_background_server( self.handle, model.handle @@ -4257,7 +4321,7 @@ def __init__( num_q_heads, num_kv_heads, hidden_dim, - qkv_inner_dim, + head_dim, tensor_parallelism_degree, use_full_precision ): @@ -4267,7 +4331,7 @@ def __init__( num_q_heads, num_kv_heads, hidden_dim, - qkv_inner_dim, + head_dim, tensor_parallelism_degree, use_full_precision ) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 5af077273..df630462a 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -200,7 +200,7 @@ def init( if configs_dict.get("offload", None) is None: configs_dict["offload"] = False if configs_dict.get("offload_reserve_space_size", None) is None: - configs_dict["offload_reserve_space_size"] = 1024**2 + configs_dict["offload_reserve_space_size"] = 8 * 1024**3 if configs_dict.get("use_4bit_quantization", None) is None: configs_dict["use_4bit_quantization"] = False if configs_dict.get("use_8bit_quantization", None) is None: diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 7a55da26e..ab3bc4623 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -41,6 +41,17 @@ def __init__(self, hf_config): ) self.parallel_attn = hf_config.parallel_attn self.vocab_size = hf_config.vocab_size + self.rotary_embedding_meta = RotaryEmbeddingMeta( + apply_rotary_embedding=True, + rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0, + ) + if "rope_scaling" in hf_config.__dict__: + if hf_config.rope_scaling is not None: + self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"] + self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"] + self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"] + self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"] + self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"] # Standardized FlexFlow num heads fields below self.num_attention_heads = self.n_head self.num_key_value_heads = self.n_head_kv @@ -54,8 +65,6 @@ def __init__( ffconfig, hf_config, data_type, - # max_batch_size=1, - # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -63,11 +72,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.falcon_config = FalconConfig(hf_config) - # self.falcon_config.max_seq_length = max_seq_length - # self.falcon_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -124,7 +130,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm", + name=f"layers.{i}.input_layernorm", ) else: token, att_norm = ffmodel.residual_layer_norm( @@ -135,7 +141,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm", + name=f"layers.{i}.input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -152,8 +158,8 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding - name=f"layers_{i}_attention", + self.falcon_config.rotary_embedding_meta, + name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -169,11 +175,11 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding - name=f"layers_{i}_attention", + self.falcon_config.rotary_embedding_meta, + name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multiquery_self_attention( + mha = ffmodel.groupquery_self_attention( att_norm, self.falcon_config.hidden_size, self.falcon_config.n_head, @@ -186,8 +192,8 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding - name=f"layers_{i}_attention", + self.falcon_config.rotary_embedding_meta, + name=f"layers.{i}.self_attention", ) else: assert False @@ -197,7 +203,7 @@ def build_model(self, max_tokens_per_batch): self.falcon_config.hidden_size * 4, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_h_to_4h", + name=f"layers.{i}.mlp.dense_h_to_4h", ) dense_h_to_4h = ffmodel.gelu(dense_h_to_4h) mlp_output = ffmodel.dense( @@ -205,7 +211,7 @@ def build_model(self, max_tokens_per_batch): self.falcon_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_4h_to_h", + name=f"layers.{i}.mlp.dense_4h_to_h", ) _, ln_f = ffmodel.residual_layer_norm( @@ -243,6 +249,13 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel + # TODO: finish this + def convert_hf_weight_name(name): + return (name.replace("transformer.h.", "layers.") + .replace("transformer.", "") + .replace("self_attention.dense", "self_attention.o_proj") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) n_head = ( @@ -258,10 +271,10 @@ def convert_hf_model(model, dst_folder): .replace("self_attention_dense", "attention_wo") ) # Split Q,K,V attention weights - if "self_attention_query_key_value" in name: - name_q = name.replace("self_attention_query_key_value", "attention_wq") - name_k = name.replace("self_attention_query_key_value", "attention_wk") - name_v = name.replace("self_attention_query_key_value", "attention_wv") + if "self_attention.query_key_value" in name: + name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj") + name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj") + name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj") q, k, v = torch.split( params, [ @@ -278,5 +291,5 @@ def convert_hf_model(model, dst_folder): params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) # LM head weight model.lm_head.weight.detach().cpu().numpy().tofile( - os.path.join(dst_folder, "lm_head_weight") + os.path.join(dst_folder, "lm_head.weight") ) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 6b33030f6..e58ed57bc 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -19,8 +19,6 @@ class LLAMAConfig: def __init__(self, hf_config): - # self.max_seq_len = 256 - # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.max_spec_tree_token_num = 20 @@ -29,6 +27,17 @@ def __init__(self, hf_config): self.hidden_size = hf_config.hidden_size self.rms_norm_eps = hf_config.rms_norm_eps self.intermediate_size = hf_config.intermediate_size + self.rotary_embedding_meta = RotaryEmbeddingMeta( + apply_rotary_embedding=True, + rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0, + ) + if "rope_scaling" in hf_config.__dict__: + if hf_config.rope_scaling is not None: + self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"] + self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"] + self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"] + self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"] + self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"] # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.num_attention_heads self.num_key_value_heads = ( @@ -55,11 +64,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.llama_config = LLAMAConfig(hf_config) - # self.llama_config.max_seq_length = max_seq_length - # self.llama_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -106,7 +112,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="tok_embeddings", + name="embed_tokens", ) for i in range(self.llama_config.num_hidden_layers): @@ -117,7 +123,7 @@ def build_model(self, max_tokens_per_batch): token, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm", + name=f"layers.{i}.input_layernorm", ) else: token, attn_norm = ffmodel.residual_rms_norm( @@ -125,7 +131,7 @@ def build_model(self, max_tokens_per_batch): w2, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm", + name=f"layers.{i}.input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -144,8 +150,8 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding - name=f"layers_{i}_attention", + self.llama_config.rotary_embedding_meta, + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -163,11 +169,11 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding - name=f"layers_{i}_attention", + self.llama_config.rotary_embedding_meta, + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multiquery_self_attention( + mha = ffmodel.groupquery_self_attention( attn_norm, self.llama_config.hidden_size, self.llama_config.num_attention_heads, @@ -182,8 +188,8 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding - name=f"layers_{i}_attention", + self.llama_config.rotary_embedding_meta, + name=f"layers.{i}.self_attn", ) else: assert False @@ -193,29 +199,29 @@ def build_model(self, max_tokens_per_batch): mha, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_ffn_norm", + name=f"layers.{i}.post_attention_layernorm", ) w1 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w1", + name=f"layers.{i}.mlp.gate_proj", ) w3 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w3", + name=f"layers.{i}.mlp.up_proj", ) - multi = ffmodel.sigmoid_silu_multi(w1, w3) + multi = ffmodel.sigmoid_silu_multi(w1, w3, self.llama_config.intermediate_size) w2 = ffmodel.dense( multi, self.llama_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w2", + name=f"layers.{i}.mlp.down_proj", ) _, token = ffmodel.residual_rms_norm( @@ -230,7 +236,7 @@ def build_model(self, max_tokens_per_batch): self.llama_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="output", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -250,6 +256,9 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel + def convert_hf_weight_name(name): + return name.replace("model.", "") + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): @@ -271,3 +280,7 @@ def convert_hf_model(model, dst_folder): .replace("model_", "") ) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") + # LM head weight + model.lm_head.weight.detach().cpu().numpy().tofile( + os.path.join(dst_folder, "output_weight") + ) diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 92867fd49..a68bbd2a0 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -19,8 +19,6 @@ class MPTConfig: def __init__(self, hf_config): - # self.max_seq_len = 256 - # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.max_spec_tree_token_num = 20 @@ -28,6 +26,7 @@ def __init__(self, hf_config): self.n_heads = hf_config.n_heads self.n_layers = hf_config.n_layers self.vocab_size = hf_config.vocab_size + self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False) # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.n_heads self.num_key_value_heads = hf_config.n_heads @@ -50,11 +49,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.mpt_config = MPTConfig(hf_config) - # self.mpt_config.max_seq_length = max_seq_length - # self.mpt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -97,7 +93,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wte", + name="wte", ) axes = [ @@ -114,7 +110,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_1", + name=f"layers.{i}.norm_1", ) else: hidden_states, layernorm_output = ffmodel.residual_layer_norm( @@ -126,7 +122,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_1", + name=f"layers.{i}.norm_1", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -142,13 +138,13 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.mpt_config.rotary_embedding_meta, True, # scaling_query (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: attn_outputs = ffmodel.inc_multihead_self_attention_verify( @@ -163,13 +159,13 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.mpt_config.rotary_embedding_meta, True, # scaling_query (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: attn_outputs = ffmodel.inc_multihead_self_attention( @@ -184,13 +180,13 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.mpt_config.rotary_embedding_meta, True, # scaling_query (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) else: assert False @@ -204,7 +200,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_2", + name=f"layers.{i}.norm_2", ) # mlp layernorm_output = ffmodel.dense( @@ -212,7 +208,7 @@ def build_model(self, max_tokens_per_batch): 4 * self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_up_proj", + name=f"layers.{i}.ffn.up_proj", ) layernorm_output = ffmodel.gelu(layernorm_output) intermediate_output = ffmodel.dense( @@ -220,7 +216,7 @@ def build_model(self, max_tokens_per_batch): self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_down_proj", + name=f"layers.{i}.ffn.down_proj", ) _, all_final_norm = ffmodel.residual_layer_norm( @@ -232,7 +228,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"transformer_norm_f", + name=f"norm_f", ) lm_head = ffmodel.dense( all_final_norm, @@ -253,14 +249,22 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel + # TODO: finish this + def convert_hf_weight_name(name): + return ( + name.replace("transformer.blocks.", "layers.") + .replace("transformer.", "") + .replace("attn.out_proj", "attn.o_proj") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): name = name.replace("transformer.blocks.", "layers.").replace(".", "_") if "Wqkv" in name: - name_q = name.replace("attn_Wqkv", "attention_wq") - name_k = name.replace("attn_Wqkv", "attention_wk") - name_v = name.replace("attn_Wqkv", "attention_wv") + name_q = name.replace("attn.Wqkv", "attn.q_proj") + name_k = name.replace("attn.Wqkv", "attn.k_proj") + name_v = name.replace("attn.Wqkv", "attn.v_proj") q, k, v = torch.split( params, [ @@ -280,6 +284,6 @@ def convert_hf_model(model, dst_folder): params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) shutil.copy( - os.path.join(dst_folder, "transformer_wte_weight"), - os.path.join(dst_folder, "lm_head_weight"), + os.path.join(dst_folder, "wte.weight"), + os.path.join(dst_folder, "lm_head.weight"), ) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index b715f5f35..abf88b784 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -34,6 +34,7 @@ def __init__(self, hf_config): self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size self.word_embed_proj_dim = hf_config.word_embed_proj_dim + self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False) # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.num_attention_heads self.num_key_value_heads = hf_config.num_attention_heads @@ -47,8 +48,6 @@ def __init__( ffconfig, hf_config, data_type, - # max_batch_size=1, - # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -56,11 +55,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.opt_config = OPTConfig(hf_config) - # self.opt_config.max_seq_length = max_seq_length - # self.opt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -139,7 +135,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_attention_layer_norm", + name=f"layers.{i}.self_attn_layer_norm", ) else: hidden_states = ffmodel.add(token, positional_embedding) @@ -158,12 +154,12 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.opt_config.rotary_embedding_meta, True, # scaling_query (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multihead_self_attention_verify( @@ -178,12 +174,12 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.opt_config.rotary_embedding_meta, True, # scaling_query (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multihead_self_attention( @@ -198,12 +194,12 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.opt_config.rotary_embedding_meta, True, # scaling_query (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) else: assert False @@ -215,7 +211,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_add_bias_residual_layer_norm", + name=f"layers.{i}.add_bias_residual_layer_norm", ) if not self.opt_config.do_layer_norm_before: @@ -226,14 +222,14 @@ def build_model(self, max_tokens_per_batch): self.opt_config.ffn_dim, ActiMode.AC_MODE_RELU, True, - name=f"layers_{i}_fc1", + name=f"layers.{i}.fc1", ) fc2 = ffmodel.dense( fc1, self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_fc2", + name=f"layers.{i}.fc2", ) if not self.opt_config.do_layer_norm_before: @@ -245,7 +241,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_final_layer_norm", + name=f"layers.{i}.final_layer_norm", ) _, all_final_norm = ffmodel.residual_layer_norm( @@ -263,7 +259,7 @@ def build_model(self, max_tokens_per_batch): self.opt_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="embed_tokens_weight_lm_head", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -283,6 +279,17 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel + def convert_hf_weight_name(name): + return ( + name.replace("decoder.", "") + .replace("model.", "") + .replace("self_attn.out_proj", "self_attn.o_proj") + .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias") + .replace( + ".final_layer_norm", ".add_bias_residual_layer_norm" + ) # important to use the leading "_" to avoid matching the last LayerNorm + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): @@ -303,6 +310,6 @@ def convert_hf_model(model, dst_folder): params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") # copy embedding weights shutil.copy( - os.path.join(dst_folder, "embed_tokens_weight"), - os.path.join(dst_folder, "embed_tokens_weight_lm_head"), + os.path.join(dst_folder, "embed_tokens.weight"), + os.path.join(dst_folder, "lm_head.weight"), ) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 37edaa4c4..762ad24c4 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -19,8 +19,6 @@ class STARCODERConfig: def __init__(self, hf_config): - # self.max_seq_len = 256 - # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.max_spec_tree_token_num = 20 @@ -32,6 +30,7 @@ def __init__(self, hf_config): self.vocab_size = hf_config.vocab_size self.intermediate_size = hf_config.n_inner self.n_head_kv = 1 if hf_config.multi_query else hf_config.n_head + self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False) # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.n_head self.num_key_value_heads = self.n_head_kv @@ -45,8 +44,6 @@ def __init__( ffconfig, hf_config, data_type, - # max_batch_size=1, - # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -54,11 +51,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.starcoder_config = STARCODERConfig(hf_config) - # self.starcoder_config.max_seq_length = max_seq_length - # self.starcoder_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -111,7 +105,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wte", + name="wte", ) positional_embedding = ffmodel.embedding( position_tensor, @@ -121,7 +115,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wpe", + name="wpe", ) axes = [ @@ -139,11 +133,11 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_1", + name=f"layers.{i}.ln_1", ) assert self.mode == InferenceMode.INC_DECODING_MODE - mha = ffmodel.inc_multiquery_self_attention( + mha = ffmodel.groupquery_self_attention( ln_1, self.starcoder_config.hidden_size, self.starcoder_config.num_attention_heads, @@ -158,8 +152,8 @@ def build_model(self, max_tokens_per_batch): False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding - name=f"layers_{i}_attention", + self.starcoder_config.rotary_embedding_meta, + name=f"layers.{i}.attn.c_attn", ) residual, l2_norm = ffmodel.residual_layer_norm( @@ -171,7 +165,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_2", + name=f"layers.{i}.ln_2", ) # mlp @@ -181,7 +175,7 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.intermediate_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_fc", + name=f"layers.{i}.mlp.c_fc", ) activation = ffmodel.gelu(c_fc, False) c_proj = ffmodel.dense( @@ -189,7 +183,7 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_proj", + name=f"layers.{i}.mlp.c_proj", ) _, ln_f = ffmodel.residual_layer_norm( @@ -200,7 +194,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"transformer_ln_f", + name=f"ln_f", ) lm_head = ffmodel.dense( ln_f, @@ -224,11 +218,11 @@ def build_model(self, max_tokens_per_batch): def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = name.replace("transformer.h", "layers").replace(".", "_") - if "c_attn_weight" in name: - name_q = name.replace("attn_c_attn", "attention_wq") - name_k = name.replace("attn_c_attn", "attention_wk") - name_v = name.replace("attn_c_attn", "attention_wv") + name = name.replace("transformer.h", "layers").replace("transformer", "") + if "attn.c_attn.weight" in name: + name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") + name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") + name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj") q, k, v = torch.split( params, [ @@ -241,10 +235,10 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "c_attn_bias" in name: - name_q = name.replace("attn_c_attn", "attention_wq") - name_k = name.replace("attn_c_attn", "attention_wk") - name_v = name.replace("attn_c_attn", "attention_wv") + elif "attn.c_attn.bias" in name: + name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") + name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") + name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj") q, k, v = torch.split( params, [ @@ -257,14 +251,14 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "c_proj_bias" in name: - name = name.replace("attn_c_proj", "attention_wo") + elif "attn.c_proj.bias" in name: + name = name.replace("attn.c_proj", "attn.c_attn.o_proj") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) - elif "c_proj_weight" in name: - name = name.replace("attn_c_proj", "attention_wo") + elif "attn.c_proj.weight" in name: + name = name.replace("attn.c_proj", "attn.c_attn.o_proj") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) else: params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) model.lm_head.weight.detach().cpu().numpy().tofile( - os.path.join(dst_folder, "lm_head_weight") + os.path.join(dst_folder, "lm_head.weight") ) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index ac622b333..37606e875 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -27,11 +27,11 @@ MPTConfig, ) from flexflow.core import * -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer +from transformers import AutoConfig, AutoModelForCausalLM from huggingface_hub import HfApi import sys, torch, shutil, hashlib from typing import Union, List - +from huggingface_hub import snapshot_download class GenerationConfig: """A class to store the sampling configs.""" @@ -95,6 +95,7 @@ def __init__( self.supported_models = { "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "MistralForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig), "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), @@ -219,7 +220,13 @@ def download_hf_weights_if_needed(self): ) # Download model from HuggingFace, or load it from the local folder hf_model = AutoModelForCausalLM.from_pretrained( - self.model_name, trust_remote_code=True + self.model_name, + trust_remote_code=True, + torch_dtype=( + torch.float32 + if self.data_type == DataType.DT_FLOAT + else torch.float16 + ), ) # Print log message to notify user download of model has finished if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): @@ -261,29 +268,21 @@ def download_hf_tokenizer_if_needed(self): ) if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model - print( - f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." - ) - else: - # Remote model - print( - f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..." - ) - # Download tokenizer from HuggingFace, or load it from the local folder - if self.model_type == ModelType.LLAMA: - hf_tokenizer = LlamaTokenizer.from_pretrained( - self.model_name, use_fast=True - ) + print( + f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..." + ) + # Load/download the tokenizer files + target_tokenizer_files = ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.json", "merges.txt", "tokenizer.model"] + if os.path.exists(self.model_name): + hf_tokenizer_path = self.model_name else: - hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name) - # Print log message to notify user download of tokenizer has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading tokenizer. Saving it now...") - # Save tokenizer - hf_tokenizer.save_pretrained(self.tokenizer_path) - print("Done saving HF tokenizer.") + hf_tokenizer_path = snapshot_download(repo_id=self.model_name, allow_patterns=target_tokenizer_files) + for file in target_tokenizer_files: + src_path = os.path.join(hf_tokenizer_path, file) + dst_path = os.path.join(self.tokenizer_path, file) + if os.path.exists(src_path): + shutil.copy(src_path, dst_path) + print("Done updating HF tokenizer.") # Save new revision hash to file with open(ff_revision_file, "w+") as f: f.write(latest_revision) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 5714c8fe3..8b3403653 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -18,6 +18,7 @@ #include "flexflow/mapper.h" #include "flexflow/request_manager.h" #include "flexflow/utils/file_loader.h" +#include using namespace Legion; using namespace FlexFlow; @@ -59,10 +60,9 @@ class FFCObjectWrapper { FF_NEW_OPAQUE_WRAPPER(flexflow_single_dataloader_t, SingleDataLoader *); // inference FF_NEW_OPAQUE_WRAPPER(flexflow_batch_config_t, BatchConfig *); - FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t, - TreeVerifyBatchConfig *); - FF_NEW_OPAQUE_WRAPPER(flexflow_beam_search_batch_config_t, - BeamSearchBatchConfig *); + // FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t, BatchConfig + // *); FF_NEW_OPAQUE_WRAPPER(flexflow_beam_search_batch_config_t, + // BatchConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_inference_manager_t, InferenceManager *); FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *); FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *); @@ -253,56 +253,56 @@ void flexflow_model_zero_gradients(flexflow_model_t handle_) { } flexflow_tensor_t flexflow_model_add_exp(flexflow_model_t handle_, - const flexflow_tensor_t x_, + flexflow_tensor_t const x_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); Tensor tensor = handle->exp(x, name); DEBUG_PRINT("[Exp] new Tensor %p, x %p, name %s", tensor, x, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_sin(flexflow_model_t handle_, - const flexflow_tensor_t x_, + flexflow_tensor_t const x_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); Tensor tensor = handle->sin(x, name); DEBUG_PRINT("[Sin] new Tensor %p, x %p, name %s", tensor, x, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_cos(flexflow_model_t handle_, - const flexflow_tensor_t x_, + flexflow_tensor_t const x_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); Tensor tensor = handle->cos(x, name); DEBUG_PRINT("[Cos] new Tensor %p, x %p, name %s", tensor, x, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_add(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->add(x, y, inplace_a, name); DEBUG_PRINT("[Add] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->subtract(x, y, inplace_a, name); DEBUG_PRINT( "[Subtract] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); @@ -310,13 +310,13 @@ flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->multiply(x, y, inplace_a, name); DEBUG_PRINT( "[Multiply] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); @@ -324,13 +324,13 @@ flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->divide(x, y, inplace_a, name); DEBUG_PRINT( "[Divide] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); @@ -338,33 +338,33 @@ flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_max(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->max(x, y, inplace_a, name); DEBUG_PRINT("[Max] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_min(flexflow_model_t handle_, - const flexflow_tensor_t x_, - const flexflow_tensor_t y_, + flexflow_tensor_t const x_, + flexflow_tensor_t const y_, bool inplace_a, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor x = FFCObjectWrapper::unwrap_const(x_); - const Tensor y = FFCObjectWrapper::unwrap_const(y_); + Tensor const x = FFCObjectWrapper::unwrap_const(x_); + Tensor const y = FFCObjectWrapper::unwrap_const(y_); Tensor tensor = handle->min(x, y, inplace_a, name); DEBUG_PRINT("[Min] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int *axes, int n, bool keepdims, @@ -385,21 +385,21 @@ flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_rsqrt(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor const input = FFCObjectWrapper::unwrap(input_); Tensor tensor = handle->rsqrt(input, name); DEBUG_PRINT("[Rsqrt] new Tensor %p, input %p, name %s", tensor, input, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const exponent, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor const input = FFCObjectWrapper::unwrap(input_); Tensor tensor = handle->pow(input, exponent, name); DEBUG_PRINT("[Pow] new Tensor %p, input %p, exponent %f, name %s", tensor, @@ -410,13 +410,13 @@ flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int *dims, int n, bool keepdims, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor const input = FFCObjectWrapper::unwrap(input_); std::vector dims_vec; char cbuffer[256]; char *cbuffer_ptr = cbuffer; @@ -441,7 +441,7 @@ flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_conv2d(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int out_channels, int kernel_h, int kernel_w, @@ -457,7 +457,7 @@ flexflow_tensor_t flexflow_initializer_t bias_initializer_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap_const(input_); + Tensor const input = FFCObjectWrapper::unwrap_const(input_); Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); @@ -505,7 +505,7 @@ flexflow_tensor_t flexflow_tensor_t flexflow_model_add_embedding(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int num_entries, int out_dim, enum AggrMode aggr, @@ -514,7 +514,7 @@ flexflow_tensor_t flexflow_initializer_t kernel_initializer_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap_const(input_); + Tensor const input = FFCObjectWrapper::unwrap_const(input_); Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); @@ -588,7 +588,7 @@ flexflow_tensor_t } flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool relu, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -608,7 +608,7 @@ flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int n, int *axes, bool elementwise_affine, @@ -616,7 +616,7 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, bool use_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); + Tensor const input = FFCObjectWrapper::unwrap(input_); std::vector axes_vec; for (int i = 0; i < n; i++) { axes_vec.push_back(axes[i]); @@ -640,9 +640,9 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_, flexflow_tensor_t * flexflow_model_add_residual_layer_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, - const flexflow_tensor_t residual1_, - const flexflow_tensor_t residual2_, + flexflow_tensor_t const input_, + flexflow_tensor_t const residual1_, + flexflow_tensor_t const residual2_, bool use_two_residuals, int n, int *axes, @@ -651,9 +651,9 @@ flexflow_tensor_t * bool use_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); - const Tensor residual1 = FFCObjectWrapper::unwrap(residual1_); - const Tensor residual2 = + Tensor const input = FFCObjectWrapper::unwrap(input_); + Tensor const residual1 = FFCObjectWrapper::unwrap(residual1_); + Tensor const residual2 = use_two_residuals ? FFCObjectWrapper::unwrap(residual2_) : nullptr; Tensor tensor_outputs[2]; std::vector axes_vec; @@ -699,8 +699,8 @@ flexflow_tensor_t * flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( flexflow_model_t handle_, - const flexflow_tensor_t input_, - const flexflow_tensor_t residual_, + flexflow_tensor_t const input_, + flexflow_tensor_t const residual_, int n, int *axes, bool elementwise_affine, @@ -708,8 +708,8 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool use_bias, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap(input_); - const Tensor residual = FFCObjectWrapper::unwrap(residual_); + Tensor const input = FFCObjectWrapper::unwrap(input_); + Tensor const residual = FFCObjectWrapper::unwrap(residual_); Tensor tensor_outputs[2]; std::vector axes_vec; for (int i = 0; i < n; i++) { @@ -746,14 +746,15 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( flexflow_tensor_t flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle_, - const flexflow_tensor_t input1_, - const flexflow_tensor_t input2_, + flexflow_tensor_t const input1_, + flexflow_tensor_t const input2_, + int intermediate_size, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input1 = FFCObjectWrapper::unwrap(input1_); - const Tensor input2 = FFCObjectWrapper::unwrap(input2_); - Tensor tensor = - handle->sigmoid_silu_multi(input1, input2, input1->data_type, name); + Tensor const input1 = FFCObjectWrapper::unwrap(input1_); + Tensor const input2 = FFCObjectWrapper::unwrap(input2_); + Tensor tensor = handle->sigmoid_silu_multi( + input1, input2, intermediate_size, input1->data_type, name); DEBUG_PRINT("[SigmoidSiluMulti] new Tensor %p, input1 %p, input2 %p, name %s", tensor, input1, @@ -763,8 +764,8 @@ flexflow_tensor_t } flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_, - const flexflow_tensor_t a_, - const flexflow_tensor_t b_, + flexflow_tensor_t const a_, + flexflow_tensor_t const b_, int a_seq_length_dim, int b_seq_length_dim) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -778,7 +779,7 @@ flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_dense( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int out_dim, enum ActiMode activation /* AC_MODE_NONE */, bool use_bias /* true */, @@ -790,7 +791,7 @@ flexflow_tensor_t flexflow_model_add_dense( float kernel_reg_lambda, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - const Tensor input = FFCObjectWrapper::unwrap_const(input_); + Tensor const input = FFCObjectWrapper::unwrap_const(input_); Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); @@ -896,8 +897,8 @@ flexflow_tensor_t flexflow_model_add_flat(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, - const flexflow_tensor_t input_, - const flexflow_tensor_t index_, + flexflow_tensor_t const input_, + flexflow_tensor_t const index_, int dim, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -914,7 +915,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int dim, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -926,7 +927,7 @@ flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int n, int *perm, char const *name) { @@ -946,7 +947,7 @@ flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int n, int *shape, char const *name) { @@ -966,7 +967,7 @@ flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int axis, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -982,7 +983,7 @@ flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_scalar_multiply(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const scalar, bool inplace, char const *name) { @@ -998,7 +999,7 @@ flexflow_tensor_t } flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const scalar, bool inplace, char const *name) { @@ -1014,7 +1015,7 @@ flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const scalar, bool inplace, char const *name) { @@ -1032,7 +1033,7 @@ flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_scalar_truediv(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float const scalar, bool inplace, char const *name) { @@ -1049,7 +1050,7 @@ flexflow_tensor_t } flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1059,7 +1060,7 @@ flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1070,7 +1071,7 @@ flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool inplace, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1081,7 +1082,7 @@ flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1092,7 +1093,7 @@ flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); @@ -1102,7 +1103,7 @@ flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool inplace, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1113,7 +1114,7 @@ flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float rate, unsigned long long seed, char const *name) { @@ -1131,9 +1132,9 @@ flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle_, flexflow_tensor_t flexflow_model_add_multihead_attention( flexflow_model_t handle_, - const flexflow_tensor_t query_, - const flexflow_tensor_t key_, - const flexflow_tensor_t value_, + flexflow_tensor_t const query_, + flexflow_tensor_t const key_, + flexflow_tensor_t const value_, int embed_dim, int num_heads, int kdim, @@ -1186,7 +1187,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention( flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -1198,15 +1199,29 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, + bool streaming_cache, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->inc_multihead_self_attention(input, embed_dim, num_heads, @@ -1218,18 +1233,19 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, position_bias, + streaming_cache, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -1241,15 +1257,29 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, + bool streaming_cache, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->spec_inc_multihead_self_attention(input, embed_dim, @@ -1262,18 +1292,19 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, position_bias, + streaming_cache, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_heads, int kdim, @@ -1285,6 +1316,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -1294,6 +1331,13 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->inc_multihead_self_attention_verify(input, embed_dim, @@ -1306,7 +1350,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -1315,9 +1359,9 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( return FFCObjectWrapper::wrap(tensor); } -flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( +flexflow_tensor_t flexflow_model_add_groupquery_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -1330,39 +1374,54 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, + bool streaming_cache, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); - Tensor tensor = handle->inc_multiquery_self_attention(input, - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - data_type, - kernel_initializer, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - name); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); + Tensor tensor = handle->groupquery_self_attention(input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + data_type, + kernel_initializer, + rotary_embedding_meta, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + streaming_cache, + name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -1375,15 +1434,29 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, bool position_bias, + bool streaming_cache, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->spec_inc_multiquery_self_attention(input, embed_dim, @@ -1397,18 +1470,19 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, position_bias, + streaming_cache, name); return FFCObjectWrapper::wrap(tensor); } flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int embed_dim, int num_q_heads, int num_kv_heads, @@ -1421,6 +1495,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -1430,6 +1510,13 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->inc_multiquery_self_attention_verify(input, embed_dim, @@ -1443,7 +1530,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -1453,7 +1540,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( } flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float eps, int dim, char const *name) { @@ -1465,8 +1552,8 @@ flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_, flexflow_tensor_t * flexflow_model_add_residual_rms_norm(flexflow_model_t handle_, - const flexflow_tensor_t input1_, - const flexflow_tensor_t input2_, + flexflow_tensor_t const input1_, + flexflow_tensor_t const input2_, float eps, int dim, char const *name) { @@ -1486,31 +1573,30 @@ flexflow_tensor_t * } flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, int k, bool sorted, - bool speculative_decoding, + bool renormalize, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = - handle->arg_top_k(input, k, sorted, speculative_decoding, name); + Tensor tensor = handle->arg_top_k(input, k, sorted, name); return FFCObjectWrapper::wrap(tensor); } -flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, - const flexflow_tensor_t input_, - int max_beam_size, - bool sorted, - char const *name) { - FFModel *handle = FFCObjectWrapper::unwrap(handle_); - Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->beam_top_k(input, max_beam_size, sorted, name); - return FFCObjectWrapper::wrap(tensor); -} +// flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, +// flexflow_tensor_t const +// input_, int max_beam_size, +// bool sorted, +// char const *name) { +// FFModel *handle = FFCObjectWrapper::unwrap(handle_); +// Tensor input = FFCObjectWrapper::unwrap(input_); +// Tensor tensor = handle->beam_top_k(input, max_beam_size, sorted, name); +// return FFCObjectWrapper::wrap(tensor); +// } flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, float top_p, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1520,7 +1606,7 @@ flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_, } flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, - const flexflow_tensor_t input_, + flexflow_tensor_t const input_, bool beam_search, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1600,8 +1686,10 @@ void flexflow_model_generate(flexflow_model_t handle_, text_str.c_str(), max_seq_length); } + std::vector> slo_ratios = {std::pair(10.0, 1.0)}; + ConstantEmissionMachine emission_machine(1.0, slo_ratios); std::vector results = - handle->generate(prompts, max_seq_length); + handle->generate(prompts, emission_machine); // If the prompt exceeds max seq len, check that we return the prompt with no // additional token. Otherwise, check that the output does not exceed the max // sequence length. @@ -2525,37 +2613,37 @@ void flexflow_batch_config_destroy(flexflow_batch_config_t handle_) { // TreeVerifyBatchConfig // ----------------------------------------------------------------------- -flexflow_tree_verify_batch_config_t - flexflow_tree_verify_batch_config_create(void) { - TreeVerifyBatchConfig *config = new TreeVerifyBatchConfig(); - DEBUG_PRINT("[TreeVerifyBatchConfig] new %p", config); - return FFCObjectWrapper::wrap(config); -} +// flexflow_tree_verify_batch_config_t +// flexflow_tree_verify_batch_config_create(void) { +// BatchConfig *config = new BatchConfig(); +// DEBUG_PRINT("[BatchConfig] new %p", config); +// return FFCObjectWrapper::wrap(config); +// } -void flexflow_tree_verify_batch_config_destroy( - flexflow_tree_verify_batch_config_t handle_) { - TreeVerifyBatchConfig *handle = FFCObjectWrapper::unwrap(handle_); - DEBUG_PRINT("[TreeVerifyBatchConfig] delete %p", handle); - delete handle; -} +// void flexflow_tree_verify_batch_config_destroy( +// flexflow_tree_verify_batch_config_t handle_) { +// BatchConfig *handle = FFCObjectWrapper::unwrap(handle_); +// DEBUG_PRINT("[BatchConfig] delete %p", handle); +// delete handle; +// } // ----------------------------------------------------------------------- // BeamSearchBatchConfig // ----------------------------------------------------------------------- -flexflow_beam_search_batch_config_t - flexflow_beam_search_batch_config_create(void) { - BeamSearchBatchConfig *config = new BeamSearchBatchConfig(); - DEBUG_PRINT("[BeamSearchBatchConfig] new %p", config); - return FFCObjectWrapper::wrap(config); -} +// flexflow_beam_search_batch_config_t +// flexflow_beam_search_batch_config_create(void) { +// BatchConfig *config = new BatchConfig(); +// DEBUG_PRINT("[BeamSearchBatchConfig] new %p", config); +// return FFCObjectWrapper::wrap(config); +// } -void flexflow_beam_search_batch_config_destroy( - flexflow_beam_search_batch_config_t handle_) { - BeamSearchBatchConfig *handle = FFCObjectWrapper::unwrap(handle_); - DEBUG_PRINT("[BeamSearchBatchConfig] delete %p", handle); - delete handle; -} +// void flexflow_beam_search_batch_config_destroy( +// flexflow_beam_search_batch_config_t handle_) { +// BatchConfig *handle = FFCObjectWrapper::unwrap(handle_); +// DEBUG_PRINT("[BeamSearchBatchConfig] delete %p", handle); +// delete handle; +// } // ----------------------------------------------------------------------- // RequestManager @@ -2582,12 +2670,20 @@ void flexflow_request_manager_set_max_tokens_per_batch( DEBUG_PRINT("[RequestManager] set max_tokens_per_batch %d", max_num_tokens); } -void flexflow_request_manager_set_max_spec_tree_token_num( - flexflow_request_manager_t handle_, int max_num_tokens) { +void flexflow_request_manager_set_max_tokens_per_ssm_batch( + flexflow_request_manager_t handle_, int max_num_ssm_tokens) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_tokens_per_ssm_batch(max_num_ssm_tokens); + DEBUG_PRINT("[RequestManager] set max_tokens_per_ssm_batch %d", + max_num_ssm_tokens); +} + +void flexflow_request_manager_set_max_tokens_per_prefilling_batch( + flexflow_request_manager_t handle_, int max_num_prefilling_tokens) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); - handle->set_max_spec_tree_token_num(max_num_tokens); - DEBUG_PRINT("[RequestManager] set max_spec_tree_token_num %d", - max_num_tokens); + handle->set_max_tokens_per_prefilling_batch(max_num_prefilling_tokens); + DEBUG_PRINT("[RequestManager] set max_tokens_per_prefilling_batch %d", + max_num_prefilling_tokens); } void flexflow_request_manager_set_max_sequence_length( @@ -2597,6 +2693,20 @@ void flexflow_request_manager_set_max_sequence_length( DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length); } +void flexflow_request_manager_set_max_output_length( + flexflow_request_manager_t handle_, int max_output_length) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_output_length(max_output_length); + DEBUG_PRINT("[RequestManager] set max_output_length %d", max_output_length); +} + +void flexflow_request_manager_set_max_kv_cache_size( + flexflow_request_manager_t handle_, int max_kv_cache_size) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_kv_cache_size(max_kv_cache_size); + DEBUG_PRINT("[RequestManager] set max_kv_cache_size %d", max_kv_cache_size); +} + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, @@ -2608,7 +2718,7 @@ void flexflow_request_manager_register_tokenizer( "Cannot convert nullptr char * to std::string"); std::string const tokenizer_filepath_str(tokenizer_filepath); handle->register_tokenizer( - model_type, bos_token_id, eos_token_id, tokenizer_filepath_str); + model_type, bos_token_id, {eos_token_id}, tokenizer_filepath_str); DEBUG_PRINT( "[RequestManager] register tokenizer %p %s", handle, tokenizer_filepath); } @@ -2700,7 +2810,7 @@ flexflow_file_data_loader_t int num_q_heads, int num_kv_heads, int hidden_dim, - int qkv_inner_dim, + int head_dim, int tensor_parallelism_degree, bool use_full_precision) { assert(weight_file_path != nullptr && @@ -2711,7 +2821,7 @@ flexflow_file_data_loader_t num_q_heads, num_kv_heads, hidden_dim, - qkv_inner_dim, + head_dim, tensor_parallelism_degree, use_full_precision); DEBUG_PRINT("[FileDataLoader] new %p", handle); @@ -2728,5 +2838,7 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_) { FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); FFModel *model = FFCObjectWrapper::unwrap(model_handle_); - handle->load_weights(model); + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + handle->load_weights_parallel(model, ctx, runtime); } diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index c293aecb1..38127a1cf 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -20,7 +20,7 @@ namespace FlexFlow { using namespace Legion; using namespace Mapping; -LegionRuntime::Logger::Category log_ff_mapper("Mapper"); +Legion::Logger log_ff_mapper("Mapper"); FFShardingFunctor::FFShardingFunctor(int _gpus_per_node, int _cpus_per_node, @@ -33,7 +33,7 @@ FFShardingFunctor::~FFShardingFunctor(void) {} ShardID FFShardingFunctor::shard(DomainPoint const &point, Domain const &full_space, - const size_t total_shards) { + size_t const total_shards) { assert(point.get_dim() == full_space.get_dim()); int device_id = machine_view.start_device_id; for (int i = 0; i < point.get_dim(); i++) { @@ -259,7 +259,7 @@ Mapper::MapperSyncModel FFMapper::get_mapper_sync_model(void) const { return SERIALIZED_REENTRANT_MAPPER_MODEL; } -void FFMapper::select_task_options(const MapperContext ctx, +void FFMapper::select_task_options(MapperContext const ctx, Task const &task, TaskOptions &output) { unsigned long long task_hash = compute_task_hash(task); @@ -285,9 +285,14 @@ void FFMapper::select_task_options(const MapperContext ctx, } if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) || - (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || + (task.task_id == RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID) || - (task.task_id == RM_BACKGROUND_SERVING_TASK_ID)) { + (task.task_id == RM_BACKGROUND_SERVING_TASK_ID) || + (task.task_id == RM_GET_NEXT_BATCH_CONFIG_TASK_ID)) { + output.initial_proc = all_cpus[0]; + return; + } + if (task.task_id == LOAD_WEIGHT_TASK_ID) { output.initial_proc = all_cpus[0]; return; } @@ -296,6 +301,7 @@ void FFMapper::select_task_options(const MapperContext ctx, // control replicate top level task if (enable_control_replication) { output.replicate = true; + output.map_locally = false; } return; } @@ -374,7 +380,7 @@ void FFMapper::select_task_options(const MapperContext ctx, assert(task.is_index_space); } -void FFMapper::slice_task(const MapperContext ctx, +void FFMapper::slice_task(MapperContext const ctx, Task const &task, SliceTaskInput const &input, SliceTaskOutput &output) { @@ -480,7 +486,7 @@ void FFMapper::slice_task(const MapperContext ctx, } } -void FFMapper::premap_task(const MapperContext ctx, +void FFMapper::premap_task(MapperContext const ctx, Task const &task, PremapTaskInput const &input, PremapTaskOutput &output) { @@ -506,7 +512,7 @@ std::string humanReadableSize(size_t size, bool mb = false) { return std::string(buffer); } -void FFMapper::map_task(const MapperContext ctx, +void FFMapper::map_task(MapperContext const ctx, Task const &task, MapTaskInput const &input, MapTaskOutput &output) { @@ -560,6 +566,10 @@ void FFMapper::map_task(const MapperContext ctx, assert(output.target_procs[i].address_space() == node_id); } } + if (input.shard_processor.exists()) { + output.target_procs = std::vector{input.shard_processor}; + } + // Find instances that still need to be mapped std::vector> missing_fields(task.regions.size()); runtime->filter_instances(ctx, @@ -643,17 +653,18 @@ void FFMapper::map_task(const MapperContext ctx, task.regions[idx], created, &footprint)) { - if (log_instance_creation) { - for (size_t idx = 0; idx < created_instances.size(); idx++) { - log_ff_mapper.print("Instance[%zu]: memory:" IDFMT " proc:" IDFMT - " size:%zu task:%s", - idx, - created_instances[idx].memory.id, - created_instances[idx].processor.id, - created_instances[idx].size, - created_instances[idx].task_name.c_str()); - } - } + // if (log_instance_creation) { + // for (size_t idx = 0; idx < created_instances.size(); idx++) { + // log_ff_mapper.print("Instance[%zu]: memory: " IDFMT " proc: " + // IDFMT + // " size: %zu task: %s", + // idx, + // created_instances[idx].memory.id, + // created_instances[idx].processor.id, + // created_instances[idx].size, + // created_instances[idx].task_name.c_str()); + // } + // } // Report failed to creation log_ff_mapper.error( "Out of memory! FlexFlow failed to reserve block of size %s" @@ -681,17 +692,27 @@ void FFMapper::map_task(const MapperContext ctx, clog.memory = target_mem; clog.processor = task.target_proc; created_instances.push_back(clog); + log_ff_mapper.print( + "Created Instance[%lu]: memory_kind: %s memory_id: %llx " + "proc: " IDFMT " size: %zu (capacity %lu) task: %s", + created_instances.size() - 1, + Legion::Mapping::Utilities::to_string(clog.memory.kind()), + clog.memory.id, + clog.processor.id, + clog.size, + clog.memory.capacity(), + clog.task_name.c_str()); } } // for idx } -void FFMapper::replicate_task(const MapperContext ctx, +void FFMapper::replicate_task(MapperContext const ctx, Task const &task, ReplicateTaskInput const &input, ReplicateTaskOutput &output) { // Should only be replicated for the top-level task assert((task.get_depth() == 0) && (task.regions.size() == 0)); - const Processor::Kind target_kind = task.target_proc.kind(); + Processor::Kind const target_kind = task.target_proc.kind(); VariantID vid; { std::vector variant_ids; @@ -707,7 +728,7 @@ void FFMapper::replicate_task(const MapperContext ctx, procs.only_kind(target_kind); for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end(); it++) { - const AddressSpace space = it->address_space(); + AddressSpace const space = it->address_space(); if (handled[space]) { continue; } @@ -718,21 +739,21 @@ void FFMapper::replicate_task(const MapperContext ctx, assert(count == total_nodes); } -void FFMapper::select_task_variant(const MapperContext ctx, +void FFMapper::select_task_variant(MapperContext const ctx, Task const &task, SelectVariantInput const &input, SelectVariantOutput &output) { assert(false); } -void FFMapper::postmap_task(const MapperContext ctx, +void FFMapper::postmap_task(MapperContext const ctx, Task const &task, PostMapInput const &input, PostMapOutput &output) { assert(false); } -void FFMapper::select_task_sources(const MapperContext ctx, +void FFMapper::select_task_sources(MapperContext const ctx, Task const &task, SelectTaskSrcInput const &input, SelectTaskSrcOutput &output) { @@ -817,26 +838,26 @@ void FFMapper::default_policy_select_sources( } void FFMapper::create_task_temporary_instance( - const MapperContext ctx, + MapperContext const ctx, Task const &task, CreateTaskTemporaryInput const &input, CreateTaskTemporaryOutput &output) { assert(false); } -void FFMapper::speculate(const MapperContext ctx, +void FFMapper::speculate(MapperContext const ctx, Task const &task, SpeculativeOutput &output) { assert(false); } -void FFMapper::report_profiling(const MapperContext ctx, +void FFMapper::report_profiling(MapperContext const ctx, Task const &task, TaskProfilingInfo const &input) { assert(false); } -void FFMapper::select_sharding_functor(const MapperContext ctx, +void FFMapper::select_sharding_functor(MapperContext const ctx, Task const &task, SelectShardingFunctorInput const &input, SelectShardingFunctorOutput &output) { @@ -865,7 +886,7 @@ void FFMapper::select_sharding_functor(const MapperContext ctx, } } -void FFMapper::map_inline(const MapperContext ctx, +void FFMapper::map_inline(MapperContext const ctx, InlineMapping const &inline_op, MapInlineInput const &input, MapInlineOutput &output) { @@ -968,7 +989,7 @@ void FFMapper::map_inline(const MapperContext ctx, } } -void FFMapper::select_inline_sources(const MapperContext ctx, +void FFMapper::select_inline_sources(MapperContext const ctx, InlineMapping const &inline_op, SelectInlineSrcInput const &input, SelectInlineSrcOutput &output) { @@ -978,27 +999,27 @@ void FFMapper::select_inline_sources(const MapperContext ctx, } void FFMapper::create_inline_temporary_instance( - const MapperContext ctx, + MapperContext const ctx, InlineMapping const &inline_op, CreateInlineTemporaryInput const &input, CreateInlineTemporaryOutput &output) { assert(false); } -void FFMapper::report_profiling(const MapperContext ctx, +void FFMapper::report_profiling(MapperContext const ctx, InlineMapping const &inline_op, InlineProfilingInfo const &input) { assert(false); } -void FFMapper::map_copy(const MapperContext ctx, +void FFMapper::map_copy(MapperContext const ctx, Copy const ©, MapCopyInput const &input, MapCopyOutput &output) { assert(false); } -void FFMapper::select_copy_sources(const MapperContext ctx, +void FFMapper::select_copy_sources(MapperContext const ctx, Copy const ©, SelectCopySrcInput const &input, SelectCopySrcOutput &output) { @@ -1006,26 +1027,26 @@ void FFMapper::select_copy_sources(const MapperContext ctx, } void FFMapper::create_copy_temporary_instance( - const MapperContext ctx, + MapperContext const ctx, Copy const ©, CreateCopyTemporaryInput const &input, CreateCopyTemporaryOutput &output) { assert(false); } -void FFMapper::speculate(const MapperContext ctx, +void FFMapper::speculate(MapperContext const ctx, Copy const ©, SpeculativeOutput &output) { assert(false); } -void FFMapper::report_profiling(const MapperContext ctx, +void FFMapper::report_profiling(MapperContext const ctx, Copy const ©, CopyProfilingInfo const &input) { assert(false); } -void FFMapper::select_sharding_functor(const MapperContext ctx, +void FFMapper::select_sharding_functor(MapperContext const ctx, Copy const ©, SelectShardingFunctorInput const &input, SelectShardingFunctorOutput &output) { @@ -1033,14 +1054,14 @@ void FFMapper::select_sharding_functor(const MapperContext ctx, assert(false); } -void FFMapper::map_close(const MapperContext ctx, +void FFMapper::map_close(MapperContext const ctx, Close const &close, MapCloseInput const &input, MapCloseOutput &output) { assert(false); } -void FFMapper::select_close_sources(const MapperContext ctx, +void FFMapper::select_close_sources(MapperContext const ctx, Close const &close, SelectCloseSrcInput const &input, SelectCloseSrcOutput &output) { @@ -1048,20 +1069,20 @@ void FFMapper::select_close_sources(const MapperContext ctx, } void FFMapper::create_close_temporary_instance( - const MapperContext ctx, + MapperContext const ctx, Close const &close, CreateCloseTemporaryInput const &input, CreateCloseTemporaryOutput &output) { assert(false); } -void FFMapper::report_profiling(const MapperContext ctx, +void FFMapper::report_profiling(MapperContext const ctx, Close const &close, CloseProfilingInfo const &input) { assert(false); } -void FFMapper::select_sharding_functor(const MapperContext ctx, +void FFMapper::select_sharding_functor(MapperContext const ctx, Close const &close, SelectShardingFunctorInput const &input, SelectShardingFunctorOutput &output) { @@ -1069,26 +1090,26 @@ void FFMapper::select_sharding_functor(const MapperContext ctx, assert(false); } -void FFMapper::map_acquire(const MapperContext ctx, +void FFMapper::map_acquire(MapperContext const ctx, Acquire const &acquire, MapAcquireInput const &input, MapAcquireOutput &output) { assert(false); } -void FFMapper::speculate(const MapperContext ctx, +void FFMapper::speculate(MapperContext const ctx, Acquire const &acquire, SpeculativeOutput &output) { assert(false); } -void FFMapper::report_profiling(const MapperContext ctx, +void FFMapper::report_profiling(MapperContext const ctx, Acquire const &acquire, AcquireProfilingInfo const &input) { assert(false); } -void FFMapper::select_sharding_functor(const MapperContext ctx, +void FFMapper::select_sharding_functor(MapperContext const ctx, Acquire const &acquire, SelectShardingFunctorInput const &input, SelectShardingFunctorOutput &output) { @@ -1096,14 +1117,14 @@ void FFMapper::select_sharding_functor(const MapperContext ctx, assert(false); } -void FFMapper::map_release(const MapperContext ctx, +void FFMapper::map_release(MapperContext const ctx, Release const &release, MapReleaseInput const &input, MapReleaseOutput &output) { assert(false); } -void FFMapper::select_release_sources(const MapperContext ctx, +void FFMapper::select_release_sources(MapperContext const ctx, Release const &release, SelectReleaseSrcInput const &input, SelectReleaseSrcOutput &output) { @@ -1111,26 +1132,26 @@ void FFMapper::select_release_sources(const MapperContext ctx, } void FFMapper::create_release_temporary_instance( - const MapperContext ctx, + MapperContext const ctx, Release const &release, CreateReleaseTemporaryInput const &input, CreateReleaseTemporaryOutput &output) { assert(false); } -void FFMapper::speculate(const MapperContext ctx, +void FFMapper::speculate(MapperContext const ctx, Release const &release, SpeculativeOutput &output) { assert(false); } -void FFMapper::report_profiling(const MapperContext ctx, +void FFMapper::report_profiling(MapperContext const ctx, Release const &release, ReleaseProfilingInfo const &input) { assert(false); } -void FFMapper::select_sharding_functor(const MapperContext ctx, +void FFMapper::select_sharding_functor(MapperContext const ctx, Release const &release, SelectShardingFunctorInput const &input, SelectShardingFunctorOutput &output) { @@ -1138,21 +1159,21 @@ void FFMapper::select_sharding_functor(const MapperContext ctx, } void FFMapper::select_partition_projection( - const MapperContext ctx, + MapperContext const ctx, Partition const &partition, SelectPartitionProjectionInput const &input, SelectPartitionProjectionOutput &output) { assert(false); } -void FFMapper::map_partition(const MapperContext ctx, +void FFMapper::map_partition(MapperContext const ctx, Partition const &partition, MapPartitionInput const &input, MapPartitionOutput &output) { assert(false); } -void FFMapper::select_partition_sources(const MapperContext ctx, +void FFMapper::select_partition_sources(MapperContext const ctx, Partition const &partition, SelectPartitionSrcInput const &input, SelectPartitionSrcOutput &output) { @@ -1160,34 +1181,34 @@ void FFMapper::select_partition_sources(const MapperContext ctx, } void FFMapper::create_partition_temporary_instance( - const MapperContext ctx, + MapperContext const ctx, Partition const &partition, CreatePartitionTemporaryInput const &input, CreatePartitionTemporaryOutput &output) { assert(false); } -void FFMapper::report_profiling(const MapperContext ctx, +void FFMapper::report_profiling(MapperContext const ctx, Partition const &partition, PartitionProfilingInfo const &input) { assert(false); } -void FFMapper::select_sharding_functor(const MapperContext ctx, +void FFMapper::select_sharding_functor(MapperContext const ctx, Partition const &partition, SelectShardingFunctorInput const &input, SelectShardingFunctorOutput &output) { assert(false); } -void FFMapper::select_sharding_functor(const MapperContext ctx, +void FFMapper::select_sharding_functor(MapperContext const ctx, Fill const &fill, SelectShardingFunctorInput const &input, SelectShardingFunctorOutput &output) { assert(false); } -void FFMapper::configure_context(const MapperContext ctx, +void FFMapper::configure_context(MapperContext const ctx, Task const &task, ContextConfigOutput &output) { // Increase max_window_size to allow Legion tracing to capture larger traces @@ -1195,21 +1216,21 @@ void FFMapper::configure_context(const MapperContext ctx, // Use the default values and do nothing else } -void FFMapper::select_tunable_value(const MapperContext ctx, +void FFMapper::select_tunable_value(MapperContext const ctx, Task const &task, SelectTunableInput const &input, SelectTunableOutput &output) { assert(false); } -void FFMapper::select_sharding_functor(const MapperContext ctx, +void FFMapper::select_sharding_functor(MapperContext const ctx, MustEpoch const &epoch, SelectShardingFunctorInput const &input, MustEpochShardingFunctorOutput &output) { assert(false); } -void FFMapper::map_must_epoch(const MapperContext ctx, +void FFMapper::map_must_epoch(MapperContext const ctx, MapMustEpochInput const &input, MapMustEpochOutput &output) { // Directly assign each task to its target_proc @@ -1220,13 +1241,13 @@ void FFMapper::map_must_epoch(const MapperContext ctx, assert(input.constraints.size() == 0); } -void FFMapper::map_dataflow_graph(const MapperContext ctx, +void FFMapper::map_dataflow_graph(MapperContext const ctx, MapDataflowGraphInput const &input, MapDataflowGraphOutput &output) { assert(false); } -void FFMapper::memoize_operation(const MapperContext ctx, +void FFMapper::memoize_operation(MapperContext const ctx, Mappable const &mappable, MemoizeInput const &input, MemoizeOutput &output) { @@ -1240,7 +1261,7 @@ void FFMapper::memoize_operation(const MapperContext ctx, } // Mapping control and stealing -void FFMapper::select_tasks_to_map(const MapperContext ctx, +void FFMapper::select_tasks_to_map(MapperContext const ctx, SelectMappingInput const &input, SelectMappingOutput &output) { // Just map all the ready tasks @@ -1251,13 +1272,13 @@ void FFMapper::select_tasks_to_map(const MapperContext ctx, } } -void FFMapper::select_steal_targets(const MapperContext ctx, +void FFMapper::select_steal_targets(MapperContext const ctx, SelectStealingInput const &input, SelectStealingOutput &output) { // Nothing to do, no stealing in FFMapper } -void FFMapper::permit_steal_request(const MapperContext ctx, +void FFMapper::permit_steal_request(MapperContext const ctx, StealRequestInput const &intput, StealRequestOutput &output) { // Nothing to do, no stealing in FFMapper diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp index 1add43ecd..ae66d9b86 100644 --- a/src/ops/add_bias_residual_layer_norm.cpp +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -38,7 +38,8 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "AddBiasResidualLayerNormMeta"); mean_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index ceb1a6514..2ce5605b6 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -37,7 +37,8 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "AddBiasResidualLayerNormMeta"); mean_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 780a77450..ebed5ab0c 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -48,10 +48,10 @@ using PCG::Node; // For an input tensor, computes the top k entries in each row // (resp. vector along the last dimension). Thus, // values.shape = indices.shape = input.shape[:-1] + [k] -Tensor FFModel::arg_top_k(const Tensor input, +Tensor FFModel::arg_top_k(Tensor const input, int k, bool sorted, - bool speculative_decoding, + bool renormalize, char const *name) { Layer *li = new Layer(this, OP_ARG_TOPK, @@ -59,7 +59,7 @@ Tensor FFModel::arg_top_k(const Tensor input, name, 1 /*inputs*/, 0 /*weights*/, - speculative_decoding ? 2 : 1 /*outputs*/, + 2 /*outputs*/, input); { int numdims = input->num_dims; @@ -72,14 +72,12 @@ Tensor FFModel::arg_top_k(const Tensor input, // numdims, dims, input->data_type, li, 0, true /*create_grad*/); li->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_INT32, li, 0, false /*create_grad*/); - if (speculative_decoding) { - li->outputs[1] = create_tensor_legion_ordering( - numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); - } + li->outputs[1] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); } li->add_int_property("k", k); li->add_int_property("sorted", sorted); - li->add_int_property("speculative_decoding", speculative_decoding); + li->add_int_property("renormalize", renormalize); layers.push_back(li); // outputs[0] = li->outputs[0]; // outputs[1] = li->outputs[1]; @@ -95,23 +93,18 @@ Op *ArgTopK::create_operator_from_layer( int k = value; layer->get_int_property("sorted", value); bool sorted = (bool)value; - layer->get_int_property("speculative_decoding", value); - bool speculative_decoding = (bool)value; - - return new ArgTopK(model, - layer->layer_guid, - inputs[0], - k, - sorted, - speculative_decoding, - layer->name); + layer->get_int_property("renormalize", value); + bool renormalize = (bool)value; + + return new ArgTopK( + model, layer->layer_guid, inputs[0], k, sorted, renormalize, layer->name); } ArgTopKParams ArgTopK::get_params() const { ArgTopKParams params; params.k = this->k; params.sorted = this->sorted; - params.speculative_decoding = this->speculative_decoding; + params.renormalize = this->renormalize; if (this->name != nullptr) { strcpy(params.name, this->name); } @@ -125,15 +118,15 @@ bool ArgTopKParams::is_valid(ParallelTensorShape const &) const { bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) { return lhs.k == rhs.k && lhs.sorted == rhs.sorted && - lhs.speculative_decoding == rhs.speculative_decoding; + lhs.renormalize == rhs.renormalize; } ArgTopK::ArgTopK(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _k, bool _sorted, - bool _speculative_decoding, + bool _renormalize, char const *name) : Op(model, OP_ARG_TOPK, @@ -141,9 +134,9 @@ ArgTopK::ArgTopK(FFModel &model, name, 1 /*inputs*/, 0 /*weights*/, - _speculative_decoding ? 2 : 1 /*outputs*/, + 2 /*outputs*/, _input), - k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) { + k(_k), sorted(_sorted), renormalize(_renormalize) { // overwrite layer_guid layer_guid = _layer_guid; int numdim = inputs[0]->num_dims; @@ -158,22 +151,20 @@ ArgTopK::ArgTopK(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, DT_INT32, this, 0 /*owner_idx*/); - if (_speculative_decoding) { - outputs[1] = model.create_parallel_tensor_legion_ordering( - numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/); - } + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/); } ArgTopK::ArgTopK(FFModel &model, LayerID const &layer_guid, ArgTopK const &other, - const ParallelTensor input) + ParallelTensor const input) : ArgTopK(model, layer_guid, input, other.k, other.sorted, - other.speculative_decoding, + other.renormalize, other.name) {} ArgTopK::ArgTopK(FFModel &model, @@ -185,7 +176,7 @@ ArgTopK::ArgTopK(FFModel &model, input, params.k, params.sorted, - params.speculative_decoding, + params.renormalize, params.name) {} void ArgTopK::init_inference(FFModel const &ff, @@ -275,14 +266,19 @@ OpMeta *ArgTopK::init_task(Task const *task, Runtime *runtime) { ArgTopK *topk = (ArgTopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - ArgTopKMeta *m = new ArgTopKMeta(handle, topk); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + ArgTopKMeta *m = new ArgTopKMeta(handle, topk, gpu_mem_allocator); m->profiling = topk->profiling; m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; m->k = topk->k; std::strcpy(m->op_name, topk->name); m->layer_guid = topk->layer_guid; - m->speculative_decoding = topk->speculative_decoding; + m->renormalize = topk->renormalize; return m; } @@ -305,66 +301,38 @@ FutureMap ArgTopK::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv << std::endl; */ - if (speculative_decoding) { - IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - - launcher.add_region_requirement( - RegionRequirement(batch_outputs[1]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[1]->region)); - launcher.add_field(2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); - - } else { - IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - return runtime->execute_index_space(ctx, launcher); - } + IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); } +// just output the indices InferenceResult ArgTopK::inference_task(Task const *task, std::vector const ®ions, @@ -399,23 +367,23 @@ InferenceResult } InferenceResult ir; + ir.num_token_ids = batch_size * m->k; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } -BeamInferenceResult ArgTopK::inference_speculative_task( +InferenceResult ArgTopK::inference_speculative_task( Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { assert(regions.size() == 3); assert(task->regions.size() == 3); - BeamSearchBatchConfig const &bc = - Future(task->futures[0]).get_result(); - if (bc.num_active_tokens() == 0) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { // Directly return for empty batch config - BeamInferenceResult ir; + InferenceResult ir; return ir; } ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args); @@ -427,10 +395,11 @@ BeamInferenceResult ArgTopK::inference_speculative_task( GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO( DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); - int batch_size = bc.num_active_tokens(); - ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); + int batch_size = bc->num_active_tokens(); + ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, bc); - BeamInferenceResult ir; + InferenceResult ir; + ir.num_token_ids = batch_size * m->k; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); @@ -448,7 +417,7 @@ void ArgTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->k); sez.serialize(this->sorted); - sez.serialize(this->speculative_decoding); + sez.serialize(this->renormalize); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -465,10 +434,10 @@ Node ArgTopK::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); int k; bool sorted; - bool speculative_decoding; + bool renormalize; dez.deserialize(k); dez.deserialize(sorted); - dez.deserialize(speculative_decoding); + dez.deserialize(renormalize); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -477,7 +446,7 @@ Node ArgTopK::deserialize(FFModel &ff, params.layer_guid = layer_guid; params.k = k; params.sorted = sorted; - params.speculative_decoding = speculative_decoding; + params.renormalize = renormalize; strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } @@ -504,7 +473,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.k); hash_combine(key, params.sorted); - hash_combine(key, params.speculative_decoding); + hash_combine(key, params.renormalize); return key; } }; // namespace std diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index f431d3d4b..90dbb5909 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -379,7 +379,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, int length, int k, bool sorted, - BeamSearchBatchConfig const *bc, + BatchConfig const *bc, hipStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -398,29 +398,17 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - // all requests are in the same beam stages + // all requests share the same number of branches if (m->speculative_decoding) { assert(bc->num_active_requests() >= 0); - // check - int beam_size = -1; - for (int i = 1; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } else if (beam_size == -1) { - beam_size = bc->beamRequestsInfo[i].beam_size; - } else { - assert(beam_size == bc->beamRequestsInfo[i].beam_size); - } - } - - assert(num_shards >= (size_t)beam_size); + assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); num_shards = k; arg_topk_forward_kernel<<>>( input_ptr, shared_memory_size, length, - beam_size, + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, sorted, output_ptr, indices_ptr, @@ -448,7 +436,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, // float *output_ptr, GenericTensorAccessorW const &indices, int batch_size, - BeamSearchBatchConfig const *bc) { + BatchConfig const *bc) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); // Domain in1_domain = runtime->get_index_space_domain( diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 5b7978812..0d4ea2045 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -15,446 +15,125 @@ #include "flexflow/ops/arg_topk.h" #include "flexflow/utils/cuda_helper.h" +#include "raft/matrix/detail/select_k.cuh" namespace FlexFlow { // declare Legion names using Legion::coord_t; -enum class HeapType { kMinHeap, kMaxHeap }; -enum class PreferIndices { kLower, kHigher }; +__global__ void half2float_kernel(half const *__restrict__ in, + float *__restrict__ out, + int size) { + // int stride = blockDim.x * gridDim.x, + // tid = blockIdx.x * blockDim.x + threadIdx.x; -template -struct Entry { - int index; - T value; -}; - -template -struct LinearData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index]; - } - - __device__ int get_index(int i) const { - return data[i].index; - } - __device__ T get_value(int i) const { - return data[i].value; - } - - Entry *const data; -}; - -template -struct IndirectLinearData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index]; - } - - __device__ int get_index(int i) const { - return backing_data[data[i].index].index; - } - __device__ T get_value(int i) const { - return data[i].value; - } - - Entry *const data; - Entry *const backing_data; -}; - -template -struct StridedData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index * blockDim.x + threadIdx.x]; - } - - __device__ int get_index(int i) const { - return (*this)[i].index; - } - __device__ T get_value(int i) const { - return (*this)[i].value; - } - - Entry *const data; -}; - -// A heap of Entry that can either work as a min-heap or as a max-heap. -template - class Data, - typename T> -struct IndexedHeap { - typedef typename Data::Entry Entry; - Data const data; - __device__ IndexedHeap(Data const &d) : data(d) {} - - __device__ bool is_above(int left, int right) { - T left_value = data.get_value(left); - T right_value = data.get_value(right); - if (left_value == right_value) { - if (preferIndices == PreferIndices::kLower) { - return data.get_index(left) < data.get_index(right); - } else { - return data.get_index(left) > data.get_index(right); - } - } - if (heapType == HeapType::kMinHeap) { - return left_value < right_value; - } else { - return left_value > right_value; - } - } - - __device__ void assign(int i, Entry const &entry) { - data[i] = entry; - } - - __device__ void push_up(int i) { - int child = i; - int parent; - for (; child > 0; child = parent) { - parent = (child - 1) / 2; - if (!is_above(child, parent)) { - // Heap property satisfied. - break; - } - swap(child, parent); - } - } - - __device__ void swap(int a, int b) { - auto tmp = data[b]; - data[b] = data[a]; - data[a] = tmp; - } - - __device__ void push_root_down(int k) { - push_down(0, k); - } - - // MAX-HEAPIFY in Cormen - __device__ void push_down(int node, int k) { - while (true) { - int const left = 2 * node + 1; - int const right = left + 1; - int smallest = node; - if (left < k && is_above(left, smallest)) { - smallest = left; - } - if (right < k && is_above(right, smallest)) { - smallest = right; - } - if (smallest == node) { - break; - } - swap(smallest, node); - node = smallest; - } - } - - // BUILD-MAX-HEAPIFY in Cormen - __device__ void build(int k) { - for (int node = (k - 1) / 2; node >= 0; node--) { - push_down(node, k); - } - } - - // HEAP-EXTRACT-MAX in Cormen - __device__ void remove_root(int k) { - data[0] = data[k - 1]; - push_root_down(k - 1); - } - - // in-place HEAPSORT in Cormen - // This method destroys the heap property. - __device__ void sort(int k) { - for (int slot = k - 1; slot > 0; slot--) { - // This is like remove_root but we insert the element at the end. - swap(slot, 0); - // Heap is now an element smaller. - push_root_down(/*k=*/slot); - } - } - - __device__ void replace_root(Entry const &entry, int k) { - data[0] = entry; - push_root_down(k); - } - - __device__ Entry const &root() { - return data[0]; - } -}; - -template - class Data, - typename T> -__device__ IndexedHeap - make_indexed_heap(typename Data::Entry *data) { - return IndexedHeap{Data{data}}; -} - -// heapArgTopK walks over [input, input+length) with `step_size` stride starting -// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries` -// using `Accessor` to access elements in `heap_entries`. If sorted=true, the -// elements will be sorted at the end. -template class Data = LinearData> -__device__ void heapArgTopK(T const *__restrict__ input, - int length, - int k, - Entry *__restrict__ heap_entries, - bool sorted = false, - int start_index = 0, - int step_size = 1) { - assert(k <= length); - - auto heap = - make_indexed_heap( - heap_entries); - - int heap_end_index = start_index + k * step_size; - if (heap_end_index > length) { - heap_end_index = length; - } - // Initialize the min-heap. - for (int index = start_index, slot = 0; index < heap_end_index; - index += step_size, slot++) { - heap.assign(slot, {index, input[index]}); - } - - heap.build(k); - - // Now iterate over the remaining items. - // If an item is smaller than the min element, it is not amongst the top k. - // Otherwise, replace the min element with it and push upwards. - for (int index = heap_end_index; index < length; index += step_size) { - // We prefer elements with lower indices. This is given here. - // Later elements automatically have higher indices, so can be discarded. - if (input[index] > heap.root().value) { - // This element should replace the min. - heap.replace_root({index, input[index]}, k); - } - } - - // Sort if wanted. - if (sorted) { - heap.sort(k); + // for (int i = tid; i < size; i += stride) { + // out[i] = __half2float(in[i]); + // } + CUDA_KERNEL_LOOP(i, size) { + out[i] = __half2float(in[i]); } } -// mergeShards performs a top-k merge on `num_shards` many sorted streams that -// are sorted and stored in `entries` in a strided way: -// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|... -// The overall top k elements are written to `top_k_values` and their indices -// to top_k_indices. -// `top_k_heap` is used as temporary storage for the merge heap. -template -__device__ void mergeShards(int num_shards, - int k, - Entry *__restrict__ entries, - Entry *__restrict__ top_k_heap, - float *top_k_values, - int *top_k_indices, - bool speculative_decoding) { - // If k < num_shards, we can use a min-heap with k elements to get the top k - // of the sorted blocks. - // If k > num_shards, we can initialize a min-heap with the top element from - // each sorted block. - int const heap_size = k < num_shards ? k : num_shards; - - // Min-heap part. - { - auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; - // Initialize the heap as a min-heap. - for (int slot = 0; slot < heap_size; slot++) { - min_heap.assign(slot, {slot, entries[slot].value}); - } - min_heap.build(heap_size); - - // Now perform top k with the remaining shards (if num_shards > heap_size). - for (int shard = heap_size; shard < num_shards; shard++) { - auto const entry = entries[shard]; - auto const root = min_heap.root(); - if (entry.value < root.value) { - continue; - } - if (entry.value == root.value && - entry.index > entries[root.index].index) { - continue; - } - // This element should replace the min. - min_heap.replace_root({shard, entry.value}, heap_size); - } - } - - // Max-part. - { - // Turn the min-heap into a max-heap in-place. - auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; - // Heapify into a max heap. - max_heap.build(heap_size); - - // Now extract the minimum k-1 times. - // k is treated specially. - int const last_k = k - 1; - for (int rank = 0; rank < last_k; rank++) { - Entry const &max_element = max_heap.root(); - if (speculative_decoding) { - assert(top_k_values != nullptr); - top_k_values[rank] = static_cast(max_element.value); +template +__global__ void insertion_sort_kernel(DT *topk_values, + int *topk_indices, + int batch_size, + int k) { + int batch_index = blockIdx.x * blockDim.x + threadIdx.x; + if (batch_index < batch_size) { + DT *values = topk_values + batch_index * k; + int *indices = topk_indices + batch_index * k; + + for (int i = 1; i < k; i++) { + DT key_val = values[i]; + int key_idx = indices[i]; + int j = i - 1; + while (j >= 0 && values[j] < key_val) { + values[j + 1] = values[j]; + indices[j + 1] = indices[j]; + j = j - 1; } - - int shard_index = max_element.index; - top_k_indices[rank] = entries[shard_index].index; - int next_shard_index = shard_index + num_shards; - // For rank < k-1, each top k heap still contains at least 1 element, - // so we can draw a replacement. - max_heap.replace_root({next_shard_index, entries[next_shard_index].value}, - heap_size); + values[j + 1] = key_val; + indices[j + 1] = key_idx; } - - // rank == last_k. - Entry const &max_element = max_heap.root(); - // top_k_values[last_k] = max_element.value; - int shard_index = max_element.index; - top_k_indices[last_k] = entries[shard_index].index; } } -template -__global__ void arg_topk_forward_kernel(T const *__restrict__ input, - size_t shared_memory_size, - int length, - int k, - bool sorted, - float *__restrict__ output, - int *__restrict__ indices, - bool speculative_decoding) { - __shared__ char shared_memory[48 << 10]; - int const batch_index = blockIdx.x; - T const *batch_input = input + batch_index * length; - int const thread_index = threadIdx.x; - int const thread_count = blockDim.x; - Entry *shared_entries = (Entry *)shared_memory; - heapArgTopK( - batch_input, length, k, shared_entries, true, thread_index, thread_count); - __syncthreads(); - if (thread_index == 0) { - int const offset = batch_index * k; - auto batch_output = output + offset; - auto batch_indices = indices + offset; - Entry *top_k_heap = shared_entries + thread_count * k; - mergeShards(thread_count, - k, - shared_entries, - top_k_heap, - batch_output, - batch_indices, - speculative_decoding); +template +__global__ void renormalize_kernel(DT *topk_values, + int batch_size, + int k, + float epsilon = 1e-6) { + int batch_index = blockIdx.x * blockDim.x + threadIdx.x; + assert(batch_index < batch_size); + DT *values = topk_values + batch_index * k; + DT sum = 0; + for (int i = 0; i < k; i++) { + sum += values[i]; + } + sum += epsilon; + for (int i = 0; i < k; i++) { + values[i] /= sum; } } +// Adopted from Raft's select_k +// https://github.com/rapidsai/raft/blob/branch-24.10/cpp/include/raft/matrix/detail/select_k.cuh + /*static*/ template -void ArgTopK::forward_kernel(ArgTopKMeta const *m, +void ArgTopK::forward_kernel(ArgTopKMeta *m, DT const *input_ptr, - float *output_ptr, + DT *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, - BeamSearchBatchConfig const *bc, + bool renormalize, + BatchConfig const *bc, cudaStream_t stream) { - // Adopted from TensorFlow's ArgTopK implementation - // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h - int num_shards = 0; - { - constexpr auto shared_memory_size = 48 << 10; - auto const heap_size = k * sizeof(Entry
); - // shared_memory_size = (num_shards + 1) * heap_size <=> - num_shards = shared_memory_size / heap_size - 1; - assert(num_shards > 0); - if (num_shards > CUDA_NUM_THREADS) { - num_shards = CUDA_NUM_THREADS; - } - } - // We are limited by the amount of shared memory we have per block. - size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); - // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; - size_t num_blocks = batch_size; - - // all requests are in the same beam stages - if (m->speculative_decoding) { - assert(bc->num_active_requests() >= 0); - - // check - // allow last request different with others - int beam_size = -1; - int num_activate_requests = bc->num_active_requests(); - int last_request_idx = - bc->requestsInfo[num_activate_requests - 1].batch_config_request_id; - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } else if (beam_size == -1) { - beam_size = bc->beamRequestsInfo[i].beam_size; - - } else if (i != last_request_idx) { - assert(beam_size == bc->beamRequestsInfo[i].beam_size); - } else if (i == last_request_idx) { - } - } - assert(num_shards >= (size_t)beam_size); - num_shards = k; - arg_topk_forward_kernel<<>>( - input_ptr, - shared_memory_size, - length, - beam_size, - sorted, - output_ptr, - indices_ptr, - m->speculative_decoding); - } else { - - assert(num_shards >= (size_t)k); - num_shards = k; - arg_topk_forward_kernel<<>>( - input_ptr, - shared_memory_size, - length, - k, - sorted, - nullptr, - indices_ptr, - false); + assert(bc->num_active_requests() >= 0); + if (m->device_resources.find(stream) == m->device_resources.end()) { + m->device_resources[stream] = new raft::device_resources(stream); + } + raft::device_resources *handle = m->device_resources[stream]; + raft::matrix::detail::select_k(*handle, + input_ptr, + (int *)nullptr, + batch_size, + (size_t)length, + k, + output_ptr, + indices_ptr, + /*select_min=*/false, + sorted); + // if (sorted) { + // assert(output_ptr != nullptr); + // insertion_sort_kernel<<>>(output_ptr, indices_ptr, batch_size, + // k); + // } + if (renormalize) { + assert(output_ptr != nullptr); + renormalize_kernel<<>>(output_ptr, batch_size, k); } } /*static*/ -void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, +void ArgTopK::forward_kernel_wrapper(ArgTopKMeta *m, GenericTensorAccessorR const &input, // float *output_ptr, GenericTensorAccessorW const &probs, GenericTensorAccessorW const &indices, int batch_size, - BeamSearchBatchConfig const *bc) { + BatchConfig const *bc) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -497,28 +176,36 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, } if (input.data_type == DT_HALF) { + // printf("ArgTopK: length = %d, batch_size = %d\n", length, batch_size); ArgTopK::forward_kernel(m, input.get_half_ptr(), - m->speculative_decoding ? probs.get_float_ptr() - : nullptr, + (half *)m->half_precision_output, indices.get_int32_ptr(), batch_size, length, k, m->sorted, - m->speculative_decoding ? bc : nullptr, + m->renormalize, + bc, stream); + // transfer data from half to float (half_precision_output to output) + int size = k * batch_size; + half2float_kernel<<>>( + (half const *)m->half_precision_output, probs.get_float_ptr(), size); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), - m->speculative_decoding ? probs.get_float_ptr() - : nullptr, + probs.get_float_ptr(), indices.get_int32_ptr(), batch_size, length, k, m->sorted, - m->speculative_decoding ? bc : nullptr, + m->renormalize, + bc, stream); } else { assert(false && "Unsupported data type"); @@ -535,7 +222,23 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, } } -ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op) - : OpMeta(handler, op) {} +ArgTopKMeta::ArgTopKMeta(FFHandler handler, + Op const *op, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handler, op) { + max_output_size = BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS; + gpu_mem_allocator.create_legion_instance( + reserveInst, sizeof(half) * max_output_size, "ArgTopKMeta"); + half_precision_output = gpu_mem_allocator.allocate_instance_untyped( + sizeof(half) * max_output_size); +} +ArgTopKMeta::~ArgTopKMeta() { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } + for (auto &kv : device_resources) { + delete kv.second; + } +} }; // namespace FlexFlow diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index a52ce1886..0524defce 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -44,7 +44,7 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -Tensor FFModel::argmax(const Tensor input, bool beam_search, char const *name) { +Tensor FFModel::argmax(Tensor const input, bool beam_search, char const *name) { Layer *li = new Layer(this, OP_ARGMAX, input->data_type, @@ -106,7 +106,7 @@ bool operator==(ArgMaxParams const &lhs, ArgMaxParams const &rhs) { } ArgMax::ArgMax(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, bool _beam_search, char const *name) : Op(model, @@ -136,12 +136,12 @@ ArgMax::ArgMax(FFModel &model, } } -ArgMax::ArgMax(FFModel &model, ArgMax const &other, const ParallelTensor input) +ArgMax::ArgMax(FFModel &model, ArgMax const &other, ParallelTensor const input) : ArgMax(model, input, other.beam_search, other.name) {} ArgMax::ArgMax(FFModel &model, ArgMaxParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : ArgMax(model, input, params.beam_search, params.name) {} @@ -332,7 +332,7 @@ FutureMap ArgMax::inference(FFModel const &ff, } } -BeamInferenceResult +InferenceResult ArgMax::inference_task_beam(Task const *task, std::vector const ®ions, Context ctx, @@ -342,7 +342,7 @@ BeamInferenceResult BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { // Directly return for empty batch config - BeamInferenceResult ir; + InferenceResult ir; return ir; } ArgMaxMeta *m = *((ArgMaxMeta **)task->local_args); @@ -355,17 +355,17 @@ BeamInferenceResult GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); - BeamInferenceResult ir; + InferenceResult ir; + ir.num_token_ids = batch_size; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); download_tensor(m->probs, ir.probs, batch_size); - download_tensor(parent.get_int32_ptr(), ir.parent_id, batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; ArgMax::save_inference_tensors_to_file( - m, shard_id, bc, {}, {}, {input, indices, parent}); + m, shard_id, bc, {}, {}, {input, indices}); } return ir; @@ -394,6 +394,7 @@ InferenceResult int batch_size = bc->num_active_tokens(); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); InferenceResult ir; + ir.num_token_ids = batch_size; if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 8a1cf0b3b..bd0b2bd19 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -493,7 +493,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, size_t prob_size = batch_size; assert(data_type == DT_FLOAT || data_type == DT_HALF); size_t total_size = prob_size * sizeof(float); - gpu_mem_allocator.create_legion_instance(reserveInst, total_size); + gpu_mem_allocator.create_legion_instance( + reserveInst, total_size, "ArgMaxMeta"); probs = gpu_mem_allocator.allocate_instance(prob_size); } ArgMaxMeta::~ArgMaxMeta(void) { diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 05c84719c..42d1a96f3 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -23,10 +23,11 @@ __global__ void init_offset(int batch_size, int vocab_size, int total_eles, int *d_offsets) { - CUDA_KERNEL_LOOP(i, total_eles) { - if (i % vocab_size == 0) { - d_offsets[i / vocab_size] = i; - } + CUDA_KERNEL_LOOP(i, (total_eles) / vocab_size + 1) { + // if (i % vocab_size == 0) { + // d_offsets[i / vocab_size] = i; + // } + d_offsets[i] = i * vocab_size; } } @@ -83,7 +84,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, prob_ptr, batch_size, m->beam_search); - // print_tensor(indices_ptr, 32, "argmax op"); + // print_tensor(indices_ptr, 4, "argmax op"); } /*static*/ @@ -151,7 +152,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - size_t d_offsets_size = batch_size; + size_t d_offsets_size = batch_size + 1; size_t prob_size = batch_size; assert(data_type == DT_FLOAT || data_type == DT_HALF); size_t total_size = @@ -160,7 +161,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, ? sizeof(cub::KeyValuePair) * batch_size : sizeof(cub::KeyValuePair) * batch_size) + prob_size * sizeof(float); - gpu_mem_allocator.create_legion_instance(reserveInst, total_size); + gpu_mem_allocator.create_legion_instance( + reserveInst, total_size, "ArgMaxMeta"); d_offsets = gpu_mem_allocator.allocate_instance(d_offsets_size); d_out = data_type == DT_FLOAT ? gpu_mem_allocator.allocate_instance_untyped( @@ -199,7 +201,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, stream)); } - gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); + gpu_mem_allocator.create_legion_instance( + reserveInst, temp_storage_bytes, "ArgMaxMeta"); d_temp_storage = gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes); } diff --git a/src/ops/attention_impl.cu b/src/ops/attention_impl.cu new file mode 100644 index 000000000..f3cc8df92 --- /dev/null +++ b/src/ops/attention_impl.cu @@ -0,0 +1,818 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "cuComplex.h" +#endif +#include "flashinfer/attention_impl.cuh" + +// This is for instantiating the template attention kernels +namespace flashinfer { + +// warp_layout_literal[] = { +// "WarpLayout::k4x1x2", +// "WarpLayout::k4x1x1", +// "WarpLayout::k1x4x1", +// } +// head_dim[] = {64, 128, 256}; + +/********** batch append instantiations for half precision **********/ + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +/********** batch prefill instantiations for half precision **********/ + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchPrefillWithPagedKVCacheDispatched( + half *q, + int32_t *request_indices, + int32_t *q_tile_indices, + int32_t *kv_tile_indices, + int32_t *q_indptr, + int32_t *q_offset, + paged_kv_t paged_kv, + uint8_t *custom_mask, + int32_t *qk_indptr, + int32_t *o_indptr, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + int32_t *merge_indptr, + bool *block_valid_mask, + int32_t *kv_chunk_size_ptr, + uint32_t total_num_rows, + uint32_t num_qo_heads, + uint32_t padded_batch_size, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +/********** batch decode instantiations for half precision **********/ +template cudaError_t + BatchDecodeWithPagedKVCacheDispatched<64, + PageStorage::kIndices, + LogitsPostHook::kNone, + PosEncodingMode::kNone, + half, + half, + half, + int32_t>( + half *q, + int32_t *q_offset, + paged_kv_t paged_kv, + kv_partition_info_t kv_partition_info, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + bool *block_valid_mask, + uint32_t padded_batch_size, + uint32_t num_qo_heads, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchDecodeWithPagedKVCacheDispatched<128, + PageStorage::kIndices, + LogitsPostHook::kNone, + PosEncodingMode::kNone, + half, + half, + half, + int32_t>( + half *q, + int32_t *q_offset, + paged_kv_t paged_kv, + kv_partition_info_t kv_partition_info, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + bool *block_valid_mask, + uint32_t padded_batch_size, + uint32_t num_qo_heads, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +template cudaError_t + BatchDecodeWithPagedKVCacheDispatched<256, + PageStorage::kIndices, + LogitsPostHook::kNone, + PosEncodingMode::kNone, + half, + half, + half, + int32_t>( + half *q, + int32_t *q_offset, + paged_kv_t paged_kv, + kv_partition_info_t kv_partition_info, + half *o, + half *tmp_v, + float *tmp_s, + float *lse, + bool *block_valid_mask, + uint32_t padded_batch_size, + uint32_t num_qo_heads, + int32_t window_left, + float logits_soft_cap, + float sm_scale, + float rope_scale, + float rope_theta, + cudaStream_t stream); + +} // namespace flashinfer diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc deleted file mode 100644 index d2054cacb..000000000 --- a/src/ops/beam_topk.cc +++ /dev/null @@ -1,476 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/ops/beam_topk.h" -#include "flexflow/model.h" -#include "flexflow/utils/hash_utils.h" -#include "legion/legion_utilities.h" -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "flexflow/utils/cuda_helper.h" -#else -#include "flexflow/utils/hip_helper.h" -#endif - -namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::Future; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::InlineLauncher; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; - -// For an input tensor, computes the top k entries in each row -// (resp. vector along the last dimension). Thus, -// values.shape = indices.shape = input.shape[:-1] + [k] -Tensor FFModel::beam_top_k(const Tensor input, - int max_beam_width, - bool sorted, - char const *name) { - Layer *li = new Layer(this, - OP_BEAM_TOPK, - input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - 3 /*outputs*/, - input); - { - int numdims = input->num_dims; - - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = input->dims[i]; - } - dims[0] = max_beam_width; - - std::cout << "beam input dimen:" << numdims << "\n"; - for (int i = 0; i < numdims; i++) { - std::cout << input->dims[i] << ", "; - } - - // beam width is dynamic - li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, DT_INT32, li, 0, false /*create_grad*/); - li->outputs[1] = create_tensor_legion_ordering( - numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); - li->outputs[2] = create_tensor_legion_ordering( - numdims, dims, DT_INT32, li, 1, false /*create_grad*/); - } - li->add_int_property("sorted", sorted); - li->add_int_property("max_beam_width", max_beam_width); - layers.push_back(li); - // outputs[0] = li->outputs[0]; - // outputs[1] = li->outputs[1]; - return li->outputs[1]; -} - -Op *BeamTopK::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - long long value; - layer->get_int_property("sorted", value); - bool sorted = (bool)value; - layer->get_int_property("max_beam_width", value); - int max_beam_width = value; - return new BeamTopK( - model, inputs[0], layer->layer_guid, max_beam_width, sorted, layer->name); -} - -BeamTopKParams BeamTopK::get_params() const { - BeamTopKParams params; - params.layer_guid = this->layer_guid; - params.sorted = this->sorted; - params.max_beam_width = this->max_beam_width; - return params; -} - -bool BeamTopKParams::is_valid(ParallelTensorShape const &) const { - // topk is always valid - return true; -} - -bool operator==(BeamTopKParams const &lhs, BeamTopKParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.sorted == rhs.sorted && - lhs.max_beam_width == rhs.max_beam_width; -} - -BeamTopK::BeamTopK(FFModel &model, - const ParallelTensor _input, - LayerID const &_layer_guid, - int _max_beam_width, - bool _sorted, - char const *name) - : Op(model, - OP_BEAM_TOPK, - _input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - 3 /*outputs*/, - _input) { - sorted = _sorted; - max_beam_width = _max_beam_width; - layer_guid = _layer_guid; - int numdim = inputs[0]->num_dims; - assert(inputs[0]->dims[0].degree == 1); - assert(inputs[0]->dims[0].parallel_idx == -1); - // outputs[0] = model.create_parallel_tensor_legion_ordering( - // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); - outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, inputs[0]->dims, DT_INT32, this, 0 /*owner_idx*/); - outputs[1] = model.create_parallel_tensor_legion_ordering( - numdim, inputs[0]->dims, DT_FLOAT, this, 1 /*owner_idx*/); - outputs[2] = model.create_parallel_tensor_legion_ordering( - numdim, inputs[0]->dims, DT_INT32, this, 2 /*owner_idx*/); -} - -BeamTopK::BeamTopK(FFModel &model, - BeamTopK const &other, - const ParallelTensor input) - : BeamTopK(model, - input, - other.layer_guid, - other.max_beam_width, - other.sorted, - other.name) {} - -BeamTopK::BeamTopK(FFModel &model, - BeamTopKParams const ¶ms, - const ParallelTensor input, - char const *name) - : BeamTopK(model, - input, - params.layer_guid, - params.max_beam_width, - params.sorted, - params.name) {} - -void BeamTopK::init_inference(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = batch_outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher(BEAM_TOPK_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(BeamTopK)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[1]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[2]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[2]->region)); - launcher.add_field(3, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); -} - -void BeamTopK::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(BEAM_TOPK_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(BeamTopK)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[1]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[1]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[2]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[2]->region)); - launcher.add_field(3, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} - -OpMeta *BeamTopK::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - BeamTopK *topk = (BeamTopK *)task->args; - FFHandler handle = *((FFHandler *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); - MemoryAllocator gpu_mem_allocator(gpu_mem); - BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator); - m->profiling = topk->profiling; - m->inference_debugging = topk->inference_debugging; - std::strcpy(m->op_name, topk->name); - m->layer_guid = topk->layer_guid; - m->sorted = topk->sorted; - m->max_beam_width = topk->max_beam_width; - m->input_type[0] = topk->inputs[0]->data_type; - return m; -} - -void BeamTopK::forward(FFModel const &ff) { - assert(false); -} - -FutureMap BeamTopK::inference(FFModel const &ff, - BatchConfigFuture const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - size_t machine_view_hash = view->hash(); - - IndexLauncher launcher(BEAM_TOPK_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[1]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[2]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[2]->region)); - launcher.add_field(3, FID_DATA); - - return runtime->execute_index_space(ctx, launcher); -} - -BeamInferenceResult - BeamTopK::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - - assert(regions.size() == 4); - assert(task->regions.size() == 4); - - BeamTopKMeta *m = *((BeamTopKMeta **)task->local_args); - BeamSearchBatchConfig const &bc = - Future(task->futures[0]).get_result(); - - if (bc.num_tokens == 0) { - BeamInferenceResult ir; - return ir; - } - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW index = helperGetGenericTensorAccessorWO( - DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( - DT_INT32, regions[3], task->regions[3], FID_DATA, ctx, runtime); - - Domain input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - - int *index_ptr = index.get_int32_ptr(); - float *value_ptr = value.get_float_ptr(); - int *parent_ptr = parent.get_int32_ptr(); - - // embedding size: eg. 4096 - int length = input_domain.hi()[0] - input_domain.lo()[0] + 1; - // total token nums - size_t batch_size = bc.num_active_tokens(); - - // need meta for: how many sub requests in a main request - BeamTopK::forward_kernel_wrapper(m, - &bc, - input, - value_ptr, - index_ptr, - parent_ptr, - batch_size, - length, - m->sorted); - - BeamInferenceResult ir; - - download_tensor(index_ptr, ir.token_ids, batch_size * m->max_beam_width); - download_tensor(value_ptr, ir.probs, batch_size * m->max_beam_width); - download_tensor( - parent_ptr, ir.parent_id, batch_size * m->max_beam_width); - - if (m->inference_debugging) { - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - BeamTopK::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, {}, {index, value, parent}); - } - - return ir; -} - -void BeamTopK::backward(FFModel const &ff) { - assert(false); -} - -void BeamTopK::serialize(Legion::Serializer &sez) const { - sez.serialize(this->layer_guid.id); - sez.serialize(this->layer_guid.transformer_layer_id); - sez.serialize(this->layer_guid.model_id); - sez.serialize(this->sorted); - sez.serialize(this->max_beam_width); - sez.serialize(strlen(this->name)); - sez.serialize(this->name, strlen(this->name)); -} - -Node BeamTopK::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 1); - bool sorted; - size_t id, transformer_layer_id, deserialized_model_id; - int max_beam_width; - dez.deserialize(id); - dez.deserialize(transformer_layer_id); - dez.deserialize(deserialized_model_id); - LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); - dez.deserialize(sorted); - dez.deserialize(max_beam_width); - size_t name_len; - char name[MAX_OPNAME] = {0}; - dez.deserialize(name_len); - dez.deserialize(name, name_len); - - BeamTopKParams params; - params.layer_guid = layer_guid; - params.sorted = sorted; - params.max_beam_width = max_beam_width; - strcpy(params.name, name); - return ff.get_or_create_node(inputs[0], params); -} - -Op *BeamTopK::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - BeamTopKParams params = get_params(); - return new BeamTopK(ff, params, inputs[0], this->name); -} - -bool BeamTopK::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - return false; -} - -}; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::BeamTopKParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.layer_guid.id); - hash_combine(key, params.sorted); - hash_combine(key, params.max_beam_width); - return key; -} -}; // namespace std diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp deleted file mode 100644 index 18534455a..000000000 --- a/src/ops/beam_topk.cpp +++ /dev/null @@ -1,724 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/ops/beam_topk.h" -#include "flexflow/ffconst_utils.h" -#include "flexflow/utils/hip_helper.h" -#include - -namespace FlexFlow { -// declare Legion names -using Legion::coord_t; - -enum class HeapType { kMinHeap, kMaxHeap }; -enum class PreferIndices { kLower, kHigher }; - -LegionRuntime::Logger::Category log_beam_topk("BeamTopK"); - -template -struct Entry { - int index; - T value; -}; - -template -struct LinearData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index]; - } - - __device__ int get_index(int i) const { - return data[i].index; - } - __device__ T get_value(int i) const { - return data[i].value; - } - - Entry *const data; -}; - -template -struct IndirectLinearData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index]; - } - - __device__ int get_index(int i) const { - return backing_data[data[i].index].index; - } - __device__ T get_value(int i) const { - return data[i].value; - } - - Entry *const data; - Entry *const backing_data; -}; - -template -struct StridedData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index * blockDim.x + threadIdx.x]; - } - - __device__ int get_index(int i) const { - return (*this)[i].index; - } - __device__ T get_value(int i) const { - return (*this)[i].value; - } - - Entry *const data; -}; - -// A heap of Entry that can either work as a min-heap or as a max-heap. -template - class Data, - typename T> -struct IndexedHeap { - typedef typename Data::Entry Entry; - Data const data; - __device__ IndexedHeap(Data const &d) : data(d) {} - - __device__ bool is_above(int left, int right) { - T left_value = data.get_value(left); - T right_value = data.get_value(right); - if (left_value == right_value) { - if (preferIndices == PreferIndices::kLower) { - return data.get_index(left) < data.get_index(right); - } else { - return data.get_index(left) > data.get_index(right); - } - } - if (heapType == HeapType::kMinHeap) { - return left_value < right_value; - } else { - return left_value > right_value; - } - } - - __device__ void assign(int i, Entry const &entry) { - data[i] = entry; - } - - __device__ void push_up(int i) { - int child = i; - int parent; - for (; child > 0; child = parent) { - parent = (child - 1) / 2; - if (!is_above(child, parent)) { - // Heap property satisfied. - break; - } - swap(child, parent); - } - } - - __device__ void swap(int a, int b) { - auto tmp = data[b]; - data[b] = data[a]; - data[a] = tmp; - } - - __device__ void push_root_down(int k) { - push_down(0, k); - } - - // MAX-HEAPIFY in Cormen - __device__ void push_down(int node, int k) { - while (true) { - int const left = 2 * node + 1; - int const right = left + 1; - int smallest = node; - if (left < k && is_above(left, smallest)) { - smallest = left; - } - if (right < k && is_above(right, smallest)) { - smallest = right; - } - if (smallest == node) { - break; - } - swap(smallest, node); - node = smallest; - } - } - - // BUILD-MAX-HEAPIFY in Cormen - __device__ void build(int k) { - for (int node = (k - 1) / 2; node >= 0; node--) { - push_down(node, k); - } - } - - // HEAP-EXTRACT-MAX in Cormen - __device__ void remove_root(int k) { - data[0] = data[k - 1]; - push_root_down(k - 1); - } - - // in-place HEAPSORT in Cormen - // This method destroys the heap property. - __device__ void sort(int k) { - for (int slot = k - 1; slot > 0; slot--) { - // This is like remove_root but we insert the element at the end. - swap(slot, 0); - // Heap is now an element smaller. - push_root_down(/*k=*/slot); - } - } - - __device__ void replace_root(Entry const &entry, int k) { - data[0] = entry; - push_root_down(k); - } - - __device__ Entry const &root() { - return data[0]; - } -}; - -template - class Data, - typename T> -__device__ IndexedHeap - make_indexed_heap(typename Data::Entry *data) { - return IndexedHeap{Data{data}}; -} - -// heapBeamTopK walks over [input, input+length) with `step_size` stride -// starting at `start_index`. It builds a top-`k` heap that is stored in -// `heap_entries` using `Accessor` to access elements in `heap_entries`. If -// sorted=true, the elements will be sorted at the end. -template class Data = LinearData> -__device__ void heapBeamTopK(T const *__restrict__ input, - int batch_index, - int length, - int k, - Entry *__restrict__ heap_entries, - bool sorted = false, - int start_index = 0, - int step_size = 1) { - assert(k <= length); - auto heap = - make_indexed_heap( - heap_entries); - - int heap_end_index = start_index + k * step_size; - if (heap_end_index > length) { - heap_end_index = length; - } - // Initialize the min-heap. - for (int index = start_index, slot = 0; index < heap_end_index; - index += step_size, slot++) { - heap.assign(slot, {index, input[index]}); - } - - heap.build(k); - - // Now iterate over the remaining items. - // If an item is smaller than the min element, it is not amongst the top k. - // Otherwise, replace the min element with it and push upwards. - for (int index = heap_end_index; index < length; index += step_size) { - // We prefer elements with lower indices. This is given here. - // Later elements automatically have higher indices, so can be discarded. - if (input[index] > heap.root().value) { - // This element should replace the min. - heap.replace_root({index, input[index]}, k); - } - } - - // Sort if wanted. - if (sorted) { - heap.sort(k); - } - - // if(batch_index == 0){ - // printf("top elemmments: %d, value %.15f\n", start_index, - // heap.root().value); - // } -} - -template -__device__ void mergeBeamShards(int num_shards, - int batch_index, - int k, - int max_heap_size, - int request_id, - int *parent_id, - T *probs, - Entry *__restrict__ entries, - Entry *__restrict__ top_k_heap, - float *top_k_values, - int *top_k_indices, - int *top_k_parents, - bool verbose) { - // If k < num_shards, we can use a min-heap with k elements to get the top k - // of the sorted blocks. - // If k > num_shards, we can initialize a min-heap with the top element from - // each sorted block. - int const heap_size = k < num_shards ? k : num_shards; - // printf("see value: %f", entries[0].value); - // Min-heap part. - - { - auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; - // Initialize the heap as a min-heap. - for (int slot = 0; slot < heap_size; slot++) { - // int beam = (slot % max_heap_size) / k; - T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((slot % max_heap_size) / k)]; - min_heap.assign(slot, {slot, (entries[slot].value * prob)}); - } - min_heap.build(heap_size); - - // Now perform top k with the remaining shards (if num_shards > heap_size). - for (int shard = heap_size; shard < num_shards; shard++) { - auto const entry = entries[shard]; - auto const root = min_heap.root(); - - T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((shard % max_heap_size) / k)]; - if (entry.value * prob < root.value) { - continue; - } - if (entry.value * prob == root.value && - entry.index > entries[root.index].index) { - continue; - } - // This element should replace the min. - min_heap.replace_root({shard, entry.value * prob}, heap_size); - } - } - - // Max-part. - { - // Turn the min-heap into a max-heap in-place. - auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; - // Heapify into a max heap. - max_heap.build(heap_size); - - // Now extract the minimum k-1 times. - // k is treated specially. - int const last_k = k - 1; - for (int rank = 0; rank < last_k; rank++) { - Entry const &max_element = max_heap.root(); - top_k_values[rank] = __half2float(max_element.value); - int shard_index = max_element.index; - top_k_indices[rank] = entries[shard_index].index; - top_k_parents[rank] = - parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((shard_index % max_heap_size) / k)]; - int next_shard_index = shard_index + num_shards; - - T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((next_shard_index % max_heap_size) / k)]; - - max_heap.replace_root( - {next_shard_index, entries[next_shard_index].value * prob}, - heap_size); - } - - // rank == last_k. - Entry const &max_element = max_heap.root(); - top_k_values[last_k] = __half2float(max_element.value); - int shard_index = max_element.index; - top_k_indices[last_k] = entries[shard_index].index; - top_k_parents[last_k] = - parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((shard_index % max_heap_size) / k)]; - } -} - -template -__global__ void - mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rstd[i]); - } -} - -template -__global__ void beam_topk_forward_kernel(T const *__restrict__ input, - size_t shared_memory_size, - int length, - int k, - int max_heap_size, - int *parent_ids, - T *acc_probs, - int *gpu_block_start_index, - int *gpu_request_id, - int *tokens_per_request, - bool sorted, - float *__restrict__ output, - int *__restrict__ indices, - int *__restrict__ parents, - bool verbose) { - __shared__ char shared_memory[48 << 10]; - int const batch_index = blockIdx.x; - // T const *batch_input = input + batch_index * length; - int const thread_index = threadIdx.x; - int const thread_count = blockDim.x; - int const request_id = gpu_request_id[batch_index]; - int const token_nums = tokens_per_request[batch_index]; - Entry *shared_entries = (Entry *)shared_memory; - - int sub_request_id = thread_index / k; - // if (verbose) { - // printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d, - // " - // "request_id %d, token_nums %d\n", - // batch_index, - // thread_index, - // sub_request_id, - // request_id, - // token_nums); - // } - - T const *batch_input = input + gpu_block_start_index[batch_index] + - (sub_request_id * token_nums * length); - - // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index, - // thread_count, batch_index); - heapBeamTopK(batch_input, - batch_index, - length, - k, - shared_entries, - true, - thread_index % k, - k); - __syncthreads(); - // printf("beam thread index %d, thread_count %d, thread index %d, batch_index - // " - // "%d, k %d, parent_id %d, acc_prob: %f, sub id: %d, request_id: %d, - // offset: %d, offset2 %d, sub_request_id %d\n", thread_index, - // thread_count, - // thread_index, - // batch_index, - // k, - // parent_ids[request_id * BatchConfig::MAX_NUM_BEAMS + - // sub_request_id], acc_probs[request_id * BatchConfig::MAX_NUM_BEAMS + - // sub_request_id], sub_request_id, request_id, - // gpu_block_start_index[batch_index], - // batch_index * length, - // sub_request_id); - - if (thread_index == 0) { - // merge beam_width heaps and store the parent - // find which req it belongs to, replace the offset - // printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n", - // batch_index, - // sub_request_id, - // acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - // sub_request_id]); - int const offset = batch_index * k; - auto batch_output = output + offset; - auto batch_indices = indices + offset; - auto batch_parents = parents + offset; - Entry *top_k_heap = shared_entries + thread_count * k; - - // if(batch_index == 0 && verbose) { - // for(int i = 0; i < 18; i++){ - // printf("see value: %.15f\n", shared_entries[i].value); - // } - // } - - // get parent/acc based on the sub request and main request - mergeBeamShards(thread_count, - batch_index, - k, - max_heap_size, - request_id, - parent_ids, - acc_probs, - shared_entries, - top_k_heap, - batch_output, - batch_indices, - batch_parents, - verbose /*verbose prints*/); - } -} - -/*static*/ -template -void BeamTopK::forward_kernel(BeamTopKMeta const *m, - BeamSearchBatchConfig const *bc, - DT const *input_ptr, - float *output_ptr, - int *indices_ptr, - int *parent_ptr, - int batch_size, - int length, - bool sorted, - hipStream_t stream) { - // Adopted from TensorFlow's BeamTopK implementation - // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h - - int num_shards = 0; - int max_heap_size = 0; - int max_beam_width = 0; - int req_index = 0; - - // sub request - int const *sub_requests = bc->sub_requests; - - // std::vector beam_slots = bc->beam_slots; - // assert(bc->beam_slots.size() > 0); - - int beam_num_blocks = 0; - std::vector beam_block_start_index; - std::vector request_id; - std::vector tokens_per_request; - - int block_start_index = 0; - - // a data structure for prob, parent_id, - int max_total_requests = - BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests(); - int parent_ids[max_total_requests]; - DT acc_probs[max_total_requests]; - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - assert(bc->beamRequestsInfo[i].beam_size > 0); - - // int num_new_tokens = bc->num_processing_tokens[i]; - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - - // get beam size; - int beam_size = bc->beamRequestsInfo[i].beam_size; - - // initial request - log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i] - << "\n"; - assert(sub_requests[i] > 0); - // process sub requests - for (int j = 0; j < sub_requests[i]; j++) { - parent_ids[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = j; - // beam_slots[i].parent_id[j]; - acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = - bc->beamRequestsInfo[i].probs[j]; - log_beam_topk.debug() - << "probbbb req: " << i - << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j] - << ", sub request id " << j << ", parent id " - << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n"; - } - - // process tokens - for (int k = 0; k < num_new_tokens; k++) { - beam_block_start_index.push_back(block_start_index); - request_id.push_back(i); - tokens_per_request.push_back(num_new_tokens); - block_start_index += length; - beam_num_blocks++; - } - - max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); - max_beam_width = std::max(max_beam_width, beam_size); - req_index += 1; - block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; - } - log_beam_topk.debug() << "what index: " << block_start_index - << ", block num: " << beam_num_blocks << "\n"; - - assert(batch_size >= beam_num_blocks); - assert(bc->num_active_requests() == req_index); - - { - constexpr auto shared_memory_size = 48 << 10; - auto const heap_size = max_heap_size * sizeof(Entry
); - // shared_memory_size = (num_shards + 1) * heap_size <=> - num_shards = shared_memory_size / heap_size - 1; - assert(num_shards > 0); - if (num_shards > CUDA_NUM_THREADS) { - num_shards = CUDA_NUM_THREADS; - } - log_beam_topk.debug() << "maxheap size: " << max_heap_size << "\n"; - log_beam_topk.debug() << "maxbeam width: " << max_beam_width - << ", heap size: " << heap_size << "\n"; - } - // We are limited by the amount of shared memory we have per block. - size_t shared_memory_size = - (num_shards + 1) * max_heap_size * sizeof(Entry
); - - assert(num_shards >= (size_t)max_heap_size); - num_shards = max_heap_size; - - checkCUDA(hipMemcpy(m->parent_ids, - parent_ids, - sizeof(int) * max_total_requests, - hipMemcpyHostToDevice)); - checkCUDA(hipMemcpy(m->acc_probs, - acc_probs, - sizeof(DT) * max_total_requests, - hipMemcpyHostToDevice)); - checkCUDA(hipMemcpy(m->block_start_index, - beam_block_start_index.data(), - sizeof(int) * beam_num_blocks, - hipMemcpyHostToDevice)); - checkCUDA(hipMemcpy(m->request_id, - request_id.data(), - sizeof(int) * beam_num_blocks, - hipMemcpyHostToDevice)); - checkCUDA(hipMemcpy(m->tokens_per_request, - tokens_per_request.data(), - sizeof(int) * beam_num_blocks, - hipMemcpyHostToDevice)); - // int depth = - // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; - beam_topk_forward_kernel<<>>( - input_ptr, - shared_memory_size, - length, - max_beam_width, - max_heap_size, - m->parent_ids, - static_cast
(m->acc_probs), - m->block_start_index, - m->request_id, - m->tokens_per_request, - sorted, - output_ptr, - indices_ptr, - parent_ptr, - false /*verbose*/ // depth == 1 - ); - - // merge sub -} - -/*static*/ -void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, - BeamSearchBatchConfig const *bc, - GenericTensorAccessorR const &input, - float *output_ptr, - int *indices_ptr, - int *parent_ptr, - int batch_size, - int length, - bool sorted) { - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - hipEvent_t t_start, t_end; - if (m->profiling) { - checkCUDA(hipEventCreate(&t_start)); - checkCUDA(hipEventCreate(&t_end)); - checkCUDA(hipEventRecord(t_start, stream)); - } - - if (input.data_type == DT_HALF) { - BeamTopK::forward_kernel(m, - bc, - input.get_half_ptr(), - output_ptr, - indices_ptr, - parent_ptr, - batch_size, - length, - sorted, - stream); - } else if (input.data_type == DT_FLOAT) { - BeamTopK::forward_kernel(m, - bc, - input.get_float_ptr(), - output_ptr, - indices_ptr, - parent_ptr, - batch_size, - length, - sorted, - stream); - } - - if (m->profiling) { - checkCUDA(hipEventRecord(t_end, stream)); - checkCUDA(hipEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - checkCUDA(hipEventDestroy(t_start)); - checkCUDA(hipEventDestroy(t_end)); - printf("[BeamTopK] forward time = %.2lfms\n", elapsed); - } -} - -BeamTopKMeta::BeamTopKMeta(FFHandler handler, - Op const *op, - MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler) { - DataType data_type = op->inputs[0]->data_type; - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - int max_requests_per_batch = BatchConfig::max_requests_per_batch(); - size_t parent_id_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch; - size_t acc_probs_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch; - size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch; - size_t request_id_size = max_tokens_per_batch * max_requests_per_batch; - size_t tokens_per_request_size = - max_tokens_per_batch * max_requests_per_batch; - size_t totalSize = sizeof(int) * parent_id_size + - data_type_size(data_type) * acc_probs_size + - sizeof(int) * block_start_index_size + - sizeof(int) * request_id_size + - sizeof(int) * tokens_per_request_size; - - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); - parent_ids = gpu_mem_allocator.allocate_instance(parent_id_size); - if (data_type == DT_FLOAT) { - acc_probs = gpu_mem_allocator.allocate_instance(acc_probs_size); - } else if (data_type == DT_HALF) { - acc_probs = gpu_mem_allocator.allocate_instance(acc_probs_size); - } else { - assert(false); - } - - block_start_index = - gpu_mem_allocator.allocate_instance(block_start_index_size); - request_id = gpu_mem_allocator.allocate_instance(request_id_size); - tokens_per_request = - gpu_mem_allocator.allocate_instance(tokens_per_request_size); -} - -BeamTopKMeta::~BeamTopKMeta(void) { - if (reserveInst != Realm::RegionInstance::NO_INST) { - reserveInst.destroy(); - } -} -}; // namespace FlexFlow diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu deleted file mode 100644 index a958786be..000000000 --- a/src/ops/beam_topk.cu +++ /dev/null @@ -1,766 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/ffconst_utils.h" -#include "flexflow/ops/beam_topk.h" -#include "flexflow/request_manager.h" -#include "flexflow/utils/cuda_helper.h" - -namespace FlexFlow { -// declare Legion names -using Legion::coord_t; - -enum class HeapType { kMinHeap, kMaxHeap }; -enum class PreferIndices { kLower, kHigher }; - -LegionRuntime::Logger::Category log_beam_topk("BeamTopK"); - -template -struct Entry { - int index; - T value; -}; - -template -struct LinearData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index]; - } - - __device__ int get_index(int i) const { - return data[i].index; - } - __device__ T get_value(int i) const { - return data[i].value; - } - - Entry *const data; -}; - -template -struct IndirectLinearData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index]; - } - - __device__ int get_index(int i) const { - return backing_data[data[i].index].index; - } - __device__ T get_value(int i) const { - return data[i].value; - } - - Entry *const data; - Entry *const backing_data; -}; - -template -struct StridedData { - typedef Entry Entry; - - __device__ Entry &operator[](std::size_t index) const { - return data[index * blockDim.x + threadIdx.x]; - } - - __device__ int get_index(int i) const { - return (*this)[i].index; - } - __device__ T get_value(int i) const { - return (*this)[i].value; - } - - Entry *const data; -}; - -// A heap of Entry that can either work as a min-heap or as a max-heap. -template - class Data, - typename T> -struct IndexedHeap { - typedef typename Data::Entry Entry; - Data const data; - __device__ IndexedHeap(Data const &d) : data(d) {} - - __device__ bool is_above(int left, int right) { - T left_value = data.get_value(left); - T right_value = data.get_value(right); - if (left_value == right_value) { - if (preferIndices == PreferIndices::kLower) { - return data.get_index(left) < data.get_index(right); - } else { - return data.get_index(left) > data.get_index(right); - } - } - if (heapType == HeapType::kMinHeap) { - return left_value < right_value; - } else { - return left_value > right_value; - } - } - - __device__ void assign(int i, Entry const &entry) { - data[i] = entry; - } - - __device__ void push_up(int i) { - int child = i; - int parent; - for (; child > 0; child = parent) { - parent = (child - 1) / 2; - if (!is_above(child, parent)) { - // Heap property satisfied. - break; - } - swap(child, parent); - } - } - - __device__ void swap(int a, int b) { - auto tmp = data[b]; - data[b] = data[a]; - data[a] = tmp; - } - - __device__ void push_root_down(int k) { - push_down(0, k); - } - - // MAX-HEAPIFY in Cormen - __device__ void push_down(int node, int k) { - while (true) { - int const left = 2 * node + 1; - int const right = left + 1; - int smallest = node; - if (left < k && is_above(left, smallest)) { - smallest = left; - } - if (right < k && is_above(right, smallest)) { - smallest = right; - } - if (smallest == node) { - break; - } - swap(smallest, node); - node = smallest; - } - } - - // BUILD-MAX-HEAPIFY in Cormen - __device__ void build(int k) { - for (int node = (k - 1) / 2; node >= 0; node--) { - push_down(node, k); - } - } - - // HEAP-EXTRACT-MAX in Cormen - __device__ void remove_root(int k) { - data[0] = data[k - 1]; - push_root_down(k - 1); - } - - // in-place HEAPSORT in Cormen - // This method destroys the heap property. - __device__ void sort(int k) { - for (int slot = k - 1; slot > 0; slot--) { - // This is like remove_root but we insert the element at the end. - swap(slot, 0); - // Heap is now an element smaller. - push_root_down(/*k=*/slot); - } - } - - __device__ void replace_root(Entry const &entry, int k) { - data[0] = entry; - push_root_down(k); - } - - __device__ Entry const &root() { - return data[0]; - } -}; - -template - class Data, - typename T> -__device__ IndexedHeap - make_indexed_heap(typename Data::Entry *data) { - return IndexedHeap{Data{data}}; -} - -// heapBeamTopK walks over [input, input+length) with `step_size` stride -// starting at `start_index`. It builds a top-`k` heap that is stored in -// `heap_entries` using `Accessor` to access elements in `heap_entries`. If -// sorted=true, the elements will be sorted at the end. -template class Data = LinearData> -__device__ void heapBeamTopK(T const *__restrict__ input, - int batch_index, - int length, - int k, - Entry *__restrict__ heap_entries, - bool sorted = false, - int start_index = 0, - int step_size = 1) { - assert(k <= length); - auto heap = - make_indexed_heap( - heap_entries); - - int heap_end_index = start_index + k * step_size; - if (heap_end_index > length) { - heap_end_index = length; - } - // Initialize the min-heap. - for (int index = start_index, slot = 0; index < heap_end_index; - index += step_size, slot++) { - heap.assign(slot, {index, input[index]}); - } - - heap.build(k); - - // Now iterate over the remaining items. - // If an item is smaller than the min element, it is not amongst the top k. - // Otherwise, replace the min element with it and push upwards. - for (int index = heap_end_index; index < length; index += step_size) { - // We prefer elements with lower indices. This is given here. - // Later elements automatically have higher indices, so can be discarded. - if (input[index] > heap.root().value) { - // This element should replace the min. - heap.replace_root({index, input[index]}, k); - } - } - - // Sort if wanted. - if (sorted) { - heap.sort(k); - } - - // if(batch_index == 0){ - // printf("top elemmments: %d, value %.15f\n", start_index, - // heap.root().value); - // } -} - -template -__device__ void mergeBeamShards(int num_shards, - int batch_index, - int k, - int max_heap_size, - int request_id, - int *parent_id, - T *probs, - Entry *__restrict__ entries, - Entry *__restrict__ top_k_heap, - float *top_k_values, - int *top_k_indices, - int *top_k_parents, - bool verbose) { - // If k < num_shards, we can use a min-heap with k elements to get the top k - // of the sorted blocks. - // If k > num_shards, we can initialize a min-heap with the top element from - // each sorted block. - int const heap_size = k < num_shards ? k : num_shards; - // printf("see value: %f", entries[0].value); - // Min-heap part. - - { - auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; - // Initialize the heap as a min-heap. - for (int slot = 0; slot < heap_size; slot++) { - // int beam = (slot % max_heap_size) / k; - T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((slot % max_heap_size) / k)]; - min_heap.assign(slot, {slot, (entries[slot].value * prob)}); - if (verbose && batch_index == 0) { - printf("slot %d, value %.15f, prob %15f\n", - slot, - static_cast(entries[slot].value), - static_cast(prob)); - } - } - min_heap.build(heap_size); - - // Now perform top k with the remaining shards (if num_shards > heap_size). - for (int shard = heap_size; shard < num_shards; shard++) { - auto const entry = entries[shard]; - auto const root = min_heap.root(); - - T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((shard % max_heap_size) / k)]; - if (verbose && batch_index == 0) { - printf("shard %d, index %d, value %.15f, prob %.15f\n", - shard, - entry.index, - static_cast(entry.value), - static_cast(prob)); - } - if (entry.value * prob < root.value) { - continue; - } - if (entry.value * prob == root.value && - entry.index > entries[root.index].index) { - continue; - } - // This element should replace the min. - min_heap.replace_root({shard, entry.value * prob}, heap_size); - } - } - - // Max-part. - { - // Turn the min-heap into a max-heap in-place. - auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; - // Heapify into a max heap. - max_heap.build(heap_size); - - // Now extract the minimum k-1 times. - // k is treated specially. - int const last_k = k - 1; - for (int rank = 0; rank < last_k; rank++) { - Entry const &max_element = max_heap.root(); - top_k_values[rank] = __half2float(max_element.value); - int shard_index = max_element.index; - top_k_indices[rank] = entries[shard_index].index; - top_k_parents[rank] = - parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((shard_index % max_heap_size) / k)]; - int next_shard_index = shard_index + num_shards; - - T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((next_shard_index % max_heap_size) / k)]; - // if (batch_index == 0) { - // printf("next_shard_index %d, value %.15f, prob %.15f\n", - // next_shard_index, - // entries[next_shard_index].value, - // prob); - // } - max_heap.replace_root( - {next_shard_index, entries[next_shard_index].value * prob}, - heap_size); - } - - // rank == last_k. - Entry const &max_element = max_heap.root(); - top_k_values[last_k] = __half2float(max_element.value); - int shard_index = max_element.index; - top_k_indices[last_k] = entries[shard_index].index; - top_k_parents[last_k] = - parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - ((shard_index % max_heap_size) / k)]; - } -} - -template -__global__ void - mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rstd[i]); - } -} - -template -__global__ void beam_topk_forward_kernel(T const *__restrict__ input, - size_t shared_memory_size, - int length, - int k, - int max_heap_size, - int *parent_ids, - T *acc_probs, - int *gpu_block_start_index, - int *gpu_request_id, - int *tokens_per_request, - bool sorted, - float *__restrict__ output, - int *__restrict__ indices, - int *__restrict__ parents, - bool verbose) { - __shared__ char shared_memory[48 << 10]; - int const batch_index = blockIdx.x; - // T const *batch_input = input + batch_index * length; - int const thread_index = threadIdx.x; - int const thread_count = blockDim.x; - int const request_id = gpu_request_id[batch_index]; - int const token_nums = tokens_per_request[batch_index]; - Entry *shared_entries = (Entry *)shared_memory; - - int sub_request_id = thread_index / k; - // if (verbose) { - // printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d, - // " - // "request_id %d, token_nums %d\n", - // batch_index, - // thread_index, - // sub_request_id, - // request_id, - // token_nums); - // } - - T const *batch_input = input + gpu_block_start_index[batch_index] + - (sub_request_id * token_nums * length); - - if (verbose && batch_index == 0) { - printf("request 0 start index: thread index %d, offset %d, batch_input %p, " - "acc index %d acc " - "prob %f, thread_count %d, request_id %d\n", - thread_index, - gpu_block_start_index[batch_index] + - (sub_request_id * token_nums * length), - batch_input, - request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id, - static_cast( - acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - sub_request_id]), - thread_count, - request_id); - } - // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index, - // thread_count, batch_index); - heapBeamTopK(batch_input, - batch_index, - length, - k, - shared_entries, - true, - thread_index % k, - k); - __syncthreads(); - // printf("beam thread index %d, thread_count %d, thread index %d, batch_index - // " - // "%d, k %d, parent_id %d, acc_prob: %f, sub id: %d, request_id: %d, - // offset: %d, offset2 %d, sub_request_id %d\n", thread_index, - // thread_count, - // thread_index, - // batch_index, - // k, - // parent_ids[request_id * BatchConfig::MAX_NUM_BEAMS + - // sub_request_id], acc_probs[request_id * BatchConfig::MAX_NUM_BEAMS + - // sub_request_id], sub_request_id, request_id, - // gpu_block_start_index[batch_index], - // batch_index * length, - // sub_request_id); - - if (thread_index == 0) { - // merge beam_width heaps and store the parent - // find which req it belongs to, replace the offset - // printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n", - // batch_index, - // sub_request_id, - // acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + - // sub_request_id]); - int const offset = batch_index * k; - auto batch_output = output + offset; - auto batch_indices = indices + offset; - auto batch_parents = parents + offset; - Entry *top_k_heap = shared_entries + thread_count * k; - - // if(batch_index == 0 && verbose) { - // for(int i = 0; i < 18; i++){ - // printf("see value: %.15f\n", shared_entries[i].value); - // } - // } - - // get parent/acc based on the sub request and main request - mergeBeamShards(thread_count, - batch_index, - k, - max_heap_size, - request_id, - parent_ids, - acc_probs, - shared_entries, - top_k_heap, - batch_output, - batch_indices, - batch_parents, - verbose /*verbose prints*/); - } -} - -/*static*/ -template -void BeamTopK::forward_kernel(BeamTopKMeta const *m, - BeamSearchBatchConfig const *bc, - DT const *input_ptr, - float *output_ptr, - int *indices_ptr, - int *parent_ptr, - int batch_size, - int length, - bool sorted, - cudaStream_t stream) { - // Adopted from TensorFlow's BeamTopK implementation - // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h - - int num_shards = 0; - int max_heap_size = 0; - int max_beam_width = 0; - int req_index = 0; - - // sub request - int const *sub_requests = bc->sub_requests; - - // std::vector beam_slots = bc->beam_slots; - // assert(bc->beam_slots.size() > 0); - - int beam_num_blocks = 0; - std::vector beam_block_start_index; - std::vector request_id; - std::vector tokens_per_request; - - int block_start_index = 0; - - // a data structure for prob, parent_id, - int max_total_requests = - BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests(); - int parent_ids[max_total_requests]; - DT acc_probs[max_total_requests]; - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - assert(bc->beamRequestsInfo[i].beam_size > 0); - - // int num_new_tokens = bc->num_processing_tokens[i]; - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - - // get beam size; - int beam_size = bc->beamRequestsInfo[i].beam_size; - - // initial request - assert(sub_requests[i] > 0); - // process sub requests - for (int j = 0; j < sub_requests[i]; j++) { - parent_ids[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = j; - // beam_slots[i].parent_id[j]; - acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = - bc->beamRequestsInfo[i].probs[j]; - // std::cout << "probbbb req: " << i << ", sub req probability : " - // << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << - // j - // << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] - // << ", data inddd" - // << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j - // << "\n"; - } - - // process tokens - for (int k = 0; k < num_new_tokens; k++) { - beam_block_start_index.push_back(block_start_index); - request_id.push_back(i); - tokens_per_request.push_back(num_new_tokens); - block_start_index += length; - beam_num_blocks++; - } - - max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); - max_beam_width = std::max(max_beam_width, beam_size); - - req_index += 1; - block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; - } - log_beam_topk.debug() << "what index: " << block_start_index - << ", block num: " << beam_num_blocks << "\n"; - - assert(batch_size >= beam_num_blocks); - assert(bc->num_active_requests() == req_index); - - { - constexpr auto shared_memory_size = 48 << 10; - auto const heap_size = max_heap_size * sizeof(Entry
); - // shared_memory_size = (num_shards + 1) * heap_size <=> - num_shards = shared_memory_size / heap_size - 1; - assert(num_shards > 0); - if (num_shards > CUDA_NUM_THREADS) { - num_shards = CUDA_NUM_THREADS; - } - log_beam_topk.debug() << "maxheap size: " << max_heap_size << "\n"; - log_beam_topk.debug() << "maxbeam width: " << max_beam_width - << ", heap size: " << heap_size << "\n"; - } - // We are limited by the amount of shared memory we have per block. - size_t shared_memory_size = - (num_shards + 1) * max_heap_size * sizeof(Entry
); - - assert(num_shards >= (size_t)max_heap_size); - num_shards = max_heap_size; - - checkCUDA(cudaMemcpyAsync(m->parent_ids, - parent_ids, - sizeof(int) * max_total_requests, - cudaMemcpyHostToDevice, - stream)); - checkCUDA(cudaMemcpyAsync(m->acc_probs, - acc_probs, - sizeof(DT) * max_total_requests, - cudaMemcpyHostToDevice, - stream)); - // trick, set acc_probs to 0; - checkCUDA(cudaMemsetAsync( - m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); - checkCUDA(cudaMemcpyAsync(m->block_start_index, - beam_block_start_index.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice, - stream)); - checkCUDA(cudaMemcpyAsync(m->request_id, - request_id.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice, - stream)); - checkCUDA(cudaMemcpyAsync(m->tokens_per_request, - tokens_per_request.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice, - stream)); - // int depth = - // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; - beam_num_blocks = bc->num_active_tokens(); - beam_topk_forward_kernel<<>>( - input_ptr, - shared_memory_size, - length, - max_beam_width, - max_heap_size, - m->parent_ids, - static_cast
(m->acc_probs), - m->block_start_index, - m->request_id, - m->tokens_per_request, - sorted, - output_ptr, - indices_ptr, - parent_ptr, - false /*verbose*/ // depth == 1 - ); - - // merge sub -} - -/*static*/ -void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, - BeamSearchBatchConfig const *bc, - GenericTensorAccessorR const &input, - float *output_ptr, - int *indices_ptr, - int *parent_ptr, - int batch_size, - int length, - bool sorted) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - cudaEvent_t t_start, t_end; - if (m->profiling) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start, stream); - } - - if (input.data_type == DT_HALF) { - BeamTopK::forward_kernel(m, - bc, - input.get_half_ptr(), - output_ptr, - indices_ptr, - parent_ptr, - batch_size, - length, - sorted, - stream); - } else if (input.data_type == DT_FLOAT) { - BeamTopK::forward_kernel(m, - bc, - input.get_float_ptr(), - output_ptr, - indices_ptr, - parent_ptr, - batch_size, - length, - sorted, - stream); - } - - if (m->profiling) { - cudaEventRecord(t_end, stream); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("[BeamTopK] forward time = %.2lfms\n", elapsed); - } -} - -BeamTopKMeta::BeamTopKMeta(FFHandler handler, - Op const *op, - MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler) { - DataType data_type = op->inputs[0]->data_type; - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - int max_requests_per_batch = BatchConfig::max_requests_per_batch(); - size_t parent_id_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch; - size_t acc_probs_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch; - size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch; - size_t request_id_size = max_tokens_per_batch * max_requests_per_batch; - size_t tokens_per_request_size = - max_tokens_per_batch * max_requests_per_batch; - size_t totalSize = sizeof(int) * parent_id_size + - data_type_size(data_type) * acc_probs_size + - sizeof(int) * block_start_index_size + - sizeof(int) * request_id_size + - sizeof(int) * tokens_per_request_size; - - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); - parent_ids = gpu_mem_allocator.allocate_instance(parent_id_size); - if (data_type == DT_FLOAT) { - acc_probs = gpu_mem_allocator.allocate_instance(acc_probs_size); - } else if (data_type == DT_HALF) { - acc_probs = gpu_mem_allocator.allocate_instance(acc_probs_size); - } else { - assert(false); - } - - block_start_index = - gpu_mem_allocator.allocate_instance(block_start_index_size); - request_id = gpu_mem_allocator.allocate_instance(request_id_size); - tokens_per_request = - gpu_mem_allocator.allocate_instance(tokens_per_request_size); -} - -BeamTopKMeta::~BeamTopKMeta(void) { - if (reserveInst != Realm::RegionInstance::NO_INST) { - reserveInst.destroy(); - } -} -}; // namespace FlexFlow diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index e630563b6..3cc8ceea0 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -469,7 +469,7 @@ FutureMap Embedding::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(EMBED_FWD_TASK_ID, + IndexLauncher launcher(EMBED_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -625,6 +625,8 @@ void Embedding::inference_task(Task const *task, effective_batch_size = output.domain.get_volume() / out_dim; assert(effective_batch_size * in_dim == input.domain.get_volume()); } + // use active batch size + effective_batch_size = bc->num_active_tokens(); forward_kernel_wrapper( m, input, output, kernel, in_dim, out_dim, effective_batch_size); if (m->inference_debugging) { diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 9ad5c4dc9..a22873847 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -355,6 +355,7 @@ void FusedOp::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + // launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); switch (domain.get_dim()) { @@ -445,6 +446,7 @@ void FusedOp::init_inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + // launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); switch (domain.get_dim()) { @@ -479,6 +481,7 @@ void FusedOp::forward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; int offset = 0; for (int i = 0; i < numInputs; i++) { assert(inputs[i]->part != LogicalPartition::NO_PART); @@ -538,6 +541,7 @@ FutureMap FusedOp::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); int offset = 0; for (int i = 0; i < numInputs; i++) { @@ -589,6 +593,7 @@ void FusedOp::backward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; int idx = 0; for (int i = 0; i < numInputs; i++) { launcher.add_region_requirement(RegionRequirement(inputs[i]->part, diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 3282bc57d..6111a8fd0 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -871,8 +871,8 @@ __host__ void assert(fused->op_num_outputs[op] == 1); TreeIncMultiHeadSelfAttentionMeta *m = (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - TreeVerifyBatchConfig const &tree_bc = - Future(task->futures[0]).get_result(); + BatchConfig const &verify_bc = + Future(task->futures[0]).get_result(); assert(fused->op_num_weights[op] == (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; @@ -882,7 +882,7 @@ __host__ void } TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - &tree_bc, + &verify_bc, task->index_point.point_data[0], my_input_accessor[0], my_weight_accessor[0], @@ -895,10 +895,10 @@ __host__ void assert(fused->op_num_outputs[op] == 1); SpecIncMultiHeadSelfAttentionMeta const *m = (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // BeamSearchBatchConfig const *beam_bc = - // (BeamSearchBatchConfig *)task->args; - BeamSearchBatchConfig const &beam_bc = - Future(task->futures[0]).get_result(); + // TreeSearchBatchConfig const *search_bc = + // (TreeSearchBatchConfig *)task->args; + BatchConfig const &search_bc = + Future(task->futures[0]).get_result(); assert(fused->op_num_weights[op] == (1 + (int)(*m->qkv_bias || *m->final_bias))); GenericTensorAccessorR biases; @@ -908,7 +908,7 @@ __host__ void } SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - &beam_bc, + &search_bc, task->index_point.point_data[0], my_input_accessor[0], my_weight_accessor[0], @@ -1047,7 +1047,7 @@ __host__ void assert(fused->op_num_outputs[op] == 1); AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; Kernels::AllReduce::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + ctx, runtime, m, bc, my_input_accessor[0], my_output_accessor[0]); break; } default: { diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 483028599..78983d579 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -13,7 +13,9 @@ * limitations under the License. */ +#include "cuda.h" #include "flexflow/accessor.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" @@ -45,17 +47,6 @@ #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -// declare Legion names -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::Future; -using Legion::LogicalPartition; -using Legion::LogicalRegion; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; OpMeta *FusedOp::init_task(Task const *task, std::vector const ®ions, @@ -142,6 +133,8 @@ __host__ void FusedOp::forward_task(Task const *task, for (int op = start + 1; op < fused->numOperators; op++) { if (metas->meta[op] != NULL) { assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.blasLt == + metas->meta[op]->handle.blasLt); assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); } } @@ -539,7 +532,6 @@ __host__ void // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; - // BatchConfig const *bc = (BatchConfig *)task->args; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); // Return if no active tokens if (bc->num_tokens == 0) { @@ -604,534 +596,596 @@ __host__ void for (int op = start + 1; op < fused->numOperators; op++) { if (metas->meta[op] != NULL) { assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.blasLt == + metas->meta[op]->handle.blasLt); assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); } } - int ioff = 0, woff = 0, ooff = 0; - for (int op = 0; op < fused->numOperators; op++) { - // Domain my_id[MAX_NUM_INPUTS]; - // Domain my_wd[MAX_NUM_WEIGHTS]; - // Domain my_od[MAX_NUM_OUTPUTS]; - GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - // my_id[i] = input_domain[my_off]; - assert(my_off < fused->numInputs); - my_input_accessor[i] = input_accessor[my_off]; - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - // my_id[i] = output_domain[my_off]; - assert(my_off < fused->numOutputs); - my_input_accessor[i] = output_accessor[my_off]; - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; - // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; - assert(fused->op_weight_idx[i + woff] < fused->numWeights); - my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - assert(my_off < fused->numOutputs); - // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; - // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[my_off]; - } - switch (fused->op_op_type[op]) { - case OP_CONCAT: { - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - ConcatMeta *m = (ConcatMeta *)metas->meta[op]; - int num_inputs = fused->op_num_inputs[op]; - Kernels::Concat::forward_kernel_wrapper(m, - my_output_accessor[0], - my_input_accessor, - num_inputs, - m->legion_axis); - break; - } - case OP_BATCHNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 2); - assert(my_weight_accessor[1].domain.get_dim() == 2); - BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; - BatchNorm::forward_kernel(m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_accessor[1].get_float_ptr()); - break; + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // create new cuda graph + cudaGraphExec_t instance; + + GraphParams graph_params = { + bc->num_active_requests(), bc->num_active_tokens(), bc->prompt_phase}; + // int shard_id = task->index_point.point_data[0]; + + // bool use_cuda_graph = (bc->get_mode() == TREE_SEARCH_MODE or bc->get_mode() + // == TREE_VERIFY_MODE); + bool use_cuda_graph = + (bc->get_mode() == TREE_SEARCH_MODE && bc->prompt_phase == 0); + // bool use_cuda_graph = (bc->get_mode() == TREE_VERIFY_MODE); + // bool use_cuda_graph = false; + bool captured = false; + + if (use_cuda_graph && metas->graph_collections.count(graph_params) != 0) { + captured = true; + instance = metas->graph_collections[graph_params]; + } + + if (!captured) { + cudaGraph_t graph; + { + if (use_cuda_graph) { + cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal); } - case OP_LINEAR: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - Domain kernel_domain = my_weight_accessor[0].domain; - int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; - int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; - int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; - assert(my_output_accessor[0].domain.get_volume() == - out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - void const *bias_ptr = nullptr; - LinearMeta *m = (LinearMeta *)metas->meta[op]; - if (fused->op_num_weights[op] == 2) { - assert(my_weight_accessor[1].domain.get_volume() == out_dim); - if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].ptr; + int ioff = 0, woff = 0, ooff = 0; + for (int op = 0; op < fused->numOperators; op++) { + // Domain my_id[MAX_NUM_INPUTS]; + // Domain my_wd[MAX_NUM_WEIGHTS]; + // Domain my_od[MAX_NUM_OUTPUTS]; + GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + // my_id[i] = input_domain[my_off]; + assert(my_off < fused->numInputs); + my_input_accessor[i] = input_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + // my_id[i] = output_domain[my_off]; + assert(my_off < fused->numOutputs); + my_input_accessor[i] = output_accessor[my_off]; + } else { + assert(false); } - } else { - assert(fused->op_num_weights[op] == 1); } - assert(m->input_type[0] == my_input_accessor[0].data_type); - assert(m->input_type[0] == my_output_accessor[0].data_type); - batch_size = bc->num_active_tokens(); - Kernels::Linear::forward_kernel_wrapper(m, - my_input_accessor[0].ptr, - my_output_accessor[0].ptr, - my_weight_accessor[0].ptr, - bias_ptr, - in_dim, - out_dim, - batch_size); - break; - } - case OP_BATCHMATMUL: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Domain out_domain = my_output_accessor[0].domain; - Domain a_domain = my_input_accessor[0].domain; - Domain b_domain = my_input_accessor[1].domain; - int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; - assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); - int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; - assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); - int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; - assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); - assert(a_domain.get_dim() == b_domain.get_dim()); - assert(a_domain.get_dim() == out_domain.get_dim()); - int batch = 1; - for (int i = 2; i < a_domain.get_dim(); i++) { - int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; - assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); - assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); - batch *= dim_size; + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; + // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + assert(fused->op_weight_idx[i + woff] < fused->numWeights); + my_weight_accessor[i] = + weight_accessor[fused->op_weight_idx[i + woff]]; } - BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; - Kernels::BatchMatmul::forward_kernel_wrapper( - meta, - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - (float const *)nullptr, - m, - n, - k, - batch, - meta->a_seq_length_dim, - meta->b_seq_length_dim, - fused->iter_config.seq_length); - break; - } - case OP_EW_ADD: - case OP_EW_SUB: - case OP_EW_MUL: - case OP_EW_DIV: - case OP_EW_MAX: - case OP_EW_MIN: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::forward_kernel_wrapper(m, + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + assert(my_off < fused->numOutputs); + // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; + // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; + my_output_accessor[i] = output_accessor[my_off]; + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + int num_inputs = fused->op_num_inputs[op]; + Kernels::Concat::forward_kernel_wrapper(m, + my_output_accessor[0], + my_input_accessor, + num_inputs, + m->legion_axis); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::forward_kernel(m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == + in_dim * batch_size); + void const *bias_ptr = nullptr; + LinearMeta *m = (LinearMeta *)metas->meta[op]; + if (fused->op_num_weights[op] == 2) { + assert(my_weight_accessor[1].domain.get_volume() == out_dim); + if (!m->add_bias_only_once || + task->index_point.point_data[0] == 0) { + bias_ptr = my_weight_accessor[1].ptr; + } + } else { + assert(fused->op_num_weights[op] == 1); + } + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->input_type[0] == my_output_accessor[0].data_type); + batch_size = bc->num_active_tokens(); + Kernels::Linear::forward_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::forward_kernel_wrapper( + meta, + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + (float const *)nullptr, + m, + n, + k, + batch, + meta->a_seq_length_dim, + meta->b_seq_length_dim, + fused->iter_config.seq_length); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::forward_kernel_wrapper( + m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(my_input_accessor[0].domain.get_dim() + 1 == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); + i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i + 1]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i + 1]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } else { + assert(my_input_accessor[0].domain.get_dim() == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); + i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + // use active batch size + effective_batch_size = bc->num_active_tokens(); + + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); + Kernels::Embedding::forward_kernel_wrapper(m, + my_input_accessor[0], + my_output_accessor[0], + my_weight_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + if (m->data_type == DT_HALF) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_FLOAT) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type in ElementUnary forward"); + } + break; + } + case OP_RMS_NORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::forward_kernel_wrapper(m, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); + break; + } + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta const *m = + (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::forward_kernel_wrapper( + m, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1], + bc->num_active_tokens()); + break; + } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + TreeIncMultiHeadSelfAttentionMeta *m = + (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + BatchConfig const *verify_bc = + BatchConfig::from_future(task->futures[0]); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + verify_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + SpecIncMultiHeadSelfAttentionMeta *m = + (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + BatchConfig const *search_bc = + BatchConfig::from_future(task->futures[0]); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + search_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_RESIDUAL_LAYERNORM: { + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta const *m = + (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = my_input_accessor[2]; + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + ResidualLayerNorm::inference_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + residual2, + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta const *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == + 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[2]; + } + } + Domain attn_bias_domain = my_weight_accessor[0].domain; + Domain residual_domain = my_input_accessor[1].domain; + int attn_bias_dim = + attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; + int residual_volume = residual_domain.get_volume(); + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + attn_bias_dim, + residual_volume, + my_input_accessor[0], + my_output_accessor[0], + my_output_accessor[1], + my_input_accessor[1], + my_weight_accessor[0], + gamma, + beta); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta const *m = + (SigmoidSiluMultiMeta *)metas->meta[op]; + // use active number of tokens + SigmoidSiluMulti::inference_kernel_wrapper(m, my_input_accessor[0], my_input_accessor[1], - my_output_accessor[0]); - break; - } - case OP_EMBEDDING: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 1); - EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - if (m->aggr == AGGR_MODE_NONE) { - // assert(kernel_domain.get_dim() == 2); - assert(my_input_accessor[0].domain.get_dim() + 1 == - my_output_accessor[0].domain.get_dim()); - for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { - assert(my_input_accessor[0].domain.hi()[i] == - my_output_accessor[0].domain.hi()[i + 1]); - assert(my_input_accessor[0].domain.lo()[i] == - my_output_accessor[0].domain.lo()[i + 1]); + my_output_accessor[0], + bc->num_active_tokens()); + break; } - assert(my_weight_accessor[0].domain.hi()[0] - - my_weight_accessor[0].domain.lo()[0] == - my_output_accessor[0].domain.hi()[0] - - my_output_accessor[0].domain.lo()[0]); - } else { - assert(my_input_accessor[0].domain.get_dim() == - my_output_accessor[0].domain.get_dim()); - for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { - assert(my_input_accessor[0].domain.hi()[i] == - my_output_accessor[0].domain.hi()[i]); - assert(my_input_accessor[0].domain.lo()[i] == - my_output_accessor[0].domain.lo()[i]); + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_HALF) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr()); + } else if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } + break; } - assert(my_weight_accessor[0].domain.hi()[0] - - my_weight_accessor[0].domain.lo()[0] == - my_output_accessor[0].domain.hi()[0] - - my_output_accessor[0].domain.lo()[0]); - } - int in_dim, out_dim, effective_batch_size; - if (m->aggr == AGGR_MODE_NONE) { - in_dim = 1; - out_dim = my_output_accessor[0].domain.hi()[0] - - my_output_accessor[0].domain.lo()[0] + 1; - effective_batch_size = - my_output_accessor[0].domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == - my_input_accessor[0].domain.get_volume()); - } else { - assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); - in_dim = my_input_accessor[0].domain.hi()[0] - - my_input_accessor[0].domain.lo()[0] + 1; - out_dim = my_output_accessor[0].domain.hi()[0] - - my_output_accessor[0].domain.lo()[0] + 1; - effective_batch_size = - my_output_accessor[0].domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == - my_input_accessor[0].domain.get_volume()); - } - - assert(my_input_accessor[0].data_type == DT_INT32 || - my_input_accessor[0].data_type == DT_INT64); - Kernels::Embedding::forward_kernel_wrapper(m, - my_input_accessor[0], - my_output_accessor[0], - my_weight_accessor[0], - in_dim, - out_dim, - effective_batch_size); - break; - } - case OP_GELU: - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: - case OP_SCALAR_TRUE_DIV: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - if (m->data_type == DT_HALF) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr(), - my_input_accessor[0].domain.get_volume()); - } else if (m->data_type == DT_FLOAT) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - } else { - assert(false && "Unsupported data type in ElementUnary forward"); - } - break; - } - case OP_RMS_NORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 1); - RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; - Kernels::RMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); - break; - } - case OP_RESIDUAL_RMS_NORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::inference_kernel_wrapper(ctx, + runtime, + m, + bc, my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); - break; - } - case OP_INC_MULTIHEAD_SELF_ATTENTION: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - IncMultiHeadSelfAttentionMeta const *m = - (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - IncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); - break; - } - case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - TreeIncMultiHeadSelfAttentionMeta *m = - (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // TreeVerifyBatchConfig const *tree_bc = - // (TreeVerifyBatchConfig *)task->args; - TreeVerifyBatchConfig const &tree_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - &tree_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); - break; - } - case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - SpecIncMultiHeadSelfAttentionMeta const *m = - (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // BeamSearchBatchConfig const *beam_bc = - // (BeamSearchBatchConfig *)task->args; - BeamSearchBatchConfig const &beam_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - &beam_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); - break; - } - case OP_LAYERNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; - if (m->elementwise_affine) { - assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; + my_output_accessor[0]); + break; } - } - LayerNorm::forward_kernel_wrapper( - m, my_input_accessor[0], my_output_accessor[0], gamma, beta); - break; - } - case OP_RESIDUAL_LAYERNORM: { - assert(fused->op_num_outputs[op] == 2); - ResidualLayerNormMeta const *m = - (ResidualLayerNormMeta *)metas->meta[op]; - if (m->use_two_residuals) { - assert(fused->op_num_inputs[op] == 3); - } else { - assert(fused->op_num_inputs[op] == 2); - } - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 0); - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 1); // weight - } else { - assert(fused->op_num_weights[op] == 2); // weight + bias + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); } } - GenericTensorAccessorR residual2; - if (m->use_two_residuals) { - residual2 = my_input_accessor[2]; - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; + if (metas->meta[op]->inference_debugging) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + input_accessors_to_save.push_back(input_accessor[my_off]); + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + input_accessors_to_save.push_back(output_accessor[my_off]); + } else { + assert(false); + } } - } - ResidualLayerNorm::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - residual2, - my_output_accessor[0], - my_output_accessor[1], - gamma, - beta); - break; - } - case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 2); - AddBiasResidualLayerNormMeta const *m = - (AddBiasResidualLayerNormMeta *)metas->meta[op]; - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 1); // attn bias - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 2); // attn bias + weight - } else { - assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + weight_accessors_to_save.push_back( + weight_accessor[fused->op_weight_idx[i + woff]]); } - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[1]; - if (m->use_bias) { - beta = my_weight_accessor[2]; + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(output_accessor[i + ooff]); } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save); } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); - AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - attn_bias_dim, - residual_volume, - my_input_accessor[0], - my_output_accessor[0], - my_output_accessor[1], - my_input_accessor[1], - my_weight_accessor[0], - gamma, - beta); - break; + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; } - case OP_SIGMOID_SILU_MULTI: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 1); - SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; - SigmoidSiluMulti::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_output_accessor[0]); - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - } - break; - } - case OP_ALLREDUCE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; - Kernels::AllReduce::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); - break; - } - default: { - fprintf(stderr, - "Fusion currently does not support type = %d\n", - fused->op_op_type[op]); - assert(false && "Fusion currently does not support type"); + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); + if (use_cuda_graph) { + cudaStreamEndCapture(stream, &graph); } } - if (metas->meta[op]->inference_debugging) { - std::vector input_accessors_to_save; - std::vector weight_accessors_to_save; - std::vector output_accessors_to_save; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - input_accessors_to_save.push_back(input_accessor[my_off]); - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - input_accessors_to_save.push_back(output_accessor[my_off]); - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - weight_accessors_to_save.push_back( - weight_accessor[fused->op_weight_idx[i + woff]]); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - output_accessors_to_save.push_back(output_accessor[i + ooff]); - } - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - FusedOp::save_inference_tensors_to_file(metas->meta[op], - shard_id, - bc, - input_accessors_to_save, - weight_accessors_to_save, - output_accessors_to_save); + if (use_cuda_graph) { + cudaGraphInstantiate(&instance, graph, NULL, NULL, 0); + metas->graph_collections[graph_params] = instance; + cudaGraphDestroy(graph); } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; } - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_ptr[i], output_domain[i].get_volume(), - // "[Fused:forward:output]"); + + if (use_cuda_graph) { + assert(metas->graph_collections.find(graph_params) != + metas->graph_collections.end()); + cudaGraphLaunch(instance, stream); + } } /* @@ -1255,6 +1309,8 @@ __host__ void FusedOp::backward_task(Task const *task, for (int op = start + 1; op < fused->numOperators; op++) { if (metas->meta[op] != NULL) { assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.blasLt == + metas->meta[op]->handle.blasLt); assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); } } diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc new file mode 100644 index 000000000..fb7f8a978 --- /dev/null +++ b/src/ops/gumbel_topk.cc @@ -0,0 +1,536 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/gumbel_topk.h" +#include "flexflow/model.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +// For an input tensor, computes the top k entries in each row +// (resp. vector along the last dimension) using Gumbel trick +// (https://arxiv.org/abs/1903.06059). Thus, values.shape = indices.shape = +// input.shape[:-1] + [k] +Tensor FFModel::gumbel_top_k(Tensor const input, + int k, + bool sorted, + bool speculative_decoding, + char const *name) { + Layer *li = new Layer(this, + OP_GUMBEL_TOPK, + input->data_type, + name, + 1, + 0, + speculative_decoding ? 3 : 1 /*outputs*/, + input); + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = k; + // token_ids + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + if (speculative_decoding) { + // log_probs + li->outputs[1] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); + // perturbed_log_probs + li->outputs[2] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 2, false /*create_grad*/); + } + } + li->add_int_property("k", k); + li->add_int_property("sorted", sorted); + li->add_int_property("speculative_decoding", speculative_decoding); + layers.push_back(li); + return li->outputs[0]; +} + +Op *GumbelTopK::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("k", value); + int k = value; + layer->get_int_property("sorted", value); + bool sorted = (bool)value; + layer->get_int_property("speculative_decoding", value); + bool speculative_decoding = (bool)value; + + return new GumbelTopK(model, + layer->layer_guid, + inputs[0], + k, + sorted, + speculative_decoding, + layer->name); +} + +GumbelTopKParams GumbelTopK::get_params() const { + GumbelTopKParams params; + params.k = this->k; + params.sorted = this->sorted; + params.speculative_decoding = this->speculative_decoding; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } + return params; +} + +bool GumbelTopKParams::is_valid(ParallelTensorShape const &) const { + // gumbel_topk is always valid + return true; +} + +bool operator==(GumbelTopKParams const &lhs, GumbelTopKParams const &rhs) { + return lhs.k == rhs.k && lhs.sorted == rhs.sorted && + lhs.speculative_decoding == rhs.speculative_decoding; +} + +GumbelTopK::GumbelTopK(FFModel &model, + LayerID const &_layer_guid, + ParallelTensor const _input, + int _k, + bool _sorted, + bool _speculative_decoding, + char const *name) + : Op(model, + OP_GUMBEL_TOPK, + _input->data_type, + name, + 1 /*inputs*/, + 0 /*weights*/, + _speculative_decoding ? 3 : 1 /*outputs*/, + _input), + k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) { + // overwrite layer_guid + layer_guid = _layer_guid; + int numdim = inputs[0]->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = inputs[0]->dims[i]; + } + + dims[0].size = k; + assert(inputs[0]->dims[0].degree == 1); + assert(inputs[0]->dims[0].parallel_idx == -1); + + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_INT32, this, 0 /*owner_idx*/); + if (_speculative_decoding) { + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/); + outputs[2] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_FLOAT, this, 2 /*owner_idx*/); + } +} + +GumbelTopK::GumbelTopK(FFModel &model, + LayerID const &layer_guid, + GumbelTopK const &other, + ParallelTensor const input) + : GumbelTopK(model, + layer_guid, + input, + other.k, + other.sorted, + other.speculative_decoding, + other.name) {} + +GumbelTopK::GumbelTopK(FFModel &model, + GumbelTopKParams const ¶ms, + ParallelTensor const input, + char const *name) + : GumbelTopK(model, + params.layer_guid, + input, + params.k, + params.sorted, + params.speculative_decoding, + params.name) {} + +void GumbelTopK::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(GUMBEL_TOPK_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(GumbelTopK)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + // 0 /*projection id*/, + // WRITE_ONLY, + // EXCLUSIVE, + // batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void GumbelTopK::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(GUMBEL_TOPK_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(GumbelTopK)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +OpMeta *GumbelTopK::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + GumbelTopK *gumbel_topk = (GumbelTopK *)task->args; + FFHandler handle = *((FFHandler *)task->local_args); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + GumbelTopKMeta *m = + new GumbelTopKMeta(handle, gumbel_topk, gpu_mem_allocator); + m->profiling = gumbel_topk->profiling; + m->inference_debugging = gumbel_topk->inference_debugging; + m->sorted = gumbel_topk->sorted; + m->k = gumbel_topk->k; + std::strcpy(m->op_name, gumbel_topk->name); + m->layer_guid = gumbel_topk->layer_guid; + m->speculative_decoding = gumbel_topk->speculative_decoding; + return m; +} + +void GumbelTopK::forward(FFModel const &ff) { + // GumbelTopK does not support forward + assert(false); +} + +FutureMap + GumbelTopK::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "GumbelTopK op machine_view: " << *(MachineView const *)mv + << std::endl; */ + if (speculative_decoding) { + IndexLauncher launcher(GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + + launcher.add_region_requirement( + RegionRequirement(batch_outputs[2]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[2]->region)); + launcher.add_field(3, FID_DATA); + + return runtime->execute_index_space(ctx, launcher); + } else { + IndexLauncher launcher(GUMBEL_TOPK_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + + return runtime->execute_index_space(ctx, launcher); + } +} + +InferenceResult + GumbelTopK::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + // const GumbelTopK* topk = (const GumbelTopK*) task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + // Directly return for empty batch config + InferenceResult ir; + return ir; + } + GumbelTopKMeta *m = *((GumbelTopKMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW log_probs; + GenericTensorAccessorW perturbed_log_probs; + + int batch_size = bc->num_active_tokens(); + GumbelTopK::forward_kernel_wrapper( + m, input, log_probs, perturbed_log_probs, indices, batch_size, nullptr); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + GumbelTopK::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {}, {indices}); + } + + InferenceResult ir; + ir.num_token_ids = batch_size * m->k; + ir.num_gumbel_logits = batch_size * m->k; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size); + return ir; +} + +InferenceResult GumbelTopK::inference_speculative_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 4); + assert(task->regions.size() == 4); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + // Directly return for empty batch config + InferenceResult ir; + return ir; + } + GumbelTopKMeta *m = *((GumbelTopKMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW log_probs = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW perturbed_log_probs = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime); + + int batch_size = bc->num_active_tokens(); + GumbelTopK::forward_kernel_wrapper( + m, input, log_probs, perturbed_log_probs, indices, batch_size, bc); + + InferenceResult ir; + ir.num_token_ids = batch_size * m->k; + ir.num_gumbel_logits = batch_size * m->k; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); + download_tensor( + log_probs.get_float_ptr(), ir.probs, batch_size * m->k); + download_tensor( + perturbed_log_probs.get_float_ptr(), ir.gumbel_logits, batch_size * m->k); + return ir; +} + +void GumbelTopK::backward(FFModel const &ff) { + // GumbelTopK does not support backward + assert(false); +} + +void GumbelTopK::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); + sez.serialize(this->k); + sez.serialize(this->sorted); + sez.serialize(this->speculative_decoding); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); +} + +Node GumbelTopK::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 1); + size_t id, transformer_layer_id, deserialized_model_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); + int k; + bool sorted; + bool speculative_decoding; + dez.deserialize(k); + dez.deserialize(sorted); + dez.deserialize(speculative_decoding); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + GumbelTopKParams params; + params.layer_guid = layer_guid; + params.k = k; + params.sorted = sorted; + params.speculative_decoding = speculative_decoding; + strcpy(params.name, name); + return ff.get_or_create_node(inputs[0], params); +} + +Op *GumbelTopK::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + GumbelTopKParams params = get_params(); + return new GumbelTopK(ff, params, inputs[0], this->name); +} + +bool GumbelTopK::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::GumbelTopKParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.k); + hash_combine(key, params.sorted); + hash_combine(key, params.speculative_decoding); + return key; +} +}; // namespace std diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu new file mode 100644 index 000000000..1af6c5eab --- /dev/null +++ b/src/ops/gumbel_topk.cu @@ -0,0 +1,618 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/gumbel_topk.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { +// declare Legion names +using Legion::coord_t; + +enum class HeapType { kMinHeap, kMaxHeap }; +enum class PreferIndices { kLower, kHigher }; + +template +struct GumbelEntry { + int index; + T value; + T perturbed_value; +}; + +template +struct LinearData { + typedef GumbelEntry GumbelEntry; + + __device__ GumbelEntry &operator[](std::size_t i) const { + return data[i]; + } + + __device__ int get_index(int i) const { + return data[i].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + __device__ T get_perturbed_value(int i) const { + return data[i].perturbed_value; + } + + GumbelEntry *const data; +}; + +template +struct IndirectLinearData { + typedef GumbelEntry GumbelEntry; + + __device__ GumbelEntry &operator[](std::size_t i) const { + return data[i]; + } + + __device__ int get_index(int i) const { + return backing_data[data[i].index].index; + } + __device__ T get_value(int i) const { + return data[i].value; + } + __device__ T get_perturbed_value(int i) const { + return data[i].perturbed_value; + } + + GumbelEntry *const data; + GumbelEntry *const backing_data; +}; + +template +struct StridedData { + typedef GumbelEntry GumbelEntry; + + __device__ GumbelEntry &operator[](std::size_t i) const { + return data[i * blockDim.x + threadIdx.x]; + } + + __device__ int get_index(int i) const { + return (*this)[i].index; + } + __device__ T get_value(int i) const { + return (*this)[i].value; + } + __device__ T get_perturbed_value(int i) const { + return (*this)[i].perturbed_value; + } + + GumbelEntry *const data; +}; + +// A heap of GumbelEntry that can either work as a min-heap or as a max-heap. +template + class Data, + typename T> +struct IndexedHeap { + typedef typename Data::GumbelEntry GumbelEntry; + Data const data; + __device__ IndexedHeap(Data const &d) : data(d) {} + + __device__ bool is_above(int left, int right) { + T left_perturbed_value = data.get_perturbed_value(left); + T right_perturbed_value = data.get_perturbed_value(right); + if (left_perturbed_value == right_perturbed_value) { + if (preferIndices == PreferIndices::kLower) { + return data.get_index(left) < data.get_index(right); + } else { + return data.get_index(left) > data.get_index(right); + } + } + if (heapType == HeapType::kMinHeap) { + return left_perturbed_value < right_perturbed_value; + } else { + return left_perturbed_value > right_perturbed_value; + } + } + + __device__ void assign(int i, GumbelEntry const &entry) { + data[i] = entry; + } + + __device__ void push_up(int i) { + int child = i; + int parent; + for (; child > 0; child = parent) { + parent = (child - 1) / 2; + if (!is_above(child, parent)) { + // Heap property satisfied. + break; + } + swap(child, parent); + } + } + + __device__ void swap(int a, int b) { + auto tmp = data[b]; + data[b] = data[a]; + data[a] = tmp; + } + + __device__ void push_root_down(int k) { + push_down(0, k); + } + + // MAX-HEAPIFY in Cormen + __device__ void push_down(int node, int k) { + while (true) { + int const left = 2 * node + 1; + int const right = left + 1; + int smallest = node; + if (left < k && is_above(left, smallest)) { + smallest = left; + } + if (right < k && is_above(right, smallest)) { + smallest = right; + } + if (smallest == node) { + break; + } + swap(smallest, node); + node = smallest; + } + } + + // BUILD-MAX-HEAPIFY in Cormen + __device__ void build(int k) { + for (int node = (k - 1) / 2; node >= 0; node--) { + push_down(node, k); + } + } + + // HEAP-EXTRACT-MAX in Cormen + __device__ void remove_root(int k) { + data[0] = data[k - 1]; + push_root_down(k - 1); + } + + // in-place HEAPSORT in Cormen + // This method destroys the heap property. + __device__ void sort(int k) { + for (int slot = k - 1; slot > 0; slot--) { + // This is like remove_root but we insert the element at the end. + swap(slot, 0); + // Heap is now an element smaller. + push_root_down(/*k=*/slot); + } + } + + __device__ void replace_root(GumbelEntry const &entry, int k) { + data[0] = entry; + push_root_down(k); + } + + __device__ GumbelEntry const &root() { + return data[0]; + } +}; + +template + class Data, + typename T> +__device__ IndexedHeap + make_indexed_heap(typename Data::GumbelEntry *data) { + return IndexedHeap{Data{data}}; +} + +__global__ void + init_random_state_kernel(curandState *state, int batch_size, long rand) { + CUDA_KERNEL_LOOP(i, batch_size) { + curand_init(rand, i, 0, &state[i]); + } +} + +// Unified log function for float +__device__ inline float unified_log(float x) { + return logf(x); +} + +// Unified log function for half +__device__ inline __half unified_log(__half x) { + return hlog(x); +} + +// heapGumbelTopK walks over [input, input+length) with `step_size` stride +// starting at `start_index`. It builds a top-`k` heap that is stored in +// `heap_entries` using `Accessor` to access elements in `heap_entries`. If +// sorted=true, the elements will be sorted at the end. NOTE that it applies +// Gumbel trick on `input`, which is, input -> log(input) - log(-log(U)), where +// U is a uniform random number in (0, 1). +template class Data = LinearData> +__device__ void heapGumbelTopK(curandState state, + T const *__restrict__ input, + int length, + int k, + GumbelEntry *__restrict__ heap_entries, + bool sorted = false, + int start_index = 0, + int step_size = 1) { + assert(k <= length); + + auto heap = + make_indexed_heap( + heap_entries); + + int heap_end_index = start_index + k * step_size; + if (heap_end_index > length) { + heap_end_index = length; + } + // Initialize the min-heap. + for (int index = start_index, slot = 0; index < heap_end_index; + index += step_size, slot++) { + T value = unified_log(input[index]); + T perturbed_value = + value - unified_log(-unified_log((T)curand_uniform(&state))); + heap.assign(slot, {index, value, perturbed_value}); + } + + heap.build(k); + + // Now iterate over the remaining items. + // If an item is smaller than the min element, it is not amongst the top k. + // Otherwise, replace the min element with it and push upwards. + for (int index = heap_end_index; index < length; index += step_size) { + // We prefer elements with lower indices. This is given here. + // Later elements automatically have higher indices, so can be discarded. + T value = unified_log(input[index]); + T perturbed_value = + value - unified_log(-unified_log((T)curand_uniform(&state))); + if (perturbed_value > heap.root().perturbed_value) { + // This element should replace the min. + heap.replace_root({index, value, perturbed_value}, k); + } + } + + // Sort if wanted. + if (sorted) { + heap.sort(k); + } +} + +// mergeShards performs a top-k merge on `num_shards` many sorted streams that +// are sorted and stored in `entries` in a strided way: +// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|... +// The overall top k elements are written to `top_k_values` and +// `top_k_perturbed_values`, and their indices to `top_k_indices`. `top_k_heap` +// is used as temporary storage for the merge heap. +template +__device__ void mergeShards(int num_shards, + int k, + GumbelEntry *__restrict__ entries, + GumbelEntry *__restrict__ top_k_heap, + float *top_k_values, + float *top_k_perturbed_values, + int *top_k_indices, + bool speculative_decoding) { + // If k < num_shards, we can use a min-heap with k elements to get the top k + // of the sorted blocks. + // If k > num_shards, we can initialize a min-heap with the top element from + // each sorted block. + int const heap_size = k < num_shards ? k : num_shards; + + // Min-heap part. + { + auto min_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Initialize the heap as a min-heap. + for (int slot = 0; slot < heap_size; slot++) { + min_heap.assign( + slot, {slot, entries[slot].value, entries[slot].perturbed_value}); + } + min_heap.build(heap_size); + + // Now perform top k with the remaining shards (if num_shards > heap_size). + for (int shard = heap_size; shard < num_shards; shard++) { + auto const entry = entries[shard]; + auto const root = min_heap.root(); + if (entry.perturbed_value < root.perturbed_value) { + continue; + } + if (entry.perturbed_value == root.perturbed_value && + entry.index > entries[root.index].index) { + continue; + } + // This element should replace the min. + min_heap.replace_root({shard, entry.value, entry.perturbed_value}, + heap_size); + } + } + + // Max-part. + { + // Turn the min-heap into a max-heap in-place. + auto max_heap = IndexedHeap{IndirectLinearData{top_k_heap, entries}}; + // Heapify into a max heap. + max_heap.build(heap_size); + + // Now extract the minimum k-1 times. + // k is treated specially. + int const last_k = k - 1; + for (int rank = 0; rank < last_k; rank++) { + GumbelEntry const &max_element = max_heap.root(); + int shard_index = max_element.index; + top_k_indices[rank] = entries[shard_index].index; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + top_k_perturbed_values[rank] = + static_cast(max_element.perturbed_value); + } + int next_shard_index = shard_index + num_shards; + // For rank < k-1, each top k heap still contains at least 1 element, + // so we can draw a replacement. + max_heap.replace_root({next_shard_index, + entries[next_shard_index].value, + entries[next_shard_index].perturbed_value}, + heap_size); + } + + // rank == last_k. + GumbelEntry const &max_element = max_heap.root(); + int shard_index = max_element.index; + top_k_indices[last_k] = entries[shard_index].index; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[last_k] = static_cast(max_element.value); + top_k_perturbed_values[last_k] = + static_cast(max_element.perturbed_value); + } + } +} + +template +__global__ void + gumbel_topk_forward_kernel(curandState *state, + T const *__restrict__ input, + size_t shared_memory_size, + int length, + int k, + bool sorted, + float *__restrict__ log_probs_ptr, + float *__restrict__ perturbed_log_probs_ptr, + int *__restrict__ indices, + bool speculative_decoding) { + __shared__ char shared_memory[48 << 10]; // block-wise shared memory + int const batch_index = blockIdx.x; + T const *batch_input = input + batch_index * length; + int const thread_index = threadIdx.x; + int const thread_count = blockDim.x; + GumbelEntry *shared_entries = (GumbelEntry *)shared_memory; + heapGumbelTopK( + state[thread_index + batch_index * thread_count], + batch_input, + length, + k, + shared_entries, + true, + thread_index, + thread_count); + __syncthreads(); + if (thread_index == 0) { + int const offset = batch_index * k; + auto batch_log_probs_ptr = log_probs_ptr + offset; + auto batch_perturbed_log_probs_ptr = perturbed_log_probs_ptr + offset; + auto batch_indices = indices + offset; + GumbelEntry *top_k_heap = shared_entries + thread_count * k; + mergeShards(thread_count, + k, + shared_entries, + top_k_heap, + batch_log_probs_ptr, + batch_perturbed_log_probs_ptr, + batch_indices, + speculative_decoding); + } +} + +/*static*/ +template +void GumbelTopK::forward_kernel(GumbelTopKMeta const *m, + DT const *input_ptr, + float *log_probs_ptr, + float *perturbed_log_probs_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted, + BatchConfig const *bc, + cudaStream_t stream) { + // Adopted from TensorFlow's ArgTopK implementation + // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h + int num_shards = 0; + { + constexpr auto shared_memory_size = 48 << 10; + auto const heap_size = k * sizeof(GumbelEntry
); + // shared_memory_size = (num_shards + 1) * heap_size <=> + num_shards = shared_memory_size / heap_size - 1; + assert(num_shards > 0); + if (num_shards > CUDA_NUM_THREADS) { + num_shards = CUDA_NUM_THREADS; + } + } + // We are limited by the amount of shared memory we have per block. + size_t shared_memory_size = (num_shards + 1) * k * sizeof(GumbelEntry
); + // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; + size_t num_blocks = batch_size; + + // all requests share the same number of branches + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + num_shards = k; + + int state_length = batch_size * num_shards; + init_random_state_kernel<<>>(m->state, state_length, rand()); + + gumbel_topk_forward_kernel<<>>( + m->state, + input_ptr, + shared_memory_size, + length, + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, + sorted, + log_probs_ptr, + perturbed_log_probs_ptr, + indices_ptr, + m->speculative_decoding); + } else { + assert(num_shards >= (size_t)k); + num_shards = k; + + int state_length = batch_size * num_shards; + init_random_state_kernel<<>>(m->state, state_length, rand()); + + gumbel_topk_forward_kernel<<>>( + m->state, + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + nullptr, + indices_ptr, + false); + } +} + +/*static*/ +void GumbelTopK::forward_kernel_wrapper( + GumbelTopKMeta const *m, + GenericTensorAccessorR const &input, + // float *output_ptr, + GenericTensorAccessorW const &log_probs, + GenericTensorAccessorW const &perturbed_log_probs, + GenericTensorAccessorW const &indices, + int batch_size, + BatchConfig const *bc) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // Domain in1_domain = runtime->get_index_space_domain( + // ctx, task->regions[0].region.get_index_space()); + // Domain out1_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + // Domain out2_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + int numdims = input.domain.get_dim(); + assert(indices.domain.get_dim() == numdims); + + int in_cols = input.domain.hi()[0] - input.domain.lo()[0] + 1; + // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1; + int out2_cols = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; + + // assert(out1_domain == out2_domain); + for (int i = 1; i < input.domain.get_dim(); i++) { + assert(input.domain.lo()[i] == indices.domain.lo()[i]); + assert(input.domain.hi()[i] == indices.domain.hi()[i]); + } + // float const *in_ptr = helperGetTensorPointerRO( + // regions[0], task->regions[0], FID_DATA, ctx, runtime); + // float *value_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + // int *index_ptr = helperGetTensorPointerWO( + // regions[1], task->regions[1], FID_DATA, ctx, runtime); + + int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int k = indices.domain.hi()[0] - indices.domain.lo()[0] + + 1; /*TODO: This prints to 5*/ + + // batch_size = input.domain.get_volume() / length; + // assert(indices.domain.get_volume() / k == batch_size); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (input.data_type == DT_HALF) { + GumbelTopK::forward_kernel( + m, + input.get_half_ptr(), + m->speculative_decoding ? log_probs.get_float_ptr() : nullptr, + m->speculative_decoding ? perturbed_log_probs.get_float_ptr() : nullptr, + indices.get_int32_ptr(), + batch_size, + length, + k, + m->sorted, + m->speculative_decoding ? bc : nullptr, + stream); + } else if (input.data_type == DT_FLOAT) { + GumbelTopK::forward_kernel( + m, + input.get_float_ptr(), + m->speculative_decoding ? log_probs.get_float_ptr() : nullptr, + m->speculative_decoding ? perturbed_log_probs.get_float_ptr() : nullptr, + indices.get_int32_ptr(), + batch_size, + length, + k, + m->sorted, + m->speculative_decoding ? bc : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[GumbelTopK] forward time = %.2lfms\n", elapsed); + } +} + +GumbelTopKMeta::GumbelTopKMeta(FFHandler handler, + Op const *op, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handler, op) { + state_max_length = + BatchConfig::MAX_NUM_TOKENS * + max(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, CUDA_NUM_THREADS); + gpu_mem_allocator.create_legion_instance( + reserveInst, sizeof(curandState) * state_max_length, "GumbelTopKMeta"); + state = gpu_mem_allocator.allocate_instance(state_max_length); +} + +GumbelTopKMeta::~GumbelTopKMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} +}; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 7aa350377..b819b4936 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -46,7 +46,7 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -LegionRuntime::Logger::Category log_inc_mha("IncrementalMHA"); +Legion::Logger log_inc_mha("IncrementalMHA"); bool IncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { @@ -54,61 +54,66 @@ bool IncMultiHeadSelfAttentionParams::is_valid( return is_valid; } -Tensor FFModel::inc_multihead_self_attention(const Tensor input, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { - return inc_multiquery_self_attention(input, - embed_dim, - num_heads, - num_heads, - kdim, - vdim, - dropout, - qkv_bias, - final_bias, - add_zero_attn, - data_type, - kernel_initializer, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - name); +Tensor FFModel::inc_multihead_self_attention( + const Tensor input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool qkv_bias, + bool final_bias, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + RotaryEmbeddingMeta rotary_embedding_meta, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + bool streaming_cache, + char const *name) { + return groupquery_self_attention(input, + embed_dim, + num_heads, + num_heads, + kdim, + vdim, + dropout, + qkv_bias, + final_bias, + add_zero_attn, + data_type, + kernel_initializer, + rotary_embedding_meta, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + streaming_cache, + name); } -Tensor FFModel::inc_multiquery_self_attention(const Tensor input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { +Tensor FFModel::groupquery_self_attention( + const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool qkv_bias, + bool final_bias, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + RotaryEmbeddingMeta rotary_embedding_meta, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + bool streaming_cache, + char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; } @@ -147,13 +152,12 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, numdims, dims, data_type, li, 0, true /*create_grad*/); } // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim; + int hidden_size = input->dims[0]; + int qParas = qk_dim * hidden_size; + int kParas = qk_dim * hidden_size; + int vParas = v_dim * hidden_size; + int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size); // allocate num_q_heads for key, value for replication int weight_size = qParas * num_q_heads + kParas * num_q_heads + @@ -178,10 +182,8 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, } if (qkv_bias || final_bias) { // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; + int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -200,13 +202,24 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); - li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("apply_rotary_embedding", + rotary_embedding_meta.apply_rotary_embedding); + li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta); + li->add_string_property("rope_type", rotary_embedding_meta.rope_type); + li->add_float_property("factor", rotary_embedding_meta.factor); + li->add_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + li->add_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + li->add_int_property("original_max_position_embeddings", + rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); li->add_int_property("position_bias", position_bias); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); + li->add_int_property("streaming_cache", streaming_cache); li->add_int_property("tensor_parallelism_degree", config.tensor_parallelism_degree); layers.push_back(li); @@ -237,8 +250,18 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); - bool apply_rotary_embedding = (bool)value; + rotary_embedding_meta.apply_rotary_embedding = (bool)value; + layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta); + layer->get_string_property("rope_type", rotary_embedding_meta.rope_type); + layer->get_float_property("factor", rotary_embedding_meta.factor); + layer->get_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + layer->get_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + layer->get_int_property("original_max_position_embeddings", value); + rotary_embedding_meta.original_max_position_embeddings = (int)value; layer->get_int_property("scaling_query", value); bool scaling_query = (bool)value; float scaling_factor; @@ -252,6 +275,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( DataType quantization_type = (DataType)value; layer->get_int_property("offload", value); bool offload = (bool)value; + layer->get_int_property("streaming_cache", value); + bool streaming_cache = (bool)value; layer->get_int_property("tensor_parallelism_degree", value); int tensor_parallelism_degree = (int)value; @@ -267,7 +292,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( qkv_bias, final_bias, add_zero_attn, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -275,6 +300,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( false /*allocate_weights*/, quantization_type, offload, + streaming_cache, tensor_parallelism_degree, layer->name); } @@ -292,7 +318,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, @@ -300,6 +326,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool allocate_weights, DataType _quantization_type, bool _offload, + bool _streaming_cache, int _tensor_parallelism_degree, char const *name) // Initializer* _bias_initializer) @@ -314,14 +341,13 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), - quantization_type(_quantization_type), offload(_offload), + rotary_embedding_meta(_rotary_embedding_meta), + hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim), + o_dim(_embed_dim), qoSeqLength(_input->dims[1].size), + kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), + scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), + position_bias(_position_bias), quantization_type(_quantization_type), + offload(_offload), streaming_cache(_streaming_cache), tensor_parallelism_degree(_tensor_parallelism_degree) { // overwrite layer_guid layer_guid = _layer_guid; @@ -340,11 +366,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( // Create weight tensor int num_dims = inputs[0]->num_dims; // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; + int qParas = this->qk_dim * this->hidden_size; + int kParas = this->qk_dim * this->hidden_size; + int vParas = this->v_dim * this->hidden_size; int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size); ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; @@ -368,10 +394,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -406,7 +431,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, @@ -414,6 +439,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( bool allocate_weights, DataType _quantization_type, bool _offload, + bool _streaming_cache, int _tensor_parallelism_degree, char const *name) // Initializer* _bias_initializer) @@ -429,14 +455,13 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), - quantization_type(_quantization_type), offload(_offload), + rotary_embedding_meta(_rotary_embedding_meta), + hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim), + o_dim(_embed_dim), qoSeqLength(_input->dims[1].size), + kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), + scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), + position_bias(_position_bias), quantization_type(_quantization_type), + offload(_offload), streaming_cache(_streaming_cache), tensor_parallelism_degree(_tensor_parallelism_degree) // bias_initializer(_bias_initializer) { @@ -453,11 +478,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( // Create weight tensor int num_dims = inputs[0]->num_dims; // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; + int qParas = this->qk_dim * this->hidden_size; + int kParas = this->qk_dim * this->hidden_size; + int vParas = this->v_dim * this->hidden_size; int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size); ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; @@ -482,10 +507,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -518,16 +542,16 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( : IncMultiHeadSelfAttention(model, other.layer_guid, input, - other.oProjSize, + other.o_dim, other.num_q_heads, other.num_kv_heads, - other.qProjSize, - other.vProjSize, + other.qk_dim, + other.v_dim, other.dropout, other.qkv_bias, other.final_bias, other.add_zero_attn, - other.apply_rotary_embedding, + other.rotary_embedding_meta, other.scaling_query, other.scaling_factor, other.qk_prod_scaling, @@ -535,6 +559,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( allocate_weights, other.quantization_type, other.offload, + other.streaming_cache, other.tensor_parallelism_degree, other.name) {} @@ -556,7 +581,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.qkv_bias, params.final_bias, params.add_zero_attn, - params.apply_rotary_embedding, + params.rotary_embedding_meta, params.scaling_query, params.scaling_factor, params.qk_prod_scaling, @@ -564,6 +589,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( allocate_weights, params.quantization_type, params.offload, + params.streaming_cache, params.tensor_parallelism_degree, params.name) {} @@ -696,7 +722,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( attn->num_kv_heads / attn->tensor_parallelism_degree + (attn->num_kv_heads % attn->tensor_parallelism_degree != 0); - assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) @@ -902,25 +928,38 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs, lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.rotary_embedding_meta.apply_rotary_embedding == + rhs.rotary_embedding_meta.apply_rotary_embedding && + lhs.rotary_embedding_meta.rope_theta == + rhs.rotary_embedding_meta.rope_theta && + lhs.rotary_embedding_meta.rope_type == + rhs.rotary_embedding_meta.rope_type && + lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor && + lhs.rotary_embedding_meta.low_freq_factor == + rhs.rotary_embedding_meta.low_freq_factor && + lhs.rotary_embedding_meta.high_freq_factor == + rhs.rotary_embedding_meta.high_freq_factor && + lhs.rotary_embedding_meta.original_max_position_embeddings == + rhs.rotary_embedding_meta.original_max_position_embeddings && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && lhs.qk_prod_scaling == rhs.qk_prod_scaling && - lhs.position_bias == rhs.position_bias; + lhs.position_bias == rhs.position_bias && + lhs.streaming_cache == rhs.streaming_cache; } IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { IncMultiHeadSelfAttentionParams params; params.layer_guid = this->layer_guid; - params.embed_dim = this->oProjSize; + params.embed_dim = this->o_dim; params.num_q_heads = this->num_q_heads; - params.kdim = this->kProjSize; - params.vdim = this->vProjSize; + params.kdim = this->qk_dim; + params.vdim = this->v_dim; params.dropout = this->dropout; params.qkv_bias = this->qkv_bias; params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; - params.apply_rotary_embedding = this->apply_rotary_embedding; + params.rotary_embedding_meta = this->rotary_embedding_meta; params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; @@ -928,6 +967,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.tensor_parallelism_degree = this->tensor_parallelism_degree, params.quantization_type = this->quantization_type; params.offload = this->offload; + params.streaming_cache = this->streaming_cache; params.num_kv_heads = this->num_kv_heads; if (this->name != nullptr) { strcpy(params.name, this->name); @@ -952,13 +992,21 @@ size_t hash::operator()( hash_combine(key, params.qkv_bias); hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); - hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.rope_theta); + hash_combine(key, params.rotary_embedding_meta.rope_type); + hash_combine(key, params.rotary_embedding_meta.factor); + hash_combine(key, params.rotary_embedding_meta.low_freq_factor); + hash_combine(key, params.rotary_embedding_meta.high_freq_factor); + hash_combine(key, + params.rotary_embedding_meta.original_max_position_embeddings); hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); hash_combine(key, params.position_bias); hash_combine(key, params.quantization_type); hash_combine(key, params.offload); + hash_combine(key, params.streaming_cache); hash_combine(key, params.tensor_parallelism_degree); return key; } diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index d60386f92..449940155 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ffconst.h" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/utils/hip_helper.h" #include #include +#include namespace FlexFlow { @@ -123,56 +125,17 @@ __global__ void scaling_query_kernel(DT *input_ptr, } } -template -__global__ void - apply_rotary_embedding_native(DT *input_ptr, - hipFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_q_heads, - int num_tokens, - int num_kv_heads, - int q_block_size, - int k_block_size, - int q_array_size) { - CUDA_KERNEL_LOOP( - i, - num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { - // create complex number - bool q_tensor = i < (q_array_size / 2); - int proj_size = q_tensor ? qProjSize : kProjSize; - int real_i = q_tensor ? i : i - q_array_size / 2; - - int head_idx = real_i / (num_tokens * proj_size / 2); - int idx = real_i % (num_tokens * proj_size / 2); - int real_part_index = idx * 2 + - head_idx * (q_tensor ? q_block_size : k_block_size) + - (q_tensor ? 0 : q_array_size); - - int complex_part_index = real_part_index + 1; - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - int token_idx = - (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - int pos_i = real_i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - hipFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = hipCmulf(complex_input[i], complex_pos); - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - template __global__ void apply_rotary_embedding_hf(DT *input_ptr, hipFloatComplex *complex_input, BatchConfig::PerTokenInfo const *tokenInfos, + float rope_theta, + bool llama3_rope, + float factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, int qProjSize, int kProjSize, int num_tokens, @@ -207,7 +170,29 @@ __global__ void // float before_real = complex_input[i].x, before_complex = int pos_i = real_i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + + float freq = + pos * (1.0 / pow(rope_theta, (float)2 * pos_i / proj_size)); // θ_i + + if (llama3_rope) { + float pi = CUDART_PI_F; + float wavelen = 2 * pi / freq; + float low_freq_wavelen = + original_max_position_embeddings / low_freq_factor; + float high_freq_wavelen = + original_max_position_embeddings / high_freq_factor; + if (wavelen < high_freq_wavelen) { + } else if (wavelen > low_freq_wavelen) { + freq = freq / factor; + } else { + assert(low_freq_wavelen != high_freq_wavelen); + float smooth = + (original_max_position_embeddings / wavelen - low_freq_factor) / + (high_freq_factor - low_freq_factor); + freq = ((1 - smooth) * freq / factor + smooth * freq); + } + } + hipFloatComplex complex_pos = {cos(freq), sin(freq)}; complex_input[i] = hipCmulf(complex_input[i], complex_pos); @@ -232,7 +217,7 @@ __global__ void store_kv_cache(DT const *devQKVProjArray, DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const tok_id = tokenInfos[token_idx].abs_index_in_request; // key cache kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + @@ -243,14 +228,14 @@ __global__ void store_kv_cache(DT const *devQKVProjArray, } template -void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - DT const *bias_ptr, - hipStream_t stream) { +void compute_qkv(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); @@ -319,7 +304,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, *m->scaling_query, m->scaling_factor, - m->hidden_size); + m->local_hidden_size); } else if (m->scaling_query) { hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel
), GET_BLOCKS(parallelism), @@ -331,24 +316,31 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, m->qProjSize, m->scaling_factor, - m->hidden_size); + m->local_hidden_size); } - if (*m->apply_rotary_embedding) { + if (m->rotary_embedding_meta->apply_rotary_embedding) { /*q&k*/ parallelism = num_tokens * m->hidden_size; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - num_tokens, - q_array_size, - m->hidden_size); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(apply_rotary_embedding_hf), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + m->complex_input, + m->token_infos, + m->rotary_embedding_meta->rope_theta, + (m->rotary_embedding_meta->rope_type == "llama3"), + m->rotary_embedding_meta->factor, + m->rotary_embedding_meta->low_freq_factor, + m->rotary_embedding_meta->high_freq_factor, + m->rotary_embedding_meta->original_max_position_embeddings, + m->qProjSize, + m->kProjSize, + num_tokens, + q_array_size, + m->hidden_size); } } @@ -358,7 +350,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, hipStream_t stream) { int num_tokens = bc->num_active_tokens(); if (num_tokens > 0) { - int parallelism = m->hidden_size * num_tokens; + int parallelism = m->local_hidden_size * num_tokens; hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), @@ -370,15 +362,15 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, m->token_infos, num_tokens, BatchConfig::max_sequence_length(), - m->hidden_size); + m->local_hidden_size); } } template -void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - hipStream_t stream) { +void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + hipStream_t stream) { // additional processing for weight uploading // Note that we update weight_ptr and bias_ptr when uploading weight and // bias @@ -457,14 +449,14 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, hipMemcpyHostToDevice, stream)); // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); + compute_qkv(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); @@ -530,11 +522,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (!bc->request_available[i]) { continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + int total_tokens = bc->requestsInfo[i].first_token_index_in_request + bc->requestsInfo[i].num_tokens_in_batch; // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) @@ -773,7 +765,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); + pre_build_weight(m, weight, input.data_type, stream); } half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); @@ -788,7 +780,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( stream); } else if (input.data_type == DT_FLOAT) { if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); + pre_build_weight(m, weight, input.data_type, stream); } float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); @@ -838,7 +830,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, + attn->rotary_embedding_meta, attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, @@ -866,7 +858,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _kProjSize, int _vProjSize, int _oProjSize, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _qkv_bias, bool _scaling_query, bool _qk_prod_scaling, @@ -906,7 +898,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( global_num_kv_heads = _global_num_kv_heads; num_q_heads = _num_q_heads; num_kv_heads = _num_kv_heads; - hidden_size = num_q_heads * qProjSize; + local_hidden_size = num_q_heads * qProjSize; weightSize = ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * @@ -927,8 +919,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // has_load_weights = (bool *)calloc(1, sizeof(bool)); //*has_load_weights = false; - apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); - *apply_rotary_embedding = _apply_rotary_embedding; + rotary_embedding_meta = + (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta)); + *rotary_embedding_meta = _rotary_embedding_meta; qkv_bias = (bool *)calloc(1, sizeof(bool)); *qkv_bias = _qkv_bias; scaling_query = (bool *)calloc(1, sizeof(bool)); @@ -949,7 +942,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = std::max( + infer_mode == TREE_SEARCH_MODE ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()); size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_q_heads + vProjSize * num_q_heads); @@ -965,15 +961,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::max_sequence_length(); break; } - case BEAM_SEARCH_MODE: { + case TREE_SEARCH_MODE: { key_cache_size = num_q_heads * kProjSize * - BeamSearchBatchConfig::max_requests_per_batch() * + BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; value_cache_size = num_q_heads * vProjSize * - BeamSearchBatchConfig::max_requests_per_batch() * + BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; break; } default: @@ -1014,9 +1010,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(gpu_mem_allocator.reserved_total_size - gpu_mem_allocator.reserved_allocated_size >= totalSharedSize); - gpu_mem_allocator.create_legion_instance(reserveInst, instance_size); + gpu_mem_allocator.create_legion_instance( + reserveInst, instance_size, "IncMultiHeadSelfAttentionMeta"); } else { - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "IncMultiHeadSelfAttentionMeta"); } // in tree_verify, enable devQKVProjArray; @@ -1086,13 +1084,13 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { } } -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( +template void Kernels::IncMultiHeadAttention::pre_build_weight( IncMultiHeadSelfAttentionMeta const *m, GenericTensorAccessorR const weight, DataType data_type, hipStream_t stream); -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( +template void Kernels::IncMultiHeadAttention::pre_build_weight( IncMultiHeadSelfAttentionMeta const *m, GenericTensorAccessorR const weight, DataType data_type, diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index a0d31bb6e..30c0586a5 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -15,12 +15,16 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif +#include "flashinfer/decode_attention_decl.cuh" +#include "flashinfer/prefill_attention_decl.cuh" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" +#include "flexflow/page_manager.h" #include "flexflow/utils/cuda_helper.h" +#include namespace FlexFlow { @@ -33,773 +37,198 @@ using Legion::Memory; namespace Kernels { namespace IncMultiHeadAttention { -// gridDim = num_heads -// blockDim = num_tokens/num_request * head_size -// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads| -// one thread process one head_size -template -__global__ void compute_attention_kernel_generation_kernel( - DT const *query, - DT const *key_cache, - DT const *value_cache, - DT *output_ptr, - float const scale, - int max_seq_length, - int per_head_size, - int hidden_size, - BatchConfig::PerRequestInfo *request_infos) { - - // q, k - using Q_vec = typename VEC_K::Type; - using K_vec = typename VEC_K::Type; - using V_vec = typename VEC_V
::Type; - using Out_sum = typename Vec_fp32_::Type; - - constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; - - // eg. if head_size = 128, thread_per_key = 4, with float32 precision - // then K_VEC_SIZE = 1, QK_VEC_SIZE = 4 - // K_ELTS_PER_THREAD = 128 / 4 = 32 - // K_VECS_PER_THREAD = 32 / 1 = 32 - constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); - // constexpr int QK_VEC_SIZE = 16 / sizeof(DT); - // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT); - constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; - constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; - // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); - - // thread id - int const tidx = threadIdx.x; - // head id - int const head_idx = blockIdx.x; - // request idx - int const request_idx = blockIdx.y; - - int const batch_config_request_id = - request_infos[request_idx].batch_config_request_id; - - int const first_step = 0; - - int const tlength = - request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; - - // shared memory objects - extern __shared__ char smem_[]; - - float *qk_smem = reinterpret_cast(smem_); - float *out_smem = reinterpret_cast(smem_); - - float qk_max = -FLT_MAX; - - // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum - __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - - const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + - head_idx * per_head_size; - __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; - // DT const *q_ptr = - // query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size; - - // q tensor in this thread - // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total - // K_VECS_PER_THREAD elements - // QK_vec_k: 32->1, 64->2, 128->4... head_size - // K_vec_k: 4->1, 2->2, 1->4 threads_per_key - - // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE - int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; - int ki_o = tidx % THREADS_PER_KEY; - // the first key's offset for this thread - // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... - int ko = tidx / THREADS_PER_KEY; - // load q tensor - Q_vec q_vec[K_VECS_PER_THREAD]; -#pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - q_vecs[ki_o][ii] = *reinterpret_cast( - q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); - } - __syncthreads(); - // first iter = 128 / 4 = 32 - // K_VECS_PER_THREAD = 32 - // K_PER_ITER how many keys in this loop - // The number of timesteps loaded per iteration. - constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; - // // The number of keys per warp. - constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; - - DT const *k_cache_batch = - key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; - - int ti_end = - div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; - // get k, perform qk proj - - for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { - K_vec k[K_VECS_PER_THREAD]; - int const ti_circ = ti % max_seq_length; -#pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; - if (ti < tlength) { - k[ii] = *reinterpret_cast(k_cache_batch + - ti_circ * hidden_size + - head_idx * per_head_size + jj); - } - // Compute dot product. - // This includes a reduction across the threads in the same thread group. - } - float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - // // todo add positional embedding to the qk production - // // Store the product to shared memory. There's one qk value per - // timestep. - // // Update the max. - if (ti < tlength && tidx % THREADS_PER_KEY == 0) { - // todo add alobi here - bool const mask = ti_circ >= tlength; - if (mask) { - assert(false); - } - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[ti - first_step] = mask ? 0.f : qk; - } - } - - __syncthreads(); - -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Decompose the thread index into warp and lane. - int const warp = tidx / WARP_SIZE; - int const lane = tidx % WARP_SIZE; - - // The warp leader writes the max to shared memory. - if (lane == 0) { - red_smem[warp] = qk_max; - } - - // Make sure the products are in shared memory. - __syncthreads(); - - // The warps finalize the reduction. - qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; -#pragma unroll - for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Broadcast to all the threads in the warp. - qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - - float exp_sum = 0.f; - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - float logit = __expf(qk_smem[ti - first_step] - qk_max); - exp_sum += logit; - qk_smem[ti - first_step] = logit; - } - - // Compute the sum. - exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - - // softmax - float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - qk_smem[ti - first_step] *= inv_sum; - } - - __syncthreads(); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("softmax %.10f\n", qk_smem[0]); - // } - - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // A vector of V elements for the current timestep. - // using V_vec_k = typename V_vec_k_::Type; - // using V_vec_acum = typename V_vec_acum_fp32_::Type; - - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - - Out_sum out; - zero(out); - - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; - - if (Dh == Dh_MAX || vi < Dh) { - for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { - // Load the values from the cache. - int const ti_circ = ti % max_seq_length; - - V_vec v = *reinterpret_cast( - v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - float logit = qk_smem[ti - first_step]; - out = FlexFlow::fma(logit, cast_to_float(v), out); - } - } - - // // Make sure we can start writing to shared memory. - __syncthreads(); - - // Run the final reduction amongst the different groups computing different - // partial outputs. - if (Dh == Dh_MAX || vi < Dh) { -#pragma unroll - for (int active_groups = V_PER_ITER; active_groups >= 2; - active_groups /= 2) { - - // The midpoint in the number of active groups. - int midpoint = active_groups / 2; - - // The upper part of active threads store to shared memory. - if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { - *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = - out; - } - __syncthreads(); - - // The bottom warps update their values. - if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { - out = add(*reinterpret_cast(out_smem + vo * Dh + vi), - out); - } - __syncthreads(); - } - } - - // Output the final values. - if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { - convert_from_float( - *reinterpret_cast(output_ptr + request_idx * hidden_size + - head_idx * per_head_size + vi), - out); - } -} - -// only used by MPT model. https://arxiv.org/abs/2108.12409 -template -__global__ void apply_position_bias_qkprd(DT *input_ptr, - int num_tokens, - int num_total_tokens, - int num_heads, - int global_num_q_heads, - int shard_id) { - CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) { - // get head_idx, - int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id); - int position_idx = (i / num_tokens) % num_total_tokens; - position_idx = position_idx + 1 - num_total_tokens; - // 8 is alibi_bias_max in - // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json - float base = (float)(head_idx + 1) * 8 / global_num_q_heads; - float slopes = 1.0 / pow(2, base); - // if(i == 0){ - // printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes, - // position_idx * slopes); - // } - input_ptr[i] += static_cast
(position_idx * slopes); - } -} - -template -__global__ void apply_proj_bias_w(DT *input_ptr, - DT const *bias_ptr, - int num_tokens, - int qkv_weight_size, - int oProjSize) { - CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { - int bias_idx = qkv_weight_size + i % oProjSize; - input_ptr[i] += bias_ptr[bias_idx]; - } -} - -template -__global__ void apply_proj_bias_qkv(DT *input_ptr, - DT const *bias_ptr, - int shard_id, - int num_tokens, - int qProjSize, - int kProjSize, - int vProjSize, - int global_num_q_heads, - int num_q_heads, - bool scaling_query, - float scaling_factor, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) { - // for simplicity, assume q, k, v is in same shape - // 0->q, 1->k, 2->v - // int qkv_index = i / (num_tokens * qProjSize) % 3; - - int token_idx = i / (hidden_size * QKV_WEIGHT_NUM); - size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM; - - int qkv_index = in_token_idx / hidden_size; - - int proj_size = qkv_index == 0 ? qProjSize : kProjSize; - - int head_idx = - (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size; - int global_head_idx = head_idx + shard_id * num_q_heads; - - size_t pre_length = - qkv_index == 0 - ? 0 - : (qkv_index == 1 ? qProjSize * global_num_q_heads - : qProjSize * global_num_q_heads * KV_WEIGHT_NUM); - - size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size; - - input_ptr[i] += bias_ptr[bias_idx]; - - if (scaling_query && qkv_index == 0) { - input_ptr[i] *= scaling_factor; - } - } -} - -template -__global__ void scaling_query_kernel(DT *input_ptr, - int qProjSize, - int num_tokens, - int num_q_heads, - float scaling_factor, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *= - scaling_factor; - } -} - -template -__global__ void - apply_rotary_embedding_native(DT *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_q_heads, - int num_tokens, - int num_kv_heads, - int q_block_size, - int k_block_size, - int q_array_size) { - CUDA_KERNEL_LOOP( - i, - num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { - // create complex number - bool q_tensor = i < (q_array_size / 2); - int proj_size = q_tensor ? qProjSize : kProjSize; - int real_i = q_tensor ? i : i - q_array_size / 2; - - int head_idx = real_i / (num_tokens * proj_size / 2); - int idx = real_i % (num_tokens * proj_size / 2); - int real_part_index = idx * 2 + - head_idx * (q_tensor ? q_block_size : k_block_size) + - (q_tensor ? 0 : q_array_size); - - int complex_part_index = real_part_index + 1; - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - int token_idx = - (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - // float before_real = complex_input[i].x, before_complex = - // complex_input[i].y; - - int pos_i = real_i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - cuFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = cuCmulf(complex_input[i], complex_pos); - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} +using flashinfer::BatchDecodeHandler; +using flashinfer::BatchDecodeWithPagedKVCacheWrapperDispatched; +using flashinfer::BatchPrefillHandler; +using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched; +using flashinfer::LogitsPostHook; +using flashinfer::MaskMode; +using flashinfer::paged_kv_t; +using flashinfer::PageStorage; +using flashinfer::PosEncodingMode; +using flashinfer::QKVLayout; template -__global__ void - apply_rotary_embedding_hf(DT *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_tokens, - size_t q_array_size, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - // create complex number - bool q_tensor = i < (q_array_size / 2); - int proj_size = q_tensor ? qProjSize : kProjSize; - int real_i = q_tensor ? i : i - q_array_size / 2; - - int token_idx = real_i / (hidden_size / 2); - int idx = real_i % (proj_size / 2); - int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2); - - int real_part_index = idx + head_idx * proj_size + - token_idx * hidden_size * QKV_WEIGHT_NUM + - hidden_size * (q_tensor ? 0 : 1); - int complex_part_index = real_part_index + (proj_size / 2); - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 - // apply a Cartesian coordinate transformation - // multiple with input & /copy back to q/k - - // get position of token - - // size_t pos = id_map[token_idx].token_position; - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - // float before_real = complex_input[i].x, before_complex = - int pos_i = real_i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - cuFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = cuCmulf(complex_input[i], complex_pos); - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - -template -void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - DT const *bias_ptr, - cudaStream_t stream) { - - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - assert(m->qSize == m->vSize && m->qSize == m->kSize); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif - - // Step 1: Compute QKV projections - { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_q = m->qProjSize * m->num_q_heads; - int m_k = m->kProjSize * m->num_q_heads; - int m_v = m->vProjSize * m->num_q_heads; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); - int k = m->qSize; - int m_ = m_q * QKV_WEIGHT_NUM; - // before transpositions - int lda = k, ldb = k, ldc = m_; - // matrix A: QKV weights - // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3] - // matrix B: input - // matrix B's layout: [qSize (hidden_dim), num_new_tokens] - // matrix C: devQKVProjArray - // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens] - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - cublas_data_type, - lda, - input_ptr, - cublas_data_type, - ldb, - &beta, - output_ptr, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - - int num_tokens = bc->num_active_tokens(); - int parallelism = m->kProjSize * num_tokens * m->num_q_heads; - size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; - - // Step 2: apply bias for QKV, or scale the query - if (*m->qkv_bias) { - apply_proj_bias_qkv<<>>(output_ptr, - bias_ptr, - shard_id, - num_tokens, - m->qProjSize, - m->kProjSize, - m->vProjSize, - m->global_num_q_heads, - m->num_q_heads, - *m->scaling_query, - m->scaling_factor, - m->hidden_size); - } else if (m->scaling_query) { - scaling_query_kernel<<>>(output_ptr, - num_tokens, - m->num_q_heads, - m->qProjSize, - m->scaling_factor, - m->hidden_size); - } - - // Step 3: apply rotary embedding if needed - if (*m->apply_rotary_embedding) { - /*q&k*/ - parallelism = num_tokens * m->hidden_size; - apply_rotary_embedding_hf<<>>(output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - num_tokens, - q_array_size, - m->hidden_size); - } -} - -template -void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); - if (num_tokens > 0) { - int parallelism = m->hidden_size * num_tokens; - store_kv_cache<<>>(static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - num_tokens, - BatchConfig::max_sequence_length(), - m->hidden_size); - } -} - -template -void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *weight_ptr, - DT const *bias_ptr, - int num_tokens, - cudaStream_t stream) { - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; -#endif - // Project to output, save result directly on output tensor - { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = num_tokens; - // before transpositions - int lda = k, ldb = k, ldc = m_; - // matrix A: output projection weight - // matrix A's layout: [vProjSize * num_heads, oProjSize] - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - // matrix B: attn heads - // matrix B's layout: [vProjSize * num_heads, num_new_tokens] - DT const *B = static_cast
(m->attn_heads); - // matrix B: output - // matrix B's layout: [oProjSize, num_new_tokens] - DT *C = static_cast
(output_ptr); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - // Add final output bias - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - apply_proj_bias_w<<>>( - output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); - } -} - -#define LAUNCH_ATTENTION_SCORE_KERNEL( \ - DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ - smem_sz = smem_size_in_bytes
(m->qProjSize, \ - BatchConfig::max_sequence_length(), \ - THREADS_PER_VALUE, \ - THDS_PER_BLOCK); \ - compute_attention_kernel_generation_kernel \ - <<>>( \ - static_cast
(m->devQKVProjArray), \ - static_cast
(m->keyCache), \ - static_cast
(m->valueCache), \ - output_ptr, \ - scale, \ - BatchConfig::max_sequence_length(), \ - m->qProjSize, \ - m->hidden_size, \ - m->request_infos) - -template -void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - DT *output_ptr, - cudaStream_t stream) { - dim3 grid(m->num_q_heads, bc->num_generation_tokens); - int const per_head_size = m->qProjSize; - float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - size_t smem_sz; - if (per_head_size == 64) { - constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; - LAUNCH_ATTENTION_SCORE_KERNEL( - DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); - } else if (per_head_size == 128) { - constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; - LAUNCH_ATTENTION_SCORE_KERNEL( - DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); +void incr_attention(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // int device; + // checkCUDA(cudaGetDevice(&device)); + // cudaEvent_t t_start, t_end; + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + // global constant parameters + uint32_t const num_q_heads = m->num_q_heads; + uint32_t const num_kv_heads = m->num_kv_heads; + uint32_t const head_dim = m->qk_dim; + uint32_t const batch_size = bc->num_active_requests(); + float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f; + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "Update custom mask time: " << elapsed << " ms\n"; + // } + + half *q = static_cast(m->queryTmp), + *kv = static_cast(m->kvCache), + *o = static_cast(m->outputTmp); + paged_kv_t paged_kv( + num_kv_heads, + kPagesize, + head_dim, + batch_size, + QKVLayout::kNHD, + kv, + m->handle.incr_attention_metadata->kv_indices, + m->handle.incr_attention_metadata->kv_indptr, + m->handle.incr_attention_metadata->kv_last_page_len); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" attn prep time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + void *handler = nullptr; + + if (!bc->prompt_phase) { + assert(m->handle.incr_attention_metadata->decode_handler_collections.count( + batch_size) != 0 && + "Handler is not initialized"); + handler = m->handle.incr_attention_metadata + ->decode_handler_collections[batch_size]; } else { - assert(false && "a unsupported head size"); - } -} - -template -void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - cudaStream_t stream) { - // additional processing for weight uploading - // Note that we update weight_ptr and bias_ptr when uploading weight and - // bias - if (m->quantization_type != DT_NONE) { - // copy weight_ptr to quantized_weight_ptr, do compression and store in - // m->weight_ptr - cudaMemcpyAsync(m->quantized_weight_ptr, - weight.get_byte_ptr(), - m->quantized_weightSize, - cudaMemcpyHostToDevice, - stream); - - if (m->quantization_type == DT_INT4) { - int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2; - decompress_int4_attention_weights<<>>( - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); + assert(m->handle.incr_attention_metadata->prompt_handler_collections.count( + batch_size) != 0 && + "Handler is not initialized"); + handler = m->handle.incr_attention_metadata + ->prompt_handler_collections[batch_size]; + } + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" BeginForward time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + DISPATCH_HEADDIM(head_dim, HEAD_DIM, { + cudaError_t result; + if (bc->prompt_phase) { + result = + BatchPrefillWithPagedKVCacheWrapperDispatched( + static_cast(handler), + q, + m->handle.incr_attention_metadata->q_indptr, + /*q_offset=*/nullptr, + paged_kv, + /*custom_mask=*/nullptr, + /*qk_indptr=*/nullptr, + o, + /*lse=*/nullptr, + num_q_heads, + /*window_left=*/-1, + /*logits_soft_cap=*/0.f, + sm_scale, + /*rope_scale=*/1.f, + /*rope_theta=*/static_cast(1e4), + stream); } else { - assert(m->quantization_type == DT_INT8); - int parallelism = m->qProjSize * m->qSize * m->num_q_heads; - decompress_int8_attention_weights<<>>( - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); + result = + BatchDecodeWithPagedKVCacheWrapperDispatched( + static_cast(handler), + q, + /*q_offset=*/nullptr, + paged_kv, + o, + /*lse=*/nullptr, + num_q_heads, + /*window_left=*/-1, + /*logits_soft_cap=*/0.f, + sm_scale, + /*rope_scale=*/1.f, + /*rope_theta=*/static_cast(1e4), + stream); } - } else { - if (data_type == DT_FLOAT) { - cudaMemcpyAsync(m->weight_ptr, - weight.get_float_ptr(), - m->weightSize, - cudaMemcpyHostToDevice, - stream); - } else if (data_type == DT_HALF) { - cudaMemcpyAsync(m->weight_ptr, - weight.get_half_ptr(), - m->weightSize, - cudaMemcpyHostToDevice, - stream); - } else { - assert(false); + if (result != cudaSuccess) { + throw std::runtime_error("Failed to run " + "IncrementalDecodingAttentionForwardKernel: " + + std::string(cudaGetErrorString(result))); } - } + }); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" actual attn time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + produce_output(m, bc, output_ptr, stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" produce_output_kernel time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); } template -void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, +void inference_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *input_ptr, @@ -808,36 +237,51 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *bias_ptr, cudaStream_t stream) { + // cudaEvent_t t_start, t_end; + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + if (m->offload && m->biasSize > 0) { cudaMemcpyAsync( m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } - // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); - update_kv_cache_kernel
(m, bc, stream); - - if (bc->num_generation_tokens > 0) { - // phase 3: Compute attention score for generation tokens - compute_attention_kernel_generation
( - m, bc, static_cast
(m->attn_heads), stream); + // phase 1: Compute QKV Projections of the batch + compute_qkv(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + + // phase 2: First maintain the streaming cache, because it need + // pre-pos-encoding values + if (m->streaming_cache) { + // Move pre-pos-encoding cache to where took by attention + update_kv_in_streaming_cache
(m, bc, stream); + // Apply pos-encoding to those k values + apply_pos_encoding_to_streaming_proj
(m, bc, stream); + // Commit to the streaming cache + commit_kv
(m, bc, stream); + } + + // phase 3: Take care of the batch + { + // Apply pos-encoding to the batch + apply_pos_encoding_to_tokens_in_batch( + m, bc, static_cast
(m->devQKVProjArray), stream); + // Move the batch qkv values to where took by attention + update_qkv_in_batch_paged
(m, bc, stream, false); } - if (bc->num_tokens > bc->num_generation_tokens) { - // phase 4: Compute attention score for prompt tokens; - compute_attention_kernel_prompt( - m, bc, shard_id, bias_ptr, weight_ptr, stream); - } + // phase 4: Attention computation + incr_attention
(m, bc, static_cast
(m->attn_heads), stream); - // compute output production and bias together for all tokens + // phase 5: Compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens(); compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); @@ -848,295 +292,9 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, using namespace Kernels::IncMultiHeadAttention; -template -__global__ void store_kv_cache(DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - BatchConfig::PerTokenInfo const *tokenInfos, - int num_tokens, - int max_seq_len, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - int offset = i % hidden_size; - - size_t val_idx = - token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - - // key cache - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; - } -} - -template -__global__ void fill_entries_above_diagonal(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_q_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; - } -} - -template -void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif - // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; - int q_block_size = m->qProjSize; - int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); - int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { - continue; - } - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; - // Step 1: compute query-key product QK.T/sqrt(d_k) - { - // Scale by sqrt(d_k) as per the original attention paper - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // after transpositions - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - // before transpositions - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - // N.B. strides are applied before transpose operations - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // matrix A: devQKVProjArray - // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] - // To get query projection, skip over Q entries from previous requests - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // matrix B: key cache - // matrix B's layout: [kProjSize * num_heads, total_tokens] - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // matrix C: qk_prods - // matrix C's layout: [num_new_tokens, total_tokens, num_heads] - // To get C, skip over QK.T products from previous requests - DT *C = static_cast
(m->qk_prods); - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - // Step 2: Add alibi position bias to qk production - // matrix C: qk_prods - // matrix C's layout: [num_new_tokens, total_tokens, num_heads] - // To get C, skip over QK.T products from previous requests - DT *C = static_cast
(m->qk_prods); - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - - // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods - // with -inf to force causal attention. - assert(num_new_tokens <= total_tokens); - size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; - if (entries_above_diagonal > 0) { - size_t parallelism = m->num_q_heads * entries_above_diagonal; - fill_entries_above_diagonal<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - entries_above_diagonal, - static_cast
(-INFINITY)); - } - - // Step 4: Compute Softmax(QK.T/sqrt(d_k)) - { - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - } - // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ - // softmax(QK.T/sqrt(d_k)).T - { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_ = m->vProjSize; - int n = num_new_tokens; - int k = total_tokens; - // before transpositions - int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - // N.B. strides are applied before transpose operations - int strideA = vt_block_size; - int strideB = num_new_tokens * total_tokens; - int strideC = m->vProjSize; - // matrix A: value cache - // matrix A's layout: [vProjSize, num_heads, total_tokens] - // To get A, skip over V.T entries from previous requests (all heads + - // padding) - DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; - // matrix B: qk_prods_softmax - // matrix B's layout: [num_new_tokens, total_tokens, num_heads] - // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous - // requests (all heads) - DT *B = static_cast
(m->qk_prods_softmax); - ; - // matrix C: attn heads - // matrix C's layout: [vProjSize, num_heads, num_new_tokens] - // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous - // requests - // store the result attn heads, also skip the genration tokens - DT *C = static_cast
(m->attn_heads) + - (bc->requestsInfo[i].first_token_offset_in_batch) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - tokens_previous_requests += num_new_tokens; - } - if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { - bc->print(); - printf("tokens_previous_requests: %i\n", tokens_previous_requests); - printf("num_tokens: %i\n", num_tokens); - printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); - } - assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); -} - /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( - IncMultiHeadSelfAttentionMeta const *m, + IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, @@ -1147,12 +305,10 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(get_legion_stream(&stream)); bool use_bias = *m->qkv_bias || *m->final_bias; - cudaEvent_t t_start, t_end; - if (m->profiling) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start, stream); - } + // cudaEvent_t t_start, t_end; + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); @@ -1162,11 +318,11 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); + pre_build_weight(m, weight, input.data_type, stream); } half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::IncMultiHeadAttention::inference_kernel( + Kernels::IncMultiHeadAttention::inference_kernel( m, bc, shard_id, @@ -1177,11 +333,11 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( stream); } else if (input.data_type == DT_FLOAT) { if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); + pre_build_weight(m, weight, input.data_type, stream); } float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::IncMultiHeadAttention::inference_kernel( + Kernels::IncMultiHeadAttention::inference_kernel( m, bc, shard_id, @@ -1195,15 +351,13 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( assert(false && "Unspported data type"); } - if (m->profiling) { - cudaEventRecord(t_end, stream); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed); - } + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed); } IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( @@ -1217,14 +371,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( : IncMultiHeadSelfAttentionMeta(handler, INC_DECODING_MODE, attn, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->apply_rotary_embedding, + attn->hidden_size, + attn->qk_dim, + attn->v_dim, + attn->o_dim, + attn->rotary_embedding_meta, attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, @@ -1239,20 +390,18 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( _num_q_heads, _num_kv_heads, attn->quantization_type, - attn->offload) {} + attn->offload, + attn->streaming_cache) {} IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, InferenceMode infer_mode, Op const *attn, - int _qSize, - int _kSize, - int _vSize, - int _qProjSize, - int _kProjSize, - int _vProjSize, - int _oProjSize, - bool _apply_rotary_embedding, + int _hidden_size, + int _qk_dim, + int _v_dim, + int _o_dim, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _qkv_bias, bool _scaling_query, bool _qk_prod_scaling, @@ -1267,54 +416,49 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _num_q_heads, int _num_kv_heads, DataType _quantization_type, - bool _offload) + bool _offload, + bool _streaming_cache) : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor)); - qSize = _qSize; - kSize = _kSize; - vSize = _vSize; - // assume dimensions match for now - assert(qSize == kSize); - assert(kSize == vSize); - qProjSize = _qProjSize; - kProjSize = _kProjSize; - assert(qProjSize == kProjSize); // required for attention QK.T matmul - vProjSize = _vProjSize; - oProjSize = _oProjSize; + hidden_size = _hidden_size; + qk_dim = _qk_dim; + v_dim = _v_dim; + o_dim = _o_dim; size_t size_of_dt = data_type_size(attn->data_type); quantization_type = _quantization_type; offload = _offload; + streaming_cache = _streaming_cache; global_num_q_heads = _global_num_q_heads; global_num_kv_heads = _global_num_kv_heads; num_q_heads = _num_q_heads; num_kv_heads = _num_kv_heads; - hidden_size = num_q_heads * qProjSize; + local_hidden_size = num_q_heads * qk_dim; weightSize = - ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * + ((hidden_size * qk_dim + o_dim * (v_dim > 0 ? v_dim : hidden_size)) * num_q_heads + - (kSize * kProjSize + vSize * vProjSize) * num_q_heads) * + (hidden_size * qk_dim + hidden_size * v_dim) * num_q_heads) * size_of_dt; if (quantization_type != DT_NONE) { quantized_weightSize = get_quantization_to_byte_size( attn->data_type, quantization_type, weightSize); } - // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; + // biasSize = _bias ? o_dim * size_of_dt * 4 : 0; - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int final_bias_size = oProjSize; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; + int final_bias_size = o_dim; biasSize = (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0); // has_load_weights = (bool *)calloc(1, sizeof(bool)); //*has_load_weights = false; - apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); - *apply_rotary_embedding = _apply_rotary_embedding; + rotary_embedding_meta = + (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta)); + *rotary_embedding_meta = _rotary_embedding_meta; qkv_bias = (bool *)calloc(1, sizeof(bool)); *qkv_bias = _qkv_bias; scaling_query = (bool *)calloc(1, sizeof(bool)); @@ -1335,51 +479,83 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(); - size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + - kProjSize * num_q_heads + - vProjSize * num_q_heads); - size_t key_cache_size = 0, value_cache_size = 0; + int max_tokens_per_batch = std::max( + infer_mode == TREE_SEARCH_MODE ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()); + size_t qkv_max_proj_size = + max_tokens_per_batch * + (qk_dim * num_q_heads + qk_dim * num_q_heads + v_dim * num_q_heads); + size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0; + size_t streaming_pre_pos_enc_size = 0; + // assert((BatchConfig::max_sequence_length() + + // BatchConfig::max_spec_tree_token_num()) % + // kPagesize == + // 0); + size_t max_num_pages = + round_up_pages(BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); + PageManager *pm = PageManager::get_page_manager(); + size_t total_kv_cache_size_per_layer = pm->get_kv_cache_size_per_layer(); switch (infer_mode) { - case INC_DECODING_MODE: { - key_cache_size = num_q_heads * kProjSize * - BatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length(); - value_cache_size = num_q_heads * vProjSize * + case INC_DECODING_MODE: + case TREE_VERIFY_MODE: { + query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch; + // a K-ary tree max node is (k^n - 1) / 2 + if (total_kv_cache_size_per_layer == 0) { + key_cache_size = num_kv_heads * qk_dim * BatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length(); + max_num_pages * kPagesize; + value_cache_size = num_kv_heads * v_dim * + BatchConfig::max_requests_per_batch() * + max_num_pages * kPagesize; + } else { + key_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt; + value_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt; + } break; } - case BEAM_SEARCH_MODE: - case TREE_VERIFY_MODE: { + case TREE_SEARCH_MODE: { + query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch; // a K-ary tree max node is (k^n - 1) / 2 - key_cache_size = num_q_heads * kProjSize * - BeamSearchBatchConfig::max_requests_per_batch() * - (BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num()); - value_cache_size = num_q_heads * vProjSize * - BeamSearchBatchConfig::max_requests_per_batch() * - (BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num()); + key_cache_size = num_kv_heads * qk_dim * + BatchConfig::max_requests_per_batch() * max_num_pages * + kPagesize; + value_cache_size = num_kv_heads * v_dim * + BatchConfig::max_requests_per_batch() * + max_num_pages * kPagesize; break; } default: assert(false && "Unkown inference mode"); } - size_t requestinfo_size = BatchConfig::max_requests_per_batch(); - // size_t tokeninfo_size = max_tokens_per_batch; - size_t qk_prod_size = - max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; - size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; - size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads + - kProjSize * num_q_heads)) / - 2; + if (streaming_cache) { + size_t max_post_pos_enc_pages = round_up_pages( + BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() + + max(max_tokens_per_batch, BatchConfig::max_spec_tree_token_num())); + key_cache_size = num_kv_heads * qk_dim * + BatchConfig::max_requests_per_batch() * + max_post_pos_enc_pages * kPagesize; + value_cache_size = num_kv_heads * v_dim * + BatchConfig::max_requests_per_batch() * + max_post_pos_enc_pages * kPagesize; + streaming_pre_pos_enc_size = + num_kv_heads * (qk_dim + v_dim) * + BatchConfig::max_requests_per_batch() * + round_up_pages(BatchConfig::MAX_STREAMING_POS - + BatchConfig::get_max_tree_depth()) * + kPagesize; + } + size_t attn_heads_size = max_tokens_per_batch * num_q_heads * v_dim; + size_t output_tmp_size = max_tokens_per_batch * num_q_heads * v_dim; + size_t complex_size = + (max_tokens_per_batch * (qk_dim * num_q_heads + qk_dim * num_q_heads)) / + 2; size_t totalSize = - (qkv_max_proj_size + key_cache_size + value_cache_size + - 2 * qk_prod_size + attn_heads_size) * + (qkv_max_proj_size + query_tmp_size + key_cache_size + + value_cache_size + streaming_pre_pos_enc_size + attn_heads_size) * size_of_dt + + output_tmp_size * data_type_size(DT_HALF) + complex_size * sizeof(cuFloatComplex); // more components will // be added here later if (offload) { @@ -1387,15 +563,20 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_t totalSharedSize = infer_mode == TREE_VERIFY_MODE ? totalSize - - (key_cache_size + value_cache_size + qkv_max_proj_size) * + (query_tmp_size + key_cache_size + value_cache_size + + streaming_pre_pos_enc_size + qkv_max_proj_size) * size_of_dt - : totalSize - (key_cache_size + value_cache_size) * size_of_dt; + : totalSize - (query_tmp_size + key_cache_size + + value_cache_size + streaming_pre_pos_enc_size) * + size_of_dt; size_t instance_size = size_of_dt * (infer_mode == TREE_VERIFY_MODE - ? key_cache_size + value_cache_size + qkv_max_proj_size - : key_cache_size + value_cache_size); + ? query_tmp_size + key_cache_size + value_cache_size + + streaming_pre_pos_enc_size + qkv_max_proj_size + : query_tmp_size + key_cache_size + value_cache_size + + streaming_pre_pos_enc_size); if (quantization_type != DT_NONE) { totalSharedSize += quantized_weightSize; @@ -1403,44 +584,54 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(gpu_mem_allocator.reserved_total_size - gpu_mem_allocator.reserved_allocated_size >= totalSharedSize); - gpu_mem_allocator.create_legion_instance(reserveInst, instance_size); + gpu_mem_allocator.create_legion_instance( + reserveInst, instance_size, "IncMultiHeadSelfAttentionMeta"); } else { - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "IncMultiHeadSelfAttentionMeta"); } // in tree_verify, enable devQKVProjArray; - if (!offload || infer_mode == TREE_VERIFY_MODE) { - devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped( + if (offload) { + devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped( qkv_max_proj_size * size_of_dt); } else { - devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped( + devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped( qkv_max_proj_size * size_of_dt); - // offset += qkv_max_proj_size * size_of_dt; } // use key value cache in all mode. - keyCache = gpu_mem_allocator.allocate_instance_untyped(key_cache_size * - size_of_dt); - valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * + if (query_tmp_size > 0) { + queryTmp = gpu_mem_allocator.allocate_instance_untyped(query_tmp_size * size_of_dt); + } + kvCache = gpu_mem_allocator.allocate_instance_untyped( + (key_cache_size + value_cache_size) * size_of_dt); + if (streaming_pre_pos_enc_size > 0) { + streamingPrePosEncBuf = gpu_mem_allocator.allocate_instance_untyped( + streaming_pre_pos_enc_size * size_of_dt); + } + outputTmp = gpu_mem_allocator.allocate_instance(output_tmp_size); token_infos = static_cast(handler.batch_config_metadata); request_infos = reinterpret_cast( reinterpret_cast(handler.batch_config_metadata) + sizeof(BatchConfig::tokensInfo)); + request_available = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); + streaming_cache_infos = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::request_available) + + sizeof(BatchConfig::causalMask)); if (offload) { // token_infos = // gpu_mem_allocator.allocate_reserved( // tokeninfo_size); // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; - qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * - size_of_dt); - // offset += qk_prod_size * size_of_dt; - qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped( - qk_prod_size * size_of_dt); - // offset += qk_prod_size * size_of_dt; attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size * size_of_dt); // offset += attn_heads_size * size_of_dt; @@ -1454,10 +645,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // token_infos = // gpu_mem_allocator.allocate_instance( // tokeninfo_size); - qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * - size_of_dt); - qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( - qk_prod_size * size_of_dt); attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size * size_of_dt); complex_input = @@ -1478,6 +665,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator.reserved_allocated_size); } } + + // set attention constants + handler.incr_attention_metadata->set_enabled(true); + handler.incr_attention_metadata->set_num_q_heads(num_q_heads); + handler.incr_attention_metadata->set_num_kv_heads(num_kv_heads); + handler.incr_attention_metadata->set_head_dim(qk_dim); + cudaStreamSynchronize(stream); } @@ -1487,49 +681,4 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { } } -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( - IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - cudaStream_t stream); - -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( - IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - cudaStream_t stream); - -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - float *output_ptr, - float const *weight_ptr, - float const *bias_ptr, - int num_tokens, - cudaStream_t stream); - -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - half *output_ptr, - half const *weight_ptr, - half const *bias_ptr, - int num_tokens, - cudaStream_t stream); - -template void - Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - float *output_ptr, - cudaStream_t stream); - -template void - Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - half *output_ptr, - cudaStream_t stream); }; // namespace FlexFlow diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 22d8161ff..f58d2dde9 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -48,7 +48,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else if (weight.data_type == DT_FLOAT) { Internal::forward_kernel(input.get_int32_ptr(), @@ -58,7 +58,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else if (weight.data_type == DT_DOUBLE) { Internal::forward_kernel(input.get_int32_ptr(), @@ -68,7 +68,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else { assert(false && "Unsupported DataType in Embedding"); @@ -82,7 +82,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else if (weight.data_type == DT_FLOAT) { Internal::forward_kernel(input.get_int64_ptr(), @@ -92,7 +92,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else if (weight.data_type == DT_DOUBLE) { Internal::forward_kernel(input.get_int64_ptr(), @@ -102,7 +102,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else { assert(false && "Unsupported DataType in Embedding"); @@ -139,7 +139,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else if (m->output_type[0] == DT_FLOAT) { Internal::backward_kernel(input.get_int32_ptr(), @@ -149,7 +149,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else if (m->output_type[0] == DT_DOUBLE) { Internal::backward_kernel(input.get_int32_ptr(), @@ -159,7 +159,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else { assert(false && "Unsupported DataType in Embedding"); @@ -173,7 +173,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else if (m->output_type[0] == DT_FLOAT) { Internal::backward_kernel(input.get_int64_ptr(), @@ -183,7 +183,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else if (m->output_type[0] == DT_DOUBLE) { Internal::backward_kernel(input.get_int64_ptr(), @@ -193,7 +193,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m, out_dim, batch_size, m->aggr, - output.domain.get_volume(), + out_dim * batch_size, stream); } else { assert(false && "Unsupported DataType in Embedding"); diff --git a/src/ops/kernels/gemm_impl.cu b/src/ops/kernels/gemm_impl.cu new file mode 100644 index 000000000..939eaeb3b --- /dev/null +++ b/src/ops/kernels/gemm_impl.cu @@ -0,0 +1,559 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/kernels/gemm_impl.h" +#include "flexflow/utils/cuda_helper.h" +#include + +namespace Internal { + +GemmEngine::GemmEngine(cublasHandle_t blas_, + cublasLtHandle_t blasLt_, + cudaDeviceProp *device_prop_, + size_t workspace_size_) { + blas = blas_; + blasLt = blasLt_; + if (device_prop_ == nullptr) { + device_prop = new cudaDeviceProp; + } else { + device_prop = device_prop_; + } + workspace_size = workspace_size_; + workspace = nullptr; +} + +void GemmEngine::assign_workspace(void *workspace_, size_t workspace_size_) { + assert(workspace_size_ >= workspace_size); + workspace = workspace_; +} + +template +void GemmEngine::gemm_internal(cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + Dtype alpha, + Dtype const *a, + int64_t lda, + Dtype const *b, + int64_t ldb, + Dtype beta, + Dtype *c, + int64_t ldc, + cudaStream_t stream) { + static_assert(false && sizeof(Dtype), "gemm_internal: not implemented"); +} + +#ifdef USE_CUBLASLT +/* Implementations for gemm_internal_cublaslt */ +template +struct CuBlasLtDeleter { + void operator()(T *x) { + if (x != nullptr) { + checkCUDA(destructor(x)); + } + } +}; + +template +class CuBlasLtDescriptor { +public: + T *descriptor() const { + return descriptor_.get(); + } + T *descriptor() { + return descriptor_.get(); + } + +protected: + std::unique_ptr> descriptor_; +}; + +class CuBlasLtMatmulDescriptor + : public CuBlasLtDescriptor { +public: + CuBlasLtMatmulDescriptor(cublasComputeType_t compute_type, + cudaDataType_t scale_type) { + cublasLtMatmulDesc_t raw_descriptor = nullptr; + checkCUDA( + cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type)); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { + // NOLINTNEXTLINE(bugprone-sizeof-expression) + checkCUDA(::cublasLtMatmulDescSetAttribute( + descriptor(), attr, &value, sizeof(T))); + } +}; + +class CuBlasLtMatrixLayout + : public CuBlasLtDescriptor { +public: + CuBlasLtMatrixLayout(cudaDataType_t type, + uint64_t rows, + uint64_t cols, + int64_t ld, + bool t = false) { + cublasLtMatrixLayout_t raw_descriptor = nullptr; + checkCUDA(cublasLtMatrixLayoutCreate( + &raw_descriptor, type, t ? cols : rows, t ? rows : cols, ld)); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, + const T value) { + checkCUDA(::cublasLtMatrixLayoutSetAttribute( + descriptor(), attr, &value, sizeof(T))); + } +}; + +class CuBlasLtMatmulPreference + : public CuBlasLtDescriptor { +public: + CuBlasLtMatmulPreference() { + cublasLtMatmulPreference_t raw_descriptor = nullptr; + checkCUDA(cublasLtMatmulPreferenceCreate(&raw_descriptor)); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, + const T value) { + checkCUDA(::cublasLtMatmulPreferenceSetAttribute( + descriptor(), attr, &value, sizeof(T))); + } +}; + +inline uint32_t _getAlignment(uintptr_t address) { + // alignment are in bytes + uint32_t alignment = 256; + for (;; alignment /= 2) { + if (!(address % alignment)) { + return alignment; + } + } +} + +template +inline void gemm_internal_cublaslt(cublasLtHandle_t handle, + cudaDeviceProp *prop, + void *workspace, + size_t workspace_size, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + Dtype alpha, + Dtype const *a, + int64_t lda, + Dtype const *b, + int64_t ldb, + Dtype beta, + Dtype *c, + int64_t ldc, + cudaStream_t stream) { + assert(workspace != nullptr && "workspace must be provided."); + cudaDataType_t abcType = CUDA_R_32F; + cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; + cudaDataType_t scaleType = CUDA_R_32F; + if constexpr (std::is_same_v) { + abcType = CUDA_R_64F; + computeType = CUBLAS_COMPUTE_64F; + scaleType = CUDA_R_64F; + } else if constexpr (std::is_same_v) { + computeType = CUBLAS_COMPUTE_32F_FAST_TF32; + } else if constexpr (std::is_same_v) { + abcType = CUDA_R_16F; + computeType = CUBLAS_COMPUTE_16F; + } else { + static_assert(false && sizeof(Dtype), + "bgemm_internal_cublaslt: not implemented"); + } + + CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb); + CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, transa == CUBLAS_OP_T); + CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, transb == CUBLAS_OP_T); + CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc); + + CuBlasLtMatmulPreference preference; + preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + workspace_size); + + uint32_t a_alignment = _getAlignment(reinterpret_cast(a)); + uint32_t b_alignment = _getAlignment(reinterpret_cast(b)); + uint32_t c_alignment = _getAlignment(reinterpret_cast(c)); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, + a_alignment); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, + b_alignment); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, + c_alignment); + + cublasLtMatmulHeuristicResult_t heuristicResult = {}; + int returnedResult = 0; + checkCUDA(cublasLtMatmulAlgoGetHeuristic(handle, + computeDesc.descriptor(), + Adesc.descriptor(), + Bdesc.descriptor(), + Cdesc.descriptor(), + Cdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + if (returnedResult == 0) { + assert(false && "cuBLASLt failed to find a valid algorithm."); + } + + checkCUDA(cublasLtMatmul(handle, + computeDesc.descriptor(), + &alpha, + a, + Adesc.descriptor(), + b, + Bdesc.descriptor(), + &beta, + c, + Cdesc.descriptor(), + c, + Cdesc.descriptor(), + &heuristicResult.algo, + workspace, + workspace_size, + stream)); +} +#else +/* Implementations for gemm_internal_cublas */ +template +inline void gemm_internal_cublas(cublasHandle_t handle, + cudaDeviceProp *prop, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + Dtype alpha, + Dtype const *a, + int64_t lda, + Dtype const *b, + int64_t ldb, + Dtype beta, + Dtype *c, + int64_t ldc, + cudaStream_t stream) { + static_assert(false && sizeof(Dtype), + "gemm_internal_cublas: not implemented"); +} + +template <> +void gemm_internal_cublas(cublasHandle_t handle, + cudaDeviceProp *prop, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + double alpha, + double const *a, + int64_t lda, + double const *b, + int64_t ldb, + double beta, + double *c, + int64_t ldc, + cudaStream_t stream) { + checkCUDA(cublasDgemm( + handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)); +} + +template <> +void gemm_internal_cublas(cublasHandle_t handle, + cudaDeviceProp *prop, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + float const *a, + int64_t lda, + float const *b, + int64_t ldb, + float beta, + float *c, + int64_t ldc, + cudaStream_t stream) { + checkCUDA(cublasSgemm( + handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)); + // checkCUDA(cublasGemmEx( + // handle, + // transa, + // transb, + // m, + // n, + // k, + // &alpha, + // a, + // CUDA_R_32F, + // lda, + // b, + // CUDA_R_32F, + // ldb, + // &beta, + // c, + // CUDA_R_32F, + // ldc, + // CUBLAS_COMPUTE_32F_FAST_16F, + // CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +} + +template <> +void gemm_internal_cublas(cublasHandle_t handle, + cudaDeviceProp *prop, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + half alpha, + half const *a, + int64_t lda, + half const *b, + int64_t ldb, + half beta, + half *c, + int64_t ldc, + cudaStream_t stream) { + if (prop->major >= 5) { + // Disallow fp16 reductions that could lead to unexpected overflow issues. + // cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; + // if (!at::globalContext().allowFP16ReductionCuBLAS()) { + // cublas_flags = static_cast(cublas_flags | + // CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); + // } + // checkCUDA(cublasSetMathMode(handle, cublas_flags)); + checkCUDA(cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + &alpha, + a, + CUDA_R_16F, + lda, + b, + CUDA_R_16F, + ldb, + &beta, + c, + CUDA_R_16F, + ldc, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // checkCUDA(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); + } else { + float falpha = alpha; + float fbeta = beta; + checkCUDA(cublasSgemmEx(handle, + transa, + transb, + m, + n, + k, + &falpha, + a, + CUDA_R_16F, + lda, + b, + CUDA_R_16F, + ldb, + &fbeta, + c, + CUDA_R_16F, + ldc)); + } +} +#endif + +template <> +void GemmEngine::gemm_internal(cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + double alpha, + double const *a, + int64_t lda, + double const *b, + int64_t ldb, + double beta, + double *c, + int64_t ldc, + cudaStream_t stream) { +#ifdef USE_CUBLASLT + gemm_internal_cublaslt(blasLt, + device_prop, + workspace, + workspace_size, + transa, + transb, + m, + n, + k, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc, + stream); +#else + gemm_internal_cublas(blas, + device_prop, + transa, + transb, + m, + n, + k, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc, + stream); +#endif +} + +template <> +void GemmEngine::gemm_internal(cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + float const *a, + int64_t lda, + float const *b, + int64_t ldb, + float beta, + float *c, + int64_t ldc, + cudaStream_t stream) { +#ifdef USE_CUBLASLT + gemm_internal_cublaslt(blasLt, + device_prop, + workspace, + workspace_size, + transa, + transb, + m, + n, + k, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc, + stream); +#else + gemm_internal_cublas(blas, + device_prop, + transa, + transb, + m, + n, + k, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc, + stream); +#endif +} + +template <> +void GemmEngine::gemm_internal(cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + half alpha, + half const *a, + int64_t lda, + half const *b, + int64_t ldb, + half beta, + half *c, + int64_t ldc, + cudaStream_t stream) { +#ifdef USE_CUBLASLT + gemm_internal_cublaslt(blasLt, + device_prop, + workspace, + workspace_size, + transa, + transb, + m, + n, + k, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc, + stream); +#else + gemm_internal_cublas(blas, + device_prop, + transa, + transb, + m, + n, + k, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc, + stream); +#endif +} +} // namespace Internal diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu new file mode 100644 index 000000000..9bb58794a --- /dev/null +++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu @@ -0,0 +1,1118 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "flexflow/batch_config.h" +#include +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "cuComplex.h" +#endif +#include "flashinfer/pos_enc.cuh" +#include "flexflow/attention_config.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/inc_multihead_self_attention.h" +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" +#include "flexflow/utils/cuda_helper.h" +#include + +namespace FlexFlow { + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; + +using flashinfer::BatchQKApplyLlama31Rotary; +using flashinfer::BatchQKApplyRotary; + +#define WARP_SIZE 32 + +namespace Kernels { +namespace IncMultiHeadAttention { + +// only used by MPT model. https://arxiv.org/abs/2108.12409 +template +__global__ void apply_position_bias_qkprd(DT *input_ptr, + int num_tokens, + int num_total_tokens, + int num_heads, + int global_num_q_heads, + int shard_id) { + CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) { + // get head_idx, + int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id); + int position_idx = (i / num_tokens) % num_total_tokens; + position_idx = position_idx + 1 - num_total_tokens; + // 8 is alibi_bias_max in + // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json + float base = (float)(head_idx + 1) * 8 / global_num_q_heads; + float slopes = 1.0 / pow(2, base); + // if(i == 0){ + // printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes, + // position_idx * slopes); + // } + input_ptr[i] += static_cast
(position_idx * slopes); + } +} + +template +__global__ void apply_proj_bias_w(DT *input_ptr, + DT const *bias_ptr, + int num_tokens, + int qkv_weight_size, + int o_dim) { + CUDA_KERNEL_LOOP(i, num_tokens * o_dim) { + int bias_idx = qkv_weight_size + i % o_dim; + input_ptr[i] += bias_ptr[bias_idx]; + } +} + +template +__global__ void apply_proj_bias_qkv(DT *input_ptr, + DT const *bias_ptr, + int shard_id, + int num_tokens, + int qk_dim, + int v_dim, + int global_num_q_heads, + int num_q_heads, + bool scaling_query, + float scaling_factor, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) { + // for simplicity, assume q, k, v is in same shape + // 0->q, 1->k, 2->v + // int qkv_index = i / (num_tokens * qk_dim) % 3; + + int token_idx = i / (hidden_size * QKV_WEIGHT_NUM); + size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM; + + int qkv_index = in_token_idx / hidden_size; + + int proj_size = qkv_index == 0 ? qk_dim : qk_dim; + + int head_idx = + (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size; + int global_head_idx = head_idx + shard_id * num_q_heads; + + size_t pre_length = + qkv_index == 0 + ? 0 + : (qkv_index == 1 ? qk_dim * global_num_q_heads + : qk_dim * global_num_q_heads * KV_WEIGHT_NUM); + + size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size; + + input_ptr[i] += bias_ptr[bias_idx]; + + if (scaling_query && qkv_index == 0) { + input_ptr[i] *= scaling_factor; + } + } +} + +template +__global__ void scaling_query_kernel(DT *input_ptr, + int qk_dim, + int num_tokens, + int num_q_heads, + float scaling_factor, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *= + scaling_factor; + } +} + +template +void compute_qkv(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + cudaStream_t stream) { + + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + // int device; + // checkCUDA(cudaGetDevice(&device)); + // cudaEvent_t t_start, t_end; + // checkCUDA(cudaEventCreate(&t_start)); + // checkCUDA(cudaEventCreate(&t_end)); + // checkCUDA(cudaEventRecord(t_start, stream)); + + // Step 1: Compute QKV projections + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_q = m->qk_dim * m->num_q_heads; + int m_k = m->qk_dim * m->num_q_heads; + int m_v = m->v_dim * m->num_q_heads; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_tokens(); + int k = m->hidden_size; + int m_ = m_q * QKV_WEIGHT_NUM; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: QKV weights + // matrix A's layout: [hidden_size (hidden_dim), qk_dim, num_heads, 3] + // matrix B: input + // matrix B's layout: [hidden_size (hidden_dim), num_new_tokens] + // matrix C: devQKVProjArray + // matrix B's layout: [qk_dim, num_heads, 3, num_new_tokens] + m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + alpha, + weight_ptr, + lda, + input_ptr, + ldb, + beta, + output_ptr, + ldc, + stream); + } + + // checkCUDA(cudaEventRecord(t_end, stream)); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) { + // std::cout << "GEMM time: " << elapsed << " ms\n"; + // } + + int num_tokens = bc->num_active_tokens(); + if (num_tokens == 0) { + return; + } + int parallelism = m->qk_dim * num_tokens * m->num_q_heads; + + // Step 2: apply bias for QKV, or scale the query + if (*m->qkv_bias) { + apply_proj_bias_qkv<<>>(output_ptr, + bias_ptr, + shard_id, + num_tokens, + m->qk_dim, + m->v_dim, + m->global_num_q_heads, + m->num_q_heads, + *m->scaling_query, + m->scaling_factor, + m->local_hidden_size); + } else if (m->scaling_query) { + scaling_query_kernel<<>>(output_ptr, + num_tokens, + m->num_q_heads, + m->qk_dim, + m->scaling_factor, + m->local_hidden_size); + } +} + +template +__global__ void apply_pos_encoding_to_tokens_in_batch_kernel( + DT *input_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + float rope_theta, + bool llama3_rope, + float factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, + int qk_dim, + int num_tokens, + size_t q_array_size, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + // create complex number + bool q_tensor = i < (q_array_size / 2); + int proj_size = q_tensor ? qk_dim : qk_dim; + int real_i = q_tensor ? i : i - q_array_size / 2; + + int token_idx = real_i / (hidden_size / 2); + int idx = real_i % (proj_size / 2); + int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2); + + int real_part_index = idx + head_idx * proj_size + + token_idx * hidden_size * QKV_WEIGHT_NUM + + hidden_size * (q_tensor ? 0 : 1); + int complex_part_index = real_part_index + (proj_size / 2); + + cuFloatComplex cii = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + // get the freq_cis: shape 1 * (qk_dim/2) = 1 * 64 + // apply a Cartesian coordinate transformation + // multiple with input & /copy back to q/k + + // get position of token + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + float freq = pos * (1.0 / pow(rope_theta, (float)2 * idx / proj_size)); + + if (llama3_rope) { + float pi = CUDART_PI_F; + float wavelen = 2 * pi / freq; + float low_freq_wavelen = + original_max_position_embeddings / low_freq_factor; + float high_freq_wavelen = + original_max_position_embeddings / high_freq_factor; + if (wavelen < high_freq_wavelen) { + } else if (wavelen > low_freq_wavelen) { + freq = freq / factor; + } else { + assert(low_freq_wavelen != high_freq_wavelen); + float smooth = + (original_max_position_embeddings / wavelen - low_freq_factor) / + (high_freq_factor - low_freq_factor); + freq = ((1 - smooth) * freq / factor + smooth * freq); + } + } + + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + cii = cuCmulf(cii, complex_pos); + input_ptr[real_part_index] = cii.x; + input_ptr[complex_part_index] = cii.y; + } +} + +template +void apply_pos_encoding_to_tokens_in_batch( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // apply rotary embedding if needed + if (!m->rotary_embedding_meta->apply_rotary_embedding) { + return; + } + int num_tokens = bc->num_active_tokens(); + if (num_tokens == 0) { + return; + } + int parallelism = num_tokens * m->local_hidden_size; + size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads; + bool llama3_rope = (m->rotary_embedding_meta->rope_type == "llama3"); + apply_pos_encoding_to_tokens_in_batch_kernel<<>>( + output_ptr, + m->token_infos, + m->rotary_embedding_meta->rope_theta, + llama3_rope, + m->rotary_embedding_meta->factor, + m->rotary_embedding_meta->low_freq_factor, + m->rotary_embedding_meta->high_freq_factor, + m->rotary_embedding_meta->original_max_position_embeddings, + m->qk_dim, + num_tokens, + q_array_size, + m->local_hidden_size); +} + +__global__ void apply_pos_encoding_to_streaming_proj_kernel( + half *kv_cache, + BatchConfig::PerRequestInfo const *requestInfos, + bool const *request_available, + int const max_num_pages, + int num_kv_heads, + int head_dim, + float rope_theta, + bool llama3_rope, + float factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, + StreamingCacheInfo const *streaming_cache_infos, + uint32_t const max_num_requests) { + int const kv_hidden_size = num_kv_heads * head_dim; + int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + int token_idx = thread_idx / (kv_hidden_size / 2); + // Each complex is consist of (i, i + head_dim / 2) wuthin the same head. + int const head_idx = (thread_idx % (kv_hidden_size / 2)) / (head_dim / 2); + int const offset_in_head = thread_idx % (head_dim / 2); + // Get the corresponding request index and token index in the request. + int request_idx = 0; + while (token_idx >= 0 && request_idx < max_num_requests) { + if (request_available[request_idx]) { + token_idx -= streaming_cache_infos[request_idx].commit_len; + } + request_idx++; + } + if (token_idx >= 0) { + return; + } + request_idx--; + token_idx += streaming_cache_infos[request_idx].commit_len; + + // Get the real and complex part index for the current complex. + int const real_part_idx = + get_k_entry_offset( + request_idx, token_idx, max_num_pages, num_kv_heads, head_dim) + + head_idx * head_dim + offset_in_head; + int const complex_part_idx = real_part_idx + head_dim / 2; + + // Apply the rotary position encoding. + cuFloatComplex cii = {kv_cache[real_part_idx], kv_cache[complex_part_idx]}; + size_t pos = token_idx; + float freq = + pos * (1.0 / pow(rope_theta, (float)2 * offset_in_head / head_dim)); + + if (llama3_rope) { + float pi = CUDART_PI_F; + float wavelen = 2 * pi / freq; + float low_freq_wavelen = original_max_position_embeddings / low_freq_factor; + float high_freq_wavelen = + original_max_position_embeddings / high_freq_factor; + if (wavelen < high_freq_wavelen) { + } else if (wavelen > low_freq_wavelen) { + freq = freq / factor; + } else { + assert(low_freq_wavelen != high_freq_wavelen); + float smooth = + (original_max_position_embeddings / wavelen - low_freq_factor) / + (high_freq_factor - low_freq_factor); + freq = ((1 - smooth) * freq / factor + smooth * freq); + } + } + + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + cii = cuCmulf(cii, complex_pos); + kv_cache[real_part_idx] = cii.x; + kv_cache[complex_part_idx] = cii.y; +} + +template +void apply_pos_encoding_to_streaming_proj( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { + assert(m->streaming_cache); + // apply rotary embedding if needed + if (!m->rotary_embedding_meta->apply_rotary_embedding) { + return; + } + int const kv_hidden_size = m->num_kv_heads * m->qk_dim; + int num_tokens = 0; + for (int req_idx = 0; req_idx < BatchConfig::max_requests_per_batch(); + req_idx++) { + if (!bc->request_available[req_idx]) { + continue; + } + num_tokens += bc->streamingCacheInfo[req_idx].commit_len; + } + if (num_tokens == 0) { + return; + } + int parallelism = num_tokens * kv_hidden_size / 2; + int const max_num_pages = round_up_pages( + BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() + + BatchConfig::max_spec_tree_token_num()); + bool llama3_rope = (m->rotary_embedding_meta->rope_type == "llama3"); + apply_pos_encoding_to_streaming_proj_kernel<<>>( + static_cast(m->kvCache), + m->request_infos, + m->request_available, + max_num_pages, + m->num_kv_heads, + m->qk_dim, + m->rotary_embedding_meta->rope_theta, + llama3_rope, + m->rotary_embedding_meta->factor, + m->rotary_embedding_meta->low_freq_factor, + m->rotary_embedding_meta->high_freq_factor, + m->rotary_embedding_meta->original_max_position_embeddings, + m->streaming_cache_infos, + bc->max_requests_per_batch()); +} + +template +__global__ void + update_qkv_in_batch_kernel(DT *qkv_proj_array, + half *qTmp_ptr, + half *kvCache_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + int const max_num_pages, + int num_q_heads, + int num_kv_heads, + int head_dim, + int num_new_tokens) { + int const q_hidden_size = num_q_heads * head_dim; + int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code + int const kv_hidden_size = num_kv_heads * head_dim; + int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + int const token_idx = thread_idx / q_hidden_size; + int const offset = thread_idx % q_hidden_size; + if (token_idx >= num_new_tokens) { + return; + } + + int const req_idx = tokenInfos[token_idx].request_index; + int token_abs_idx = tokenInfos[token_idx].abs_index_in_request; + + size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2); + qTmp_ptr[token_idx * q_hidden_size + offset] = + static_cast(qkv_proj_array[from_idx + offset]); + + if (offset < kv_hidden_size) { + size_t to_k_idx = get_k_entry_offset( + req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim), + to_v_idx = get_v_entry_offset( + req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim); + // key and value cache should be stored interleaved + int const stride = num_q_heads / num_kv_heads; + int const kv_offset = + offset / head_dim * stride * head_dim + offset % head_dim; + kvCache_ptr[to_k_idx + offset] = + static_cast(qkv_proj_array[from_idx + q_hidden_size + kv_offset]); + kvCache_ptr[to_v_idx + offset] = + static_cast(qkv_proj_array[from_idx + q_hidden_size + + temp_kv_hidden_size + kv_offset]); + } +} + +template +void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { + int num_new_tokens = bc->num_active_tokens(); + if (num_new_tokens == 0) { + return; + } + int parallelism = m->local_hidden_size * num_new_tokens; + int const max_num_pages = + round_up_pages(BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); + update_qkv_in_batch_kernel<<>>(static_cast
(m->devQKVProjArray), + static_cast(m->queryTmp), + static_cast(m->kvCache), + m->token_infos, + max_num_pages, + m->num_q_heads, + m->num_kv_heads, + m->qk_dim, + num_new_tokens); +} + +template +__global__ void update_qkv_in_batch_paged_kernel( + DT *qkv_proj_array, + half *qTmp_ptr, + half *kvCache_ptr, + int32_t *kv_indptr, + int32_t *kv_page_indices, + bool const *request_available, + BatchConfig::PerTokenInfo const *tokenInfos, + int num_q_heads, + int num_kv_heads, + int head_dim, + int num_new_tokens) { + int const q_hidden_size = num_q_heads * head_dim; + int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code + int const kv_hidden_size = num_kv_heads * head_dim; + int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + int const token_idx = thread_idx / q_hidden_size; + int const offset = thread_idx % q_hidden_size; + + if (token_idx >= num_new_tokens) { + return; + } + + int const req_idx = tokenInfos[token_idx].request_index; + int token_abs_idx = tokenInfos[token_idx].abs_index_in_request; + + // calculate the compact request index in the easiest way + // TODO: recheck + int req_idx_compact = -1; + int cnt = 0; + while (cnt < req_idx + 1) { + if (request_available[cnt]) { + req_idx_compact++; + } + cnt++; + } + + assert(req_idx_compact >= 0 && "Invalid request index"); + + size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2); + qTmp_ptr[token_idx * q_hidden_size + offset] = + static_cast(qkv_proj_array[from_idx + offset]); + + if (offset < kv_hidden_size) { + int start = kv_indptr[req_idx_compact]; + int end = kv_indptr[req_idx_compact + 1] - 1; + assert(start <= end && "Invalid kv_indptr"); + assert(start + (token_abs_idx / kPagesize) <= end && "Invalid page index"); + int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)]; + size_t to_k_idx = get_k_entry_offset_verify( + token_abs_idx, page_idx, num_kv_heads, head_dim), + to_v_idx = get_v_entry_offset_verify( + token_abs_idx, page_idx, num_kv_heads, head_dim); + // key and value cache should be stored interleaved + int const stride = num_q_heads / num_kv_heads; + int const kv_offset = + offset / head_dim * stride * head_dim + offset % head_dim; + kvCache_ptr[to_k_idx + offset] = + static_cast(qkv_proj_array[from_idx + q_hidden_size + kv_offset]); + kvCache_ptr[to_v_idx + offset] = + static_cast(qkv_proj_array[from_idx + q_hidden_size + + temp_kv_hidden_size + kv_offset]); + } +} + +template +void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream, + bool is_spec) { + // printf("entered update_qkv_in_batch_verify\n"); + int num_new_tokens = bc->num_active_tokens(); + if (num_new_tokens == 0) { + return; + } + int parallelism = m->local_hidden_size * num_new_tokens; + int32_t *kv_indptr = is_spec + ? m->handle.tree_verify_attention_metadata->kv_indptr + : m->handle.incr_attention_metadata->kv_indptr; + int32_t *kv_indices = + is_spec ? m->handle.tree_verify_attention_metadata->kv_indices + : m->handle.incr_attention_metadata->kv_indices; + update_qkv_in_batch_paged_kernel<<>>( + static_cast
(m->devQKVProjArray), + static_cast(m->queryTmp), + static_cast(m->kvCache), + kv_indptr, + kv_indices, + m->request_available, + m->token_infos, + m->num_q_heads, + m->num_kv_heads, + m->qk_dim, + num_new_tokens); +} + +__global__ void update_kv_in_streaming_cache_kernel( + half *pre_pos_enc_buf, + half *kv_cache, + BatchConfig::PerRequestInfo const *requestInfos, + bool const *request_available, + int const max_num_pages_pre_pos_enc_buf, + int const max_num_pages_kv_cache, + int num_kv_heads, + int head_dim, + StreamingCacheInfo const *streaming_cache_infos, + uint32_t const max_num_requests) { + int const kv_hidden_size = num_kv_heads * head_dim; + int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + int token_idx = thread_idx / kv_hidden_size; + int const offset = thread_idx % kv_hidden_size; + int request_idx = 0; + while (token_idx >= 0 && request_idx < max_num_requests) { + if (request_available[request_idx]) { + token_idx -= streaming_cache_infos[request_idx].commit_len; + } + request_idx++; + } + if (token_idx >= 0) { + return; + } + request_idx--; + token_idx += streaming_cache_infos[request_idx].commit_len; + + size_t from_k_idx = get_k_entry_offset(request_idx, + token_idx, + max_num_pages_pre_pos_enc_buf, + num_kv_heads, + head_dim), + from_v_idx = get_v_entry_offset(request_idx, + token_idx, + max_num_pages_pre_pos_enc_buf, + num_kv_heads, + head_dim); + + // to_idx should consider the rolling property of the window cache + int to_idx = token_idx; + StreamingCacheInfo const &info = streaming_cache_infos[request_idx]; + if (info.commit_len >= info.sink_cache_size + info.window_cache_size && + to_idx >= info.sink_cache_size) { + to_idx -= info.sink_cache_size; + to_idx = (to_idx + info.window_cache_size - info.window_back) % + info.window_cache_size; + to_idx += info.sink_cache_size; + } + + size_t to_k_idx = get_k_entry_offset(request_idx, + to_idx, + max_num_pages_kv_cache, + num_kv_heads, + head_dim), + to_v_idx = get_v_entry_offset(request_idx, + to_idx, + max_num_pages_kv_cache, + num_kv_heads, + head_dim); + + kv_cache[to_k_idx + offset] = pre_pos_enc_buf[from_k_idx + offset]; + kv_cache[to_v_idx + offset] = pre_pos_enc_buf[from_v_idx + offset]; +} + +template +void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { + assert(m->streaming_cache); + int const kv_hidden_size = m->num_kv_heads * m->qk_dim; + int num_tokens = 0; + for (int req_idx = 0; req_idx < BatchConfig::max_requests_per_batch(); + req_idx++) { + if (!bc->request_available[req_idx]) { + continue; + } + num_tokens += bc->streamingCacheInfo[req_idx].commit_len; + } + if (num_tokens == 0) { + return; + } + int parallelism = kv_hidden_size * num_tokens; + int const max_num_pages_pre_pos_enc_buf = round_up_pages( + BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth()); + int const max_num_pages_kv_cache = round_up_pages( + BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() + + BatchConfig::max_spec_tree_token_num()); + + update_kv_in_streaming_cache_kernel<<>>( + static_cast(m->streamingPrePosEncBuf), + static_cast(m->kvCache), + m->request_infos, + m->request_available, + max_num_pages_pre_pos_enc_buf, + max_num_pages_kv_cache, + m->num_kv_heads, + m->qk_dim, + m->streaming_cache_infos, + bc->max_requests_per_batch()); +} + +template +__global__ void + commit_kv_kernel(DT const *qkv_proj_array, + half *pre_pos_enc_buf, + BatchConfig::PerTokenInfo const *tokenInfos, + BatchConfig::PerRequestInfo const *requestInfos, + int const max_num_pages, + int num_q_heads, + int num_kv_heads, + int head_dim, + StreamingCacheInfo const *streaming_cache_infos, + int num_new_tokens) { + int const q_hidden_size = num_q_heads * head_dim; + int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code + int const kv_hidden_size = num_kv_heads * head_dim; + int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + int const token_idx = thread_idx / kv_hidden_size; + int const offset = thread_idx % kv_hidden_size; + if (token_idx >= num_new_tokens) { + return; + } + int const request_idx = tokenInfos[token_idx].request_index; + + StreamingCacheInfo const &info = streaming_cache_infos[request_idx]; + int to_idx = tokenInfos[token_idx].abs_index_in_request; + // cases that get over the boundary: + // 1. commit_len < sink_cache_size: commit to sink, window, window_back is + // after commit_len. + // 2. sink_cache_size <= commit_len < sink_cache_size + window_cache_size: + // commit to window, window_back + sink_cache_size = commit_len, pointing to + // the same position. + // 3. commit_len >= sink_cache_size + window_cache_size: commit to window, + // window is full before this commit, window_back is pointing to the real + // position. + if (to_idx >= info.sink_cache_size + info.window_cache_size) { + to_idx = to_idx - info.commit_len + info.window_back; + if (info.commit_len < info.sink_cache_size) { + // For case 1, compensating for sink offset, because window_back is + // someway back from commit_len. + to_idx -= info.sink_cache_size - info.commit_len; + } + to_idx = info.sink_cache_size + to_idx % info.window_cache_size; + } + // TODO: For now don't consider the case that the commit tokens roll over the + // for more than once. In this case, we should only count the last tokens in + // the same window position. + + size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2); + size_t to_k_idx = get_k_entry_offset( + request_idx, to_idx, max_num_pages, num_kv_heads, head_dim), + to_v_idx = get_v_entry_offset( + request_idx, to_idx, max_num_pages, num_kv_heads, head_dim); + + int const stride = num_q_heads / num_kv_heads; + int const kv_offset = + offset / head_dim * stride * head_dim + offset % head_dim; + + pre_pos_enc_buf[to_k_idx + offset] = + static_cast(qkv_proj_array[from_idx + q_hidden_size + kv_offset]); + pre_pos_enc_buf[to_v_idx + offset] = + static_cast(qkv_proj_array[from_idx + q_hidden_size + + temp_kv_hidden_size + kv_offset]); +} + +template +void commit_kv(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { + assert(m->streaming_cache); + int const kv_hidden_size = m->num_kv_heads * m->qk_dim; + int const num_new_tokens = bc->num_active_tokens(); + if (num_new_tokens == 0) { + return; + } + int parallelism = kv_hidden_size * num_new_tokens; + int const max_num_pages = round_up_pages(BatchConfig::MAX_STREAMING_POS - + BatchConfig::get_max_tree_depth()); + + commit_kv_kernel<<>>(static_cast
(m->devQKVProjArray), + static_cast(m->streamingPrePosEncBuf), + m->token_infos, + m->request_infos, + max_num_pages, + m->num_q_heads, + m->num_kv_heads, + m->qk_dim, + m->streaming_cache_infos, + num_new_tokens); +} + +template +__global__ void produce_output_kernel(half const *input_ptr, + DT *output_ptr, + int parallelism) { + CUDA_KERNEL_LOOP(idx, parallelism) { + output_ptr[idx] = static_cast
(input_ptr[idx]); + } +} + +template +void produce_output(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + int const num_tokens = bc->num_active_tokens(); + if (num_tokens == 0) { + return; + } + int parallelism = m->v_dim * m->num_q_heads * num_tokens; + produce_output_kernel<<>>(m->outputTmp, output_ptr, parallelism); +} + +template +void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *weight_ptr, + DT const *bias_ptr, + int num_tokens, + cudaStream_t stream) { + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); + // Project to output, save result directly on output tensor + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->o_dim; + int k = m->v_dim * m->num_q_heads; + int n = num_tokens; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: output projection weight + // matrix A's layout: [v_dim * num_heads, o_dim] + DT const *A = weight_ptr + m->hidden_size * (m->qk_dim * m->num_q_heads + + m->qk_dim * m->num_q_heads + + m->v_dim * m->num_q_heads); + // matrix B: attn heads + // matrix B's layout: [v_dim * num_heads, num_new_tokens] + DT const *B = static_cast
(m->attn_heads); + // matrix B: output + // matrix B's layout: [o_dim, num_new_tokens] + DT *C = static_cast
(output_ptr); + + m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc, + stream); + } + // Add final output bias + if (*m->final_bias && shard_id == 0) { + int parallelism = m->o_dim * num_tokens; + int qkv_weight_size = m->qk_dim * m->global_num_q_heads + + m->qk_dim * m->global_num_q_heads + + m->v_dim * m->global_num_q_heads; + apply_proj_bias_w<<>>( + output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->o_dim); + } +} + +template +void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream) { + // additional processing for weight uploading + // Note that we update weight_ptr and bias_ptr when uploading weight and + // bias + if (m->quantization_type != DT_NONE) { + // copy weight_ptr to quantized_weight_ptr, do compression and store in + // m->weight_ptr + cudaMemcpyAsync(m->quantized_weight_ptr, + weight.get_byte_ptr(), + m->quantized_weightSize, + cudaMemcpyHostToDevice, + stream); + + if (m->quantization_type == DT_INT4) { + int parallelism = m->qk_dim * m->hidden_size * m->num_q_heads / 2; + decompress_int4_attention_weights<<>>( + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qk_dim, + m->hidden_size, + m->num_q_heads); + } else { + assert(m->quantization_type == DT_INT8); + int parallelism = m->qk_dim * m->hidden_size * m->num_q_heads; + decompress_int8_attention_weights<<>>( + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qk_dim, + m->hidden_size, + m->num_q_heads); + } + } else { + if (data_type == DT_FLOAT) { + cudaMemcpyAsync(m->weight_ptr, + weight.get_float_ptr(), + m->weightSize, + cudaMemcpyHostToDevice, + stream); + } else if (data_type == DT_HALF) { + cudaMemcpyAsync(m->weight_ptr, + weight.get_half_ptr(), + m->weightSize, + cudaMemcpyHostToDevice, + stream); + } else { + assert(false); + } + } +} + +} // namespace IncMultiHeadAttention +} // namespace Kernels + +using namespace Kernels::IncMultiHeadAttention; + +template void Kernels::IncMultiHeadAttention::pre_build_weight( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::pre_build_weight( + IncMultiHeadSelfAttentionMeta const *m, + GenericTensorAccessorR const weight, + DataType data_type, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_qkv( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float const *input_ptr, + float const *weight_ptr, + float *output_ptr, + float const *bias_ptr, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_qkv( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half const *input_ptr, + half const *weight_ptr, + half *output_ptr, + half const *bias_ptr, + cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch< + float>(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + half *output_ptr, + cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::apply_pos_encoding_to_streaming_proj( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::apply_pos_encoding_to_streaming_proj( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::update_qkv_in_batch( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::update_qkv_in_batch( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_paged( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream, + bool is_spec); + +template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_paged( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream, + bool is_spec); + +template void + Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::commit_kv( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::commit_kv( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::produce_output( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::produce_output( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + half *output_ptr, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int num_tokens, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half *output_ptr, + half const *weight_ptr, + half const *bias_ptr, + int num_tokens, + cudaStream_t stream); +}; // namespace FlexFlow diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index c30c9f71c..2c049be68 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -40,7 +40,7 @@ LinearMeta::LinearMeta(FFHandler handler, } // Allocate an all-one's vector gpu_mem_allocator.create_legion_instance( - reserveInst, data_type_size(data_type) * batch_size); + reserveInst, data_type_size(data_type) * batch_size, "LinearMeta"); one_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * batch_size); int parallelism = batch_size; @@ -323,6 +323,10 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); + DT const *input_p = static_cast
(input_ptr), + *weight_p = + static_cast
(m->offload ? m->weight_ptr : weight_ptr); + DT *output_p = static_cast
(output_ptr); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; #else @@ -334,25 +338,20 @@ void forward_kernel(LinearMeta const *m, compute_type = CUBLAS_COMPUTE_32F_FAST_16F; } #endif - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - out_dim, - batch_size, - in_dim, - &alpha, - m->offload ? m->weight_ptr : weight_ptr, - weight_type, - in_dim, - input_ptr, - input_type, - in_dim, - &beta, - output_ptr, - output_type, - out_dim, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T, + CUBLAS_OP_N, + out_dim, + batch_size, + in_dim, + alpha, + weight_p, + in_dim, + input_p, + in_dim, + beta, + output_p, + out_dim, + stream); // use_bias = True if (bias_ptr != NULL) { // fuse bias and relu diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp index 690655645..ed0b0f9a5 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cpp +++ b/src/ops/kernels/residual_rms_norm_kernels.cpp @@ -42,7 +42,8 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, size_t rms_ptr_size = batch_size; size_t norm_ptr_size = num_elements; size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "ResidualRMSNormMeta"); rms_ptr = gpu_mem_allocator.allocate_instance_untyped( rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 17ac14449..7530c179e 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -13,6 +13,11 @@ * limitations under the License. */ +#include "flashinfer/utils.cuh" +#include + +#include "flashinfer/math.cuh" +#include "flashinfer/vec_dtypes.cuh" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/residual_rms_norm_kernels.h" #include "flexflow/ops/residual_rms_norm.h" @@ -43,7 +48,8 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, size_t rms_ptr_size = batch_size; size_t norm_ptr_size = num_elements; size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "ResidualRMSNormMeta"); rms_ptr = gpu_mem_allocator.allocate_instance_untyped( rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( @@ -55,89 +61,133 @@ ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { } } -namespace Kernels { -namespace ResidualRMSNorm { +// Adopted from flashinfer +// (https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/norm.cuh) +// Main modification is for non-inplace computation +template +__global__ void FusedAddRMSNormKernel(T const *__restrict__ input, + T const *__restrict__ residual, + T const *__restrict__ weight, + T *__restrict__ output, + T *__restrict__ residual_output, + const uint32_t d, + float eps) { + const uint32_t bx = blockIdx.x; + const uint32_t tx = threadIdx.x, ty = threadIdx.y; + constexpr uint32_t warp_size = 32; + const uint32_t num_warps = blockDim.y; + const uint32_t thread_id = tx + ty * warp_size; + const uint32_t num_threads = num_warps * warp_size; + const uint32_t rounds = flashinfer::ceil_div(d, VEC_SIZE * num_threads); + extern __shared__ float smem[]; -template -__device__ __forceinline__ T WARP_SHFL_DOWN(T value, - unsigned int delta, - int width = warpSize, - unsigned int mask = 0xffffffff) { -#ifndef __HIP_PLATFORM_HCC__ - return __shfl_down_sync(mask, value, delta, width); -#else - return __shfl_down(value, delta, width); -#endif -} + float sum_sq = 0.f; -template -__inline__ __device__ T WarpReduceSum(T val) { + for (uint32_t i = 0; i < rounds; i++) { + flashinfer::vec_t input_vec; + flashinfer::vec_t residual_vec; + flashinfer::vec_t residual_output_vec; + input_vec.fill(0); + residual_vec.fill(0); + residual_output_vec.fill(0); + if ((i * num_threads + thread_id) * VEC_SIZE < d) { + input_vec.load(input + bx * d + i * num_threads * VEC_SIZE + + thread_id * VEC_SIZE); + residual_vec.load(residual + bx * d + i * num_threads * VEC_SIZE + + thread_id * VEC_SIZE); + } #pragma unroll - for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { - val += WARP_SHFL_DOWN(val, offset); + for (uint32_t j = 0; j < VEC_SIZE; j++) { + float x = float(input_vec[j]); + x += float(residual_vec[j]); + sum_sq += x * x; + residual_output_vec[j] = (T)x; + } + if ((i * num_threads + thread_id) * VEC_SIZE < d) { + residual_output_vec.store(residual_output + bx * d + + i * num_threads * VEC_SIZE + + thread_id * VEC_SIZE); + } } - return val; -} -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); + // first, warp reduce sum +#pragma unroll + for (uint32_t offset = warp_size / 2; offset > 0; offset /= 2) { + sum_sq += flashinfer::math::shfl_xor_sync(sum_sq, offset); + } + + smem[ty] = sum_sq; __syncthreads(); - if (lid == 0) { - shared[wid] = val; + // then, cross warp reduce sum using only the first warp + if (ty == 0) { + sum_sq = (tx < num_warps) ? smem[tx] : 0.f; +#pragma unroll + for (uint32_t offset = warp_size / 2; offset > 0; offset /= 2) { + sum_sq += flashinfer::math::shfl_xor_sync(sum_sq, offset); + } + smem[0] = sum_sq; } __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); + + float rms_rcp = flashinfer::math::rsqrt(smem[0] / float(d) + eps); + + for (uint32_t i = 0; i < rounds; i++) { + flashinfer::vec_t weight_vec; + flashinfer::vec_t residual_output_vec; + flashinfer::vec_t output_vec; + weight_vec.fill(0); + residual_output_vec.fill(0); + output_vec.fill(0); + if ((i * num_threads + thread_id) * VEC_SIZE < d) { + weight_vec.load(weight + i * num_threads * VEC_SIZE + + thread_id * VEC_SIZE); + residual_output_vec.load(residual_output + bx * d + + i * num_threads * VEC_SIZE + + thread_id * VEC_SIZE); + } +#pragma unroll + for (uint32_t j = 0; j < VEC_SIZE; j++) { + output_vec[j] = + float(residual_output_vec[j]) * rms_rcp * float(weight_vec[j]); + } + if ((i * num_threads + thread_id) * VEC_SIZE < d) { + output_vec.store(output + bx * d + i * num_threads * VEC_SIZE + + thread_id * VEC_SIZE); + } } - return val; } template -__global__ void ResidualRMSNormFusedForwardKernel(int64_t N, - float eps, - T const *X1, - T const *X2, - T *X_out, - T *rms, - T *Y, - T const *weights, - T *output) { - __shared__ float v_shared[C10_WARP_SIZE]; - int64_t const i = blockIdx.x; - float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { - int64_t const index = i * N + j; - X_out[index] = X1[index] + X2[index]; - sum += - (static_cast(X_out[index]) * static_cast(X_out[index])); - } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 - - if (threadIdx.x == 0) { - rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); - } +cudaError_t FusedAddRMSNorm(T const *input, + T const *residual, + T const *weight, + T *output, + T *residual_output, + uint32_t batch_size, + uint32_t d, + float eps = 1e-5, + cudaStream_t stream = 0) { + const uint32_t vec_size = std::gcd(16 / sizeof(T), d); - __syncthreads(); + const uint32_t block_size = std::min(1024, d / vec_size); + const uint32_t num_warps = flashinfer::ceil_div(block_size, 32); + dim3 nblks(batch_size); + dim3 nthrs(32, num_warps); + const uint32_t smem_size = num_warps * sizeof(float); + void *args[] = { + &input, &residual, &weight, &output, &residual_output, &d, &eps}; - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { - const int64_t index = i * N + j; - Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); - output[index] = Y[index] * weights[index % N]; - } + DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, { + auto kernel = FusedAddRMSNormKernel; + FLASHINFER_CUDA_CALL(cudaLaunchKernel( + (void *)kernel, nblks, nthrs, args, smem_size, stream)); + }); + + return cudaSuccess; } +namespace Kernels { +namespace ResidualRMSNorm { template void forward_kernel(ResidualRMSNormMeta const *m, T const *input1_ptr, @@ -145,28 +195,29 @@ void forward_kernel(ResidualRMSNormMeta const *m, T const *weight_ptr, T *residual_output_ptr, T *output_ptr, + int batch_size, cudaStream_t stream) { - + assert(batch_size <= m->batch_size); + // use active batch size std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); + std::make_pair(batch_size, kCUDABlockReduceNumThreads); std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); + std::make_pair(batch_size, kCUDANumThreads); int num_blocks = std::max(kernel1_parallelism.first, kernel2_parallelism.first); int num_threads = std::max(kernel1_parallelism.second, kernel2_parallelism.second); - ResidualRMSNormFusedForwardKernel - <<>>(m->in_dim, - m->eps, - input1_ptr, - input2_ptr, - residual_output_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - weight_ptr, - output_ptr); + checkCUDA(FusedAddRMSNorm(input1_ptr, + input2_ptr, + weight_ptr, + output_ptr, + residual_output_ptr, + batch_size, + m->in_dim, + m->eps, + stream)); } void forward_kernel_wrapper(ResidualRMSNormMeta const *m, @@ -174,7 +225,8 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, GenericTensorAccessorR const &input2, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &residual_output, - GenericTensorAccessorW const &output) { + GenericTensorAccessorW const &output, + int batch_size) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -195,6 +247,7 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, weight.get_half_ptr(), residual_output.get_half_ptr(), output.get_half_ptr(), + batch_size, stream); } else if (output.data_type == DT_FLOAT) { forward_kernel(m, @@ -203,6 +256,7 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, weight.get_float_ptr(), residual_output.get_float_ptr(), output.get_float_ptr(), + batch_size, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 24ab7051e..9636929d9 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -42,7 +42,8 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, size_t rms_ptr_size = batch_size; size_t norm_ptr_size = num_elements; size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "RMSNormMeta"); rms_ptr = gpu_mem_allocator.allocate_instance_untyped( rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 7c9f4a9f9..8555e58be 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -43,7 +43,8 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, size_t rms_ptr_size = batch_size; size_t norm_ptr_size = num_elements; size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "RMSNormMeta"); rms_ptr = gpu_mem_allocator.allocate_instance_untyped( rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 44979c48f..4289a9236 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -37,7 +37,8 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 6; - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "LayerNormMeta"); mean_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index f1b7a537b..046a4bc25 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -38,7 +38,8 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "ResidualLayerNormMeta"); mean_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index e5ebdce6e..05e66db02 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -37,7 +37,8 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, eps = ln->eps; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "ResidualLayerNormMeta"); mean_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index f4f5bb72d..713486268 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -447,7 +447,8 @@ void ResidualRMSNorm::inference_task(Task const *task, m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input1, input2, weight, residual_output, output); + forward_kernel_wrapper( + m, input1, input2, weight, residual_output, output, bc->num_tokens); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 9fc2316f9..92db9a958 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -47,7 +47,7 @@ using PCG::Node; // For an input tensor, computes the top k entries in each row // (resp. vector along the last dimension). Thus, // values.shape = indices.shape = input.shape[:-1] + [k] -Tensor FFModel::sampling(const Tensor input, float top_p, char const *name) { +Tensor FFModel::sampling(Tensor const input, float top_p, char const *name) { Layer *li = new Layer(this, OP_SAMPLING, input->data_type, @@ -103,7 +103,7 @@ bool operator==(SamplingParams const &lhs, SamplingParams const &rhs) { } Sampling::Sampling(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, float _top_p, char const *name) : Op(model, @@ -132,12 +132,12 @@ Sampling::Sampling(FFModel &model, Sampling::Sampling(FFModel &model, Sampling const &other, - const ParallelTensor input) + ParallelTensor const input) : Sampling(model, input, other.top_p, other.name) {} Sampling::Sampling(FFModel &model, SamplingParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Sampling(model, input, params.top_p, params.name) {} @@ -316,6 +316,7 @@ InferenceResult } InferenceResult ir; + ir.num_token_ids = batch_size; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp index 3d8f10352..03e37333e 100644 --- a/src/ops/sampling.cpp +++ b/src/ops/sampling.cpp @@ -204,7 +204,8 @@ SamplingMeta::SamplingMeta(FFHandler handler, idx_size + sorted_idx_size) + data_type_size(data_type) * sorted_logits_size + sizeof(hiprandState) * state_size; - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "SamplingMeta"); begin_offset = gpu_mem_allocator.allocate_instance(begin_offset_size); end_offset = gpu_mem_allocator.allocate_instance(end_offset_size); idx = gpu_mem_allocator.allocate_instance(idx_size); @@ -262,7 +263,8 @@ SamplingMeta::SamplingMeta(FFHandler handler, // assert(false && "input type in float and half"); // } - gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); + gpu_mem_allocator.create_legion_instance( + reserveInst, temp_storage_bytes, "SamplingMeta"); d_temp_storage = gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes); } diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu index 461d72ec7..686817096 100644 --- a/src/ops/sampling.cu +++ b/src/ops/sampling.cu @@ -45,8 +45,12 @@ __global__ void init_idxs(int batch_size, int *idx, int *begin_offset, int *end_offset) { - CUDA_KERNEL_LOOP(i, total_eles) { - idx[i] = i % vocab_size; + // +1 to include the upper boundary + CUDA_KERNEL_LOOP(i, total_eles + 1) { + if (i < total_eles) { + // Exclude the last element + idx[i] = i % vocab_size; + } if (i % vocab_size == 0) { begin_offset[i / vocab_size] = i; end_offset[i / vocab_size] = i; @@ -55,9 +59,9 @@ __global__ void init_idxs(int batch_size, } __global__ void - init_random_kernel(curandState *state, int batch_size, long rand) { + init_random_kernel(curandState *state, int batch_size, long seed) { CUDA_KERNEL_LOOP(i, batch_size) { - curand_init(rand, i, 0, &state[i]); + curand_init(seed, i, 0, &state[i]); } } @@ -74,11 +78,14 @@ __global__ void sampling_topp_kernel(int batch_size, int const batch_idx = blockIdx.x; __shared__ float random_n; __shared__ long long result_idx; + __shared__ bool is_end; // random num if (threadIdx.x == 0) { // number must < topp random_n = curand_uniform(state + batch_idx) * topp; + is_end = false; + result_idx = vocab_size - 1; // printf("batch idx: %d, random num%f\n", batch_idx, random_n); } @@ -91,14 +98,19 @@ __global__ void sampling_topp_kernel(int batch_size, int offset = batch_idx * vocab_size; float prefix_sum = 0.0f; BlockPrefixCallbackOp prefix_op(0); - result_idx = vocab_size - 1; for (long long j = threadIdx.x; j < vocab_size; j += blockDim.x) { float logit = (float)(sorted_logits[offset + j]); BlockScan(temp_storage).InclusiveSum(logit, prefix_sum, prefix_op); - prefix_sum /= topp; + __syncthreads(); if (prefix_sum >= random_n) { atomicMin(&result_idx, j); + is_end = true; + } + // Synchronize to make sure all threads see the updated flag + __syncthreads(); + if (is_end) { + break; } } indices_ptr[batch_idx] = sorted_idx[offset + result_idx]; @@ -216,7 +228,8 @@ SamplingMeta::SamplingMeta(FFHandler handler, idx_size + sorted_idx_size) + data_type_size(data_type) * sorted_logits_size + sizeof(curandState) * state_size; - gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + gpu_mem_allocator.create_legion_instance( + reserveInst, totalSize, "SamplingMeta"); begin_offset = gpu_mem_allocator.allocate_instance(begin_offset_size); end_offset = gpu_mem_allocator.allocate_instance(end_offset_size); idx = gpu_mem_allocator.allocate_instance(idx_size); @@ -274,7 +287,8 @@ SamplingMeta::SamplingMeta(FFHandler handler, assert(false && "input type in float and half"); } - gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); + gpu_mem_allocator.create_legion_instance( + reserveInst, temp_storage_bytes, "SamplingMeta"); d_temp_storage = gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes); } diff --git a/src/ops/select_k_impl.cu b/src/ops/select_k_impl.cu new file mode 100644 index 000000000..9fcdbb719 --- /dev/null +++ b/src/ops/select_k_impl.cu @@ -0,0 +1,35 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "raft/matrix/detail/select_k-inl.cuh" + +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + template void raft::matrix::detail::select_k(raft::resources const &handle, \ + const T *in_val, \ + const IdxT *in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T *out_val, \ + IdxT *out_idx, \ + bool select_min, \ + bool sorted, \ + raft::matrix::SelectAlgo algo, \ + const IdxT *len_i) + +instantiate_raft_matrix_detail_select_k(half, int); +instantiate_raft_matrix_detail_select_k(float, int); + +#undef instantiate_raft_matrix_detail_select_k diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 3ddd6b8d6..b39a424c6 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -41,7 +41,9 @@ using Legion::TaskLauncher; bool operator==(SigmoidSiluMultiParams const &lhs, SigmoidSiluMultiParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid; + return lhs.layer_guid == rhs.layer_guid && + lhs.intermediate_size == rhs.intermediate_size && + lhs.tensor_parallelism_degree == rhs.tensor_parallelism_degree; } bool SigmoidSiluMultiParams::is_valid( @@ -52,6 +54,8 @@ bool SigmoidSiluMultiParams::is_valid( SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const { SigmoidSiluMultiParams params; params.layer_guid = this->layer_guid; + params.intermediate_size = this->intermediate_size; + params.tensor_parallelism_degree = this->tensor_parallelism_degree; if (this->name != nullptr) { strcpy(params.name, this->name); } @@ -60,6 +64,7 @@ SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const { Tensor FFModel::sigmoid_silu_multi(const Tensor input1, const Tensor input2, + int intermediate_size, DataType data_type, char const *name) { @@ -94,6 +99,9 @@ Tensor FFModel::sigmoid_silu_multi(const Tensor input1, casted_input2); ssm->outputs[0] = create_tensor_legion_ordering( input1->num_dims, input1->dims, data_type, ssm, 0, false /*create_grad*/); + ssm->add_int_property("intermediate_size", intermediate_size); + ssm->add_int_property("tensor_parallelism_degree", + config.tensor_parallelism_degree); layers.push_back(ssm); return ssm->outputs[0]; } @@ -102,9 +110,18 @@ Op *SigmoidSiluMulti::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { - - return new SigmoidSiluMulti( - model, layer->layer_guid, inputs[0], inputs[1], layer->name); + long long value; + layer->get_int_property("intermediate_size", value); + int intermediate_size = value; + layer->get_int_property("tensor_parallelism_degree", value); + int tensor_parallelism_degree = value; + return new SigmoidSiluMulti(model, + layer->layer_guid, + inputs[0], + inputs[1], + intermediate_size, + tensor_parallelism_degree, + layer->name); } SigmoidSiluMulti::SigmoidSiluMulti( @@ -112,13 +129,20 @@ SigmoidSiluMulti::SigmoidSiluMulti( SigmoidSiluMultiParams const ¶ms, std::pair const &inputs, char const *name) - : SigmoidSiluMulti( - model, params.layer_guid, inputs.first, inputs.second, params.name) {} + : SigmoidSiluMulti(model, + params.layer_guid, + inputs.first, + inputs.second, + params.intermediate_size, + params.tensor_parallelism_degree, + params.name) {} SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model, LayerID const &_layer_guid, const ParallelTensor _input1, const ParallelTensor _input2, + int _intermediate_size, + int _tensor_parallelism_degree, char const *name) : Op(model, OP_SIGMOID_SILU_MULTI, @@ -128,7 +152,9 @@ SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model, 0 /*weights*/, 1 /*outputs*/, _input1, - _input2) { + _input2), + intermediate_size(_intermediate_size), + tensor_parallelism_degree(_tensor_parallelism_degree) { // overwrite layer_guid layer_guid = _layer_guid; outputs[0] = model.create_parallel_tensor_legion_ordering(_input1->num_dims, @@ -242,8 +268,13 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task, .best_affinity_to(task->target_proc) .first(); MemoryAllocator gpu_mem_allocator(gpu_mem); - SigmoidSiluMultiMeta *meta = - new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator); + int intermediate_size = + ssm->intermediate_size / ssm->tensor_parallelism_degree; + SigmoidSiluMultiMeta *meta = new SigmoidSiluMultiMeta(handle, + ssm, + gpu_mem_allocator, + ssm->intermediate_size, + intermediate_size); meta->input_type[0] = ssm->inputs[0]->data_type; meta->input_type[1] = ssm->inputs[1]->data_type; meta->output_type[0] = ssm->outputs[0]->data_type; @@ -350,7 +381,9 @@ void SigmoidSiluMulti::inference_task( assert(input1_domain == input2_domain); assert(input1_domain == output_domain); - SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output); + // use active number of tokens + SigmoidSiluMulti::inference_kernel_wrapper( + m, input1, input2, output, bc->num_active_tokens()); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -369,6 +402,8 @@ void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(this->intermediate_size); + sez.serialize(this->tensor_parallelism_degree); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -381,9 +416,12 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 2); size_t id, transformer_layer_id, deserialized_model_id; + int intermediate_size, tensor_parallelism_degree; dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + dez.deserialize(intermediate_size); + dez.deserialize(tensor_parallelism_degree); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -392,6 +430,8 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff, SigmoidSiluMultiParams params; params.layer_guid = layer_guid; + params.intermediate_size = intermediate_size; + params.tensor_parallelism_degree = tensor_parallelism_degree; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); @@ -406,6 +446,8 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); hash_combine(key, params.layer_guid.model_id); + hash_combine(key, params.intermediate_size); + hash_combine(key, params.tensor_parallelism_degree); return key; } }; // namespace std diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 590b641b5..962777ff3 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -21,10 +21,14 @@ namespace FlexFlow { SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, SigmoidSiluMulti const *ssm, - MemoryAllocator &gpu_mem_allocator) + MemoryAllocator &gpu_mem_allocator, + int _global_intermediate_size, + int _intermediate_size) : OpMeta(handle) { profiling = ssm->profiling; inference_debugging = ssm->inference_debugging; + global_intermediate_size = _global_intermediate_size; + intermediate_size = _intermediate_size; } SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { @@ -50,13 +54,18 @@ void SigmoidSiluMulti::inference_kernel_wrapper( SigmoidSiluMultiMeta const *m, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, - GenericTensorAccessorW const &output) { + GenericTensorAccessorW const &output, + int token_size) { + if (token_size == 0) { + return; + } cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - int num_elements = input1.domain.get_volume(); - assert(input2.domain.get_volume() == num_elements); - assert(output.domain.get_volume() == num_elements); + assert(input2.domain.get_volume() == input1.domain.get_volume()); + assert(output.domain.get_volume() == input1.domain.get_volume()); + + int num_elements = token_size * m->intermediate_size; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -68,7 +77,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( SigmoidSiluMultiKernel<<>>(input1.domain.get_volume(), + stream>>>(num_elements, input1.get_float_ptr(), input2.get_float_ptr(), output.get_float_ptr()); @@ -76,7 +85,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( SigmoidSiluMultiKernel<<>>(input1.domain.get_volume(), + stream>>>(num_elements, input1.get_half_ptr(), input2.get_half_ptr(), output.get_half_ptr()); diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 03618423b..4c94f3e5a 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -92,7 +92,7 @@ SoftmaxParams Softmax::get_params() const { return params; } -Tensor FFModel::softmax(const Tensor _input, +Tensor FFModel::softmax(Tensor const _input, int dim, DataType data_type, char const *name) { @@ -135,7 +135,7 @@ Op *Softmax::create_operator_from_layer( Softmax::Softmax(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _dim, char const *name) : Op(model, @@ -160,7 +160,7 @@ Softmax::Softmax(FFModel &model, Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Softmax(model, params.layer_guid, input, params.dim, params.name) {} diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 9c6ed0e0b..421780dd4 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -52,24 +52,25 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid( return is_valid; } -Tensor - FFModel::spec_inc_multihead_self_attention(Tensor const input, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { +Tensor FFModel::spec_inc_multihead_self_attention( + Tensor const input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool qkv_bias, + bool final_bias, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + RotaryEmbeddingMeta rotary_embedding_meta, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + bool streaming_cache, + char const *name) { return spec_inc_multiquery_self_attention(input, embed_dim, num_heads, @@ -82,33 +83,35 @@ Tensor add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, position_bias, + streaming_cache, name); } -Tensor - FFModel::spec_inc_multiquery_self_attention(Tensor const input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { +Tensor FFModel::spec_inc_multiquery_self_attention( + Tensor const input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool qkv_bias, + bool final_bias, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + RotaryEmbeddingMeta rotary_embedding_meta, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + bool streaming_cache, + char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; } @@ -145,13 +148,12 @@ Tensor numdims, dims, data_type, li, 0, true /*create_grad*/); } // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim; + int hidden_size = input->dims[0]; + int qParas = qk_dim * hidden_size; + int kParas = qk_dim * hidden_size; + int vParas = v_dim * hidden_size; + int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size); int weight_size = qParas * num_q_heads + kParas * num_q_heads + vParas * num_q_heads + oParas * num_q_heads; { @@ -166,10 +168,8 @@ Tensor } if (qkv_bias || final_bias) { // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; + int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -188,11 +188,24 @@ Tensor li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); - li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("apply_rotary_embedding", + rotary_embedding_meta.apply_rotary_embedding); + li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta); + li->add_string_property("rope_type", rotary_embedding_meta.rope_type); + li->add_float_property("factor", rotary_embedding_meta.factor); + li->add_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + li->add_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + li->add_int_property("original_max_position_embeddings", + rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); li->add_int_property("position_bias", position_bias); + li->add_int_property("streaming_cache", streaming_cache); + li->add_int_property("tensor_parallelism_degree", + config.tensor_parallelism_degree); layers.push_back(li); return li->outputs[0]; } @@ -222,8 +235,18 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); - bool apply_rotary_embedding = (bool)value; + rotary_embedding_meta.apply_rotary_embedding = (bool)value; + layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta); + layer->get_string_property("rope_type", rotary_embedding_meta.rope_type); + layer->get_float_property("factor", rotary_embedding_meta.factor); + layer->get_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + layer->get_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + layer->get_int_property("original_max_position_embeddings", value); + rotary_embedding_meta.original_max_position_embeddings = (int)value; layer->get_int_property("scaling_query", value); bool scaling_query = (bool)value; float scaling_factor; @@ -232,6 +255,10 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( bool qk_prod_scaling = (bool)value; layer->get_int_property("position_bias", value); bool position_bias = (bool)value; + layer->get_int_property("streaming_cache", value); + bool streaming_cache = (bool)value; + layer->get_int_property("tensor_parallelism_degree", value); + int tensor_parallelism_degree = (int)value; return new SpecIncMultiHeadSelfAttention(model, layer->layer_guid, @@ -245,12 +272,14 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( qkv_bias, final_bias, add_zero_attn, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, position_bias, false /*allocate_weights*/, + streaming_cache, + tensor_parallelism_degree, layer->name); } @@ -267,12 +296,14 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, bool allocate_weights, + bool _streaming_cache, + int _tensor_parallelism_degree, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -286,13 +317,13 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) { + rotary_embedding_meta(_rotary_embedding_meta), + hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim), + o_dim(_embed_dim), qoSeqLength(_input->dims[1].size), + kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), + scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), + position_bias(_position_bias), streaming_cache(_streaming_cache), + tensor_parallelism_degree(_tensor_parallelism_degree) { // overwrite layer_guid layer_guid = _layer_guid; @@ -309,11 +340,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( // Create weight tensor int num_dims = inputs[0]->num_dims; // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; + int qParas = this->qk_dim * this->hidden_size; + int kParas = this->qk_dim * this->hidden_size; + int vParas = this->v_dim * this->hidden_size; int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size); ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; @@ -331,10 +362,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -369,12 +399,14 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, bool allocate_weights, + bool _streaming_cache, + int _tensor_parallelism_degree, char const *name) // Initializer* _bias_initializer) : Op(model, @@ -389,13 +421,13 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) + rotary_embedding_meta(_rotary_embedding_meta), + hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim), + o_dim(_embed_dim), qoSeqLength(_input->dims[1].size), + kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), + scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), + position_bias(_position_bias), streaming_cache(_streaming_cache), + tensor_parallelism_degree(_tensor_parallelism_degree) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -411,11 +443,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( // Create weight tensor int num_dims = inputs[0]->num_dims; // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; + int qParas = this->qk_dim * this->hidden_size; + int kParas = this->qk_dim * this->hidden_size; + int vParas = this->v_dim * this->hidden_size; int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size); ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; @@ -434,10 +466,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -470,21 +501,23 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( : SpecIncMultiHeadSelfAttention(model, other.layer_guid, input, - other.oProjSize, + other.o_dim, other.num_q_heads, other.num_kv_heads, - other.qProjSize, - other.vProjSize, + other.qk_dim, + other.v_dim, other.dropout, other.qkv_bias, other.final_bias, other.add_zero_attn, - other.apply_rotary_embedding, + other.rotary_embedding_meta, other.scaling_query, other.scaling_factor, other.qk_prod_scaling, other.position_bias, allocate_weights, + other.streaming_cache, + other.tensor_parallelism_degree, other.name) {} SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( @@ -505,12 +538,14 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.qkv_bias, params.final_bias, params.add_zero_attn, - params.apply_rotary_embedding, + params.rotary_embedding_meta, params.scaling_query, params.scaling_factor, params.qk_prod_scaling, params.position_bias, allocate_weights, + params.streaming_cache, + params.tensor_parallelism_degree, params.name) {} void SpecIncMultiHeadSelfAttention::init_inference( @@ -636,9 +671,11 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - int num_q_heads = attn->num_q_heads; - int num_kv_heads = attn->num_kv_heads; - assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree; + int num_kv_heads = + attn->num_kv_heads / attn->tensor_parallelism_degree + + (attn->num_kv_heads % attn->tensor_parallelism_degree != 0); + assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) @@ -736,9 +773,9 @@ void SpecIncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(task->regions.size() == regions.size()); - BeamSearchBatchConfig const &bc = - Future(task->futures[0]).get_result(); - if (bc.num_tokens == 0) { + // BatchConfig const &bc = Future(task->futures[0]).get_result(); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { return; } @@ -778,7 +815,7 @@ void SpecIncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, &bc, task->index_point.point_data[0], input, weight, output, biases); + m, bc, task->index_point.point_data[0], input, weight, output, biases); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -788,7 +825,7 @@ void SpecIncMultiHeadSelfAttention::inference_task( weights_accessors.push_back(biases); } SpecIncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, weights_accessors, {output}); + m, shard_id, bc, {input}, weights_accessors, {output}); } } @@ -828,31 +865,46 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs, lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.rotary_embedding_meta.apply_rotary_embedding == + rhs.rotary_embedding_meta.apply_rotary_embedding && + lhs.rotary_embedding_meta.rope_theta == + rhs.rotary_embedding_meta.rope_theta && + lhs.rotary_embedding_meta.rope_type == + rhs.rotary_embedding_meta.rope_type && + lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor && + lhs.rotary_embedding_meta.low_freq_factor == + rhs.rotary_embedding_meta.low_freq_factor && + lhs.rotary_embedding_meta.high_freq_factor == + rhs.rotary_embedding_meta.high_freq_factor && + lhs.rotary_embedding_meta.original_max_position_embeddings == + rhs.rotary_embedding_meta.original_max_position_embeddings && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && lhs.qk_prod_scaling == rhs.qk_prod_scaling && - lhs.position_bias == rhs.position_bias; + lhs.position_bias == rhs.position_bias && + lhs.streaming_cache == rhs.streaming_cache; } SpecIncMultiHeadSelfAttentionParams SpecIncMultiHeadSelfAttention::get_params() const { SpecIncMultiHeadSelfAttentionParams params; params.layer_guid = this->layer_guid; - params.embed_dim = this->oProjSize; + params.embed_dim = this->o_dim; params.num_q_heads = this->num_q_heads; params.num_kv_heads = this->num_kv_heads; - params.kdim = this->kProjSize; - params.vdim = this->vProjSize; + params.kdim = this->qk_dim; + params.vdim = this->v_dim; params.dropout = this->dropout; params.qkv_bias = this->qkv_bias; params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; - params.apply_rotary_embedding = this->apply_rotary_embedding; + params.rotary_embedding_meta = this->rotary_embedding_meta; params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; + params.streaming_cache = this->streaming_cache; + params.tensor_parallelism_degree = this->tensor_parallelism_degree; if (this->name != nullptr) { strcpy(params.name, this->name); } @@ -876,11 +928,20 @@ size_t hash::operator()( hash_combine(key, params.qkv_bias); hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); - hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.rope_theta); + hash_combine(key, params.rotary_embedding_meta.rope_type); + hash_combine(key, params.rotary_embedding_meta.factor); + hash_combine(key, params.rotary_embedding_meta.low_freq_factor); + hash_combine(key, params.rotary_embedding_meta.high_freq_factor); + hash_combine(key, + params.rotary_embedding_meta.original_max_position_embeddings); hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); hash_combine(key, params.position_bias); + hash_combine(key, params.streaming_cache); + hash_combine(key, params.tensor_parallelism_degree); return key; } }; // namespace std diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index b1687d12a..92bcbc546 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -36,10 +36,11 @@ __global__ void spec_store_kv_cache( DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, + /* Reserved: BatchConfig Updated, leave HIP code to be updated */ BatchConfig::PerTokenInfo *tokenInfos, BatchConfig::PerRequestInfo *requestInfo, - BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + TreeSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, + TreeSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, int qProjSize, int kProjSize, int vProjSize, @@ -67,7 +68,7 @@ __global__ void spec_store_kv_cache( // int const beam_width = id_map[token_idx].beam_width; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const tok_id = tokenInfos[token_idx].abs_index_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; int const beam_depth = beamRequestInfos[req_id].current_depth; @@ -139,14 +140,14 @@ __global__ void spec_store_kv_cache( template void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, + TreeSearchBatchConfig const *bc, hipStream_t stream) { int num_tokens = bc->num_active_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); if (num_tokens > 0) { - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; + int parallelism = m->local_hidden_size * KV_WEIGHT_NUM * num_tokens; hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_store_kv_cache
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), @@ -164,9 +165,9 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens, BatchConfig::max_sequence_length(), - BeamSearchBatchConfig::MAX_BEAM_WIDTH, + TreeSearchBatchConfig::MAX_BEAM_WIDTH, /*root*/ curr_depth == 0, - m->hidden_size); + m->local_hidden_size); } } @@ -189,7 +190,7 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix, template void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, + TreeSearchBatchConfig const *bc, int shard_id, DT *output_ptr, DT const *bias_ptr, @@ -223,7 +224,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (bc->request_available[i]) { continue; } for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { @@ -232,7 +233,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // int total_tokens = bc->token_last_available_idx[i] + 1; int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + int total_tokens = bc->requestsInfo[i].first_token_index_in_request + bc->requestsInfo[i].num_tokens_in_batch; // Compute (QK^T/sqrt(d_k)) int m_ = num_new_tokens; @@ -458,7 +459,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, template void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, + TreeSearchBatchConfig const *bc, int shard_id, DT const *input_ptr, DT const *weight_ptr, @@ -466,7 +467,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT const *bias_ptr, hipStream_t stream) { // here because we need postion info in infernece 1 - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = bc->max_tokens_per_ssm_batch(); checkCUDA( hipMemcpyAsync(m->token_infos, &(bc->tokensInfo), @@ -483,25 +484,25 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipMemcpyAsync(m->beam_token_infos, &(bc->beamTokenInfo), max_tokens_per_batch * bc->MAX_BEAM_WIDTH * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), + sizeof(TreeSearchBatchConfig::BeamSearchPerTokenInfo), hipMemcpyHostToDevice, stream)); checkCUDA(hipMemcpyAsync( m->beam_request_infos, &(bc->beamRequestsInfo), bc->max_requests_per_batch() * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), + sizeof(TreeSearchBatchConfig::BeamSearchPerRequestInfo), hipMemcpyHostToDevice, stream)); // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); + compute_qkv(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); @@ -517,7 +518,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, /*static*/ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, + TreeSearchBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, @@ -586,7 +587,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( int _num_q_heads, int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, - BEAM_SEARCH_MODE, + TREE_SEARCH_MODE, attn, attn->qSize, attn->kSize, @@ -595,7 +596,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, + attn->rotary_embedding_meta, attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, @@ -617,27 +618,29 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = BatchConfig::max_tokens_per_ssm_batch(); size_t beam_tokeninfo_size = - max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); + max_tokens_per_batch * TreeSearchBatchConfig::MAX_BEAM_WIDTH; + size_t requestinfo_size = TreeSearchBatchConfig::max_requests_per_batch(); size_t beam_requestinfo_size = - BeamSearchBatchConfig::max_requests_per_batch(); + TreeSearchBatchConfig::max_requests_per_batch(); size_t total_size = requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + beam_tokeninfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + + sizeof(TreeSearchBatchConfig::BeamSearchPerTokenInfo) + beam_requestinfo_size * - sizeof(BeamSearchBatchConfig:: + sizeof(TreeSearchBatchConfig:: BeamSearchPerRequestInfo); // more components will // be added here later // We always directly allocate memory for small speculative models - gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - total_size); + gpu_mem_allocator.create_legion_instance( + beam_search_reserve_inst, + total_size, + "SpecIncMultiHeadSelfAttentionMeta"); beam_token_infos = gpu_mem_allocator - .allocate_instance( + .allocate_instance( beam_tokeninfo_size); // offset += beam_tokeninfo_size * // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); @@ -647,7 +650,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo); beam_request_infos = gpu_mem_allocator - .allocate_instance( + .allocate_instance( beam_requestinfo_size); // offset += beam_requestinfo_size * // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index a00ea9c95..6d7bf1364 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -15,15 +15,17 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif +#include "flashinfer/prefill_attention_decl.cuh" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" -namespace FlexFlow { +#include +#include -#define WARP_SIZE 32 +namespace FlexFlow { // declare Legion names using Legion::coord_t; @@ -33,711 +35,267 @@ using namespace Kernels::IncMultiHeadAttention; namespace Kernels { namespace SpecIncMultiHeadSelfAttention { -template -__global__ void compute_spec_inc_attention_kernel_generation_kernel( - DT const *query, - DT const *key_cache, - DT const *value_cache, - DT *output_ptr, - float const scale, - int const max_seq_length, - int per_head_size, - int hidden_size, - BatchConfig::PerRequestInfo *request_infos, - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask, - bool *request_completed) { - - // q, k - using Q_vec = typename VEC_K::Type; - using K_vec = typename VEC_K::Type; - using V_vec = typename VEC_V
::Type; - using Out_sum = typename Vec_fp32_::Type; - - constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; - - constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); - constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; - constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; - // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); - - // thread id - int const tidx = threadIdx.x; - // head id - int const head_idx = blockIdx.x; - // nth request idx - int const request_idx = blockIdx.y; - - // request id in batch config - int const batch_config_request_id = - request_infos[request_idx].batch_config_request_id; - - // request_idx = re - - BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; - - int const first_step = 0; - - // int const tlength = - // request_infos[batch_config_request_id].first_token_depth_in_request + - // request_infos[batch_config_request_id].num_tokens_in_batch; - - int const totalCacheSize = - bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1; - - int first_token_idx = 0; - for (int r = 0; r < batch_config_request_id; r++) { - first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size; - } - - int const tree_branch_num = - beam_request_infos[batch_config_request_id].sub_request_num; - - // shared memory objects - extern __shared__ char smem_[]; - - float *qk_smem = reinterpret_cast(smem_); - float *out_smem = reinterpret_cast(smem_); - - float qk_max = -FLT_MAX; - - // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum - __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - - const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + - head_idx * per_head_size; - __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; - - // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE - int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; - int ki_o = tidx % THREADS_PER_KEY; - // the first key's offset for this thread - // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... - int ko = tidx / THREADS_PER_KEY; - // load q tensor - Q_vec q_vec[K_VECS_PER_THREAD]; - - constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; - // The number of keys per warp. - constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; - - DT const *k_cache_batch = - key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; - - int ti_end = - div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; - - for (int qi = 0; qi < tree_branch_num; qi += 1) { -#pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - q_vecs[ki_o][ii] = *reinterpret_cast( - q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + - ii * THREADS_PER_KEY * K_VEC_SIZE); - } - - int const query_token = - bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi; - - __syncthreads(); - for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { - K_vec k[K_VECS_PER_THREAD]; - int const ti_circ = ti % max_seq_length; - - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; - if (ti < totalCacheSize) { - - k[ii] = *reinterpret_cast( - k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + - jj); - } - } - float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - - if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { - // todo add alobi here - // bool const mask = ti_circ >= totalCacheSize; - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - - // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) { - // printf("spec inc attn qkqkqk request id %d, %.10f, %d\n", - // batch_config_request_id, - // ti, - // qk, - // qi); - // } - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[ti - first_step] = mask ? 0.f : qk; - } - } - - __syncthreads(); - -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Decompose the thread index into warp and lane. - int const warp = tidx / WARP_SIZE; - int const lane = tidx % WARP_SIZE; - - // The warp leader writes the max to shared memory. - if (lane == 0) { - red_smem[warp] = qk_max; - } - - // Make sure the products are in shared memory. - __syncthreads(); - - // The warps finalize the reduction. - qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; -#pragma unroll - for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Broadcast to all the threads in the warp. - qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("spec inc attn first token qk_max %.10f\n", qk_max); - // } - - float exp_sum = 0.f; - for (int ti = first_step + tidx; ti < totalCacheSize; - ti += THREADS_PER_BLOCK) { - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); - exp_sum += logit; - qk_smem[ti - first_step] = mask ? 0.0f : logit; - } - - // Compute the sum. - exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - - // softmax - float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < totalCacheSize; - ti += THREADS_PER_BLOCK) { - qk_smem[ti - first_step] *= inv_sum; - } - - __syncthreads(); - - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // A vector of V elements for the current timestep. - // using V_vec_k = typename V_vec_k_::Type; - // using V_vec_acum = typename V_vec_acum_fp32_::Type; - - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - - Out_sum out; - zero(out); - - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + batch_config_request_id * max_seq_length * hidden_size + - vi; - - if (Dh == Dh_MAX || vi < Dh) { - for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { - // Load the values from the cache. - int const ti_circ = ti % max_seq_length; - V_vec v = *reinterpret_cast( - v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - float logit = mask ? 0.0f : qk_smem[ti - first_step]; - out = FlexFlow::fma(logit, cast_to_float(v), out); - } - } - - // // Make sure we can start writing to shared memory. - __syncthreads(); - - // Run the final reduction amongst the different groups computing different - // partial outputs. - if (Dh == Dh_MAX || vi < Dh) { -#pragma unroll - for (int active_groups = V_PER_ITER; active_groups >= 2; - active_groups /= 2) { - - // The midpoint in the number of active groups. - int midpoint = active_groups / 2; - - // The upper part of active threads store to shared memory. - if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { - *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = - out; - } - __syncthreads(); - - // The bottom warps update their values. - if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { - out = add(*reinterpret_cast(out_smem + vo * Dh + vi), - out); - } - __syncthreads(); - } - } - - // Output the final values. - if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { - convert_from_float(*reinterpret_cast( - output_ptr + (first_token_idx + qi) * hidden_size + - head_idx * per_head_size + vi), - out); - } - } -} +using flashinfer::BatchPrefillHandler; +using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched; +using flashinfer::LogitsPostHook; +using flashinfer::MaskMode; +using flashinfer::paged_kv_t; +using flashinfer::PageStorage; +using flashinfer::PosEncodingMode; +using flashinfer::QKVLayout; template -__global__ void spec_inc_store_kv_cache( - DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - BatchConfig::PerTokenInfo *tokenInfos, - BatchConfig::PerRequestInfo *requestInfo, - BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, - BatchConfig::BitMask *causalMask, - int qProjSize, - int kProjSize, - int vProjSize, - int num_tokens, - int max_seq_len, - bool is_root, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / (hidden_size); - int offset = i % hidden_size; - - size_t val_idx = - token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - - int const req_id = tokenInfos[token_idx].request_index; - // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - - int const request_token_offset = - requestInfo[req_id].first_token_offset_in_batch; - - BatchConfig::BitMask bitmask = causalMask[req_id]; - - // if prompt token -> token id - // if tree token: - - int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size + - bitmask.tree_size - 1 - bitmask.this_layer_size + - token_idx - request_token_offset; - - kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + - offset] = vVal; - } -} - -template -void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); - int curr_depth = bc->beamRequestsInfo[0].current_depth; - if (num_tokens > 0) { - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - spec_inc_store_kv_cache<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->request_infos, - m->beam_token_infos, - m->beam_request_infos, - m->causalMask, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(), - /*root*/ curr_depth == 0, - m->hidden_size); - } -} - -#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ - DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ - smem_sz = smem_size_in_bytes
(m->qProjSize, \ - BatchConfig::max_sequence_length() + \ - BatchConfig::max_spec_tree_token_num(), \ - THREADS_PER_VALUE, \ - THDS_PER_BLOCK); \ - compute_spec_inc_attention_kernel_generation_kernel \ - <<>>( \ - static_cast
(m->devQKVProjArray), \ - static_cast
(m->keyCache), \ - static_cast
(m->valueCache), \ - output_ptr, \ - scale, \ - BatchConfig::max_sequence_length() + \ - BatchConfig::max_spec_tree_token_num(), \ - m->qProjSize, \ - m->hidden_size, \ - m->request_infos, \ - m->beam_request_infos, \ - m->causalMask, \ - m->request_completed) - -template -void compute_spec_inc_attention_kernel_generation( - SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - DT *output_ptr, - cudaStream_t stream) { - // one block == one head per request - // how many generation requests - dim3 grid(m->num_q_heads, bc->get_speculative_request_num()); - int const per_head_size = m->qProjSize; - float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - size_t smem_sz; - if (per_head_size == 64) { - constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; - LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( - DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); - } else if (per_head_size == 128) { - constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; - LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( - DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); +void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // int device; + // checkCUDA(cudaGetDevice(&device)); + // cudaEvent_t t_start, t_end; + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + // global constant parameters + uint32_t const num_q_heads = m->num_q_heads; + uint32_t const num_kv_heads = m->num_kv_heads; + uint32_t const head_dim = m->qk_dim; + uint32_t const batch_size = bc->num_active_requests(); + float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f; + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "Update custom mask time: " << elapsed << " ms\n"; + // } + + half *q = static_cast(m->queryTmp), + *kv = static_cast(m->kvCache), + *o = static_cast(m->outputTmp); + paged_kv_t paged_kv( + num_kv_heads, + kPagesize, + head_dim, + batch_size, + QKVLayout::kNHD, + kv, + m->handle.tree_search_attention_metadata->kv_indices, + m->handle.tree_search_attention_metadata->kv_indptr, + m->handle.tree_search_attention_metadata->kv_last_page_len); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" attn prep time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + BatchPrefillHandler *handler = nullptr; + + if (!bc->prompt_phase) { + assert(m->handle.tree_search_attention_metadata->decode_handler_collections + .count(batch_size) != 0 && + "Handler is not initialized"); + handler = static_cast( + m->handle.tree_search_attention_metadata + ->decode_handler_collections[batch_size]); } else { - assert(false && "a unsupported head size"); - } -} - -template -__global__ void spec_fill_entries_above_diagonal(DT *matrix, - size_t new_tokens, - size_t total_tokens_in_request, - size_t num_q_heads, - DT value) { - CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { - // size_t head_idx = i / (new_tokens * total_tokens_in_request); - size_t src_idx = (i / new_tokens) % total_tokens_in_request; - size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; - // Casual Mask - if (src_idx > dst_idx) { - matrix[i] = value; - } - } -} - -template -void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + assert(m->handle.tree_search_attention_metadata->prompt_handler_collections + .count(batch_size) != 0 && + "Handler is not initialized"); + handler = static_cast( + m->handle.tree_search_attention_metadata + ->prompt_handler_collections[batch_size]); } -#endif - // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; - int tokens_prev_requests_squares = 0; - // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; - int q_block_size = m->qProjSize; - - int kt_block_size = m->kProjSize; - int kt_req_block_size = kt_block_size * m->num_q_heads * - (BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num()); - int vt_block_size = m->vProjSize; - int vt_req_block_size = vt_block_size * m->num_q_heads * - (BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num()); - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) || - (bc->requestsInfo[i].num_tokens_in_batch == 0)) { - continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - continue; - } - - // all requests in prompt phase should only have one sub requests; - assert(bc->sub_requests[i] == 1); - // int num_new_tokens = bc->num_processing_tokens[i]; - // int total_tokens = bc->token_last_available_idx[i] + 1; - - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; - if (num_new_tokens <= 0) { - continue; + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" BeginForward time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + DISPATCH_HEADDIM(head_dim, HEAD_DIM, { + cudaError_t result; + if (bc->prompt_phase) { + result = + BatchPrefillWithPagedKVCacheWrapperDispatched( + handler, + q, + m->handle.tree_search_attention_metadata->q_indptr, + /*q_offset=*/nullptr, + paged_kv, + /*custom_mask=*/nullptr, + /*qk_indptr=*/nullptr, + o, + /*lse=*/nullptr, + num_q_heads, + /*window_left=*/-1, + /*logits_soft_cap=*/0.f, + sm_scale, + /*rope_scale=*/1.f, + /*rope_theta=*/static_cast(1e4), + stream); + } else { + result = + BatchPrefillWithPagedKVCacheWrapperDispatched( + handler, + q, + m->handle.tree_search_attention_metadata->q_indptr, + /*q_offset=*/nullptr, + paged_kv, + m->handle.tree_search_attention_metadata->custom_mask, + m->handle.tree_search_attention_metadata->qk_indptr, + o, + /*lse=*/nullptr, + num_q_heads, + /*window_left=*/-1, + /*logits_soft_cap=*/0.f, + sm_scale, + /*rope_scale=*/1.f, + /*rope_theta=*/static_cast(1e4), + stream); } - - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - - // print_tensor((float*)A, 32, "A"); - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // print_tensor((float*)C, 32, "C"); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - if (num_new_tokens > 1) { - size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; - spec_fill_entries_above_diagonal<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - static_cast
(-INFINITY)); + if (result != cudaSuccess) { + throw std::runtime_error("Failed to run " + "TreeSearchAttentionForwardKernel: " + + std::string(cudaGetErrorString(result))); } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_q_heads * tokens_prev_requests_squares; - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - - int token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - - C = static_cast
(m->attn_heads) + - (token_offset)*m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; - } - - if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { - bc->print(); - printf("tokens_previous_requests: %i\n", tokens_previous_requests); - printf("num_tokens: %i\n", num_tokens); - printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); - } - assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); + }); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" actual attn time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + produce_output(m, bc, output_ptr, stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" produce_output_kernel time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); } template -void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, +void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, int shard_id, DT const *input_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { - // phase 1: Implement kernel to compute KQV for input tokens - - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); - // phase 2: Update key/val cache - update_kv_cache_kernel
(m, bc, stream); - if (bc->num_generation_tokens > 0) { - compute_spec_inc_attention_kernel_generation
( - m, bc, static_cast
(m->attn_heads), stream); + // phase 1: Compute QKV Projections of the batch + compute_qkv(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + + // phase 2: First maintain the streaming cache, because it need + // pre-pos-encoding values + if (m->streaming_cache) { + // Move pre-pos-encoding cache to where took by attention + update_kv_in_streaming_cache
(m, bc, stream); + // Apply pos-encoding to those k values + apply_pos_encoding_to_streaming_proj
(m, bc, stream); + // Commit to the streaming cache + if (bc->prompt_phase) { + commit_kv
(m, bc, stream); + } } - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - if (bc->num_tokens > bc->num_generation_tokens) { - compute_attention_kernel_prompt( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + + // phase 3: Take care of the batch + { + // Apply pos-encoding to the batch + apply_pos_encoding_to_tokens_in_batch( + m, bc, static_cast
(m->devQKVProjArray), stream); + // Move the batch qkv values to where took by attention + update_qkv_in_batch
(m, bc, stream); } - // compute output production and bias together for all tokens - int num_tokens = bc->num_active_tokens(); + // phase 4: Attention computation + tree_search_attention
(m, bc, static_cast
(m->attn_heads), stream); + + // Debug output: + // int size = m->local_hidden_size * BatchConfig::max_tokens_per_batch(); + // float *temp_output = new float[size]; + // cudaDeviceSynchronize(); + // cudaMemcpy( + // temp_output, m->attn_heads, size * sizeof(float), + // cudaMemcpyDeviceToHost); + + // printf("Output: "); + // for (int i = 0; i < bc->num_tokens; ++i) { + // float temp = 0; + // for (int j = 0; j < m->local_hidden_size; ++j) { + // temp += temp_output[i * m->local_hidden_size + j]; + // } + // printf("%.6f ", temp); + // } + // printf("\n"); + + // delete[] temp_output; + + // phase 5: Compute output production and bias together for all tokens + int num_tokens = bc->num_active_tokens(); compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } @@ -747,8 +305,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, /*static*/ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, + SpecIncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, @@ -774,7 +332,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( m, bc, shard_id, @@ -786,7 +344,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( } else if (input.data_type == DT_FLOAT) { float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( m, bc, shard_id, @@ -822,16 +380,13 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( int _num_q_heads, int _num_kv_heads) : IncMultiHeadSelfAttentionMeta(handler, - BEAM_SEARCH_MODE, + TREE_SEARCH_MODE, attn, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->apply_rotary_embedding, + attn->hidden_size, + attn->qk_dim, + attn->v_dim, + attn->o_dim, + attn->rotary_embedding_meta, attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, @@ -846,46 +401,30 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( _num_q_heads, _num_kv_heads, DT_NONE, - false) { + false, + attn->streaming_cache) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); - // allocate memory for the seqArray and reserve space - { - beam_token_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); - - beam_request_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo)); - - request_completed = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask)); - } + // set attention constants + handler.tree_search_attention_metadata->set_enabled(true); + handler.tree_search_attention_metadata->set_num_q_heads(num_q_heads); + handler.tree_search_attention_metadata->set_num_kv_heads(num_kv_heads); + handler.tree_search_attention_metadata->set_head_dim(qk_dim); cudaStreamSynchronize(stream); } SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { - if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { - beam_search_reserve_inst.destroy(); - } + // for (auto &decode_handler: decode_handler_collections) { + // delete static_cast(decode_handler.second); + // } + // for (auto &prompt_handler: prompt_handler_collections) { + // delete static_cast(prompt_handler.second); + // } } }; // namespace FlexFlow diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index d0efb01d5..a69bf61b1 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -46,7 +46,7 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -LegionRuntime::Logger::Category log_tree_verify("TreeVerifyIncMHA"); +Legion::Logger log_tree_verify("TreeVerifyIncMHA"); bool TreeIncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { @@ -55,7 +55,7 @@ bool TreeIncMultiHeadSelfAttentionParams::is_valid( } Tensor FFModel::inc_multihead_self_attention_verify( - const Tensor input, + Tensor const input, int embed_dim, int num_heads, int kdim, @@ -66,7 +66,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, - bool apply_rotary_embedding, + RotaryEmbeddingMeta rotary_embedding_meta, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -84,7 +84,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -93,7 +93,7 @@ Tensor FFModel::inc_multihead_self_attention_verify( } Tensor FFModel::inc_multiquery_self_attention_verify( - const Tensor input, + Tensor const input, int embed_dim, int num_q_heads, int num_kv_heads, @@ -105,7 +105,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, - bool apply_rotary_embedding, + RotaryEmbeddingMeta rotary_embedding_meta, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -149,13 +149,12 @@ Tensor FFModel::inc_multiquery_self_attention_verify( numdims, dims, data_type, li, 0, true /*create_grad*/); } // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim; + int hidden_size = input->dims[0]; + int qParas = qk_dim * hidden_size; + int kParas = qk_dim * hidden_size; + int vParas = v_dim * hidden_size; + int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size); int one_head_size = qParas + kParas + vParas + oParas; int weight_size = qParas * num_q_heads + kParas * num_q_heads + vParas * num_q_heads + oParas * num_q_heads; @@ -178,10 +177,8 @@ Tensor FFModel::inc_multiquery_self_attention_verify( } if (qkv_bias || final_bias) { // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; + int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)}; li->weights[1] = create_weight_legion_ordering(1, dims, data_type, @@ -200,10 +197,20 @@ Tensor FFModel::inc_multiquery_self_attention_verify( li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); - li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("apply_rotary_embedding", + rotary_embedding_meta.apply_rotary_embedding); + li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta); + li->add_string_property("rope_type", rotary_embedding_meta.rope_type); + li->add_float_property("factor", rotary_embedding_meta.factor); + li->add_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + li->add_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + li->add_int_property("original_max_position_embeddings", + rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); - li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); + li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("position_bias", position_bias); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); @@ -236,9 +243,18 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); - bool apply_rotary_embedding = (bool)value; - layer->get_int_property("scaling_query", value); + rotary_embedding_meta.apply_rotary_embedding = (bool)value; + layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta); + layer->get_string_property("rope_type", rotary_embedding_meta.rope_type); + layer->get_float_property("factor", rotary_embedding_meta.factor); + layer->get_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + layer->get_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + layer->get_int_property("original_max_position_embeddings", value); + rotary_embedding_meta.original_max_position_embeddings = (int)value; bool scaling_query = (bool)value; float scaling_factor; layer->get_float_property("scaling_factor", scaling_factor); @@ -264,7 +280,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( qkv_bias, final_bias, add_zero_attn, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -279,7 +295,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -289,7 +305,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, @@ -311,15 +327,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), - quantization_type(_quantization_type), offload(_offload), - tensor_parallelism_degree(_tensor_parallelism_degree) { + rotary_embedding_meta(_rotary_embedding_meta), + hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim), + o_dim(_embed_dim), qoSeqLength(_input->dims[1].size), + kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), + scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), + position_bias(_position_bias), quantization_type(_quantization_type), + offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) { // overwrite layer_guid layer_guid = _layer_guid; @@ -336,11 +350,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( // Create weight tensor int num_dims = inputs[0]->num_dims; // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; + int qParas = this->qk_dim * this->hidden_size; + int kParas = this->qk_dim * this->hidden_size; + int vParas = this->v_dim * this->hidden_size; int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size); ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; @@ -366,10 +380,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -393,8 +406,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, + ParallelTensor const _input, + ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -404,7 +417,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( bool _qkv_bias, bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, @@ -427,15 +440,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), - quantization_type(_quantization_type), offload(_offload), - tensor_parallelism_degree(_tensor_parallelism_degree) + rotary_embedding_meta(_rotary_embedding_meta), + hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim), + o_dim(_embed_dim), qoSeqLength(_input->dims[1].size), + kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), + scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), + position_bias(_position_bias), quantization_type(_quantization_type), + offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) // bias_initializer(_bias_initializer) { numOutputs = 1; @@ -451,11 +462,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( // Create weight tensor int num_dims = inputs[0]->num_dims; // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; + int qParas = this->qk_dim * this->hidden_size; + int kParas = this->qk_dim * this->hidden_size; + int vParas = this->v_dim * this->hidden_size; int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size); ParallelDim dims[2]; dims[0] = inputs[0]->dims[num_dims - 2]; dims[0].size = dims[0].degree; @@ -479,10 +490,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads; bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0); bias_shape.dims[1].size = bias_shape.dims[2].size = 1; weights[1] = model.create_parallel_weight_legion_ordering(bias_shape.num_dims, @@ -510,21 +520,21 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, TreeIncMultiHeadSelfAttention const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights) : TreeIncMultiHeadSelfAttention(model, other.layer_guid, input, - other.oProjSize, + other.o_dim, other.num_q_heads, other.num_kv_heads, - other.qProjSize, - other.vProjSize, + other.qk_dim, + other.v_dim, other.dropout, other.qkv_bias, other.final_bias, other.add_zero_attn, - other.apply_rotary_embedding, + other.rotary_embedding_meta, other.scaling_query, other.scaling_factor, other.qk_prod_scaling, @@ -553,7 +563,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.qkv_bias, params.final_bias, params.add_zero_attn, - params.apply_rotary_embedding, + params.rotary_embedding_meta, params.scaling_query, params.scaling_factor, params.qk_prod_scaling, @@ -695,7 +705,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( attn->num_kv_heads / attn->tensor_parallelism_degree + (attn->num_kv_heads % attn->tensor_parallelism_degree != 0); - assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1); Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) @@ -806,13 +816,12 @@ void TreeIncMultiHeadSelfAttention::inference_task( Runtime *runtime) { assert(task->regions.size() == regions.size()); - TreeVerifyBatchConfig const &bc = - Future(task->futures[0]).get_result(); - log_tree_verify.debug( - "TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d", - bc.num_tokens, - bc.num_active_requests()); - if (bc.num_tokens == 0) { + // BatchConfig const &bc = Future(task->futures[0]).get_result(); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + log_tree_verify.debug("BatchConfig, num_tokens: %d, num_requests: %d", + bc->num_tokens, + bc->num_active_requests()); + if (bc->num_tokens == 0) { return; } @@ -858,7 +867,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, &bc, task->index_point.point_data[0], input, weight, output, biases); + m, bc, task->index_point.point_data[0], input, weight, output, biases); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -869,7 +878,7 @@ void TreeIncMultiHeadSelfAttention::inference_task( weights_accessors.push_back(biases); } TreeIncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, weights_accessors, {output}); + m, shard_id, bc, {input}, weights_accessors, {output}); } } @@ -901,7 +910,19 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs, lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.rotary_embedding_meta.apply_rotary_embedding == + rhs.rotary_embedding_meta.apply_rotary_embedding && + lhs.rotary_embedding_meta.rope_theta == + rhs.rotary_embedding_meta.rope_theta && + lhs.rotary_embedding_meta.rope_type == + rhs.rotary_embedding_meta.rope_type && + lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor && + lhs.rotary_embedding_meta.low_freq_factor == + rhs.rotary_embedding_meta.low_freq_factor && + lhs.rotary_embedding_meta.high_freq_factor == + rhs.rotary_embedding_meta.high_freq_factor && + lhs.rotary_embedding_meta.original_max_position_embeddings == + rhs.rotary_embedding_meta.original_max_position_embeddings && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && lhs.qk_prod_scaling == rhs.qk_prod_scaling && @@ -912,16 +933,16 @@ TreeIncMultiHeadSelfAttentionParams TreeIncMultiHeadSelfAttention::get_params() const { TreeIncMultiHeadSelfAttentionParams params; params.layer_guid = this->layer_guid; - params.embed_dim = this->oProjSize; + params.embed_dim = this->o_dim; params.num_q_heads = this->num_q_heads; params.num_kv_heads = this->num_kv_heads; - params.kdim = this->kProjSize; - params.vdim = this->vProjSize; + params.kdim = this->qk_dim; + params.vdim = this->v_dim; params.dropout = this->dropout; params.qkv_bias = this->qkv_bias; params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; - params.apply_rotary_embedding = this->apply_rotary_embedding; + params.rotary_embedding_meta = this->rotary_embedding_meta; params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; @@ -949,7 +970,14 @@ size_t hash::operator()( hash_combine(key, params.qkv_bias); hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); - hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.rope_theta); + hash_combine(key, params.rotary_embedding_meta.rope_type); + hash_combine(key, params.rotary_embedding_meta.factor); + hash_combine(key, params.rotary_embedding_meta.low_freq_factor); + hash_combine(key, params.rotary_embedding_meta.high_freq_factor); + hash_combine(key, + params.rotary_embedding_meta.original_max_position_embeddings); hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 26291fb3b..cf3426b3e 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -36,7 +36,8 @@ __global__ void commit_tokens_kernel( DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, - TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos, + /* Reserved: BatchConfig Updated, leave HIP code to be updated */ + BatchConfig::CommittedTokensInfo const *committedTokenInfos, int qProjSize, int kProjSize, int vProjSize, @@ -70,11 +71,12 @@ __global__ void commit_tokens_kernel( template void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, hipStream_t stream) { int num_tokens_to_commit = bc->num_tokens_to_commit; if (num_tokens_to_commit > 0) { - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit; + int parallelism = + m->local_hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit; hipLaunchKernelGGL( HIP_KERNEL_NAME(commit_tokens_kernel
), GET_BLOCKS(parallelism), @@ -91,24 +93,24 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch BatchConfig::max_sequence_length(), - m->hidden_size); + m->local_hidden_size); } } template -__global__ void update_tree_branch_kv_cache( - DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int vProjSize, - int num_tokens_in_branch, - int processed_tokens_in_batch, - int total_tokens_in_batch, - int max_seq_len, - int hidden_size) { +__global__ void + update_tree_branch_kv_cache(DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens_in_branch, + int processed_tokens_in_batch, + int total_tokens_in_batch, + int max_seq_len, + int hidden_size) { CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { int token_idx = i / (hidden_size * KV_WEIGHT_NUM); int offset = i % hidden_size; @@ -119,7 +121,7 @@ __global__ void update_tree_branch_kv_cache( DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const tok_id = tokenInfos[token_idx].abs_index_in_request; kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + offset] = kVal; vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + @@ -146,7 +148,7 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix, template void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, int shard_id, DT *output_ptr, DT const *bias_ptr, @@ -178,7 +180,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (bc->request_available[i]) { continue; } int last_token_idx_of_the_request = @@ -187,17 +189,17 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int num_new_tokens = 1; int j = processed_tokens_in_batch; while ((j + 1 <= last_token_idx_of_the_request) && - (bc->tokensInfo[j].abs_depth_in_request + 1 == - bc->tokensInfo[j + 1].abs_depth_in_request)) { + (bc->tokensInfo[j].abs_index_in_request + 1 == + bc->tokensInfo[j + 1].abs_index_in_request)) { j++; num_new_tokens++; } - int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; + int total_tokens_in_request = bc->tokensInfo[j].abs_index_in_request + 1; assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { // update K-V cache - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens; + int parallelism = m->local_hidden_size * KV_WEIGHT_NUM * num_new_tokens; hipLaunchKernelGGL( HIP_KERNEL_NAME(update_tree_branch_kv_cache
), GET_BLOCKS(parallelism), @@ -215,7 +217,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, processed_tokens_in_batch, // num_processed_tokens_in_batch m->num_active_tokens, // total_tokens_in_batch BatchConfig::max_sequence_length(), - m->hidden_size); + m->local_hidden_size); } // bc->token_last_available_idx[i] + 1; @@ -437,7 +439,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, template void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, int shard_id, DT const *input_ptr, DT const *weight_ptr, @@ -464,13 +466,12 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache - checkCUDA( - hipMemcpyAsync(m->committed_token_infos, - &(bc->committed_tokens), - bc->num_tokens_to_commit * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), - hipMemcpyHostToDevice, - stream)); + checkCUDA(hipMemcpyAsync(m->committed_token_infos, + &(bc->committed_tokens), + bc->num_tokens_to_commit * + sizeof(BatchConfig::CommittedTokensInfo), + hipMemcpyHostToDevice, + stream)); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -486,18 +487,18 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, checkCUDA(hipMemcpyAsync(m->token_infos, &(bc->tokensInfo), bc->num_active_tokens() * - sizeof(TreeVerifyBatchConfig::PerTokenInfo), + sizeof(BatchConfig::PerTokenInfo), hipMemcpyHostToDevice, stream)); // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); + compute_qkv(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( @@ -515,7 +516,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, /*static*/ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta *m, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, @@ -540,7 +541,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); + pre_build_weight(m, weight, input.data_type, stream); } half const *bias_ptr = @@ -556,7 +557,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( stream); } else if (input.data_type == DT_FLOAT) { if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); + pre_build_weight(m, weight, input.data_type, stream); } float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); @@ -606,7 +607,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, + attn->rotary_embedding_meta, attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, @@ -631,24 +632,24 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( { int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); size_t committed_tokeninfo_size = max_tokens_per_batch; - size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); + size_t total_size = + committed_tokeninfo_size * sizeof(BatchConfig::CommittedTokensInfo); if (offload) { // assert that we have enough reserved work space left assert(gpu_mem_allocator.reserved_total_size - gpu_mem_allocator.reserved_allocated_size >= total_size); committed_token_infos = - gpu_mem_allocator - .allocate_reserved( - committed_tokeninfo_size); + gpu_mem_allocator.allocate_reserved( + committed_tokeninfo_size); } else { - gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, - total_size); + gpu_mem_allocator.create_legion_instance( + committed_token_reserve_inst, + total_size, + "TreeIncMultiHeadSelfAttentionMeta"); committed_token_infos = - gpu_mem_allocator - .allocate_instance( - committed_tokeninfo_size); + gpu_mem_allocator.allocate_instance( + committed_tokeninfo_size); } } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 50c056c81..058e223c4 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -15,12 +15,16 @@ #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" #endif +#include "flashinfer/prefill_attention_decl.cuh" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/cuda_helper.h" +#include +#include + namespace FlexFlow { // declare Legion names @@ -34,850 +38,311 @@ using namespace Kernels::IncMultiHeadAttention; namespace Kernels { namespace TreeIncMultiHeadAttention { -template -__global__ void compute_attention_kernel_fused_kernel( - DT const *query, - DT const *key_cache, - DT const *value_cache, - DT *output_ptr, - float const scale, - int const max_seq_length, - int const max_token_per_batch, - int per_head_size, - int hidden_size, - BatchConfig::PerRequestInfo *request_infos, - int num_heads, - int num_requests, - BatchConfig::BitMask *causalMask, - bool *request_completed, - int qk_smem_sz) { - - // q, k - using Q_vec = typename VEC_K::Type; - using K_vec = typename VEC_K::Type; - using V_vec = typename VEC_V
::Type; - using Out_sum = typename Vec_fp32_::Type; - - constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; - - constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); - constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; - constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; - // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); - - // thread id - int const tidx = threadIdx.x; - // head id - int const head_idx = blockIdx.x; - // request idx - int const request_idx = blockIdx.y; - - int const batch_config_request_id = - request_infos[request_idx].batch_config_request_id; - - int const first_step = 0; - - int const tlength = - request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; - int const qlength = - request_infos[batch_config_request_id].num_tokens_in_batch; - - BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; - - int first_token_idx = 0; - for (int r = 0; r < batch_config_request_id; r++) { - first_token_idx += - request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch; - } - - bool prompt_phase = request_infos[batch_config_request_id].prompt_phase; - int q_start = - request_infos[batch_config_request_id].first_token_depth_in_request; - - // shared memory objects - extern __shared__ char smem_[]; - - float *qk_smem = reinterpret_cast(smem_); - float *out_smem = reinterpret_cast(smem_ + qk_smem_sz); - - float qk_max = -FLT_MAX; - - // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum - __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - - const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + - head_idx * per_head_size; - __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; - - // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE - int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; - int ki_o = tidx % THREADS_PER_KEY; - // the first key's offset for this thread - // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... - int ko = tidx / THREADS_PER_KEY; - // load q tensor - Q_vec q_vec[K_VECS_PER_THREAD]; - - constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; - // The number of keys per warp. - constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; - - DT const *k_cache_batch = - key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; - - int ti_end = - div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; - - for (int qi = 0; qi < qlength; qi += 1) { -#pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - q_vecs[ki_o][ii] = *reinterpret_cast( - q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + - ii * THREADS_PER_KEY * K_VEC_SIZE); - - // if (head_idx == 0 && request_idx == 1 && tidx == 0) { - // printf("laod q %d, %d %.10f\n", - // request_idx, - // qi,q_vecs[ki_o][ii].x); - // } - } - - __syncthreads(); - for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { - K_vec k[K_VECS_PER_THREAD]; - int const ti_circ = ti % max_seq_length; - - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; - if (ti < tlength) { - k[ii] = *reinterpret_cast( - k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + - jj); - } - } - float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - - if (ti < tlength && tidx % THREADS_PER_KEY == 0) { - bool const mask = - prompt_phase ? (qi + q_start < ti) - : (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << qi)))); - - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - - // if (head_idx == 0 && !mask) { - // printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f, - // %.10f, %d\n", - // request_idx, - // qi, - // ti, - // qk, - // q_vecs[ki_o][0].x, - // k[0].x, - // bitmask.non_tree_cache_size); - // } - qk_smem[ti - first_step] = mask ? 0.0f : qk; - } - } - - __syncthreads(); - -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Decompose the thread index into warp and lane. - int const warp = tidx / WARP_SIZE; - int const lane = tidx % WARP_SIZE; - - // The warp leader writes the max to shared memory. - if (lane == 0) { - red_smem[warp] = qk_max; - } - - // Make sure the products are in shared memory. - __syncthreads(); - - // The warps finalize the reduction. - qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; -#pragma unroll - for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Broadcast to all the threads in the warp. - qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - - // if (head_idx == 0 && qi == 9 && tidx == 0) { - // printf("tree attn first token qk_max %f\n", qk_max); - // } - - float exp_sum = 0.f; - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - bool const mask = - prompt_phase ? (q_start + qi < ti) - : (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << qi)))); - float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); - exp_sum += logit; - qk_smem[ti - first_step] = mask ? 0.0f : logit; - } - - // Compute the sum. - exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - - // softmax - float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - qk_smem[ti - first_step] *= inv_sum; - } +using flashinfer::BatchPrefillHandler; +using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched; +using flashinfer::LogitsPostHook; +using flashinfer::MaskMode; +using flashinfer::paged_kv_t; +using flashinfer::PageStorage; +using flashinfer::PosEncodingMode; +using flashinfer::QKVLayout; - __syncthreads(); - - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // A vector of V elements for the current timestep. - // using V_vec_k = typename V_vec_k_::Type; - // using V_vec_acum = typename V_vec_acum_fp32_::Type; - - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - - Out_sum out; - zero(out); - - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + batch_config_request_id * max_seq_length * hidden_size + - vi; - - if (Dh == Dh_MAX || vi < Dh) { - for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { - // Load the values from the cache. - int const ti_circ = ti % max_seq_length; - // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; - V_vec v = *reinterpret_cast( - v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - - if (ti < tlength) { - bool const mask = - prompt_phase - ? (q_start + qi < ti) - : (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << qi)))); - float logit = mask ? 0.0f : qk_smem[ti - first_step]; - out = FlexFlow::fma(logit, cast_to_float(v), out); - } - } +__global__ void commit_tokens_kernel( + half *kCache_ptr, + BatchConfig::CommittedTokensInfo const *committedTokenInfos, + bool const *request_available, + int num_requests, + int num_kv_heads, + int head_dim, + int const *num_committed_tokens, + int const max_num_pages) { + int const kv_hidden_size = num_kv_heads * head_dim; + int const idx = blockIdx.x * blockDim.x + threadIdx.x; + int const request_compact_idx = idx / kv_hidden_size; + int const offset = idx % kv_hidden_size; + // request id in batch config + int requext_idx_in_batch = -1; + int cnt_1 = 0; + while (cnt_1 < request_compact_idx + 1) { + requext_idx_in_batch++; + if (request_available[requext_idx_in_batch]) { + cnt_1++; } + } - // // Make sure we can start writing to shared memory. - __syncthreads(); - - // Run the final reduction amongst the different groups computing different - // partial outputs. - if (Dh == Dh_MAX || vi < Dh) { -#pragma unroll - for (int active_groups = V_PER_ITER; active_groups >= 2; - active_groups /= 2) { - - // The midpoint in the number of active groups. - int midpoint = active_groups / 2; - - // The upper part of active threads store to shared memory. - if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { - *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = - out; - } - __syncthreads(); - - // The bottom warps update their values. - if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { - out = add(*reinterpret_cast(out_smem + vo * Dh + vi), - out); - } - __syncthreads(); + for (int i = 0; i < *num_committed_tokens; i++) { + if (committedTokenInfos[i].request_index == requext_idx_in_batch) { + int const index_in_kv_cache = committedTokenInfos[i].index_in_kv_cache; + if (index_in_kv_cache == -1) { + continue; } - } - // Output the final values. - if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { - convert_from_float(*reinterpret_cast( - output_ptr + (first_token_idx + qi) * hidden_size + - head_idx * per_head_size + vi), - out); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", - // out.x, - // out.y, - // out.z, - // out.w, - // vi, - // (first_token_idx + qi) * hidden_size + head_idx * - // per_head_size + - // vi); - // } + int const page_to_idx = committedTokenInfos[i].token_depth / kPagesize; + int const page_from_idx = + committedTokenInfos[i].index_in_kv_cache / kPagesize; + + size_t from_k_idx = get_k_entry_offset_verify( + committedTokenInfos[i].index_in_kv_cache, + page_from_idx, + num_kv_heads, + head_dim), + from_v_idx = get_v_entry_offset_verify( + committedTokenInfos[i].index_in_kv_cache, + page_from_idx, + num_kv_heads, + head_dim); + size_t to_k_idx = + get_k_entry_offset_verify(committedTokenInfos[i].token_depth, + page_to_idx, + num_kv_heads, + head_dim), + to_v_idx = + get_v_entry_offset_verify(committedTokenInfos[i].token_depth, + page_to_idx, + num_kv_heads, + head_dim); + + kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset]; + kCache_ptr[to_v_idx + offset] = kCache_ptr[from_v_idx + offset]; } } } -template -__global__ void commit_tokens_kernel( - DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos, - int qProjSize, - int kProjSize, - int vProjSize, - int num_tokens_to_commit, - int num_active_tokens_in_last_batch, - int max_seq_len, - int hidden_size) { - - CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { - - int token_pos = i / (hidden_size); - int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; - int offset = i % hidden_size; - assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); - - size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size + - hidden_size + offset; - - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - - int const req_id = committedTokenInfos[token_pos].request_index; - int const tok_id = committedTokenInfos[token_pos].token_depth; - - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; - } -} - -template void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, cudaStream_t stream) { - int num_tokens_to_commit = bc->num_tokens_to_commit; - if (num_tokens_to_commit > 0) { - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit; - commit_tokens_kernel<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->committed_token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens_to_commit, - m->num_active_tokens, // number of active tokens in previous batch - BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(), - m->hidden_size); - } -} - -template -__global__ void update_tree_branch_kv_cache( - DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int vProjSize, - int num_tokens_in_branch, - int processed_tokens_in_batch, - int total_tokens_in_batch, - int max_seq_len, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) { - - int token_idx = i / (hidden_size); - int offset = i % hidden_size; - - token_idx += processed_tokens_in_batch; // get index in the whole batch - size_t val_idx = - token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; - } + // cudaEvent_t t_start, t_end; + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + int const max_num_pages = + round_up_pages(BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); + int const num_requests = bc->num_active_requests(); + int parallelism = m->num_kv_heads * m->qk_dim * num_requests; + commit_tokens_kernel<<>>(static_cast(m->kvCache), + m->committed_token_infos, + m->request_available, + num_requests, + m->num_kv_heads, + m->qk_dim, + m->num_tokens_to_commit, + max_num_pages); + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // printf("Commit token time: %.2f ms\n", elapsed); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); } template -__global__ void update_tree_branch_kv_cache_fused( - DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, - BatchConfig::PerRequestInfo *request_infos, - int qProjSize, - int kProjSize, - int vProjSize, - int num_new_tokens, - int max_seq_len, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { - - int token_idx = i / hidden_size; - int offset = i % hidden_size; - size_t val_idx = - token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - - int const req_id = tokenInfos[token_idx].request_index; - // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - - int const request_token_offset = - request_infos[req_id].first_token_offset_in_batch; - int const first_token_depth = - request_infos[req_id].first_token_depth_in_request; - - // if(i % hidden_size == 0){ - // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", - // req_id, token_idx, request_token_offset,(token_idx + first_token_depth - // - request_token_offset), kVal); - // } - kCache_ptr[req_id * (hidden_size * max_seq_len) + - (token_idx + first_token_depth - request_token_offset) * - hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + - (token_idx + first_token_depth - request_token_offset) * - hidden_size + - offset] = vVal; - } -} - -template -__global__ void tree_fill_entries_above_diagonal(DT *matrix, - size_t new_tokens, - size_t total_tokens_in_request, - size_t num_q_heads, - DT value) { - CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { - // size_t head_idx = i / (new_tokens * total_tokens_in_request); - size_t src_idx = (i / new_tokens) % total_tokens_in_request; - size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; - // Casual Mask - if (src_idx > dst_idx) { - matrix[i] = value; - } +void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // int device; + // checkCUDA(cudaGetDevice(&device)); + // cudaEvent_t t_start, t_end; + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + // global constant parameters + uint32_t const num_q_heads = m->num_q_heads; + uint32_t const num_kv_heads = m->num_kv_heads; + uint32_t const head_dim = m->qk_dim; + uint32_t const batch_size = bc->num_active_requests(); + float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f; + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "Update custom mask time: " << elapsed << " ms\n"; + // } + + half *q = static_cast(m->queryTmp), + *kv = static_cast(m->kvCache), + *o = static_cast(m->outputTmp); + paged_kv_t paged_kv( + num_kv_heads, + kPagesize, + head_dim, + batch_size, + QKVLayout::kNHD, + kv, + m->handle.tree_verify_attention_metadata->kv_indices, + m->handle.tree_verify_attention_metadata->kv_indptr, + m->handle.tree_verify_attention_metadata->kv_last_page_len); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" attn prep time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + BatchPrefillHandler *handler = nullptr; + + if (!bc->prompt_phase) { + assert(m->handle.tree_verify_attention_metadata->decode_handler_collections + .count(batch_size) != 0 && + "Handler is not initialized"); + handler = static_cast( + m->handle.tree_verify_attention_metadata + ->decode_handler_collections[batch_size]); + } else { + assert(m->handle.tree_verify_attention_metadata->prompt_handler_collections + .count(batch_size) != 0 && + "Handler is not initialized"); + handler = static_cast( + m->handle.tree_verify_attention_metadata + ->prompt_handler_collections[batch_size]); } -} -template -void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif - // int num_requests = bc->num_active_requests(); - int processed_tokens_in_batch = 0; - // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); - int q_block_size = m->qProjSize; - int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(); - int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(); - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" BeginForward time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + DISPATCH_HEADDIM(head_dim, HEAD_DIM, { + cudaError_t result; + if (bc->prompt_phase) { + result = + BatchPrefillWithPagedKVCacheWrapperDispatched( + handler, + q, + m->handle.tree_verify_attention_metadata->q_indptr, + /*q_offset=*/nullptr, + paged_kv, + /*custom_mask=*/nullptr, + /*qk_indptr=*/nullptr, + o, + /*lse=*/nullptr, + num_q_heads, + /*window_left=*/-1, + /*logits_soft_cap=*/0.f, + sm_scale, + /*rope_scale=*/1.f, + /*rope_theta=*/static_cast(1e4), + stream); + } else { + result = + BatchPrefillWithPagedKVCacheWrapperDispatched( + handler, + q, + m->handle.tree_verify_attention_metadata->q_indptr, + /*q_offset=*/nullptr, + paged_kv, + m->handle.tree_verify_attention_metadata->custom_mask, + m->handle.tree_verify_attention_metadata->qk_indptr, + o, + /*lse=*/nullptr, + num_q_heads, + /*window_left=*/-1, + /*logits_soft_cap=*/0.f, + sm_scale, + /*rope_scale=*/1.f, + /*rope_theta=*/static_cast(1e4), + stream); } - assert(processed_tokens_in_batch == - bc->requestsInfo[i].first_token_offset_in_batch); - int last_token_idx_of_the_request = - processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; - while (processed_tokens_in_batch <= last_token_idx_of_the_request) { - int num_new_tokens = 1; - int j = processed_tokens_in_batch; - while ((j + 1 <= last_token_idx_of_the_request) && - (bc->tokensInfo[j].abs_depth_in_request + 1 == - bc->tokensInfo[j + 1].abs_depth_in_request)) { - j++; - num_new_tokens++; - } - - int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; - assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); - { - // update K-V cache - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens; - update_tree_branch_kv_cache<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_new_tokens, // num_tokens_in_branch - processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_tokens, // total_tokens_in_batch - BatchConfig::max_sequence_length(), - m->hidden_size); - } - - // bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens_in_request; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens_in_request; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - processed_tokens_in_batch * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods); - - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = - m->num_q_heads * total_tokens_in_request * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens_in_request, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens_in_request); - if (num_new_tokens > 1) { - size_t parallelism = - m->num_q_heads * num_new_tokens * total_tokens_in_request; - tree_fill_entries_above_diagonal<<>>( - C, - num_new_tokens, - total_tokens_in_request, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens_in_request; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens_in_request; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens_in_request; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - processed_tokens_in_batch += num_new_tokens; + if (result != cudaSuccess) { + throw std::runtime_error("Failed to run " + "TreeVerifyAttentionKernel: " + + std::string(cudaGetErrorString(result))); } - // Before moving to the next request - // check that we have finished all tokens of the request - assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); - } - // Project to output, save result directly on output tensor - DT alpha = 1.0f, beta = 0.0f; - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = processed_tokens_in_batch; - int lda = k, ldb = k, ldc = m_; - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - DT const *B = static_cast
(m->attn_heads); - DT *C = static_cast
(output_ptr); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * processed_tokens_in_batch; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - apply_proj_bias_w<<>>(output_ptr, - bias_ptr, - processed_tokens_in_batch, - qkv_weight_size, - m->oProjSize); - } - - assert(processed_tokens_in_batch == bc->num_active_tokens()); -} - -#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ - DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ - smem_size_in_bytes_tree
(m->qProjSize, \ - BatchConfig::max_sequence_length() + \ - BatchConfig::max_spec_tree_token_num(), \ - THDS_PER_VALUE, \ - THDS_PER_BLOCK, \ - bc, \ - smem_sz); \ - compute_attention_kernel_fused_kernel \ - <<>>( \ - static_cast
(m->devQKVProjArray), \ - static_cast
(m->keyCache), \ - static_cast
(m->valueCache), \ - output_ptr, \ - scale, \ - BatchConfig::max_sequence_length() + \ - BatchConfig::BatchConfig::max_spec_tree_token_num(), \ - BatchConfig::max_tokens_per_batch(), \ - m->qProjSize, \ - m->hidden_size, \ - m->request_infos, \ - m->num_q_heads, \ - bc->num_active_requests(), \ - m->causalMask, \ - m->request_completed, \ - smem_sz[0]) - -template -void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, - DT *output_ptr, - cudaStream_t stream) { - - // update the kv cache - // update K-V cache - int num_new_tokens = bc->num_active_tokens(); - int parallelism = m->hidden_size * num_new_tokens; - update_tree_branch_kv_cache_fused<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->request_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_new_tokens, - BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(), - m->hidden_size); - - dim3 grid(m->num_q_heads, bc->num_active_requests()); - int const per_head_size = m->qProjSize; - float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - // 0->qk production size, 1->total shared size - int smem_sz[2]; - if (per_head_size == 64) { - constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; - LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( - DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); - } else if (per_head_size == 128) { - constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; - LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( - DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); - } else { - assert(false && "a unsupported head size"); - } + }); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" actual attn time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + produce_output(m, bc, output_ptr, stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // if (device == 0) { + // printf(" produce_output_kernel time: %.4f ms\n", elapsed); + // } + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); } template void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, int shard_id, DT const *input_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { + // int device; + // checkCUDA(cudaGetDevice(&device)); + // cudaEvent_t t_start, t_end; + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + // additional processing for weight uploading if (m->handle.offload_reserve_space != nullptr) { // Note that we update weight_ptr and bias_ptr when uploading weight and @@ -902,7 +367,23 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << // "\n"; - commit_tokens
(m, bc, stream); + if (!bc->prompt_phase) { + commit_tokens(m, bc, stream); + } + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "Commit tokens time: " << elapsed << " ms\n"; + // } + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); // After commit we update m->num_active_tokens to be the number of active // tokens for the current batch @@ -914,22 +395,86 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } - // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); - - // phase 2: No need to update key/val cache - // IncMultiHeadSelfAttention::update_kv_cache_kernel( - // m, bc, stream); - // use the new kernel - compute_attention_kernel_fused
( - m, bc, static_cast
(m->attn_heads), stream); + // Implement kernel to compute KQV for input tokens + compute_qkv(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + + apply_pos_encoding_to_tokens_in_batch( + m, bc, static_cast
(m->devQKVProjArray), stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "Compute qkv time: " << elapsed << " ms\n"; + // } + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + // Update key-val cache, compact q array + update_qkv_in_batch_paged
(m, bc, stream, true); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "Update qkv time: " << elapsed << " ms\n"; + // } + + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); + + // Compute attention + tree_verify_attention
(m, bc, static_cast
(m->attn_heads), stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "Attn time: " << elapsed << " ms\n"; + // } + + // Debug output: + // { + // int size = m->local_hidden_size * bc->num_active_tokens(); + // float *temp_output = new float[size]; + // cudaDeviceSynchronize(); + // cudaMemcpy( + // temp_output, m->attn_heads, size * sizeof(float), + // cudaMemcpyDeviceToHost); + // printf("Output (flashinfer attention) :"); + // for (int i = 0; i < 1; ++i) { + // float temp = 0; + // for (int j = 0; j < m->local_hidden_size; ++j) { + // temp += temp_output[i * m->local_hidden_size + j]; + // } + // printf("%.6f ", temp); + // } + // printf("\n"); + + // delete[] temp_output; + // } + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); int processed_tokens_in_batch = bc->num_active_tokens(); @@ -941,6 +486,31 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, bias_ptr, processed_tokens_in_batch, stream); + + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "Compute output proj time: " << elapsed << " ms\n"; + // } + // { + // int size = m->o_dim; + // DT *temp_output = new DT[size]; + // cudaDeviceSynchronize(); + // cudaMemcpy( + // temp_output, output_ptr + m->o_dim * (bc->num_active_tokens() - + // 1), size * sizeof(DT), cudaMemcpyDeviceToHost); + // printf("Output :"); + // for (int i = 0; i < size; ++i) { + // printf("%.6f ", static_cast(temp_output[i])); + // } + // printf("\n"); + + // delete[] temp_output; + // } } } // namespace TreeIncMultiHeadAttention @@ -949,7 +519,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, /*static*/ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta *m, - TreeVerifyBatchConfig const *bc, + BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, @@ -959,12 +529,12 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(get_legion_stream(&stream)); bool use_bias = *m->qkv_bias || *m->final_bias; - cudaEvent_t t_start, t_end; - if (m->profiling) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start, stream); - } + // int device; + // checkCUDA(cudaGetDevice(&device)); + // cudaEvent_t t_start, t_end; + // cudaEventCreate(&t_start); + // cudaEventCreate(&t_end); + // cudaEventRecord(t_start, stream); // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); @@ -974,12 +544,12 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); + pre_build_weight(m, weight, input.data_type, stream); } half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::TreeIncMultiHeadAttention::inference_kernel( + Kernels::TreeIncMultiHeadAttention::inference_kernel( m, bc, shard_id, @@ -990,11 +560,11 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( stream); } else if (input.data_type == DT_FLOAT) { if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); + pre_build_weight(m, weight, input.data_type, stream); } float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::TreeIncMultiHeadAttention::inference_kernel( + Kernels::TreeIncMultiHeadAttention::inference_kernel( m, bc, shard_id, @@ -1008,14 +578,16 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( assert(false && "Unspported data type"); } - if (m->profiling) { - cudaEventRecord(t_end, stream); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - } + // cudaEventRecord(t_end, stream); + // checkCUDA(cudaEventSynchronize(t_end)); + // float elapsed = 0; + // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + // cudaEventDestroy(t_start); + // cudaEventDestroy(t_end); + // if (device == 0) { + // std::cout << "TreeIncMultiHeadSelfAttention time: " << elapsed << " + // ms\n"; + // } } TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( @@ -1029,14 +601,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( : IncMultiHeadSelfAttentionMeta(handler, TREE_VERIFY_MODE, attn, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->apply_rotary_embedding, + attn->hidden_size, + attn->qk_dim, + attn->v_dim, + attn->o_dim, + attn->rotary_embedding_meta, attn->qkv_bias, attn->scaling_query, attn->qk_prod_scaling, @@ -1051,39 +620,40 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( _num_q_heads, _num_kv_heads, attn->quantization_type, - attn->offload), + attn->offload, + false), num_active_tokens(0) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); + // set attention constants + handler.tree_verify_attention_metadata->set_enabled(true); + handler.tree_verify_attention_metadata->set_num_q_heads(num_q_heads); + handler.tree_verify_attention_metadata->set_num_kv_heads(num_kv_heads); + handler.tree_verify_attention_metadata->set_head_dim(qk_dim); + // allocate memory for the seqArray and reserve space { - - causalMask = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); committed_token_infos = - reinterpret_cast( + reinterpret_cast( reinterpret_cast(handler.batch_config_metadata) + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BatchConfig::causalMask)); - - request_completed = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens)); + sizeof(BatchConfig::request_available) + + sizeof(BatchConfig::causalMask) + + sizeof(BatchConfig::streamingCacheInfo)); + num_tokens_to_commit = reinterpret_cast( + reinterpret_cast(committed_token_infos) + + sizeof(BatchConfig::committed_tokens)); } cudaStreamSynchronize(stream); } TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) { - if (committed_token_reserve_inst != Realm::RegionInstance::NO_INST) { - committed_token_reserve_inst.destroy(); - } + // delete static_cast(batch_prefill_handler); } }; // namespace FlexFlow diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 5d38e2890..7f38e2714 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -106,7 +106,12 @@ OpMeta *AllReduce::init_task(Task const *task, Runtime *runtime) { AllReduce *ar = (AllReduce *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - AllReduceMeta *meta = new AllReduceMeta(handle, ar); + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + AllReduceMeta *meta = new AllReduceMeta(handle, ar, gpu_mem_allocator); meta->input_type[0] = ar->inputs[0]->data_type; meta->output_type[0] = ar->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); @@ -129,6 +134,7 @@ void AllReduce::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + // launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -167,6 +173,7 @@ void AllReduce::init_inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + // launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -208,6 +215,7 @@ FutureMap AllReduce::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, @@ -240,6 +248,7 @@ void AllReduce::forward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -269,6 +278,7 @@ void AllReduce::backward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, inputs[0]->machine_view.hash()); + // launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, @@ -326,7 +336,7 @@ void AllReduce::inference_task(Task const *task, assert(regions.size() == 2); assert(task->regions.size() == 2); - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + AllReduceMeta *m = *((AllReduceMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( @@ -335,7 +345,7 @@ void AllReduce::inference_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input.data_type == output.data_type); - inference_kernel_wrapper(m, bc, input, output); + inference_kernel_wrapper(ctx, runtime, m, bc, input, output); } /*static*/ @@ -354,7 +364,7 @@ void AllReduce::forward_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input.data_type == output.data_type); - forward_kernel_wrapper(m, input, output); + forward_kernel_wrapper(ctx, runtime, m, input, output); } void AllReduce::backward_task(Task const *task, diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp index 8d7e20e39..1e60728fa 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cpp +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -25,7 +25,9 @@ AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, +void inference_kernel_wrapper(Legion::Context ctx, + Legion::Runtime *runtime, + AllReduceMeta const *m, BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { @@ -37,6 +39,7 @@ void inference_kernel_wrapper(AllReduceMeta const *m, size_t num_elements = bc->num_tokens * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(input.ptr, output.ptr, num_elements, @@ -44,12 +47,15 @@ void inference_kernel_wrapper(AllReduceMeta const *m, ncclSum, m->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); #else assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); #endif } -void forward_kernel_wrapper(AllReduceMeta const *m, +void forward_kernel_wrapper(Legion::Context ctx, + Legion::Runtime *runtime, + AllReduceMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { hipStream_t stream; @@ -59,6 +65,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m, size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(input.ptr, output.ptr, input.domain.get_volume(), @@ -66,6 +73,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m, ncclSum, m->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); #else assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); #endif diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index 2c000137a..879be72b8 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -15,40 +15,199 @@ #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/cuda_helper.h" +#include "tensorrt_llm/custom_allreduce_kernels.h" +#include namespace FlexFlow { -AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) - : OpMeta(handle) {} +AllReduceMeta::AllReduceMeta(FFHandler handle, + AllReduce const *reduct, + MemoryAllocator &gpu_mem_allocator) + : OpMeta(handle) { + barrier_ptr_size = sizeof(uint32_t) * + (tensorrt_llm::MAX_ALL_REDUCE_BLOCKS + 2) * + tensorrt_llm::MAX_RANKS_PER_NODE; + gpu_mem_allocator.create_legion_instance( + reserveInst, + sizeof(void *) * (handle.num_devices + 1) + barrier_ptr_size * 2, + "AllReduceMeta"); + allgather_src = gpu_mem_allocator.allocate_instance_untyped(sizeof(void *)); + allgather_dst = gpu_mem_allocator.allocate_instance_untyped( + sizeof(void *) * handle.num_devices); + // Create barrier helpers for all-reduce. + barrier_in_ptr = + gpu_mem_allocator.allocate_instance_untyped(barrier_ptr_size); + barrier_out_ptr = + gpu_mem_allocator.allocate_instance_untyped(barrier_ptr_size); + checkCUDA(cudaMemset(barrier_in_ptr, 0, barrier_ptr_size)); + checkCUDA(cudaMemset(barrier_out_ptr, 0, barrier_ptr_size)); + // Reset allocated memory to zero. + // We explicitly synchronize after memset, to make sure memset finishes + // before using all-gather to exchange peer pointers. + // This is important to ensure the memory reset get ordered + // before any other peers read the memory. + checkCUDA(cudaDeviceSynchronize()); +} + +AllReduceMeta::~AllReduceMeta() { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, +CommunicationBuffer *get_or_create_comm_buffer(Context ctx, + Runtime *runtime, + AllReduceMeta *m, + int num_devices, + int device_id, + ncclComm_t ncclComm, + void *local_ptr, + cudaStream_t stream) { + auto iter = m->comm_bufs.find(local_ptr); + if (iter != m->comm_bufs.end()) { + return iter->second; + } else { + CommunicationBuffer *comm_buffer = + create_comm_buf_with_local_ptr(ctx, + runtime, + num_devices, + device_id, + ncclComm, + m->allgather_src, + m->allgather_dst, + local_ptr, + m->barrier_in_ptr, + m->barrier_out_ptr, + &(m->barrier_flag), + stream); + m->comm_bufs[local_ptr] = comm_buffer; + return comm_buffer; + } +} + +// Get the number of bits for a given data type. +inline int get_bits(DataType dtype) { + switch (dtype) { + case DataType::DT_INT64: + case DataType::DT_DOUBLE: + return 64; + case DataType::DT_INT32: + case DataType::DT_FLOAT: + return 32; + case DataType::DT_HALF: + return 16; + case DataType::DT_INT8: + return 8; + case DataType::DT_INT4: + return 4; + default: + assert(false && "Unsupported data type"); + } +} + +// Check if customized all-reduce kernels can be applied. +inline bool CanApplyCustomAllReduce(int64_t num_elements, DataType dtype) { + // The customized all-reduce kernel has the following requirement(s). + return num_elements % (16 / ((get_bits(dtype) + 7) / 8)) == 0; +} + +// Check if the two-shot customized all-reduce kernel can be applied. +inline bool CanApplyTwoShotAllReduce(int64_t num_elements, + DataType dtype, + int num_workers) { + // The two-shot customized all-reduce kernel has the following requirement(s). + return (num_elements / num_workers) % (16 / ((get_bits(dtype) + 7) / 8)) == 0; +} + +// Customized all-reduce kernel backed by CUDA Peer memory. +void inference_kernel_wrapper(Context ctx, + Runtime *runtime, + AllReduceMeta *m, BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { +#ifndef FF_USE_NCCL + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; size_t num_elements = bc->num_tokens * hidden_dim_size; -#ifdef FF_USE_NCCL - ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); - checkNCCL(ncclAllReduce(input.ptr, - output.ptr, - num_elements, - nccl_data_type, - ncclSum, - m->handle.ncclComm, - stream)); -#else - assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); -#endif + int num_devices = m->handle.num_devices; + int device_id = m->handle.device_id; + ncclComm_t ncclComm = m->handle.ncclComm; + DataType dtype = input.data_type; + if (num_elements == 0) { + return; + } + + tensorrt_llm::AllReduceStrategyType strategy = + tensorrt_llm::SelectImplementation( + num_elements * ((get_bits(dtype) + 7) / 8), num_devices); + + if (strategy == tensorrt_llm::AllReduceStrategyType::RING || + !CanApplyCustomAllReduce(num_elements, dtype)) { + // Dispatch to nccl AllReduce if the customized all-reduce cannot apply. + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype); + runtime->concurrent_task_barrier(ctx); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + num_elements, + nccl_data_type, + ncclSum, + ncclComm, + stream)); + runtime->concurrent_task_barrier(ctx); + return; + } + + // Initialize the all-reduce kernel arguments. + tensorrt_llm::AllReduceParams params; + params.ranks_per_node = num_devices; + params.rank = device_id; + params.local_rank = device_id; + CommunicationBuffer *comm_buffer = + get_or_create_comm_buffer(ctx, + runtime, + m, + num_devices, + device_id, + ncclComm, + const_cast(input.ptr), + stream); + params.barrier_flag = ++(*comm_buffer->barrier_flag); + for (int i = 0; i < num_devices; ++i) { + params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i]; + } + for (int i = 0; i < num_devices; ++i) { + params.peer_barrier_ptrs_in[i] = + reinterpret_cast(comm_buffer->barrier_in[i]); + } + for (int i = 0; i < num_devices; ++i) { + params.peer_barrier_ptrs_out[i] = + reinterpret_cast(comm_buffer->barrier_out[i]); + } + + if (!CanApplyTwoShotAllReduce(num_elements, dtype, num_devices)) { + // Two-shot all-reduce does not support this case. + // So we fallback to the one-shot strategy. + strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT; + } + + // runtime->concurrent_task_barrier(ctx); + tensorrt_llm::customAllReduce( + params, output.ptr, num_elements, dtype, strategy, stream); + // runtime->concurrent_task_barrier(ctx); } -void forward_kernel_wrapper(AllReduceMeta const *m, +void forward_kernel_wrapper(Context ctx, + Runtime *runtime, + AllReduceMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { cudaStream_t stream; @@ -57,6 +216,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m, assert(input.domain == output.domain); #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(input.ptr, output.ptr, input.domain.get_volume(), @@ -64,6 +224,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m, ncclSum, m->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); #else assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); #endif diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index bd96dbb14..0073093d8 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -16,58 +16,62 @@ #include "flexflow/batch_config.h" #include "flexflow/request_manager.h" #include "legion.h" +#include #include #include namespace FlexFlow { -LegionRuntime::Logger::Category log_bc("BatchConfig"); +Legion::Logger log_bc("BatchConfig"); using Legion::Future; using Legion::Memory; -BatchConfig::BatchConfig() : num_tokens(0) { - for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - requestsInfo[i].first_token_depth_in_request = 0; - requestsInfo[i].first_token_offset_in_batch = 0; - requestsInfo[i].num_tokens_in_batch = 0; - request_completed[i] = true; +BatchConfig::BatchConfig(InferenceMode inference_mode_, int model_id_) + : model_id(model_id_), inference_mode(inference_mode_) { + std::fill(std::begin(request_available), std::end(request_available), 0); + // Don't need to initialize requestInfo ,tokensInfo, causalMask and + // committed_tokens here because they initialize themselves. + // Other fields are already initialized to proper value. +} + +BatchConfig::BatchConfig(BatchConfig const &rhs) { + model_id = rhs.model_id; + inference_mode = rhs.inference_mode; + num_available_requests = rhs.num_available_requests; + num_tokens = rhs.num_tokens; + prompt_phase = rhs.prompt_phase; + num_tokens_to_commit = rhs.num_tokens_to_commit; + for (int token_idx = 0; token_idx < num_tokens; token_idx++) { + tokensInfo[token_idx] = rhs.tokensInfo[token_idx]; } - for (int i = 0; i < MAX_NUM_TOKENS; i++) { - tokensInfo[i].abs_depth_in_request = 0; - tokensInfo[i].request_index = 0; - tokensInfo[i].token_id = 0; + for (int request_idx = 0; request_idx < max_requests_per_batch(); + request_idx++) { + if (rhs.request_available[request_idx]) { + request_available[request_idx] = true; + requestsInfo[request_idx] = rhs.requestsInfo[request_idx]; + streamingCacheInfo[request_idx] = rhs.streamingCacheInfo[request_idx]; + causalMask[request_idx] = rhs.causalMask[request_idx]; + } + } + for (int committed_token_idx = 0; committed_token_idx < num_tokens_to_commit; + committed_token_idx++) { + committed_tokens[committed_token_idx] = + rhs.committed_tokens[committed_token_idx]; } } /*static*/ BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future) { - BatchConfig const *bc = static_cast( + return static_cast( Future(future).get_buffer(Memory::SYSTEM_MEM)); - // Check future size - if (bc->get_mode() == INC_DECODING_MODE) { - assert(Future(future).get_untyped_size() == sizeof(BatchConfig)); - } else if (bc->get_mode() == BEAM_SEARCH_MODE) { - assert(Future(future).get_untyped_size() == sizeof(BeamSearchBatchConfig)); - } else if (bc->get_mode() == TREE_VERIFY_MODE) { - assert(Future(future).get_untyped_size() == sizeof(TreeVerifyBatchConfig)); - } else { - assert(false && "Unsupported inference mode"); - } - return bc; } InferenceMode BatchConfig::get_mode() const { - return INC_DECODING_MODE; + return inference_mode; } int BatchConfig::num_active_requests() const { - int num_requests = 0; - for (int i = 0; i < max_requests_per_batch(); i++) { - if (!request_completed[i]) { - num_requests++; - } - } - return num_requests; + return num_available_requests; } int BatchConfig::num_active_tokens() const { @@ -85,9 +89,14 @@ int BatchConfig::max_tokens_per_batch() { } /*static*/ -int BatchConfig::max_verify_tokens_per_batch() { +int BatchConfig::max_tokens_per_ssm_batch() { + return RequestManager::get_request_manager()->get_max_tokens_per_ssm_batch(); +} + +/*static*/ +int BatchConfig::max_tokens_per_prefilling_batch() { return RequestManager::get_request_manager() - ->get_max_verify_tokens_per_batch(); + ->get_max_tokens_per_prefilling_batch(); } /*static*/ @@ -95,37 +104,102 @@ int BatchConfig::max_sequence_length() { return RequestManager::get_request_manager()->get_max_sequence_length(); } +int BatchConfig::max_output_length() { + return RequestManager::get_request_manager()->get_max_output_length(); +} + +size_t BatchConfig::max_kv_cache_size() { + return RequestManager::get_request_manager()->get_max_kv_cache_size(); +} +bool BatchConfig::streaming_cache() { + return RequestManager::get_request_manager()->get_streaming_cache(); +} + int BatchConfig::max_spec_tree_token_num() { return RequestManager::get_request_manager()->get_max_spec_tree_token_num(); } +int BatchConfig::get_max_tree_depth() { + return RequestManager::get_request_manager()->get_max_tree_depth(); +} + +// Overloading the << operator for the Bitset class +std::ostream &operator<<(std::ostream &os, + BatchConfig::BitMask::Bitset const &bitset) { + for (size_t i = 0; i < BatchConfig::max_spec_tree_token_num(); i++) { + os << (bitset.test_bit(i) ? '1' : '0'); + } + return os; +} + +std::ostream &operator<<(std::ostream &os, BatchConfig::BitMask const &bm) { + os << "BitMask {\n" + << " non_tree_cache_size: " << bm.non_tree_cache_size << "\n" + << " tree_or_prompt_size: " << bm.tree_or_prompt_size << "\n" + << " current_layer_size: " << bm.current_layer_size << "\n" + << " bit_mask: [" << bm.bit_mask << "]\n"; + os << "}"; + return os; +} + std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { os << "@@@@@@@@@@@@@@ Batch Config (mode " << bc.get_mode() << ") @@@@@@@@@@@@@@" << std::endl; - // Max values - os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl; - os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl; - os << "Max sequence length: " << bc.max_sequence_length() << std::endl; // Current values os << "Number of tokens: " << bc.num_active_tokens() << std::endl; os << "Number of requests: " << bc.num_active_requests() << std::endl; + os << "Prompt phase: " << bc.prompt_phase << std::endl; + os << "Inference mode: "; + switch (bc.inference_mode) { + case INC_DECODING_MODE: + os << "Incremental decoding"; + break; + case TREE_SEARCH_MODE: + os << "Tree search"; + break; + case TREE_VERIFY_MODE: + os << "Tree verify"; + break; + default: + os << "Unknown"; + } + os << std::endl; + if (bc.inference_mode == TREE_VERIFY_MODE) { + os << "Number of tokens to commit: " << bc.num_tokens_to_commit + << std::endl; + } + if (bc.inference_mode == TREE_SEARCH_MODE) { + os << "Model id: " << bc.model_id << std::endl; + } // Per-request info os << "Per-request info:\n"; for (int i = 0; i < bc.max_requests_per_batch(); i++) { - if (!bc.request_completed[i]) { + if (bc.request_available[i]) { os << " Request " << i << ":\n"; os << " First token depth in request: " - << bc.requestsInfo[i].first_token_depth_in_request << std::endl; + << bc.requestsInfo[i].first_token_index_in_request << std::endl; os << " First token offset in batch: " << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; - os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; - os << " Request completed: " << bc.request_completed[i] << std::endl; - os << " Request running: " << bc.request_running[i] << std::endl; + os << " Request available: " << bc.request_available[i] << std::endl; + } + } + + // Streaming cache info + os << "Streaming cache info:\n"; + for (int i = 0; i < bc.max_requests_per_batch(); i++) { + if (bc.request_available[i]) { + os << " Request " << i << ":\n"; + os << " Sink cache size: " << bc.streamingCacheInfo[i].sink_cache_size + << std::endl; + os << " Window cache size: " + << bc.streamingCacheInfo[i].window_cache_size << std::endl; + os << " Window back: " << bc.streamingCacheInfo[i].window_back + << std::endl; + os << " Commit len: " << bc.streamingCacheInfo[i].commit_len + << std::endl; } } @@ -133,15 +207,85 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { os << "Per-token info:\n"; for (int i = 0; i < bc.num_tokens; i++) { os << " Token " << i << ":\n"; + os << " Absolute index in request: " + << bc.tokensInfo[i].abs_index_in_request << std::endl; os << " Absolute depth in request: " << bc.tokensInfo[i].abs_depth_in_request << std::endl; os << " Request index: " << bc.tokensInfo[i].request_index << std::endl; os << " Token id: " << bc.tokensInfo[i].token_id << std::endl; } + + if (bc.inference_mode == TREE_VERIFY_MODE) { + os << "Committed tokens info:\n"; + for (int i = 0; i < bc.num_tokens_to_commit; i++) { + os << " Token " << i << ":\n"; + os << " Index in kv cache: " + << bc.committed_tokens[i].index_in_kv_cache << std::endl; + os << " Request index: " << bc.committed_tokens[i].request_index + << std::endl; + os << " Token depth: " << bc.committed_tokens[i].token_depth + << std::endl; + } + } + + if (bc.inference_mode == TREE_SEARCH_MODE || + bc.inference_mode == TREE_VERIFY_MODE) { + os << "Causal mask:\n"; + for (int i = 0; i < bc.max_requests_per_batch(); i++) { + if (bc.request_available[i]) { + os << " Request " << i << ":\n"; + os << " Non tree cache size: " + << bc.causalMask[i].non_tree_cache_size << std::endl; + os << " Tree or prompt size: " + << bc.causalMask[i].tree_or_prompt_size + + << std::endl; + os << " Current layer size: " << bc.causalMask[i].current_layer_size + << std::endl; + os << " Bit mask: " << std::endl; + for (int j = 0; j < BatchConfig::max_spec_tree_token_num(); j++) { + os << " " << bc.causalMask[i].bit_mask[j] << std::endl; + } + } + } + } + os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl; return os; } +std::ostream &operator<<(std::ostream &os, InferenceResult const &ir) { + os << "InferenceResult {\n" + << " num_token_ids: " << ir.num_token_ids << "\n" + << " num_gumbel_logits: " << ir.num_gumbel_logits << "\n" + << " token_ids: ["; + for (int i = 0; i < ir.num_token_ids; i++) { + os << ir.token_ids[i]; + if (i < ir.num_token_ids - 1) { + os << ", "; + } + } + os << "]\n" + << " probs: ["; + for (int i = 0; i < ir.num_token_ids; i++) { + os << ir.probs[i]; + if (i < ir.num_token_ids - 1) { + os << ", "; + } + } + os << "]\n" + << " gumbel_logits: ["; + for (int i = 0; i < ir.num_gumbel_logits; i++) { + os << ir.gumbel_logits[i]; + if (i < ir.num_gumbel_logits - 1) { + os << ", "; + } + } + os << "]\n" + << "}"; + return os; +} + void BatchConfig::print() const { std::cout << *this << std::endl; } @@ -158,4 +302,75 @@ void BatchConfig::save_to_file(std::string const &filename) const { } } +InferenceResult::InferenceResult(InferenceResult const &other) { + num_token_ids = other.num_token_ids; + num_gumbel_logits = other.num_gumbel_logits; + std::copy(other.token_ids, other.token_ids + num_token_ids, token_ids); + std::copy(other.probs, other.probs + num_token_ids, probs); + std::copy(other.gumbel_logits, + other.gumbel_logits + num_gumbel_logits, + gumbel_logits); +} + +StreamingCacheInfo::StreamingCacheInfo() : StreamingCacheInfo(0, 0) {} + +StreamingCacheInfo::StreamingCacheInfo(int sink_cache_size, + int window_cache_size) + : sink_cache_size(sink_cache_size), window_cache_size(window_cache_size), + window_back(0), commit_len(0) {} + +StreamingCacheInfo::StreamingCacheInfo(StreamingCacheInfo const &other) + : sink_cache_size(other.sink_cache_size), + window_cache_size(other.window_cache_size), + window_back(other.window_back), commit_len(other.commit_len) {} + +StreamingCacheInfo & + StreamingCacheInfo::operator=(StreamingCacheInfo const &other) { + sink_cache_size = other.sink_cache_size; + window_cache_size = other.window_cache_size; + window_back = other.window_back; + commit_len = other.commit_len; + return *this; +} + +// For draft model, we only update the cache when prefill or +// commit the verified result from target model; +// For incremental decoding, we update the cache both in prefill and decoding +void StreamingCacheInfo::commit_cache(int len) { + total_len += len; + commit_len += len; + if (commit_len <= sink_cache_size + window_cache_size) { + window_back = std::max(0, commit_len - sink_cache_size); + } else { + commit_len = sink_cache_size + window_cache_size; + window_back = (window_back + len - 1) % window_cache_size + 1; + } +} + +void StreamingCacheInfo::reset_cache() { + window_back = 0; + commit_len = 0; + total_len = 0; +} + +// page attention: TODO: I think we just need to change the index + +int StreamingCacheInfo::global_2_cache_index(int global_index) { + if (global_index < sink_cache_size) { + return global_index; + } + return (global_index - sink_cache_size) % window_cache_size + sink_cache_size; +} + +int StreamingCacheInfo::cache_2_global_index(int cache_index) { + if (cache_index < sink_cache_size) { + return cache_index; + } + // cache = (global-sink) % window + sink + cache_index -= sink_cache_size; + int num_window = (total_len - sink_cache_size) / window_cache_size - + (window_back <= cache_index); + return sink_cache_size + cache_index + num_window * window_cache_size; +} + }; // namespace FlexFlow diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc deleted file mode 100644 index ff7bf1a81..000000000 --- a/src/runtime/beam_search_batch_config.cc +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright 2023 CMU, Stanford, Facebook, LANL - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/batch_config.h" -#include "flexflow/request_manager.h" -#include "legion.h" -#include -#include - -#define DEFAULT_BEAM_WIDTH 1 -#define DEFAULT_TARGET_ITERATIONS 3 - -namespace FlexFlow { - -LegionRuntime::Logger::Category log_beam_bc("BeamSearchBatchConfig"); - -BeamSearchBatchConfig::BeamSearchBatchConfig() : BatchConfig() { - this->beam_width = DEFAULT_BEAM_WIDTH; - this->target_iterations = DEFAULT_TARGET_ITERATIONS; - current_iteration = 0; -} - -BeamSearchBatchConfig::BeamSearchBatchConfig(int model_id) : BatchConfig() { - this->model_id = model_id; - std::cout << "==================\n" - << "Register Batch Config with Model " << this->model_id - << std::endl; - current_iteration = 0; -} - -BeamSearchBatchConfig::BeamSearchBatchConfig(size_t beam_width, - size_t target_iterations) - : BatchConfig() { - this->beam_width = beam_width; - this->target_iterations = target_iterations; - current_iteration = 0; -} - -BeamSearchBatchConfig::BeamSearchBatchConfig(BeamSearchBatchConfig const &other, - int model_id) - : BatchConfig() { - this->beam_width = other.beam_width; - this->target_iterations = other.target_iterations; - this->model_id = model_id; - current_iteration = 0; -} - -BeamSearchBatchConfig::~BeamSearchBatchConfig() {} - -InferenceMode BeamSearchBatchConfig::get_mode() const { - return BEAM_SEARCH_MODE; -} - -bool BeamSearchBatchConfig::done() const { - assert(current_iteration <= target_iterations); - return current_iteration == target_iterations; -} - -int BeamSearchBatchConfig::max_beam_depth_all_requests() const { - int max_depth_all_requests = 0; - for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { - if (!request_completed[i] && - beamRequestsInfo[i].max_depth > max_depth_all_requests) { - /* printf("\treq %i has max_depth=%i. Increasing max_depth_all_requests " - "from %i\n", - i, - beamRequestsInfo[i].max_depth, - max_depth_all_requests); */ - max_depth_all_requests = beamRequestsInfo[i].max_depth; - } - } - assert(max_depth_all_requests <= BeamSearchBatchConfig::MAX_BEAM_DEPTH); - return max_depth_all_requests; -} - -int BeamSearchBatchConfig::get_speculative_request_num() const { - return speculative_request_num; -} - -int BeamSearchBatchConfig::current_depth_all_requests() const { - int current_depth = 0; - for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { - if (!request_completed[i] && - beamRequestsInfo[i].current_depth > current_depth) { - /* printf("\treq %i has current_depth=%i. Increasing " - "current_depth_all_requests from %i\n", - i, - beamRequestsInfo[i].current_depth, - current_depth); */ - current_depth = beamRequestsInfo[i].current_depth; - } - } - assert(current_depth <= BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1); - return current_depth; -} - -std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { - os << "@@@@@@@@@@@@@@ BeamSearchBatchConfig (mode " << bc.get_mode() - << ") @@@@@@@@@@@@@@" << std::endl; - // Max values - os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl; - os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl; - os << "Max sequence length: " << bc.max_sequence_length() << std::endl; - // Current values - os << "Number of tokens: " << bc.num_active_tokens() << std::endl; - os << "Number of requests: " << bc.num_active_requests() << std::endl; - // BeamSearch-specific - os << "Model ID: " << bc.model_id << std::endl; - os << "Max Beam Depth (all requests): " << bc.max_beam_depth_all_requests() - << std::endl; - os << "Current depth (all requests): " << bc.current_depth_all_requests() - << std::endl; - os << "Beam width: " << bc.beam_width << std::endl; - os << "Target Iterations: " << bc.target_iterations << std::endl; - os << "Current Iterations: " << bc.current_iteration << std::endl; - - os << "Per-request info:\n"; - for (int i = 0; i < bc.max_requests_per_batch(); i++) { - if (!bc.request_completed[i]) { - os << " Request " << i << ":\n"; - os << " First token depth in request: " - << bc.requestsInfo[i].first_token_depth_in_request << std::endl; - os << " First token offset in batch: " - << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; - os << " Number of tokens in batch: " - << bc.requestsInfo[i].num_tokens_in_batch << std::endl; - os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; - os << " Request completed: " << bc.request_completed[i] << std::endl; - os << " Request running: " << bc.request_running[i] << std::endl; - os << " Beam Search Specific: " << std::endl; - os << " beam_size: " << bc.beamRequestsInfo[i].beam_size - << std::endl; - os << " current_depth: " << bc.beamRequestsInfo[i].current_depth - << std::endl; - os << " max_depth: " << bc.beamRequestsInfo[i].max_depth - << std::endl; - os << " tokens: "; - for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) { - os << bc.beamRequestsInfo[i].tokens[j] << ", "; - } - os << std::endl; - os << " probs: "; - for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) { - os << bc.beamRequestsInfo[i].probs[j] << ", "; - } - os << std::endl; - os << " parent_id: "; - for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) { - os << bc.beamRequestsInfo[i].parent_id[j] << ", "; - } - os << std::endl; - } - } - - os << "Per-token info:\n"; - for (int i = 0; i < bc.num_tokens; i++) { - os << " Token " << i << ":\n"; - os << " Absolute depth in request: " - << bc.tokensInfo[i].abs_depth_in_request << std::endl; - os << " Request index: " << bc.tokensInfo[i].request_index << std::endl; - os << " Token id: " << bc.tokensInfo[i].token_id << std::endl; - os << " Beam Search Specific: " << std::endl; - os << " beam_size: " << bc.beamTokenInfo[i].sub_request_index - << std::endl; - } - os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl; - return os; -} - -void BeamSearchBatchConfig::print() const { - std::cout << *this << std::endl; -} - -void BeamSearchBatchConfig::save_to_file(std::string const &filename) const { - std::ofstream outputFile(filename); - if (outputFile.is_open()) { - outputFile << *this << std::endl; - outputFile.close(); - } else { - std::cerr << "Error: Unable to open the batch config output file: " - << filename << std::endl; - assert(false); - } -} - -}; // namespace FlexFlow diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index c7b6e1257..36c68c836 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -114,10 +114,12 @@ std::string get_operator_type_name(OperatorType type) { return "Size"; case OP_TOPK: return "TopK"; + case OP_GUMBEL_TOPK: + return "GumbelTopK"; case OP_ARG_TOPK: return "ArgTopK"; - case OP_BEAM_TOPK: - return "BeamTopK"; + // case OP_BEAM_TOPK: + // return "BeamTopK"; case OP_WHERE: return "Where"; case OP_CEIL: diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 43ce9d700..14e806d49 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -16,6 +16,7 @@ #include "flexflow/utils/file_loader.h" #include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" +#include "flexflow/model.h" #include using namespace std; @@ -27,12 +28,12 @@ FileDataLoader::FileDataLoader(std::string _prompts_filepath, int _num_heads, int _num_kv_heads, size_t _hidden_dim, - size_t _qkv_inner_dim, + size_t _head_dim, int _tensor_parallelism_degree, bool _use_full_precision) : prompts_filepath(_prompts_filepath), weights_folder(_weights_folder), num_heads(_num_heads), num_kv_heads(_num_kv_heads), - hidden_dim(_hidden_dim), qkv_inner_dim(_qkv_inner_dim), + hidden_dim(_hidden_dim), head_dim(_head_dim), tensor_parallelism_degree(_tensor_parallelism_degree), use_full_precision(_use_full_precision){}; @@ -124,6 +125,7 @@ void load_attention_weights_multi_query(DT *ptr, ptr[data_index++] = host_array.at(i); } file_index++; + in.close(); } } @@ -132,16 +134,16 @@ void load_attention_bias_v2(DT *ptr, int num_heads, int num_kv_heads, size_t hidden_dim, - size_t qkv_inner_dim, + size_t head_dim, bool final_bias, std::string layer_name, std::string weights_folder) { - std::string q_file = layer_name + "_wq_bias"; - std::string k_file = layer_name + "_wk_bias"; - std::string v_file = layer_name + "_wv_bias"; + std::string q_file = layer_name + ".q_proj.bias"; + std::string k_file = layer_name + ".k_proj.bias"; + std::string v_file = layer_name + ".v_proj.bias"; std::vector bias_files = {q_file, k_file, v_file}; if (final_bias) { - std::string o_file = layer_name + "_wo_bias"; + std::string o_file = layer_name + ".o_proj.bias"; bias_files.push_back(o_file); } @@ -159,8 +161,8 @@ void load_attention_bias_v2(DT *ptr, int replicate_num = num_heads / num_kv_heads; - size_t qkv_partial_size = qkv_inner_dim * n_heads; - size_t qkv_replicate_size = qkv_inner_dim * num_heads; + size_t qkv_partial_size = head_dim * n_heads; + size_t qkv_replicate_size = head_dim * num_heads; size_t out_partial_size = hidden_dim; size_t partial_size = (file_index < 3) ? qkv_partial_size : out_partial_size; @@ -212,24 +214,22 @@ void load_attention_weights_v2(DT *ptr, int num_heads, int num_kv_heads, size_t hidden_dim, - size_t qkv_inner_dim, + size_t head_dim, std::string layer_name, std::string weights_folder, size_t volume, int tensor_parallelism_degree) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file}; int file_index = 0; int base_index = 0; size_t single_proj_size = hidden_dim * - qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + head_dim; // size of each of Q,K,V,O weights for a single head size_t one_weight_file_size = num_heads * single_proj_size; // size of each of Q/K/V/O for all heads @@ -323,8 +323,7 @@ void load_attention_weights_v2(DT *ptr, assert(one_weight_file_size == host_array.size()); int data_index = 0; - int one_partition_size = - qkv_inner_dim * (num_heads / tensor_parallelism_degree); + int one_partition_size = head_dim * (num_heads / tensor_parallelism_degree); for (int i = 0; i < one_weight_file_size; i++) { int part_idx = (i / one_partition_size) % tensor_parallelism_degree; int block_num = (i / one_partition_size); @@ -392,6 +391,7 @@ void FileDataLoader::load_positions(FFModel *ff, // ff->get_parallel_tensor_from_tensor(pt, position_pt); position_pt->set_tensor(ff, dims_vec, data); + free(data); } //--------------------- quantization functions ---------------------- @@ -402,24 +402,22 @@ void FileDataLoader::load_positions(FFModel *ff, void load_attention_weights_quantized(char *ptr, int num_heads, size_t hidden_dim, - size_t qkv_inner_dim, + size_t head_dim, std::string layer_name, std::string weights_folder, DataType data_type, bool use_full_precision) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file, o_file}; int file_index = 0; size_t single_proj_size = hidden_dim * - qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + head_dim; // size of each of Q,K,V,O weights for a single head size_t one_weight_file_size = num_heads * single_proj_size; // size of each of Q/K/V/O for all heads @@ -652,14 +650,21 @@ void load_from_quantized_file(char *ptr, void FileDataLoader::load_quantization_weight(FFModel *ff, Layer *l, - int weight_idx) { - Tensor weight = l->weights[weight_idx]; - size_t volume = 1; + int weight_idx, + size_t volume, + size_t num_replicas, + char *weight, + DataType data_type, + Domain weight_domain) { + // Tensor weight = l->weights[weight_idx]; + size_t volume_ = 1; std::vector dims_vec; - for (int i = 0; i < weight->num_dims; i++) { - dims_vec.push_back(weight->dims[i]); - volume *= weight->dims[i]; + for (int i = 0; i < weight_domain.get_dim(); i++) { + int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1; + dims_vec.push_back(dim_i); + volume_ *= dim_i; } + assert(volume_ == volume * num_replicas); char *data = (char *)malloc(sizeof(char) * volume); std::string weight_filename = removeGuidOperatorName(std::string(l->name)); @@ -671,17 +676,17 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, load_attention_weights_quantized(data, num_heads, hidden_dim, - qkv_inner_dim, + head_dim, weight_filename, weights_folder, - weight->data_type, + data_type, use_full_precision); } // else { // load_attention_bias_quantized(data, // num_heads, // hidden_dim, - // qkv_inner_dim, + // head_dim, // weight_filename, // weights_folder); // } @@ -690,37 +695,47 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, if (weight_idx > 0) { assert(weight_idx == 0 || weight_idx == 1); if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } } load_from_quantized_file(data, volume, join_path({weights_folder, weight_filename}), - weight->data_type, + data_type, use_full_precision); } - ParallelTensor weight_pt; - ff->get_parallel_tensor_from_tensor(weight, weight_pt); - weight_pt->set_tensor(ff, dims_vec, data); + // ParallelTensor weight_pt; + // ff->get_parallel_tensor_from_tensor(weight, weight_pt); + // weight_pt->set_tensor(ff, dims_vec, data); + char *ptr = weight; + for (size_t i = 0; i < num_replicas; i++) { + memcpy(ptr, data, volume * sizeof(char)); + ptr += volume; + } - delete data; + free(data); } template void FileDataLoader::load_single_weight_tensor(FFModel *ff, Layer *l, - int weight_idx) { - Tensor weight = l->weights[weight_idx]; + int weight_idx, + size_t volume, + size_t num_replicas, + DT *weight, + Domain weight_domain) { // Create a buffer to store weight data from the file - size_t volume = 1; + size_t volume_ = 1; std::vector dims_vec; - for (int i = 0; i < weight->num_dims; i++) { - dims_vec.push_back(weight->dims[i]); - volume *= weight->dims[i]; + for (int i = 0; i < weight_domain.get_dim(); i++) { + int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1; + dims_vec.push_back(dim_i); + volume_ *= dim_i; } - assert(data_type_size(weight->data_type) == sizeof(DT)); + assert(volume_ == volume * num_replicas); + // assert(data_type_size(weight->data_type) == sizeof(DT)); DT *data = (DT *)malloc(sizeof(DT) * volume); std::string weight_filename = removeGuidOperatorName(std::string(l->name)); @@ -734,44 +749,34 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_filename.find("self_attention") != std::string::npos) { - load_attention_weights_multi_query( - data, weight_filename, weights_folder, hidden_dim, num_heads); - } else if (weight_filename.find("attention") != std::string::npos && - weight_filename.rfind("attention") == - weight_filename.length() - strlen("attention")) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - weight_filename, - weights_folder, - volume, - tensor_parallelism_degree); - } else { - long long value; - l->get_int_property("final_bias", value); - bool final_bias = (bool)value; - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - final_bias, - weight_filename, - weights_folder); - } - + if (weight_idx == 0) { + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + head_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); } else { - assert(false); + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + head_dim, + final_bias, + weight_filename, + weights_folder); } } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { assert(weight_idx >= 0 || weight_idx <= 2); weight_filename += (weight_idx == 0) - ? "_attn_bias" - : ((weight_idx == 1) ? "_weight" : "_bias"); + ? ".attn_bias" + : ((weight_idx == 1) ? ".weight" : ".bias"); std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = join_path({weights_folder, weight_filename}); @@ -781,7 +786,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, assert(weight_idx == 0 || weight_idx == 1); // handle exception if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = @@ -790,40 +795,123 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, } } - // Copy the weight data from the buffer to the weight's ParallelTensor - ParallelTensor weight_pt; - ff->get_parallel_tensor_from_tensor(weight, weight_pt); - weight_pt->set_tensor
(ff, dims_vec, data); + // Copy the weight data from the buffer to the weight + DT *ptr = weight; + for (size_t i = 0; i < num_replicas; i++) { + memcpy(ptr, data, volume * sizeof(DT)); + ptr += volume; + } // Free buffer memory - delete data; + free(data); } -void FileDataLoader::load_weights(FFModel *ff) { +void FileDataLoader::load_weight_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime) { + WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; + + assert(task->regions.size() == regions.size()); + assert(regions.size() == 1); // one weight only + GenericTensorAccessorW weight = helperGetGenericTensorAccessorWO( + args->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + + switch (args->data_type) { + case DT_HALF: { + args->loader->load_single_weight_tensor(args->ff, + args->layer, + args->weight_idx, + args->volume, + args->num_replicas, + weight.get_half_ptr(), + weight_domain); + break; + } + case DT_FLOAT: { + args->loader->load_single_weight_tensor(args->ff, + args->layer, + args->weight_idx, + args->volume, + args->num_replicas, + weight.get_float_ptr(), + weight_domain); + break; + } + case DT_INT4: + case DT_INT8: { + args->loader->load_quantization_weight(args->ff, + args->layer, + args->weight_idx, + args->volume, + args->num_replicas, + weight.get_byte_ptr(), + args->data_type, + weight_domain); + break; + } + default: + assert(false && "Unsupported data type"); + } +} + +void FileDataLoader::load_weights_parallel(FFModel *ff, + Context ctx, + Runtime *runtime) { + std::vector futures; + for (Layer *l : ff->layers) { if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) { continue; } + for (int i = 0; i < l->numWeights; i++) { Tensor weight = l->weights[i]; if (weight == NULL) { continue; } - switch (weight->data_type) { - case DT_HALF: - load_single_weight_tensor(ff, l, i); - break; - case DT_FLOAT: - load_single_weight_tensor(ff, l, i); - break; - case DT_INT4: - case DT_INT8: - // load weights in quantization - load_quantization_weight(ff, l, i); - break; - default: - assert(false && "Unsupported data type"); + + if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF && + weight->data_type != DT_INT4 && weight->data_type != DT_INT8) { + assert(false && "Unsupported data type"); + } + + ParallelTensor weight_pt; + ff->get_parallel_tensor_from_tensor(weight, weight_pt); + + // Create task arguments + size_t volume = 1, num_replicas = 1; + if (weight_pt->sync_type == ParameterSyncType::NCCL) { + for (int i = 0; i < weight_pt->num_dims; i++) { + if (weight_pt->dims[i].is_replica_dim) { + num_replicas *= weight_pt->dims[i].size; + } + } + } else if (weight_pt->sync_type == ParameterSyncType::PS) { + num_replicas = 1; + } else { + num_replicas = 1; + } + for (int i = 0; i < weight->num_dims; i++) { + volume *= weight->dims[i]; } + WeightLoadTaskArgs args( + ff, this, l, i, volume, num_replicas, weight->data_type); + // launch task asynchronously + TaskLauncher launcher(LOAD_WEIGHT_TASK_ID, + TaskArgument(&args, sizeof(WeightLoadTaskArgs))); + launcher.add_region_requirement(RegionRequirement( + weight_pt->region, WRITE_ONLY, EXCLUSIVE, weight_pt->region)); + launcher.add_field(0, FID_DATA); + futures.push_back(runtime->execute_task(ctx, launcher)); } } + + // Wait for all tasks to complete + for (Future &f : futures) { + f.get_void_result(); + } } diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index f8e8240cc..30f42327f 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -21,7 +21,7 @@ #include "flexflow/ops/argmax.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" -#include "flexflow/ops/beam_topk.h" +// #include "flexflow/ops/beam_topk.h" #include "flexflow/ops/cast.h" #include "flexflow/ops/concat.h" #include "flexflow/ops/conv_2d.h" @@ -33,6 +33,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/gumbel_topk.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -66,10 +67,10 @@ namespace FlexFlow::PCG { using namespace Legion; using FlexFlow::MachineView; -LegionRuntime::Logger::Category log_graph("graph"); -LegionRuntime::Logger::Category log_simplify("graph_simplify"); +Legion::Logger log_graph("graph"); +Legion::Logger log_simplify("graph_simplify"); -const Node Node::INVALID_NODE = Node(); +Node const Node::INVALID_NODE = Node(); Node::Node(void) : guid(0), ptr(NULL) {} @@ -2326,21 +2327,31 @@ GraphOptimalViewSerialized sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->layer_guid.model_id); - sez.serialize(attn->oProjSize); + sez.serialize(attn->o_dim); sez.serialize(attn->num_q_heads); - sez.serialize(attn->qProjSize); - sez.serialize(attn->vProjSize); + sez.serialize(attn->qk_dim); + sez.serialize(attn->v_dim); sez.serialize(attn->dropout); sez.serialize(attn->qkv_bias); sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); - sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.rope_theta); + sez.serialize(attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(), + attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.factor); + sez.serialize(attn->rotary_embedding_meta.low_freq_factor); + sez.serialize(attn->rotary_embedding_meta.high_freq_factor); + sez.serialize( + attn->rotary_embedding_meta.original_max_position_embeddings); sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); sez.serialize(attn->position_bias); sez.serialize(attn->quantization_type); sez.serialize(attn->offload); + sez.serialize(attn->streaming_cache); sez.serialize(attn->num_kv_heads); sez.serialize(attn->tensor_parallelism_degree); sez.serialize(strlen(attn->name)); @@ -2353,20 +2364,31 @@ GraphOptimalViewSerialized sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->layer_guid.model_id); - sez.serialize(attn->oProjSize); + sez.serialize(attn->o_dim); sez.serialize(attn->num_q_heads); - sez.serialize(attn->qProjSize); - sez.serialize(attn->vProjSize); + sez.serialize(attn->qk_dim); + sez.serialize(attn->v_dim); sez.serialize(attn->dropout); sez.serialize(attn->qkv_bias); sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); - sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.rope_theta); + sez.serialize(attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(), + attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.factor); + sez.serialize(attn->rotary_embedding_meta.low_freq_factor); + sez.serialize(attn->rotary_embedding_meta.high_freq_factor); + sez.serialize( + attn->rotary_embedding_meta.original_max_position_embeddings); sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); sez.serialize(attn->position_bias); + sez.serialize(attn->streaming_cache); sez.serialize(attn->num_kv_heads); + sez.serialize(attn->tensor_parallelism_degree); sez.serialize(strlen(attn->name)); sez.serialize(attn->name, strlen(attn->name)); break; @@ -2377,15 +2399,24 @@ GraphOptimalViewSerialized sez.serialize(attn->layer_guid.id); sez.serialize(attn->layer_guid.transformer_layer_id); sez.serialize(attn->layer_guid.model_id); - sez.serialize(attn->oProjSize); + sez.serialize(attn->o_dim); sez.serialize(attn->num_q_heads); - sez.serialize(attn->qProjSize); - sez.serialize(attn->vProjSize); + sez.serialize(attn->qk_dim); + sez.serialize(attn->v_dim); sez.serialize(attn->dropout); sez.serialize(attn->qkv_bias); sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); - sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.rope_theta); + sez.serialize(attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(), + attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.factor); + sez.serialize(attn->rotary_embedding_meta.low_freq_factor); + sez.serialize(attn->rotary_embedding_meta.high_freq_factor); + sez.serialize( + attn->rotary_embedding_meta.original_max_position_embeddings); sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); @@ -2805,8 +2836,9 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, offload, position_bias; + bool qkv_bias, final_bias, add_zero_attn, scaling_query, + qk_prod_scaling, offload, streaming_cache, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; DataType quantization_type; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); @@ -2821,13 +2853,24 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(qkv_bias); dez.deserialize(final_bias); dez.deserialize(add_zero_attn); - dez.deserialize(apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.rope_theta); + size_t rope_type_len; + char rope_type[1024] = {0}; + dez.deserialize(rope_type_len); + dez.deserialize(rope_type, rope_type_len); + rotary_embedding_meta.rope_type = std::string(rope_type); + dez.deserialize(rotary_embedding_meta.factor); + dez.deserialize(rotary_embedding_meta.low_freq_factor); + dez.deserialize(rotary_embedding_meta.high_freq_factor); + dez.deserialize(rotary_embedding_meta.original_max_position_embeddings); dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); dez.deserialize(position_bias); dez.deserialize(quantization_type); dez.deserialize(offload); + dez.deserialize(streaming_cache); dez.deserialize(num_kv_heads); dez.deserialize(tensor_parallelism_degree); size_t name_len; @@ -2845,13 +2888,14 @@ void FFModel::deserialize_graph_optimal_view( params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; - params.apply_rotary_embedding = apply_rotary_embedding; + params.rotary_embedding_meta = rotary_embedding_meta; params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; params.position_bias = position_bias; params.quantization_type = quantization_type; params.offload = offload; + params.streaming_cache = streaming_cache; params.num_kv_heads = num_kv_heads; params.tensor_parallelism_degree = tensor_parallelism_degree; strcpy(params.name, name); @@ -2860,10 +2904,12 @@ void FFModel::deserialize_graph_optimal_view( } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(num_inputs == 1); - int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; + int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, + tensor_parallelism_degree; float dropout, scaling_factor; bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; + scaling_query, qk_prod_scaling, position_bias, streaming_cache; + RotaryEmbeddingMeta rotary_embedding_meta; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); @@ -2877,12 +2923,24 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(qkv_bias); dez.deserialize(final_bias); dez.deserialize(add_zero_attn); - dez.deserialize(apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.rope_theta); + size_t rope_type_len; + char rope_type[1024] = {0}; + dez.deserialize(rope_type_len); + dez.deserialize(rope_type, rope_type_len); + rotary_embedding_meta.rope_type = std::string(rope_type); + dez.deserialize(rotary_embedding_meta.factor); + dez.deserialize(rotary_embedding_meta.low_freq_factor); + dez.deserialize(rotary_embedding_meta.high_freq_factor); + dez.deserialize(rotary_embedding_meta.original_max_position_embeddings); dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); dez.deserialize(position_bias); + dez.deserialize(streaming_cache); dez.deserialize(num_kv_heads); + dez.deserialize(tensor_parallelism_degree); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -2898,12 +2956,14 @@ void FFModel::deserialize_graph_optimal_view( params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; - params.apply_rotary_embedding = apply_rotary_embedding; + params.rotary_embedding_meta = rotary_embedding_meta; params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; params.position_bias = position_bias; + params.streaming_cache = streaming_cache; params.num_kv_heads = num_kv_heads; + params.tensor_parallelism_degree = tensor_parallelism_degree; strcpy(params.name, name); node = get_or_create_node(inputs[0], params); @@ -2914,8 +2974,9 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, offload, position_bias; + bool qkv_bias, final_bias, add_zero_attn, scaling_query, + qk_prod_scaling, offload, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; DataType quantization_type; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); @@ -2930,7 +2991,17 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(qkv_bias); dez.deserialize(final_bias); dez.deserialize(add_zero_attn); - dez.deserialize(apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.rope_theta); + size_t rope_type_len; + char rope_type[1024] = {0}; + dez.deserialize(rope_type_len); + dez.deserialize(rope_type, rope_type_len); + rotary_embedding_meta.rope_type = std::string(rope_type); + dez.deserialize(rotary_embedding_meta.factor); + dez.deserialize(rotary_embedding_meta.low_freq_factor); + dez.deserialize(rotary_embedding_meta.high_freq_factor); + dez.deserialize(rotary_embedding_meta.original_max_position_embeddings); dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); @@ -2954,7 +3025,7 @@ void FFModel::deserialize_graph_optimal_view( params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; - params.apply_rotary_embedding = apply_rotary_embedding; + params.rotary_embedding_meta = rotary_embedding_meta; params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; @@ -2972,14 +3043,18 @@ void FFModel::deserialize_graph_optimal_view( node = TopK::deserialize(*this, dez, inputs, num_inputs); break; } - case OP_ARG_TOPK: { - node = ArgTopK::deserialize(*this, dez, inputs, num_inputs); + case OP_GUMBEL_TOPK: { + node = GumbelTopK::deserialize(*this, dez, inputs, num_inputs); break; } - case OP_BEAM_TOPK: { - node = BeamTopK::deserialize(*this, dez, inputs, num_inputs); + case OP_ARG_TOPK: { + node = ArgTopK::deserialize(*this, dez, inputs, num_inputs); break; } + // case OP_BEAM_TOPK: { + // node = BeamTopK::deserialize(*this, dez, inputs, num_inputs); + // break; + // } case OP_SAMPLING: { node = Sampling::deserialize(*this, dez, inputs, num_inputs); break; @@ -3152,21 +3227,21 @@ void FFModel::deserialize_graph_optimal_view( optimal_views[guid_to_nodes[guid]] = view; } assert(dez.get_remaining_bytes() == 0); - printf("Deserialized Views...\n"); - for (auto const &it : optimal_views) { - printf("node[%zu]: type(%s) view(%d %d %d) ", - it.first.guid, - it.first.to_string().c_str(), - it.second.ndims, - it.second.dim[0], - it.second.start_device_id); - auto const &list = graph->inEdges.at(it.first); - for (auto const &it2 : list) { - Edge e = it2; - printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx); - } - printf("\n"); - } + // printf("Deserialized Views...\n"); + // for (auto const &it : optimal_views) { + // printf("node[%zu]: type(%s) view(%d %d %d) ", + // it.first.guid, + // it.first.to_string().c_str(), + // it.second.ndims, + // it.second.dim[0], + // it.second.start_device_id); + // auto const &list = graph->inEdges.at(it.first); + // for (auto const &it2 : list) { + // Edge e = it2; + // printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx); + // } + // printf("\n"); + // } } }; // namespace FlexFlow diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 2a94df8b4..ed0c2ed69 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -13,20 +13,24 @@ * limitations under the License. */ +#include "flexflow/batch_config.h" #include "flexflow/ffconst_utils.h" #include "flexflow/graph.h" +#include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/ops/fused.h" #include "flexflow/ops/noop.h" #include "flexflow/parallel_ops/parallel_op.h" #include "flexflow/request_manager.h" +#include +#include namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); -LegionRuntime::Logger::Category log_offload("Offloading"); +Legion::Logger log_inf_mgr("InferenceManager"); +Legion::Logger log_offload("Offloading"); InferenceManager::InferenceManager() {} @@ -53,11 +57,15 @@ bool parallel_tensor_list_overlaps(std::vector const &list1, return false; } -void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { +void InferenceManager::compile_model_and_allocate_buffer(FFModel *model, + bool is_llm) { // TODO: currently assume there is a single data-parallel pipeline // (i.e., data-parallel-degree == 1) assert(model->config.data_parallelism_degree == 1); - model->config.batchSize = BatchConfig::max_tokens_per_batch(); + model->config.batchSize = + std::max(is_llm ? BatchConfig::max_tokens_per_batch() + : BatchConfig::max_tokens_per_ssm_batch(), + BatchConfig::max_tokens_per_prefilling_batch()); model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; @@ -230,41 +238,41 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { } // print optimized graph - for (size_t i = 0; i < model->operators.size(); i++) { - Op *op = model->operators[i]; - if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) { - continue; - } - printf("operator[%zu]: type(%s) guid(%lu)\n", - i, - get_operator_type_name(model->operators[i]->op_type).c_str(), - model->operators[i]->op_guid); - for (int j = 0; j < op->numInputs; j++) { - assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); - LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region; - printf("\tinputs[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - for (int j = 0; j < op->numOutputs; j++) { - LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region; - printf("\toutputs[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - for (int j = 0; j < op->numWeights; j++) { - LogicalRegion handle = op->weights[j]->region; - printf("\tweights[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - } + // for (size_t i = 0; i < model->operators.size(); i++) { + // Op *op = model->operators[i]; + // if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) { + // continue; + // } + // printf("operator[%zu]: type(%s) guid(%lu)\n", + // i, + // get_operator_type_name(model->operators[i]->op_type).c_str(), + // model->operators[i]->op_guid); + // for (int j = 0; j < op->numInputs; j++) { + // assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); + // LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region; + // printf("\tinputs[%d] mapped_region(%d,%d,%d)\n", + // j, + // handle.get_index_space().get_id(), + // handle.get_field_space().get_id(), + // handle.get_tree_id()); + // } + // for (int j = 0; j < op->numOutputs; j++) { + // LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region; + // printf("\toutputs[%d] mapped_region(%d,%d,%d)\n", + // j, + // handle.get_index_space().get_id(), + // handle.get_field_space().get_id(), + // handle.get_tree_id()); + // } + // for (int j = 0; j < op->numWeights; j++) { + // LogicalRegion handle = op->weights[j]->region; + // printf("\tweights[%d] mapped_region(%d,%d,%d)\n", + // j, + // handle.get_index_space().get_id(), + // handle.get_field_space().get_id(), + // handle.get_tree_id()); + // } + // } } void InferenceManager::init_operators_inference(FFModel *model) { @@ -307,26 +315,8 @@ void InferenceManager::init_operators_inference(FFModel *model) { FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfig const &bc) { - if (bc.get_mode() == INC_DECODING_MODE) { - BatchConfigFuture bcf = Future::from_value(bc); - return inference(model, index, bcf); - } else if (bc.get_mode() == BEAM_SEARCH_MODE) { - BatchConfig const *bc_ptr = &bc; - BeamSearchBatchConfig const *bsbc_ptr = - static_cast(bc_ptr); - BeamSearchBatchConfigFuture bcf = - Future::from_value(*bsbc_ptr); - return inference(model, index, bcf); - } else if (bc.get_mode() == TREE_VERIFY_MODE) { - BatchConfig const *bc_ptr = &bc; - TreeVerifyBatchConfig const *tvbc_ptr = - static_cast(bc_ptr); - TreeVerifyBatchConfigFuture bcf = - Future::from_value(*tvbc_ptr); - return inference(model, index, bcf); - } else { - assert(false && "Unsupported inference mode"); - } + BatchConfigFuture bcf = Future::from_value(bc); + return inference(model, index, bcf); } FutureMap InferenceManager::inference(FFModel *model, @@ -503,6 +493,23 @@ void FFModel::set_transformer_layer_id(int id) { assert(id < MAX_NUM_TRANSFORMER_LAYERS); } +void FFModel::set_num_transformer_layers(int num_layers) { + num_transformer_layers = num_layers; +} + +void FFModel::set_num_kv_heads(int num_heads) { + num_kv_heads = num_heads; +} + +void FFModel::set_qkv_dim(int qkv) { + qkv_dim = qkv; +} + +void FFModel::set_size_dt(int dt) { + printf("Setting size_dt to %d\n", dt); + size_dt = dt; +} + void FFModel::set_position_offset(int offset) { assert(offset == 0 || offset == 2); position_offset = offset; @@ -535,7 +542,7 @@ void FFModel::compile_inference() { deserialize_graph_optimal_view(dez, best_graph, optimal_views); operators.clear(); convert_graph_to_operators(best_graph, optimal_views); - best_graph->print_dot(); + // best_graph->print_dot(); delete best_graph; for (auto const &layer : layers) { // map inputs to parallel tensor @@ -656,6 +663,7 @@ void FFModel::compile_inference() { false /*must*/, 0 /*mapper_id*/, view.hash() /*MappingTagID*/); + index_launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, index_launcher); fm.wait_all_results(); int idx = 0; @@ -691,4 +699,87 @@ std::string join_path(std::vector const &paths) { return joined; } +void EmissionMachine::wait_until_next_request() { + // use last_request_time to determine the next request time + // and sleep until then + if (last_request_time_ms == 0) { + last_request_time_ms = Realm::Clock::current_time_in_microseconds() * 1e-3; + } + double current_time = Realm::Clock::current_time_in_microseconds() * 1e-3; + double time_to_sleep = + get_next_interval_ms() - (current_time - last_request_time_ms); + if (time_to_sleep > 0) { + usleep(static_cast(time_to_sleep * 1e3)); + elapsed_time_ms += time_to_sleep; + } + last_request_time_ms = Realm::Clock::current_time_in_microseconds() * 1e-3; +} + +double EmissionMachine::get_elapsed_time_ms() { + return elapsed_time_ms; +} + +EmissionTrace::EmissionTrace(json const &json_obj) { + prompt = json_obj["prompt"].get(); + input_length = json_obj["input_length"].get(); + output_length = json_obj["output_length"].get(); + slo_ratio = json_obj["slo_ratio"].get(); + emission_time_ms = json_obj["emission_time_ms"].get(); +} + +json EmissionTrace::to_json() const { + json json_obj; + json_obj["prompt"] = prompt; + json_obj["input_length"] = input_length; + json_obj["output_length"] = output_length; + json_obj["slo_ratio"] = slo_ratio; + json_obj["emission_time_ms"] = emission_time_ms; + return json_obj; +} + +double ConstantEmissionMachine::get_next_interval_ms() { + return interval_ms; +} + +double PoissonEmissionMachine::get_next_interval_ms() { + // Note that these are static so multiple instances will share the same + // generator and distribution. + static std::default_random_engine generator( + std::chrono::system_clock::now().time_since_epoch().count()); + static std::exponential_distribution distribution(lambda); + return distribution(generator) * 1e3; +} + +double TraceEmissionMachine::get_next_interval_ms() { + if (timestamps.empty()) { + return 0; + } + double next_interval = timestamps[idx] - elapsed_time_ms; + idx++; + return next_interval; +} + +double EmissionMachine::sample_slo_ratio() { + assert(!slo_ratios.empty()); + static std::default_random_engine generator( + std::chrono::system_clock::now().time_since_epoch().count()); + static std::uniform_real_distribution distribution(0.0, 1.0); + double r = distribution(generator); + + for (auto const &pair : slo_ratios) { + if (r < pair.second) { + return pair.first; + } + } + return slo_ratios.back().first; +} + +double TraceEmissionMachine::sample_slo_ratio() { + // NOTE: Should be called before wait_until_next_request. + if (ratios.empty()) { + return 1.0; + } + double next_slo_ratio = ratios[idx]; + return next_slo_ratio; +} }; // namespace FlexFlow diff --git a/src/runtime/layer.cc b/src/runtime/layer.cc index 8f33f6db8..72e71688c 100644 --- a/src/runtime/layer.cc +++ b/src/runtime/layer.cc @@ -87,6 +87,11 @@ void Layer::add_int_vector_property(std::string const &key, int_vector_properties[key] = value; } +void Layer::add_string_property(std::string const &key, + std::string const &value) { + string_properties[key] = value; +} + void Layer::add_initializer(std::string const &key, Initializer *initializer) { initializers[key] = initializer; } @@ -125,6 +130,18 @@ bool Layer::get_int_vector_property(std::string const &key, } } +bool Layer::get_string_property(std::string const &key, + std::string &value) const { + auto const &it = string_properties.find(key); + if (it == string_properties.end()) { + assert(false); + return false; + } else { + value = it->second; + return true; + } +} + bool Layer::get_initializer(std::string const &key, Initializer *&initializer) const { auto const &it = initializers.find(key); diff --git a/src/runtime/memory_allocator.cc b/src/runtime/memory_allocator.cc index 06a7c468a..46bef18c8 100644 --- a/src/runtime/memory_allocator.cc +++ b/src/runtime/memory_allocator.cc @@ -14,6 +14,7 @@ */ #include "flexflow/utils/memory_allocator.h" +#include "flexflow/mapper.h" namespace FlexFlow { @@ -21,14 +22,30 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; using Realm::RegionInstance; +using namespace Legion; +using namespace Mapping; + +Legion::Logger log_ff_mem_allocator("MemoryAllocator"); MemoryAllocator::MemoryAllocator(Memory _memory) : memory(_memory), reserved_ptr(nullptr), instance_ptr(nullptr), reserved_total_size(0), reserved_allocated_size(0), - instance_total_size(0), instance_allocated_size(0) {} + instance_total_size(0), instance_allocated_size(0), + log_instance_creation(false) { + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--log-instance-creation")) { + log_instance_creation = true; + break; + } + } +} void MemoryAllocator::create_legion_instance(RegionInstance &inst, - size_t size) { + size_t size, + char const *task_name) { // Assert that we have used up previously created region instance assert(instance_total_size == instance_allocated_size); Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), @@ -38,6 +55,16 @@ void MemoryAllocator::create_legion_instance(RegionInstance &inst, Realm::RegionInstance::create_instance( inst, memory, bounds, field_sizes, 0, Realm::ProfilingRequestSet()) .wait(); + if (log_instance_creation) { + log_ff_mem_allocator.print( + "Created instance in memory_kind: %s memory_id: %llx size: %zu " + "(capacity %lu) task_name: %s", + Legion::Mapping::Utilities::to_string(memory.kind()), + memory.id, + size, + memory.capacity(), + ((task_name != NULL) ? task_name : "unknown")); + } instance_ptr = inst.pointer_untyped(0, 0); instance_total_size = size; instance_allocated_size = 0; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 1fa281777..2a72029c5 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -30,7 +30,7 @@ #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" -#include "flexflow/ops/beam_topk.h" +// #include "flexflow/ops/beam_topk.h" #include "flexflow/ops/cache.h" #include "flexflow/ops/cast.h" #include "flexflow/ops/concat.h" @@ -44,6 +44,7 @@ #include "flexflow/ops/fused.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/gumbel_topk.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -82,8 +83,8 @@ namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_model("Model"); -LegionRuntime::Logger::Category log_measure("measure"); +Legion::Logger log_model("Model"); +Legion::Logger log_measure("measure"); Op::Op(FFModel &model, OperatorType otype, @@ -93,10 +94,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - const ParallelTensor input1, - const ParallelTensor input2, - const ParallelTensor input3, - const ParallelTensor input4) + ParallelTensor const input1, + ParallelTensor const input2, + ParallelTensor const input3, + ParallelTensor const input4) : Op(model, otype, dtype, @@ -116,10 +117,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - const ParallelTensor _input1, - const ParallelTensor _input2, - const ParallelTensor _input3, - const ParallelTensor _input4) + ParallelTensor const _input1, + ParallelTensor const _input2, + ParallelTensor const _input3, + ParallelTensor const _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling), @@ -604,6 +605,29 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank)); // fprintf(stderr, "ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", // ncclComm, allRanks, myRank, ncclId); + + // Double check that we already enabled P2P access between all GPUs + for (int i = 0; i < allRanks; i++) { + if (i == myRank) { + continue; + } + cudaError_t err = cudaDeviceEnablePeerAccess(i, 0); + if (err == cudaSuccess) { + printf("P2P access successfully enabled between GPU %d and GPU %d\n", + myRank, + i); + } else if (err == cudaErrorPeerAccessAlreadyEnabled) { + printf("P2P access is already enabled between GPU %d and GPU %d\n", + myRank, + i); + } else { + printf("Failed to enable P2P access between GPU %d and GPU %d: %s\n", + myRank, + i, + cudaGetErrorString(err)); + assert(false && "Failed to enable P2P access"); + } + } return ncclComm; } @@ -1035,9 +1059,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(const ParallelTensor output, +int Op::get_output_to_input_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor input) { + ParallelTensor const input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1070,9 +1094,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output, return -1; } -int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, +int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor weight) { + ParallelTensor const weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1242,12 +1266,15 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff, #define DIMFUNC(DIM) \ case DIM: { \ Rect rect = domain; \ - int idx = 0; \ + int idx = 0, num_devices = rect.volume(); \ for (PointInRectIterator it(rect); it(); it++) { \ FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ if (op_type == OP_ALLREDUCE) { \ ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ - handle.ncclComm = nccl_comms[idx++]; \ + handle.ncclComm = nccl_comms[idx]; \ + handle.num_devices = num_devices; \ + handle.device_id = idx; \ + idx++; \ } \ argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); \ } \ @@ -1589,41 +1616,47 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } +#ifdef FF_USE_NCCL +void FFModel::finish_nccl_comms() { + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + for (auto const &comm : view_hash_to_nccl_comms) { + // Find the machine view that has the hash + MachineView view; + for (size_t l = 0; l < operators.size(); l++) { + view = operators[l]->outputs[0]->machine_view; + if (view.hash() == comm.first) { + break; + } + } + assert(view.hash() == comm.first && "Cannot find the machine view"); + IndexSpace task_is = get_or_create_task_is(view); + Domain domain = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = 0; + for (Domain::DomainPointIterator it(domain); it; it++, idx++) { + argmap.set_point(*it, + TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); + } + IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + comm.first); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + } +} +#endif + FFModel::~FFModel() { // Destroy nccl communication groups #ifdef FF_USE_NCCL if (config.computationMode == COMP_MODE_TRAINING) { - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - for (auto const &comm : view_hash_to_nccl_comms) { - // Find the machine view that has the hash - MachineView view; - for (size_t l = 0; l < operators.size(); l++) { - view = operators[l]->outputs[0]->machine_view; - if (view.hash() == comm.first) { - break; - } - } - assert(view.hash() == comm.first && "Cannot find the machine view"); - IndexSpace task_is = get_or_create_task_is(view); - Domain domain = runtime->get_index_space_domain(ctx, task_is); - ArgumentMap argmap; - int idx = 0; - for (Domain::DomainPointIterator it(domain); it; it++, idx++) { - argmap.set_point(*it, - TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); - } - IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, - task_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - comm.first); - FutureMap fm = runtime->execute_index_space(ctx, index_launcher); - fm.wait_all_results(); - } + finish_nccl_comms(); } #endif } @@ -1706,7 +1739,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1739,7 +1772,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1789,7 +1822,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], +ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1870,7 +1903,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], +ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1901,7 +1934,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1921,7 +1954,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -2135,7 +2168,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - const Tensor tensor, ParallelTensor ¶llel_tensor) const { + Tensor const tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -2172,7 +2205,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2195,7 +2228,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2228,7 +2261,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -2252,7 +2285,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -2289,7 +2322,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(const ParallelTensor tensor, +void FFModel::create_disjoint_partition(ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2337,7 +2370,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2719,7 +2752,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { +IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -3241,16 +3274,21 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } - case OP_ARG_TOPK: { - Op *op = ArgTopK::create_operator_from_layer(*this, layer, inputs); + case OP_GUMBEL_TOPK: { + Op *op = GumbelTopK::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); return op; } - case OP_BEAM_TOPK: { - Op *op = BeamTopK::create_operator_from_layer(*this, layer, inputs); + case OP_ARG_TOPK: { + Op *op = ArgTopK::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); return op; } + // case OP_BEAM_TOPK: { + // Op *op = BeamTopK::create_operator_from_layer(*this, layer, inputs); + // operators.push_back(op); + // return op; + // } case OP_SAMPLING: { Op *op = Sampling::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -3308,7 +3346,7 @@ bool FFModel::is_mlp_block(int layer_idx) const { } void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; + std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { auto const &l = layers[layer_idx]; @@ -3321,10 +3359,14 @@ void FFModel::create_operators_from_layers() { } Op *op = nullptr; // add a combine before arg_topk + // if (config.computationMode == COMP_MODE_INFERENCE && + // config.tensor_parallelism_degree > 1 && + // (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX || + // l->op_type == OP_ARGMAX || l->op_type == OP_GUMBEL_TOPK)) { if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && - (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX || - l->op_type == OP_ARGMAX)) { + (l->op_type == OP_SOFTMAX || l->op_type == OP_ARGMAX || + l->op_type == OP_GUMBEL_TOPK)) { std::vector partitioned_inputs; assert(inputs.size() == 1); Combine *comb = new Combine(*this, @@ -3351,6 +3393,7 @@ void FFModel::create_operators_from_layers() { config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || // mlp layer is_mlp_block(layer_idx) || // llama mlp layer @@ -4061,39 +4104,40 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - const static int epochs = 1; + static int const epochs = 1; // const static int iterations = 1; - const static int batchSize = 64; - const static bool profiling = false; - const static bool benchmarking = false; - const static bool inference_debugging = false; + static int const batchSize = 64; + static bool const log_instance_creation = false; + static bool const profiling = false; + static bool const benchmarking = false; + static bool const inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB - const static int numNodes = 1; - const static int workersPerNode = 0; - const static int cpusPerNode = 0; - const static size_t searchBudget = -1; - const static size_t simulatorWorkSpaceSize = + static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB + static int const numNodes = 1; + static int const workersPerNode = 0; + static int const cpusPerNode = 0; + static size_t const searchBudget = -1; + static size_t const simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GB constexpr static float searchAlpha = 1.2f; - const static bool searchOverlapBackwardUpdate = false; - const static size_t offloadReserveSpaceSize = + static bool const searchOverlapBackwardUpdate = false; + static size_t const offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB - const static bool cpuOffload = false; - const static bool onlyDataParallel = true; - const static bool enableSampleParallel = true; - const static bool enableParameterParallel = false; - const static bool enableAttributeParallel = false; - const static bool enableInplaceOptimizations = false; - const static bool allowTensorOpMathConversion = false; - const static int machine_model_version = 0; - const static int simulator_segment_size = 16777216; // 16 MB - const static int simulator_max_num_segments = 1; - const static int base_optimize_threshold = 10; - const static bool enable_control_replication = true; + static bool const cpuOffload = false; + static bool const onlyDataParallel = true; + static bool const enableSampleParallel = true; + static bool const enableParameterParallel = false; + static bool const enableAttributeParallel = false; + static bool const enableInplaceOptimizations = false; + static bool const allowTensorOpMathConversion = false; + static int const machine_model_version = 0; + static int const simulator_segment_size = 16777216; // 16 MB + static int const simulator_max_num_segments = 1; + static int const base_optimize_threshold = 10; + static bool const enable_control_replication = true; // The default python data loader type is 2 to enable control replication - const static int python_data_loader_type = 2; + static int const python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -4101,6 +4145,7 @@ FFConfig::FFConfig() { // iterations = DefaultConfig::iterations; batchSize = DefaultConfig::batchSize; profiling = DefaultConfig::profiling; + log_instance_creation = DefaultConfig::log_instance_creation; benchmarking = DefaultConfig::benchmarking; inference_debugging = DefaultConfig::inference_debugging; learningRate = DefaultConfig::learningRate; @@ -4288,6 +4333,10 @@ void FFConfig::parse_args(char **argv, int argc) { cpusPerNode = atoi(argv[++i]); continue; } + if ((!strcmp(argv[i], "--log-instance-creation"))) { + log_instance_creation = true; + continue; + } if (!strcmp(argv[i], "--profiling")) { profiling = true; continue; @@ -4452,105 +4501,56 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } - // RequestManager prepare_next_batch + // RequestMang get_next_batch_config { - TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID, - "RequestManager Prepare Next Batch"); + TaskVariantRegistrar registrar(RM_GET_NEXT_BATCH_CONFIG_TASK_ID, + "RequestManager Get Next Batch Config"); registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant< BatchConfig, - RequestManager::prepare_next_batch_task>( - registrar, "RequestManager Prepare Next Batch Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant( - registrar); - } - } - // RequestManager prepare_next_batch_beam - { - TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, - "RequestManager Prepare Next Batch (Beam)"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant< - BeamSearchBatchConfig, - RequestManager::prepare_next_batch_beam_task>( - registrar, "RequestManager Prepare Next Batch (Beam) Task"); + RequestManager::get_next_batch_config_task>( + registrar, "RequestManager Get Next Batch Config Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } runtime - ->register_task_variant( + ->register_task_variant( registrar); } } - // RequestManager prepare_next_batch_init - { - TaskVariantRegistrar registrar( - RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, - "RequestManager Prepare Next Batch (Init Beam)"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant< - BeamSearchBatchConfig, - RequestManager::prepare_next_batch_init_task>( - registrar, "RequestManager Prepare Next Batch (Init Beam) Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime - ->register_task_variant( - registrar); - } - } - // RequestManager prepare_next_batch_verify + // RequestManager background serving task { - TaskVariantRegistrar registrar( - RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, - "RequestManager Prepare Next Batch (Verify)"); + TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID, + "RequestManager Background Serving Task"); registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - registrar.set_leaf(); + // registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant< - TreeVerifyBatchConfig, - RequestManager::prepare_next_batch_verify_task>( - registrar, "RequestManager Prepare Next Batch (Verify) Task"); + RequestManager::background_serving_task>( + registrar, "RequestManager Background Serving Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant< - TreeVerifyBatchConfig, - RequestManager::prepare_next_batch_verify_task>(registrar); + runtime->register_task_variant( + registrar); } } - // RequestManager background serving task { - TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID, - "RequestManager Background Serving Task"); + TaskVariantRegistrar registrar(LOAD_WEIGHT_TASK_ID, "load_weight_task"); registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - // registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant< - RequestManager::background_serving_task>( - registrar, "RequestManager Background Serving Task"); + Runtime::preregister_task_variant( + registrar, "load_weight_task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant( + runtime->register_task_variant( registrar); } } @@ -6007,86 +6007,143 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } - // ArgTopk task + // GumbelTopk task { - TaskVariantRegistrar registrar(ARG_TOPK_INIT_TASK_ID, "ArgTopK Init"); + TaskVariantRegistrar registrar(GUMBEL_TOPK_INIT_TASK_ID, "GumbelTopK Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "ArgTopK Init Task"); + Runtime::preregister_task_variant( + registrar, "GumbelTopK Init Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant( + registrar); } } { - TaskVariantRegistrar registrar(ARG_TOPK_INF_TASK_ID, "ArgTopK Inference"); + TaskVariantRegistrar registrar(GUMBEL_TOPK_INF_TASK_ID, + "GumbelTopK Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "ArgTopK Inference Task"); + GumbelTopK::inference_task>( + registrar, "GumbelTopK Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant( - registrar); + runtime + ->register_task_variant( + registrar); } } { - TaskVariantRegistrar registrar(ARG_TOPK_INF_SPECULATIVE_TASK_ID, - "ArgTopK Speculative Inference"); + TaskVariantRegistrar registrar(GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID, + "GumbelTopK Speculative Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "ArgTopK Speculative Inference Task"); + Runtime::preregister_task_variant( + registrar, "GumbelTopK Speculative Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant( + runtime->register_task_variant( registrar); } } - // BeamTopk task + // ArgTopk task { - TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init"); + TaskVariantRegistrar registrar(ARG_TOPK_INIT_TASK_ID, "ArgTopK Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "BeamTopK Init Task"); + Runtime::preregister_task_variant( + registrar, "ArgTopK Init Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); } } { - TaskVariantRegistrar registrar(BEAM_TOPK_INF_TASK_ID, "BeamTopK Inference"); + TaskVariantRegistrar registrar(ARG_TOPK_INF_TASK_ID, "ArgTopK Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "BeamTopK Inference Task"); + Runtime::preregister_task_variant( + registrar, "ArgTopK Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant( + registrar); } } + { + TaskVariantRegistrar registrar(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + "ArgTopK Speculative Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgTopK Speculative Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + // BeamTopk task + // { + // TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK + // Init"); + // registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + // registrar.set_leaf(); + // if (pre_register) { + // Runtime::preregister_task_variant( + // registrar, "BeamTopK Init Task"); + // } else { + // if (enable_control_replication) { + // registrar.global_registration = false; + // } + // runtime->register_task_variant(registrar); + // } + // } + // { + // TaskVariantRegistrar registrar(BEAM_TOPK_INF_TASK_ID, "BeamTopK + // Inference"); + // registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + // registrar.set_leaf(); + // if (pre_register) { + // Runtime::preregister_task_variant( + // registrar, "BeamTopK Inference Task"); + // } else { + // if (enable_control_replication) { + // registrar.global_registration = false; + // } + // runtime + // ->register_task_variant( + // registrar); + // } + // } // Sampling task { TaskVariantRegistrar registrar(SAMPLING_INIT_TASK_ID, "Sampling Init"); @@ -6139,15 +6196,16 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( registrar, "ArgMax Inference Task Beam"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime + ->register_task_variant( + registrar); } } { @@ -6408,6 +6466,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Forward Task"); @@ -6422,6 +6481,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Inference Task"); @@ -6436,6 +6497,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Backward Task"); @@ -6643,6 +6705,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, "AllReduce Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Inference Task"); @@ -6657,6 +6721,10 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrentluy since they + // use ncclAllReduce internally + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Forward Task"); @@ -6671,6 +6739,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrentluy since they + // use ncclAllReduce internally if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Backward Task"); @@ -6748,6 +6818,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(SGD_UPD_NCCL_TASK_ID, "SGD NCCL Update"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "SGD NCCL Update Task"); @@ -6762,6 +6834,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ADAM_UPD_NCCL_TASK_ID, "Adam NCCL Update"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "Adam NCCL Update Task"); @@ -6898,6 +6972,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, "NCCL Init Communicators"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "NCCL Init Communicators Task"); @@ -6914,6 +6990,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, "NCCL Finish Communicators"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "NCCL Finish Communicators Task"); diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index ad2b78156..2f8631b24 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -155,7 +155,9 @@ FFHandler } else { handle.offload_reserve_space = nullptr; } - if (handle.batch_config_metadata_size > 0) { + if (handle.batch_config_metadata_size + + handle.attention_metadata->mem_size() > + 0) { // allocate memory for offload reserve space Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) @@ -163,7 +165,8 @@ FFHandler .first(); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + Realm::Point<1, coord_t>(handle.batch_config_metadata_size + + handle.attention_metadata->mem_size() - 1)); std::vector field_sizes; field_sizes.push_back(sizeof(char)); Realm::RegionInstance workspaceInst; @@ -176,12 +179,19 @@ FFHandler .wait(); handle.batch_config_metadata = workspaceInst.pointer_untyped(0, sizeof(char)); + handle.attention_metadata->assign_address( + static_cast(static_cast(handle.batch_config_metadata) + + handle.batch_config_metadata_size), + handle.attention_metadata->mem_size()); } else { handle.batch_config_metadata = nullptr; + handle.attention_metadata->assign_address(nullptr, 0); } // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; + handle.num_devices = 0; + handle.device_id = 0; #endif return handle; } diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 23b7f0efb..962d2c345 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -14,6 +14,7 @@ */ #include "flexflow/model.h" #include "flexflow/utils/cuda_helper.h" +#include namespace FlexFlow { // declare Legion names @@ -89,11 +90,30 @@ FFHandler handle.offload_reserve_space_size = info->offload_reserve_space_size; handle.quantization_type = info->quantization_type; handle.allowTensorOpMathConversion = info->allowTensorOpMathConversion; + handle.incr_attention_metadata = new AttentionMetaData(); + handle.tree_search_attention_metadata = new AttentionMetaData(); + handle.tree_verify_attention_metadata = new AttentionMetaData(); + assert(handle.incr_attention_metadata != nullptr && + "Attention metadata must be allocated"); + assert(handle.tree_search_attention_metadata != nullptr && + "Attention metadata must be allocated"); + assert(handle.tree_verify_attention_metadata != nullptr && + "Attention metadata must be allocated"); checkCUDA(cublasCreate(&handle.blas)); + checkCUDA(cublasLtCreate(&handle.blasLt)); if (handle.allowTensorOpMathConversion) { checkCUDA(cublasSetMathMode(handle.blas, CUBLAS_TENSOR_OP_MATH)); } checkCUDNN(cudnnCreate(&handle.dnn)); + handle.num_devices = 0; + handle.device_id = 0; + handle.gemm_engine = new Internal::GemmEngine(handle.blas, handle.blasLt); + // We may not use all devices, physical_device may not be successive, so we + // explicitly get the physical device id + int physical_device; + checkCUDA(cudaGetDevice(&physical_device)); + checkCUDA(cudaGetDeviceProperties(handle.gemm_engine->device_prop, + physical_device)); // #ifdef FF_USE_NCCL // checkNCCL(ncclCommInitRank(&handle.nccl, info->allRanks, info->ncclId, // info->myRank)); fprintf(stderr, "handle.nccl(%p)\n", handle.nccl); @@ -151,7 +171,12 @@ FFHandler } else { handle.offload_reserve_space = nullptr; } - if (handle.batch_config_metadata_size > 0) { + if (handle.batch_config_metadata_size + + handle.incr_attention_metadata->mem_size() + + handle.tree_search_attention_metadata->mem_size() + + handle.tree_verify_attention_metadata->mem_size() + + handle.gemm_engine->workspace_size > + 0) { // allocate memory for offload reserve space Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) @@ -159,7 +184,12 @@ FFHandler .first(); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + Realm::Point<1, coord_t>( + handle.batch_config_metadata_size + + handle.incr_attention_metadata->mem_size() + + handle.tree_search_attention_metadata->mem_size() + + handle.tree_verify_attention_metadata->mem_size() + + handle.gemm_engine->workspace_size - 1)); std::vector field_sizes; field_sizes.push_back(sizeof(char)); Realm::RegionInstance workspaceInst; @@ -172,8 +202,34 @@ FFHandler .wait(); handle.batch_config_metadata = workspaceInst.pointer_untyped(0, sizeof(char)); + handle.incr_attention_metadata->assign_address( + static_cast(static_cast(handle.batch_config_metadata) + + handle.batch_config_metadata_size), + handle.incr_attention_metadata->mem_size()); + handle.tree_search_attention_metadata->assign_address( + static_cast(static_cast(handle.batch_config_metadata) + + handle.batch_config_metadata_size + + handle.incr_attention_metadata->mem_size()), + handle.tree_search_attention_metadata->mem_size()); + handle.tree_verify_attention_metadata->assign_address( + static_cast(static_cast(handle.batch_config_metadata) + + handle.batch_config_metadata_size + + handle.incr_attention_metadata->mem_size() + + handle.tree_search_attention_metadata->mem_size()), + handle.tree_verify_attention_metadata->mem_size()); + handle.gemm_engine->assign_workspace( + static_cast(static_cast(handle.batch_config_metadata) + + handle.batch_config_metadata_size + + handle.incr_attention_metadata->mem_size() + + handle.tree_search_attention_metadata->mem_size() + + handle.tree_verify_attention_metadata->mem_size()), + handle.gemm_engine->workspace_size); } else { handle.batch_config_metadata = nullptr; + handle.incr_attention_metadata->assign_address(nullptr, 0); + handle.tree_search_attention_metadata->assign_address(nullptr, 0); + handle.tree_verify_attention_metadata->assign_address(nullptr, 0); + handle.gemm_engine->assign_workspace(nullptr, 0); } // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 6b2d223f5..33e945774 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -7,7 +7,7 @@ #include "flexflow/ops/attention.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" -#include "flexflow/ops/beam_topk.h" +// #include "flexflow/ops/beam_topk.h" #include "flexflow/ops/cache.h" #include "flexflow/ops/cast.h" #include "flexflow/ops/concat.h" @@ -19,6 +19,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/ops/gather.h" #include "flexflow/ops/groupby.h" +#include "flexflow/ops/gumbel_topk.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" @@ -129,6 +130,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Split *)op)->get_params(); case OP_TOPK: return ((TopK *)op)->get_params(); + case OP_GUMBEL_TOPK: + return ((GumbelTopK *)op)->get_params(); case OP_GROUP_BY: return ((Group_by *)op)->get_params(); case OP_AGGREGATE: @@ -141,8 +144,8 @@ tl::optional get_op_parameters(Op const *op) { return ((ResidualRMSNorm *)op)->get_params(); case OP_ARG_TOPK: return ((ArgTopK *)op)->get_params(); - case OP_BEAM_TOPK: - return ((BeamTopK *)op)->get_params(); + // case OP_BEAM_TOPK: + // return ((BeamTopK *)op)->get_params(); case OP_SAMPLING: return ((Sampling *)op)->get_params(); case OP_ARGMAX: diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index c42a0c9aa..96b735803 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -311,7 +311,7 @@ void SGDOptimizer::nccl_update_task(Task const *task, } } - nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr); + nccl_update_task_gpu(ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr); } #endif @@ -603,7 +603,8 @@ void AdamOptimizer::nccl_update_task(Task const *task, } } - nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); + nccl_update_task_gpu( + ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } #endif diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index e71adc87a..a33ee35de 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -21,7 +21,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_optimizer("optimizer"); +Legion::Logger log_optimizer("optimizer"); __global__ void sgd_update(size_t count, float lr, @@ -86,7 +86,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, } #ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, +__host__ void SGDOptimizer::nccl_update_task_gpu(Legion::Context ctx, + Legion::Runtime *runtime, + SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -96,6 +98,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, size, @@ -103,6 +106,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, ncclSum, meta->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // Step 2: SGD update @@ -208,7 +212,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, } #ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, +__host__ void AdamOptimizer::nccl_update_task_gpu(Legion::Context ctx, + Legion::Runtime *runtime, + AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -218,6 +224,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, size, @@ -225,6 +232,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, ncclSum, meta->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update @@ -247,4 +255,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, } #endif -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index 5f654fbb5..6bc3d52b2 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -20,7 +20,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_optimizer("optimizer"); +Legion::Logger log_optimizer("optimizer"); __global__ void sgd_update(size_t count, float lr, @@ -75,7 +75,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, } #ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, +__host__ void SGDOptimizer::nccl_update_task_gpu(Legion::Context ctx, + Legion::Runtime *runtime, + SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -85,6 +87,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, size, @@ -92,6 +95,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, ncclSum, meta->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -183,7 +187,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, } #ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, +__host__ void AdamOptimizer::nccl_update_task_gpu(Legion::Context ctx, + Legion::Runtime *runtime, + AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -193,6 +199,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, size, @@ -200,6 +207,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, ncclSum, meta->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc new file mode 100644 index 000000000..7fbb16bcd --- /dev/null +++ b/src/runtime/page_manager.cc @@ -0,0 +1,246 @@ +/* Copyright 2023 CMU, Stanford, Facebook, LANL + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/page_manager.h" + +namespace FlexFlow { + +// For all runtime functions, they share a single page manager for pages +// information +PageManager *page_manager_singleton = nullptr; + +// the interface of logicaltokenblock +LogicalTokenBlock::LogicalTokenBlock(int block_number, uint32_t block_size) + : block_number(block_number), block_size(block_size), num_tokens(0), + num_commit_tokens(0), num_spec_tokens(0) {} + +bool LogicalTokenBlock::is_empty() const { + assert(num_spec_tokens == 0 && num_commit_tokens == 0); + assert(num_tokens <= block_size); + return num_tokens == 0; +} + +bool LogicalTokenBlock::is_full() const { + assert(num_spec_tokens + num_commit_tokens == num_tokens); + assert(num_tokens <= block_size); + return num_tokens == block_size; +} + +int LogicalTokenBlock::get_num_empty_slots() const { + assert(num_spec_tokens + num_commit_tokens == num_tokens); + assert(num_tokens <= block_size); + return block_size - num_tokens; +} + +int LogicalTokenBlock::get_num_alloc_slots() const { + assert(num_spec_tokens + num_commit_tokens == num_tokens); + assert(num_tokens <= block_size); + return num_tokens; +} + +void LogicalTokenBlock::reset_num_spec_tokens() { + assert(num_spec_tokens + num_commit_tokens == num_tokens); + assert(num_tokens <= block_size); + + num_tokens -= num_spec_tokens; + num_spec_tokens = 0; + + assert(num_spec_tokens + num_commit_tokens == num_tokens); + assert(num_tokens <= block_size); +} + +void LogicalTokenBlock::append_tokens( + std::vector const &token_ids_to_append, bool committed) { + assert(num_spec_tokens + num_commit_tokens == num_tokens); + assert(num_tokens <= block_size); + if (num_tokens + token_ids_to_append.size() > block_size) { + printf("block is full! Cannot append more tokens\n"); + throw std::runtime_error("Block is full! Cannot append more tokens."); + } + token_ids.insert( + token_ids.end(), token_ids_to_append.begin(), token_ids_to_append.end()); + num_tokens += token_ids_to_append.size(); + if (committed) { + num_commit_tokens += token_ids_to_append.size(); + } else { + num_spec_tokens += token_ids_to_append.size(); + } + assert(num_spec_tokens + num_commit_tokens == num_tokens); + assert(num_tokens <= block_size); +} + +std::vector LogicalTokenBlock::get_token_ids() const { + return token_ids; +} + +PhysicalTokenBlock::PhysicalTokenBlock(int block_number, int block_size) + : block_number(block_number), block_size(block_size), ref_count(0) {} + +BlockAllocator::BlockAllocator(int block_size, int num_total_blocks) { + for (int block_number = 0; block_number < num_total_blocks; ++block_number) { + free_blocks.push_back(PhysicalTokenBlock(block_number, block_size)); + } + num_total_blocks = num_total_blocks; +} + +// Allocate a block +PhysicalTokenBlock BlockAllocator::allocate() { + if (free_blocks.empty()) { + printf("no free blocks are available\n"); + throw std::runtime_error("Out of memory! No free blocks are available."); + } + PhysicalTokenBlock block = free_blocks.front(); + free_blocks.pop_front(); + block.incr_ref_count(); + return block; +} + +// Free a block +void BlockAllocator::free(PhysicalTokenBlock &block) { + if (block.ref_count == 0) { + printf("block is already freed\n"); + throw std::runtime_error("Double free! Block is already freed."); + } + block.decr_ref_count(); + if (block.ref_count == 0) { + free_blocks.push_back(block); + } else { + // in current implementation this should not be the case + printf("block is not freed. Ref count: %d\n", block.ref_count); + throw std::runtime_error("Block is not freed. Ref count: " + + std::to_string(block.ref_count)); + } +} + +int BlockAllocator::get_num_free_blocks() const { + return free_blocks.size(); +} + +PageManager::PageManager(int block_size, size_t num_total_blocks) + : block_size(block_size), num_total_blocks(num_total_blocks), + block_allocator(block_size, num_total_blocks) {} + +// return the physical number of this block +int PageManager::allocate_one_block(RequestGuid const &request_guid) { + BlockTable &block_table = block_tables[request_guid]; + + PhysicalTokenBlock block = block_allocator.allocate(); + block_table.push_back(block); + block_tables[request_guid] = block_table; + return block.get_block_number(); +} + +void PageManager::free_block_table(BlockTable &block_table) { + // make it reverse order to free the last allocated block first + BlockTable::reverse_iterator rit = block_table.rbegin(); + for (; rit != block_table.rend(); ++rit) { + block_allocator.free(*rit); + } + return; +} + +void PageManager::free_request(RequestGuid const &request_guid) { + // we only free the blocks that are already used + BlockTable block_table = block_tables[request_guid]; + free_block_table(block_table); + block_tables.erase(request_guid); + return; +} + +// delete the last num_blocks in the request_guid +void PageManager::free_multiple_blocks(RequestGuid const &request_guid, + int num_blocks) { + // assert(block_tables.find(request_guid) != block_tables.end()); + auto &block_table = block_tables[request_guid]; + // assert(num_blocks <= block_table.size()); + int num_blocks_allocated = block_table.size(); + for (int i = 0; i < num_blocks; i++) { + block_allocator.free(block_table[num_blocks_allocated - i - 1]); + } + // only keep the first num_blocks_allocated - num_blocks blocks + block_table.erase(block_table.begin() + num_blocks_allocated - num_blocks, + block_table.end()); + block_tables[request_guid] = block_table; + return; +} + +std::vector PageManager::get_block_table_indices( + RequestGuid const &request_guid) const { + std::vector indices; + auto const &it = block_tables.find(request_guid); + if (it == block_tables.end()) { + return indices; + } + auto const &block_table = it->second; + for (auto const &block : block_table) { + indices.push_back(block.get_block_number()); + } + return indices; +} + +int PageManager::get_num_total_free_blocks() const { + return block_allocator.get_num_free_blocks(); +} + +int PageManager::get_num_allocated_blocks( + RequestGuid const &request_guid) const { + auto it = block_tables.find(request_guid); + if (it == block_tables.end()) { + return 0; + } else { + return it->second.size(); + } +} + +PageManager *PageManager::get_page_manager(FFModel *ff, + size_t total_kv_cache_size) { + int num_kv_heads = ff->num_kv_heads; + int size_dt = ff->size_dt; + int qkv_dim = ff->qkv_dim; + int num_transformer_layers = ff->num_transformer_layers; + int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree; + assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 && + num_transformer_layers > 0 && + pipeline_parallelism_degree > + 0); // needs to make sure that the model is initialized + if (page_manager_singleton == nullptr) { + size_t num_total_blocks = 0; + if (total_kv_cache_size == 0) { + num_total_blocks = (BatchConfig::max_spec_tree_token_num() + + BatchConfig::max_sequence_length() + kPagesize - 1) / + kPagesize * BatchConfig::max_requests_per_batch(); + } else { + num_total_blocks = total_kv_cache_size * 1024 * 1024 / size_dt / qkv_dim / + num_kv_heads / num_transformer_layers / kPagesize; + } + printf("page manager singleton is initialized with %d blocks\n", + num_total_blocks); + page_manager_singleton = new PageManager(kPagesize, num_total_blocks); + page_manager_singleton->kv_cache_size_per_layer = + total_kv_cache_size * 1024 * 1024 / num_transformer_layers; + } + return page_manager_singleton; +} + +size_t PageManager::get_kv_cache_size_per_layer() { + return kv_cache_size_per_layer; +} + +PageManager *PageManager::get_page_manager() { + assert(page_manager_singleton != nullptr); + return page_manager_singleton; +} + +}; // namespace FlexFlow diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 8f1be15fd..202983e8f 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -1,4 +1,5 @@ #include "flexflow/ffconst_utils.h" +#include "flexflow/mapper.h" #include "flexflow/model.h" #include "flexflow/ops/attention.h" #include "flexflow/ops/concat.h" @@ -19,6 +20,9 @@ namespace FlexFlow { using namespace Legion; +using namespace Legion; +using namespace Mapping; +Legion::Logger pt_logger("ParallelTensor"); TensorBase::TensorBase(TensorBase const &rhs) { tensor_guid = rhs.tensor_guid; @@ -647,11 +651,41 @@ bool ParallelTensorBase::is_valid_machine_view(MachineView const &view) const { return true; } +size_t get_physical_region_size(PhysicalRegion const &pr, + Context ctx, + Runtime *runtime) { + // Get the logical region + LogicalRegion lr = pr.get_logical_region(); + + // Get the index space domain + Domain domain = runtime->get_index_space_domain(ctx, lr.get_index_space()); + + // Get number of elements in the domain + size_t num_elements = domain.get_volume(); + + // Get the field space + FieldSpace fs = lr.get_field_space(); + + // Get all fields in the field space + std::vector fields; + runtime->get_field_space_fields(ctx, fs, fields); + + // Sum up the size of all fields + size_t total_field_size = 0; + for (FieldID fid : fields) { + size_t field_size = runtime->get_field_size(ctx, fs, fid); + total_field_size += field_size; + } + + // Total size is number of elements times size of each element + return num_elements * total_field_size; +} + template bool ParallelTensorBase::set_tensor(FFModel const *ff, std::vector const &dim_sizes, T const *data) { - Context ctx = ff->config.lg_ctx; + Context ctx = Legion::Runtime::get_context(); Runtime *runtime = ff->config.lg_hlr; // TODO: check data type matches // TODO: Currently we use a task launch, change to index launch for NCCL @@ -678,6 +712,28 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, InlineLauncher launcher(req); PhysicalRegion pr = runtime->map_region(ctx, launcher); pr.wait_until_valid(); + + if (ff->config.log_instance_creation) { + size_t pr_size = get_physical_region_size(pr, ctx, runtime); + if (pr_size != volume * num_replicas * sizeof(T)) { + std::cout << "Physical region size: " << pr_size << std::endl; + std::cout << "Volume: " << volume << std::endl; + std::cout << "Num replicas: " << num_replicas << std::endl; + std::cout << "Size of T: " << sizeof(T) << std::endl; + } + assert(pr_size == volume * num_replicas * sizeof(T)); + std::set memories; + pr.get_memories(memories); + assert(memories.size() == 1); + Memory memory = *(memories.begin()); + pt_logger.print("Created instance in memory_kind: %s memory_id: %llx size: " + "%zu (capacity %lu) task_name: set_tensor", + Legion::Mapping::Utilities::to_string(memory.kind()), + memory.id, + pr_size, + memory.capacity()); + } + switch (num_dims) { #define DIMFUNC(DIM) \ case DIM: { \ @@ -704,7 +760,7 @@ template bool ParallelTensorBase::get_tensor(FFModel const *ff, T *data, bool get_gradients) { - Context ctx = ff->config.lg_ctx; + Context ctx = Legion::Runtime::get_context(); Runtime *runtime = ff->config.lg_hlr; LogicalRegion weight_lr = LogicalRegion::NO_REGION; if (sync_type == ParameterSyncType::PS) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc old mode 100644 new mode 100755 index 16513e918..47c394f7e --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -14,39 +14,102 @@ */ #include "flexflow/request_manager.h" +#include "flexflow/inference.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" #include +#include +#include #include #include #include #include +#include #include #include +#include +#include +#include namespace FlexFlow { using namespace Legion; using tokenizers::Tokenizer; +using RequestGuid = BatchConfig::RequestGuid; -LegionRuntime::Logger::Category log_req_mgr("RequestManager"); +Legion::Logger log_req_mgr("RequestManager"); + +bool operator<(std::shared_ptr const &lhs, + std::shared_ptr const &rhs) { + if (lhs->gumbel) { + assert(rhs->gumbel); + return lhs->gumbel_logit < rhs->gumbel_logit; + } + return lhs->log_accumulated_prob < rhs->log_accumulated_prob; +} + +bool operator<=(std::shared_ptr const &lhs, + std::shared_ptr const &rhs) { + if (lhs->gumbel) { + assert(rhs->gumbel); + return lhs->gumbel_logit <= rhs->gumbel_logit; + } + return lhs->log_accumulated_prob <= rhs->log_accumulated_prob; +} + +void write_to_output_file(std::string const &output_filepath, + std::string const &str) { + std::ostream *os = &std::cout; + std::ofstream output_file; + if (!output_filepath.empty()) { + output_file.open(output_filepath, std::ios::app); + if (output_file.is_open()) { + os = &output_file; + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } + } + *os << str << std::endl; + if (!output_filepath.empty()) { + output_file.close(); + } +} std::string LoadBytesFromFile(std::string const &path) { std::ifstream fs(path, std::ios::in | std::ios::binary); - assert(!fs.fail() && "no such file"); - std::string data; + assert(fs.is_open() && "Failed to open file for reading."); fs.seekg(0, std::ios::end); - size_t size = static_cast(fs.tellg()); + size_t size = fs.tellg(); fs.seekg(0, std::ios::beg); - data.resize(size); - fs.read(data.data(), size); + std::string data(size, '\0'); + fs.read(&data[0], size); + assert(!fs.fail() && "Failed to read data from file."); return data; } +double Request::get_length_weight() { + double coeff_alpha = 128; + return log((double(tokens.size()) + coeff_alpha) / coeff_alpha); +} + +void Request::set_slo_ratio(double slo_ratio_) { + slo_ratio = slo_ratio_; +} +double Request::get_slo_ratio() { + return slo_ratio; +} + +int Request::decode_length() const { + return tokens.size() - llm_prefill_len; +} + RequestManager::RequestManager() - : request_manager_status(INITIALIZED), verbose(false), + : background_server_status(INITIALIZED), verbose(false), next_available_guid(1000000), num_processed_requests(0), - total_request_run_time(0.0f) { + total_request_run_time(0.0f), request_manager_status(PREFILLING), + decoding_mode(INCREMENTAL_DECODING), prefill_model(SSM) { // The following config parameters are set // during ffmodel.compile() // Initialize them to -1 to make sure no one @@ -54,8 +117,18 @@ RequestManager::RequestManager() // ffmodel.compile() max_requests_per_batch = -1; max_tokens_per_batch = -1; + max_tokens_per_ssm_batch = -1; + max_tokens_per_prefilling_batch = -1; max_spec_tree_token_num = -1; max_sequence_length = -1; + max_output_length = -1; + max_kv_cache_size = 0; + max_tree_depth = -1; + max_tree_width = -1; + k = -1; + std::fill(std::begin(request_available), std::end(request_available), false); + std::fill( + std::begin(guid_of_requests), std::end(guid_of_requests), INVALID_GUID); } void RequestManager::set_max_requests_per_batch(int max_num_requests) { @@ -76,11 +149,19 @@ void RequestManager::set_max_tokens_per_batch(int max_num_tokens) { assert(max_tokens_per_batch <= BatchConfig::MAX_NUM_TOKENS); } -void RequestManager::set_max_spec_tree_token_num(int max_num_tokens) { - assert(max_spec_tree_token_num == -1 || - max_spec_tree_token_num == max_num_tokens); - max_spec_tree_token_num = max_num_tokens; - assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); +void RequestManager::set_max_tokens_per_ssm_batch(int max_num_ssm_tokens) { + assert(max_tokens_per_ssm_batch == -1 || + max_tokens_per_ssm_batch == max_num_ssm_tokens); + max_tokens_per_ssm_batch = max_num_ssm_tokens; + assert(max_tokens_per_ssm_batch <= BatchConfig::MAX_NUM_TOKENS); +} + +void RequestManager::set_max_tokens_per_prefilling_batch( + int max_num_prefilling_tokens) { + assert(max_tokens_per_prefilling_batch == -1 || + max_tokens_per_prefilling_batch == max_num_prefilling_tokens); + max_tokens_per_prefilling_batch = max_num_prefilling_tokens; + assert(max_tokens_per_prefilling_batch <= BatchConfig::MAX_NUM_TOKENS); } int RequestManager::get_max_tokens_per_batch() { @@ -88,17 +169,21 @@ int RequestManager::get_max_tokens_per_batch() { return max_tokens_per_batch; } +int RequestManager::get_max_tokens_per_ssm_batch() { + assert(max_tokens_per_ssm_batch > 0); + return max_tokens_per_ssm_batch; +} + +int RequestManager::get_max_tokens_per_prefilling_batch() { + assert(max_tokens_per_prefilling_batch > 0); + return max_tokens_per_prefilling_batch; +} + int RequestManager::get_max_spec_tree_token_num() { assert(max_spec_tree_token_num > 0); return max_spec_tree_token_num; } -int RequestManager::get_max_verify_tokens_per_batch() { - assert(max_tokens_per_batch > 0); - return max_tokens_per_batch + - max_spec_tree_token_num * max_requests_per_batch; -} - void RequestManager::set_max_sequence_length(int max_seq_length) { assert(max_sequence_length == -1 || max_sequence_length == max_seq_length); max_sequence_length = max_seq_length; @@ -109,45 +194,306 @@ int RequestManager::get_max_sequence_length() { return max_sequence_length; } -void RequestManager::push_spec_infer_tree_width(int tree_width) { - assert(tree_width <= BeamSearchBatchConfig::MAX_BEAM_WIDTH); - spec_infer_tree_width.emplace_back(tree_width); +void RequestManager::set_max_output_length(int max_output_length) { + assert(max_output_length > 0); + this->max_output_length = max_output_length; +} + +int RequestManager::get_max_output_length() { + assert(max_output_length > 0); + return max_output_length; +} + +void RequestManager::set_max_kv_cache_size(size_t max_kv_cache_size) { + this->max_kv_cache_size = max_kv_cache_size; +} + +size_t RequestManager::get_max_kv_cache_size() { + return max_kv_cache_size; +} + +void RequestManager::set_decoding_mode(DecodingMode mode) { + assert(mode == INCREMENTAL_DECODING || mode == SPECULATIVE_DECODING); + decoding_mode = mode; +} + +void RequestManager::set_verbose(bool verbose_) { + verbose = verbose_; +} + +int RequestManager::get_k() { + assert(k > 0 and k <= BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES and + "Invalid k"); + return k; +} + +void RequestManager::set_k(int _k) { + assert(_k > 0 and _k <= BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES and + "Invalid k"); + k = _k; +} + +int RequestManager::get_max_tree_depth() { + assert(max_tree_depth > 0 and + max_tree_depth <= BatchConfig::MAX_TREE_DEPTH and + "Invalid max_tree_depth"); + return max_tree_depth; +} + +void RequestManager::set_max_tree_depth(int max_tree_depth) { + assert(max_tree_depth > 0 and + max_tree_depth <= BatchConfig::MAX_TREE_DEPTH and + "Invalid max_tree_depth"); + this->max_tree_depth = max_tree_depth; + if (max_tree_width > 0) { + // 8 is k of topk, if max_tree_width <= k, we will fill the second level + max_spec_tree_token_num = + max_tree_depth * max_tree_width + (max_tree_width <= 8); + assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + } +} + +int RequestManager::get_max_tree_width() { + assert(max_tree_width > 0 and + max_tree_width <= BatchConfig::MAX_TREE_WIDTH and + "Invalid max_tree_width"); + return max_tree_width; +} + +void RequestManager::set_max_tree_width(int max_tree_width) { + assert(max_tree_width > 0 and + max_tree_width <= BatchConfig::MAX_TREE_WIDTH and + "Invalid max_tree_width"); + this->max_tree_width = max_tree_width; + if (max_tree_depth > 0) { + // 8 is k of topk, if max_tree_width <= k, we will fill the second level + max_spec_tree_token_num = + max_tree_depth * max_tree_width + (max_tree_width <= 8); + assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + } +} + +int RequestManager::get_expansion_degree() { + assert(expansion_degree > 0 and + expansion_degree <= BatchConfig::MAX_TREE_WIDTH and + "Invalid expansion_degree"); + return expansion_degree; +} +void RequestManager::set_expansion_degree(int expansion_degree_) { + assert(expansion_degree > 0 and + expansion_degree <= BatchConfig::MAX_TREE_WIDTH and + "Invalid expansion_degree"); + this->expansion_degree = expansion_degree_; +} + +void RequestManager::set_speculative_sampling(bool speculative_sampling_) { + speculative_sampling = speculative_sampling_; +} + +void RequestManager::set_baseline_latency(double baseline_latency_ms_) { + baseline_latency_ms = baseline_latency_ms_; +} + +double RequestManager::get_baseline_latency() { + return baseline_latency_ms; +} + +void RequestManager::set_ssm_spec_latency(double ssm_spec_latency_ms_) { + ssm_spec_latency_ms = ssm_spec_latency_ms_; +} + +double RequestManager::get_ssm_spec_latency() { + return ssm_spec_latency_ms; +} + +void RequestManager::set_llm_verify_latency(double llm_verify_latency_ms_) { + llm_verify_latency_ms = llm_verify_latency_ms_; +} + +double RequestManager::get_llm_verify_latency() { + return llm_verify_latency_ms; +} + +void RequestManager::set_correction_factor(double correction_factor_) { + correction_factor = correction_factor_; +} + +double RequestManager::get_correction_factor() { + return correction_factor; +} + +void RequestManager::set_streaming_cache(bool streaming_cache_) { + streaming_cache = streaming_cache_; +} + +bool RequestManager::get_streaming_cache() { + return streaming_cache; +} + +bool RequestManager::get_memory_occupancy() { + return memory_occupancy; +} + +void RequestManager::set_memory_occupancy(bool memory_occupancy_) { + memory_occupancy = memory_occupancy_; +} + +void RequestManager::set_slo_violation_early_termination( + bool slo_violation_early_termination_) { + slo_violation_early_termination = slo_violation_early_termination_; +} + +void RequestManager::set_spec_infer_old_version(bool spec_infer_old_version_) { + spec_infer_old_version = spec_infer_old_version_; +} + +void RequestManager::set_greedy_schedule(bool greedy_schedule_) { + greedy_schedule = greedy_schedule_; +} + +void RequestManager::set_equal_schedule(bool equal_schedule_) { + equal_schedule = equal_schedule_; +} + +void RequestManager::set_fcfs_slo(bool fcfs_slo_) { + fcfs_slo = fcfs_slo_; +} + +void RequestManager::set_stta(bool stta_) { + stta = stta_; +} + +bool RequestManager::get_spec_infer_old_version() { + return spec_infer_old_version; +} + +bool RequestManager::get_greedy_schedule() { + return greedy_schedule; +} + +bool RequestManager::get_equal_schedule() { + return equal_schedule; +} + +bool RequestManager::get_fcfs_slo() { + return fcfs_slo; +} + +bool RequestManager::get_stta() { + return stta; +} + +void RequestManager::set_eval_overhead_breakdown( + bool eval_overhead_breakdown_) { + eval_overhead_breakdown = eval_overhead_breakdown_; +} + +bool RequestManager::get_eval_overhead_breakdown() { + return eval_overhead_breakdown; +} + +inline double RequestManager::get_slo_constraint(Request &request) { + if (request.get_slo_ratio() < 0) { + // we use negative number to specify the absolute slo constraint (ms) + return -request.get_slo_ratio(); + } else { + // relative slo constraint upon the baseline latency + return request.get_slo_ratio() * baseline_latency_ms; + } +} + +double RequestManager::get_request_expected_latency(Request &request) { + return get_slo_constraint(request) * request.decode_length(); +} + +Request &RequestManager::get_request_with_guid(RequestGuid guid) { + assert(all_requests.find(guid) != all_requests.end() && + "Request with the given GUID does not exist."); + return all_requests[guid]; +} + +bool RequestManager::SharedTokenTreeNodePtrRequestGuidWeightedLess::operator()( + std::pair, RequestGuid> const &lhs, + std::pair, RequestGuid> const &rhs) const { + if (lhs.first->gumbel) { + assert(rhs.first->gumbel); + return lhs.first->gumbel_logit * get_request_manager() + ->get_request_with_guid(lhs.second) + .get_length_weight() < + rhs.first->gumbel_logit * get_request_manager() + ->get_request_with_guid(rhs.second) + .get_length_weight(); + } + return lhs.first->log_accumulated_prob * + get_request_manager() + ->get_request_with_guid(lhs.second) + .get_length_weight() < + rhs.first->log_accumulated_prob * + get_request_manager() + ->get_request_with_guid(rhs.second) + .get_length_weight(); +} + +bool RequestManager::SharedTokenTreeNodePtrDoubleRequestGuidLess ::operator()( + std::tuple, double, RequestGuid> const &lhs, + std::tuple, double, RequestGuid> const &rhs) + const { + return std::get<1>(lhs) < std::get<1>(rhs); } void RequestManager::register_tokenizer(ModelType type, int bos_token_id, - int eos_token_id, + std::vector eos_token_ids, std::string const &path) { this->model_type = type; this->bos_token_id = bos_token_id; - this->eos_token_id = eos_token_id; - std::string tokenizer_folder = - (!path.empty() && path.back() != '/') ? path + '/' : path; + this->eos_token_ids = eos_token_ids; + std::filesystem::path tokenizer_folder(path); + if (model_type == ModelType::LLAMA) { - bool path_to_file = !path.empty() && - (path.size() >= strlen("tokenizer.model")) && - path.find("tokenizer.model") == - (path.size() - strlen("tokenizer.model")); - std::string tokenizer_filepath = - path_to_file ? path : tokenizer_folder + "tokenizer.model"; - this->tokenizer_ = - Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath)); + // try with tokenizer.json first + std::filesystem::path tokenizer_json_path; + if (std::filesystem::is_directory(tokenizer_folder)) { + tokenizer_json_path = + std::filesystem::path(tokenizer_folder) / "tokenizer.json"; + } else { + tokenizer_json_path = tokenizer_folder; + } + if (std::filesystem::exists(tokenizer_json_path)) { + // load from tokenizer.json + this->tokenizer_ = Tokenizer::FromBlobJSON( + LoadBytesFromFile(tokenizer_json_path.string())); + } else { + // load from tokenizer.model + std::filesystem::path tokenizer_model_path; + if (std::filesystem::is_directory(tokenizer_folder)) { + tokenizer_model_path = + std::filesystem::path(tokenizer_folder) / "tokenizer.model"; + } else { + tokenizer_model_path = tokenizer_folder; + } + if (!std::filesystem::exists(tokenizer_model_path)) { + std::cerr << "Failed to open file: " << tokenizer_model_path + << std::endl; + assert(false); + } + old_llama_tokenizer = true; + this->tokenizer_ = Tokenizer::FromBlobSentencePiece( + LoadBytesFromFile(tokenizer_model_path.string())); + } } else if (model_type == ModelType::OPT) { - std::string vocab_file = tokenizer_folder + "vocab.json"; - std::string merges_file = tokenizer_folder + "merges.txt"; - std::string added_tokens_file = - tokenizer_folder + "special_tokens_map.json"; - std::filesystem::path path1(vocab_file); - std::filesystem::path path2(merges_file); - std::filesystem::path path3(added_tokens_file); - assert(std::filesystem::exists(path1) && + std::filesystem::path vocab_file = tokenizer_folder / "vocab.json"; + std::filesystem::path merges_file = tokenizer_folder / "merges.txt"; + std::filesystem::path added_tokens_file = + tokenizer_folder / "special_tokens_map.json"; + assert(std::filesystem::exists(vocab_file) && "Vocab file vocab.json does not exist at the specified path"); - assert(std::filesystem::exists(path2) && + assert(std::filesystem::exists(merges_file) && "Merge file merges.txt does not exist at the specified path"); // opt_tokenizer = new OptTokenizer(vocab_file, merges_file); - std::string vocab = LoadBytesFromFile(path1.string()); - std::string merges = LoadBytesFromFile(path2.string()); - std::string added_tokens = LoadBytesFromFile(path3.string()); + std::string vocab = LoadBytesFromFile(vocab_file.string()); + std::string merges = LoadBytesFromFile(merges_file.string()); + std::string added_tokens = LoadBytesFromFile(added_tokens_file.string()); this->tokenizer_ = Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens); @@ -160,6 +506,10 @@ void RequestManager::register_tokenizer(ModelType type, } } +std::vector RequestManager::tokenize(std::string const &text) { + return tokenizer_->Encode(text); +} + void RequestManager::register_output_filepath( std::string const &_output_filepath) { this->output_filepath = _output_filepath; @@ -173,7 +523,7 @@ int RequestManager::register_ssm_model(FFModel *model) { } FFModel *RequestManager::get_ssm_model(int model_id) { - assert(model_id < ssm_models.size()); + assert(model_id >= 0 && model_id < ssm_models.size()); return ssm_models[model_id]; } @@ -182,27 +532,26 @@ size_t RequestManager::get_num_ssms() { } RequestManager::RequestGuid - RequestManager::register_new_request(std::vector const &prompt, - int max_sequence_length) { - const std::lock_guard lock(request_queue_mutex); - + RequestManager::register_new_request(GenerationRequest const &req) { // Add a new request Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - - if (prompt.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << prompt.size() << ".\n"; - - printf("tokens size: %zu\n", request.tokens.size()); - return INVALID_GUID; - } else { - request.initial_len = prompt.size(); - request.tokens = prompt; + request.add_special_tokens = req.add_special_tokens; + if (bos_token_id >= 0 && request.add_special_tokens && + model_type != ModelType::FALCON) { + request.tokens.push_back(bos_token_id); } + std::vector tokens = this->tokenizer_->Encode(req.prompt); + // for (int i = 0; i < tokens.size(); i++) { + // std::cout << "[" << i << "]" << tokens.at(i) << "\n"; + // } + // std::cout << "[slo ratio] " << req.slo_ratio << std::endl; + request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); + request.set_slo_ratio(req.slo_ratio); + printf("Registered as request[%ld] with slo %.3f ms\n", + request.guid, + get_slo_constraint(request)); if (get_num_ssms() == 0) { std::cout << "No small speculative model registered, using incremental " @@ -210,116 +559,67 @@ RequestManager::RequestGuid << std::endl; } else { std::cout << "Num of SSMs: " << get_num_ssms() << std::endl; - for (int i = 0; i < get_num_ssms(); i++) { - BeamTree beam_tree = BeamTree{}; - request.beam_trees.push_back(beam_tree); - } - } - - pending_request_queue.push(request); - all_requests[request.guid] = request; - { - const std::lock_guard lock(request_to_promise_mutex); - request_to_promise[request.guid] = new std::promise(); + assert(get_num_ssms() == 1 && "Only one SSM is supported now."); + init_token_tree(request.guid); } - if (verbose) { - std::cout << "new req: " << request.tokens.size() << std::endl; - for (int i = 0; i < request.tokens.size(); i++) { - std::cout << i << " : " << request.tokens[i] << std::endl; - } - } + request.streaming_cache_info = StreamingCacheInfo( + BatchConfig::SINK_SIZE, + BatchConfig::MAX_STREAMING_POS - BatchConfig::SINK_SIZE - + BatchConfig::get_max_tree_depth()); GenerationResult gr; gr.guid = request.guid; - gr.input_text = ""; - gr.input_tokens = prompt; - gr.output_text = ""; - gr.output_tokens = prompt; - request_generation_results[request.guid] = gr; - - return request.guid; -} - -RequestManager::RequestGuid - RequestManager::register_new_request(std::string const &prompt, - int max_sequence_length) { - const std::lock_guard lock(request_queue_mutex); - // Add a new request - Request request; - request.status = Request::PENDING; - request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - if (bos_token_id >= 0 && model_type != ModelType::FALCON) { - request.tokens.push_back(bos_token_id); - } - std::vector tokens = this->tokenizer_->Encode(prompt); - if (tokens.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << tokens.size() << ".\n"; - - printf("tokens size: %zu\n", tokens.size()); - return INVALID_GUID; - } - for (int i = 0; i < tokens.size(); i++) { - std::cout << "[" << i << "]" << tokens.at(i) << "\n"; - } - request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); - request.initial_len = request.tokens.size(); + gr.input_text = req.prompt; + gr.input_tokens = request.tokens; + gr.output_text = req.prompt; + gr.output_tokens = request.tokens; + gr.slo_ratio = req.slo_ratio; + gr.emission_time_ms = req.emission_time_ms; + + // Record time when request was enqueued + // Step idx -2: enqueueing; step idx -1: prefilling begins, step idx 0: + // prefilling finished + NewProfileInfo new_profile_info; + new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds(); + new_profile_info.request_guid = request.guid; + new_profile_info.request_step_idx = -2; + new_profiling_info.push_back(new_profile_info); - if (get_num_ssms() == 0) { - std::cout << "No small speculative model registered, using incremental " - "decoding." - << std::endl; - } else { - std::cout << "Num of SSMs: " << get_num_ssms() << std::endl; - for (int i = 0; i < get_num_ssms(); i++) { - BeamTree beam_tree = BeamTree{}; - request.beam_trees.push_back(beam_tree); - } + { + std::lock_guard const lock(request_queue_mutex); + pending_request_queue.push(request); + all_requests[request.guid] = request; } - - pending_request_queue.push(request); - all_requests[request.guid] = request; + request_queue_cv.notify_all(); { - const std::lock_guard lock(request_to_promise_mutex); + std::lock_guard const lock(request_to_promise_mutex); request_to_promise[request.guid] = new std::promise(); } + { + std::lock_guard const lock(request_result_mutex); + request_generation_results[request.guid] = gr; + } { - std::string output = "New request tokens:"; - output = "[" + std::to_string(request.guid) + "]" + output; - for (int i = 0; i < request.tokens.size(); i++) { - output = output + " " + std::to_string(request.tokens[i]); - } - log_req_mgr.print("%s", output.c_str()); + // std::string output = "New request tokens:"; + // output = "[" + std::to_string(request.guid) + "] " + output; + // for (int i = 0; i < request.tokens.size(); i++) { + // output = output + " " + std::to_string(request.tokens[i]); + // } + // log_req_mgr.print("%s", output.c_str()); + // write_to_output_file("", output); } - GenerationResult gr; - gr.guid = request.guid; - gr.input_text = prompt; - gr.input_tokens = request.tokens; - gr.output_text = prompt; - gr.output_tokens = request.tokens; - request_generation_results[request.guid] = gr; return request.guid; } -bool RequestManager::is_request_completed(RequestGuid const &guid) { - const std::lock_guard lock(request_queue_mutex); - assert(all_requests.find(guid) != all_requests.end()); - Request const &request = all_requests[guid]; - // return request.tokens.size() >= request.max_sequence_length; - return request.status == Request::COMPLETED; -} - GenerationResult RequestManager::get_generation_result(RequestGuid const &guid) { // First get the future of the request std::future future; { - const std::lock_guard lock(request_to_promise_mutex); + std::lock_guard const lock(request_to_promise_mutex); assert(request_to_promise.find(guid) != request_to_promise.end()); future = request_to_promise[guid]->get_future(); } @@ -327,7 +627,7 @@ GenerationResult future.get(); // Get the generation result { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_result_mutex); assert(request_generation_results.find(guid) != request_generation_results.end()); return request_generation_results[guid]; @@ -338,1974 +638,2261 @@ size_t RequestManager::get_num_processed_requests() { return num_processed_requests; } -BatchConfigFuture - RequestManager::prepare_next_batch(BatchConfigFuture const &old_bc, - InferenceResultFuture const &result, - Context ctx, - Runtime *runtime) { +int RequestManager::get_num_active_requests() { + return num_available_requests; +} + +int RequestManager::get_empty_request_index() { + for (int i = 0; i < get_max_requests_per_batch(); i++) { + if (guid_of_requests[i] == INVALID_GUID) { + return i; + } + } + return -1; +} + +std::unordered_map + RequestManager::get_requests_profiling() { + return profiling_requests; +} + +std::unordered_map + RequestManager::get_request_generation_results() { + return request_generation_results; +} + +ProfileInfo RequestManager::get_profiling_info() { + return profiling; +} + +std::vector RequestManager::get_new_profiling_info() { + return new_profiling_info; +} + +BatchConfigFuture RequestManager::get_next_batch_config( + InferenceResultFuture const &result, Context ctx, Runtime *runtime) { RequestManager *rm = this; - TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_TASK_ID, + TaskLauncher launcher(RM_GET_NEXT_BATCH_CONFIG_TASK_ID, TaskArgument(&rm, sizeof(RequestManager *))); - launcher.add_future(old_bc); launcher.add_future(result); return runtime->execute_task(ctx, launcher); } -BatchConfig RequestManager::prepare_next_batch_task( +BatchConfig RequestManager::get_next_batch_config_task( Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { RequestManager *rm = *((RequestManager **)task->args); - BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - InferenceResult const &result = - Future(task->futures[1]).get_result(); - return rm->prepare_next_batch(*bc, result); -} - -BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, - InferenceResult const &result) { - const std::lock_guard lock(request_queue_mutex); - - // Step 1: append result from previous iteration to request's tokens - for (int i = 0; i < old_bc.num_tokens; i++) { - size_t guid = - old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; - Request &request = all_requests[guid]; - if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) { - // This is a prompt token - continue; - } else { - assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 == - request.tokens.size()); - // This is a decoding token - log_req_mgr.print("Output token is: %d", result.token_ids[i]); - request.tokens.push_back(result.token_ids[i]); - // std::string output = this->tokenizer_->Decode(request.tokens); - // log_req_mgr.print("Output: %s", output.c_str()); - } + if (rm->request_manager_status == PREFILLING and rm->prefill_model == SSM and + rm->current_ssm_step != 0) { + // Return an empty batch config, because we only need on step for SSM + // prefilling, and the rest is placeholder for scheduling + return rm->get_next_batch_config(InferenceResult()); + } else if (rm->request_manager_status == SSM_SPEC and rm->ssm_completed) { + return rm->get_next_batch_config(InferenceResult()); } - int num_generation_tokens = 0; - int num_active_req = -1; - // Step 2: prepare the next batch for existing requests - BatchConfig new_bc; - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (old_bc.request_completed[i]) { // add new requests to the next batch - continue; - } else { - assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); - Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = - old_bc.requestsInfo[i].first_token_depth_in_request + - old_bc.requestsInfo[i].num_tokens_in_batch; - assert(processed_tokens < request.tokens.size()); - bool request_completed = false; - // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { - request_completed = true; - } else if (request.tokens.back() == eos_token_id) { - // Encounter EOS token id - request_completed = true; - } - if (request_completed) { - std::string output = this->tokenizer_->Decode(request.tokens); - // Unlike Huggingface, the sentencepiece C++ library automatically - // removes the BOS token - if (model_type == ModelType::LLAMA && - request.tokens.at(0) == bos_token_id) { - output = " " + output; - } - { - // update generation result - GenerationResult &gr = request_generation_results[request.guid]; - assert(gr.guid == request.guid); - gr.output_tokens = request.tokens; - gr.output_text = output; - } - request.status = Request::COMPLETED; - trigger_request_completion_future(request.guid); - log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", - old_bc.requestsInfo[i].request_guid, - request.tokens.size()); - log_req_mgr.print("Final output: %s", output.c_str()); - num_processed_requests++; - ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); - total_request_run_time += - profile_info.finish_time - profile_info.start_time; - profiling_requests[request.guid] = profile_info; - log_req_mgr.print( - "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.llm_decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); - // Write output to file if needed: - if (!output_filepath.empty()) { - std::ofstream outputFile(output_filepath, std::ios::app); - if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; - } - } - outputFile << std::endl; - outputFile << output; - outputFile.close(); - } else { - std::cout << "Unable to open the output file: " << output_filepath - << std::endl; - assert(false); - } - } + InferenceResult const &result = + Future(task->futures[0]).get_result(); + return rm->get_next_batch_config(result); +} - } else { - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = - old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - num_active_req++; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == - request.tokens.size()) { - // Incremental phase - new_bc.requestsInfo[i].num_tokens_in_batch = 1; - num_generation_tokens++; - new_bc.requestsInfo[i].prompt_phase = false; +BatchConfig + RequestManager::get_next_batch_config(InferenceResult const &result) { + static double process_this_start_us = 0.0, process_last_end_us = 0.0; + if (get_eval_overhead_breakdown()) { + process_this_start_us = Realm::Clock::current_time_in_microseconds(); + if (process_last_end_us != 0) { + if (request_manager_status == PREFILLING) { + if (prefill_model == SSM) { + eval_ssm_prefill_latency_us += + process_this_start_us - process_last_end_us; } else { - // Prompt phase - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, - (int)request.tokens.size() - - new_bc.requestsInfo[i].first_token_depth_in_request); - new_bc.requestsInfo[i].prompt_phase = true; - } - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - assert(depth < request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; - new_bc.num_tokens++; + eval_llm_prefill_latency_us += + process_this_start_us - process_last_end_us; } - // Update profiling - profiling_requests[new_bc.requestsInfo[i].request_guid] - .llm_decoding_steps++; + } else if (request_manager_status == SSM_SPEC) { + eval_ssm_spec_latency_us += process_this_start_us - process_last_end_us; + } else if (request_manager_status == LLM_VERIFY) { + eval_llm_verify_latency_us += + process_this_start_us - process_last_end_us; } } } - new_bc.num_generation_tokens = num_generation_tokens; - - // Step 3: add new requests to the next batch - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (new_bc.request_completed[i]) { - if (!pending_request_queue.empty() && - new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); - // all_requests[new_request.guid] = new_request; - - new_bc.requestsInfo[i].first_token_depth_in_request = 0; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = new_request.guid; - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, - (int)new_request.tokens.size()); - new_bc.requestsInfo[i].max_sequence_length = - new_request.max_sequence_length; - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].prompt_phase = true; - num_active_req++; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - // add profile_info for the new request - ProfileInfo profile_info; - profile_info.llm_decoding_steps = 1; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - assert(depth < new_request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = - new_request.tokens[depth]; - new_bc.num_tokens++; - } - if (new_bc.num_tokens == get_max_tokens_per_batch()) { - break; - } + update_inference_results(result); + BatchConfig bc = prepare_next_batch(); + if (get_eval_overhead_breakdown()) { + process_last_end_us = Realm::Clock::current_time_in_microseconds(); + double process_time_us = process_last_end_us - process_this_start_us; + // printf("Process time: %.3f us\n", process_time_us); + eval_process_latency_us += process_time_us; + } + return bc; +} + +// Return value: true if load a pending request to the batch +bool RequestManager::load_pending_request_to_batch() { + static double load_request_start = 0.0; + if (get_eval_overhead_breakdown()) { + load_request_start = Realm::Clock::current_time_in_microseconds(); + } + if (num_running_requests >= get_max_requests_per_batch()) { + if (get_eval_overhead_breakdown()) { + eval_other_latency_us += + Realm::Clock::current_time_in_microseconds() - load_request_start; + } + return false; + } + std::unique_lock lock(request_queue_mutex); + if (pending_request_queue.empty()) { + if (num_running_requests > 0) { + // No pending request to process, but there are running requests in the + // batch. Do nothing and return + if (get_eval_overhead_breakdown()) { + eval_other_latency_us += + Realm::Clock::current_time_in_microseconds() - load_request_start; + } + return false; + } + // Wait until there is a pending request or the background server is + // terminated + request_queue_cv.wait(lock, [&] { + return !pending_request_queue.empty() || + is_background_server_terminated(); + }); + // If the background server has been terminated, exit + if (is_background_server_terminated()) { + if (get_eval_overhead_breakdown()) { + eval_other_latency_us += + Realm::Clock::current_time_in_microseconds() - load_request_start; } + return false; } } + assert(!pending_request_queue.empty() && "No pending request to process."); + if (profiling.server_start_time == 0) { + reset_profiling_statistics(); + } + while (num_running_requests < get_max_requests_per_batch() && + !pending_request_queue.empty()) { + RequestGuid guid = pending_request_queue.front().guid; + pending_request_queue.pop(); + Request *request = &all_requests[guid]; + if (request->tokens.size() > get_max_sequence_length()) { + std::cerr << "Request " << guid + << " exceeds the maximum sequence length: " + << request->tokens.size() << " > " << get_max_sequence_length() + << std::endl; + continue; + } - return new_bc; + request->status = Request::RUNNING; + // Find an empty slot + int request_index = get_empty_request_index(); + assert(request_index != -1 && "No empty request slot to load the request."); + // Load request into batch + request->batch_index = request_index; + guid_of_requests[request_index] = guid; + num_running_requests++; + request_available[request_index] = true; + num_available_requests++; + // Initialize the bitmask for the new request with its prompt length + init_bitmask_prompt(guid, request->tokens.size()); + + prefilling_requests.push_back(request); + + profiling_requests[guid] = RequestProfileInfo(); + profiling_requests[guid].start_time = + Realm::Clock::current_time_in_microseconds(); + } + if (get_eval_overhead_breakdown()) { + eval_other_latency_us += + Realm::Clock::current_time_in_microseconds() - load_request_start; + } + return true; } -/* ----- Speculative Inference Specific functions ----- */ +void RequestManager::request_update_attainment(int batch_index, bool attained) { + Request &request = all_requests[guid_of_requests[batch_index]]; + request.attained &= attained; +} -/***** Request Init Phase *****/ -BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init( - TreeVerifyBatchConfigFuture const &old_bc, - InferenceResultFuture const &result, - int model_id, - Context ctx, - Runtime *runtime) { +bool isPrefixAndRemove(std::vector const &prefix, std::vector &vec) { + if (prefix.size() > vec.size()) { + return false; + } - RequestManager *rm = this; - TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, - TaskArgument(&rm, sizeof(RequestManager *))); - launcher.add_future(old_bc); - launcher.add_future(result); - launcher.add_future(Future::from_value(model_id)); - return runtime->execute_task(ctx, launcher); -} + if (std::equal(prefix.begin(), prefix.end(), vec.begin())) { + vec.erase(vec.begin(), vec.begin() + prefix.size()); + return true; + } -BeamSearchBatchConfig RequestManager::prepare_next_batch_init_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - RequestManager *rm = *((RequestManager **)task->args); - TreeVerifyBatchConfig const &bc = - Future(task->futures[0]).get_result(); - InferenceResult const &result = - Future(task->futures[1]).get_result(); - int model_id = Future(task->futures[2]).get_result(); - return rm->prepare_next_batch_init(bc, result, model_id); + return false; } -BeamSearchBatchConfig - RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, - InferenceResult const &result, - int model_id) { - const std::lock_guard lock(request_queue_mutex); - if (verbose) { - std::cout << "\n############### prepare_next_batch_init ###############\n"; +void RequestManager::request_complete_clean_up(int batch_index) { + static double request_complete_start = 0.0; + if (get_eval_overhead_breakdown()) { + request_complete_start = Realm::Clock::current_time_in_microseconds(); } + RequestGuid guid = guid_of_requests[batch_index]; - // Step 1: use result to update requests - BeamSearchBatchConfig new_bc; - new_bc.num_tokens = 0; - new_bc.model_id = model_id; - int result_index = 0; + profiling_requests[guid].finish_time = + Realm::Clock::current_time_in_microseconds(); + Request &request = all_requests[guid]; + guid_of_requests[batch_index] = INVALID_GUID; + num_running_requests--; + request_available[batch_index] = false; + num_available_requests--; + request.status = Request::COMPLETED; + + // page attention: free the pages + PageManager *page_manager = PageManager::get_page_manager(); + page_manager->free_request(guid); + + // Find the sos and eos in the sequence + // auto bos_it = std::find( + // request.tokens.begin(), request.tokens.end(), this->bos_token_id); + // auto eos_rit = std::find( + // request.tokens.rbegin(), request.tokens.rend(), this->eos_token_id); + // std::vector::iterator eos_it; + // if (eos_rit != request.tokens.rend()) { + // eos_it = eos_rit.base(); + // } else { + // eos_it = request.tokens.end(); + // } + // std::string output = + // this->tokenizer_->Decode(std::vector(bos_it, eos_it)); + { + std::lock_guard const lock(request_result_mutex); + request_generation_results[guid].output_tokens = request.tokens; + assert(isPrefixAndRemove(request_generation_results[guid].input_tokens, + request_generation_results[guid].output_tokens)); + if (request_generation_results[guid].output_tokens.size() > 0 && + is_eos_token( + request_generation_results[guid].output_tokens + [request_generation_results[guid].output_tokens.size() - 1]) && + !request.add_special_tokens) { + request_generation_results[guid].output_tokens.pop_back(); + } + request_generation_results[guid].output_text = this->tokenizer_->Decode( + request_generation_results[guid].output_tokens); + request_generation_results[guid].decoding_steps = + profiling_requests[guid].llm_decoding_steps; + // request_generation_results[guid].output_tokens = + // std::vector(bos_it, eos_it); + } - int num_generation_tokens = 0; - int num_active_req = -1; + trigger_request_completion_future(guid); - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (old_bc.request_completed[i]) { - continue; + std::string output = this->tokenizer_->Decode(request.tokens); + std::cout << "Request " << guid << " completed: " << std::endl; + std::cout << "" << output; + if (is_eos_token(request.tokens.back())) { + std::cout << ""; + } + std::cout << std::endl << std::endl; + { + RequestProfileInfo profile_info = profiling_requests[guid]; + + std::ostream *os = &std::cout; + std::ofstream output_file; + if (!output_filepath.empty()) { + output_file.open(output_filepath, std::ios::app); + if (output_file.is_open()) { + os = &output_file; + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } } - size_t guid = old_bc.requestsInfo[i].request_guid; - Request &request = all_requests[guid]; - - std::cout << "[ " << guid << " ]" << std::endl; + *os << "Request " << guid << " profiling: " << std::endl; + if (profile_info.start_decoding_time != 0) { + *os << "Decoding time: " + << (profile_info.finish_time - profile_info.start_decoding_time) * + 1e-3 + << " ms" << std::endl; + } else { + *os << "Decoding time: 0 ms" << std::endl; + } + *os << "Total time: " + << (profile_info.finish_time - profile_info.start_time) * 1e-3 << " ms" + << std::endl; + *os << "LLM decoding steps: " << profile_info.llm_decoding_steps + << std::endl; + if (decoding_mode == SPECULATIVE_DECODING) { + *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps + << std::endl; + } + *os << std::endl; + // *os << output << std::endl << std::endl; - // Verify this: get verified tokens from result - std::vector> tree_outputs = - std::vector>(); + if (!output_filepath.empty()) { + output_file.close(); + } + } + // RequestProfileInfo profile_info = profiling_requests[guid]; + // std::string str = + // "[" + std::to_string(guid) + + // "] Request completed:" + " decoding_time_ms(" + + // std::to_string( + // (profile_info.finish_time - profile_info.start_decoding_time) * + // 1e-3) + + // ")" + " total_time_ms(" + + // std::to_string((profile_info.finish_time - profile_info.start_time) * + // 1e-3) + + // ")" + " LLM_decoding_steps(" + + // std::to_string(profile_info.llm_decoding_steps) + ")"; + // if (decoding_mode == SPECULATIVE_DECODING) { + // str = str + " SSM_decoding_steps(" + + // std::to_string(profile_info.ssm_decoding_steps) + ")"; + // } + // write_to_output_file("", str); + if (get_eval_overhead_breakdown()) { + eval_other_latency_us += + Realm::Clock::current_time_in_microseconds() - request_complete_start; + } +} - assert(old_bc.num_tokens > 0); +void RequestManager::request_offload_from_batch(int batch_index) { + RequestGuid guid = guid_of_requests[batch_index]; + Request &request = all_requests[guid]; + // Still keep the request in `guid_of_requests` where can be retrieved later + request_available[batch_index] = false; + num_available_requests--; +} - // reset committed_tokens - if (committed_tokens.count(guid) == 0) { - committed_tokens[guid] = {}; - } else { - committed_tokens[guid].clear(); - } - - // iterate through all the tokens that belong to request i - int root_abs_depth = request.tokens.size() - 1; - - while (result_index < old_bc.num_tokens && - old_bc.tokensInfo[result_index].request_index == i) { - int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; - int token_id = result.token_ids[result_index]; - - if (request.status == Request::PENDING) { - committed_tokens[guid].emplace_back(abs_depth, result_index); - } else if (abs_depth >= root_abs_depth) { - tree_outputs.emplace_back(token_id, abs_depth + 1); - // std::cout << "committred tokens push: " << abs_depth - // << " ,result index: " << result_index << "\n"; - committed_tokens[guid].emplace_back(abs_depth, result_index); - - if (verbose) { - std::cout << "Index within old batch: " << result_index << std::endl; - printf(" Input: [%d] %d ---> [%d] %d \n", - abs_depth, - old_bc.tokensInfo[result_index].token_id, - tree_outputs.back().second, - token_id); - } - // std::cout << "Index within old batch: " << result_index << std::endl; - // printf(" Input: [%d] %d ---> [%d] %d \n", - // abs_depth, - // old_bc.tokensInfo[result_index].token_id, - // tree_outputs.back().second, - // token_id); - } - result_index++; - } +void RequestManager::request_load_onto_batch(int batch_index) { + RequestGuid guid = guid_of_requests[batch_index]; + Request &request = all_requests[guid]; + request_available[batch_index] = true; + num_available_requests++; +} - if (request.status == Request::RUNNING) { +void RequestManager::update_token_tree_depth() { + ssm_tree_depth = min(int(std::ceil((double)get_max_tokens_per_batch() / + get_num_active_requests())), + get_max_tree_depth()); +} - std::vector> verified_tokens = - traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); +void RequestManager::update_inference_results(InferenceResult const &result) { + // Update the inference results + if (num_running_requests == 0) { + // Update nothing + // Load the pending request to the batch + load_pending_request_to_batch(); + request_manager_status = PREFILLING; + if (decoding_mode == SPECULATIVE_DECODING) { + prefill_model = SSM; + current_ssm_step = 0; + } + return; + } - log_req_mgr.print("Number of Verified Tokens = %zu", - verified_tokens.size()); - // check if the request is finished - if (verified_tokens.size() + request.tokens.size() >= - request.max_sequence_length) { - // Append all verified tokens to the request - for (auto const &token_pair : verified_tokens) { - if (token_pair.second < request.max_sequence_length) { - request.tokens.push_back(token_pair.first); + switch (request_manager_status) { + case PREFILLING: + if (decoding_mode == INCREMENTAL_DECODING) { + // This indicates that the prefilling of the requests finishes + bool all_prefilled = update_llm_prefill_results(result); + // Check if there are more empty slots + if (load_pending_request_to_batch() or !all_prefilled) { + // Load the pending request to the batch + request_manager_status = PREFILLING; + } else { + // No more empty slots, start the decoding + while (!prefilled_requests.empty()) { + Request *request = prefilled_requests.front(); + request_load_onto_batch(request->batch_index); + prefilled_requests.pop(); } + request_manager_status = DECODING; } - log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", - request.guid, - request.tokens.size()); - std::string output = this->tokenizer_->Decode(request.tokens); - // Unlike Huggingface, the sentencepiece C++ library automatically - // removes the BOS token - if (model_type == ModelType::LLAMA && - request.tokens.at(0) == bos_token_id) { - output = " " + output; - } - { - // update generation result - GenerationResult &gr = request_generation_results[request.guid]; - assert(gr.guid == request.guid); - gr.output_tokens = request.tokens; - gr.output_text = output; - } - request.status = Request::COMPLETED; - trigger_request_completion_future(request.guid); - log_req_mgr.print("Final output: %s", output.c_str()); - - new_bc.request_completed[i] = true; - new_bc.request_running[i] = false; - num_processed_requests++; - - // Log profiling info - ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); - profile_info.ssm_decoding_steps = 0; - total_request_run_time += - profile_info.finish_time - profile_info.start_time; - profiling_requests[request.guid] = profile_info; - log_req_mgr.print( - "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.llm_decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); - - // Write output to file if needed: - if (!output_filepath.empty()) { - std::ofstream outputFile(output_filepath, std::ios::app); - if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; - } - } - outputFile << std::endl; - outputFile << output; + // Not completed, continue prefilling + } else if (decoding_mode == SPECULATIVE_DECODING) { + if (prefill_model == SSM) { + // A single iteration contains max_tree_depth SSM steps and a single + // LLM step. To align with this structure, we have to create + // max_tree_depth - 1 empty SSM steps during the prefilling phase. + if (current_ssm_step == 0) { + update_ssm_prefill_results(result); + } + // Except for the first step, we do nothing. + current_ssm_step++; - outputFile.close(); + if (current_ssm_step == get_max_tree_depth()) { + prefill_model = LLM; + } + } else if (prefill_model == LLM) { + // This indicates that the prefilling of the requests finishes + bool all_prefilled = update_llm_prefill_results(result); + if (load_pending_request_to_batch() or !all_prefilled) { + request_manager_status = PREFILLING; + prefill_model = SSM; + current_ssm_step = 0; } else { - std::cout << "Unable to open the output file: " << output_filepath - << std::endl; - assert(false); + // No more empty slots, start the speculation + while (!prefilled_requests.empty()) { + Request *request = prefilled_requests.front(); + request_load_onto_batch(request->batch_index); + prefilled_requests.pop(); + } + request_manager_status = SSM_SPEC; + // Reset the prefill_request + current_ssm_step = 0; + ssm_completed = false; } + } else { + assert(false && "Invalid prefill model."); } + } else { + assert(false && "Invalid inference mode."); + } + break; + case DECODING: { + bool request_completed = update_llm_decode_results(result); + if (load_pending_request_to_batch()) { + request_manager_status = PREFILLING; + } else { + request_manager_status = DECODING; + } + } break; + case LLM_VERIFY: { + bool request_completed = update_llm_verify_results(result); + if (load_pending_request_to_batch()) { + request_manager_status = PREFILLING; + prefill_model = SSM; + current_ssm_step = 0; + } else { + request_manager_status = SSM_SPEC; + current_ssm_step = 0; + ssm_completed = false; + } + } break; + case SSM_SPEC: + // Update current_ssm_step first because when we first call + // update_ssm_inference_results, there's already a step of small model + // inference + current_ssm_step++; + if (!ssm_completed) { + ssm_completed = update_ssm_inference_results(result); + } + // If the ssm speculation is completed, we do nothing - // delete the old input tree from cache - dfs_tree_inputs.erase(request.guid); - - } else { // Request not finished, pass verified_tokens to next iteration - - new_bc.request_completed[i] = false; - new_bc.request_running[i] = true; - num_active_req++; - - // Normal Request Info - new_bc.requestsInfo[i].first_token_depth_in_request = - verified_tokens.front().second; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = - old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - - // TODO: Beam Request Info, missing from VerifyTreeBatchConfig - int new_max_depth = - new_bc.requestsInfo[i].max_sequence_length - - new_bc.requestsInfo[i].first_token_depth_in_request - - verified_tokens.size(); - new_bc.beamRequestsInfo[i].current_depth = 1; - - profiling_requests[request.guid].ssm_decoding_steps = 0; - new_bc.requestsInfo[i].prompt_phase = true; - - int ssm_decoding_steps = 0; - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; - new_bc.beamRequestsInfo[i].max_depth = - std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); - for (int j = 0; - j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; - j++) { - new_bc.beamRequestsInfo[i].parent_id[j] = 0; - new_bc.beamRequestsInfo[i].probs[j] = 1; - } - - new_bc.beamRequestsInfo[i].sub_request_num = 1; - - new_bc.sub_requests[i] = 1; - - updateBitMask(new_bc.causalMask[i], - verified_tokens.size(), - request.tokens.size()); - - // Token Info - for (int j = 0; j < verified_tokens.size(); j++) { - auto token = verified_tokens.at(j); - - // Normal Token Info - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = - token.second; + if (current_ssm_step == get_max_tree_depth()) { + request_manager_status = LLM_VERIFY; + } + break; + default: + assert(false && "Invalid request manager status."); + } +} - // Beam Token Info - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; - new_bc.num_tokens++; +bool RequestManager::update_llm_prefill_results(InferenceResult const &result) { + int num_tokens = 0; + std::vector incomplete_requests; + incomplete_requests.reserve(prefilling_requests.size()); + for (Request *request : prefilling_requests) { + if (request->num_tokens_in_batch > 0) { + if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) { + request->streaming_cache_info.commit_cache( + request->num_tokens_in_batch); + request->llm_cache_size = request->streaming_cache_info.commit_len; + } else { + request->llm_cache_size += request->num_tokens_in_batch; + } + request->llm_prefill_len += request->num_tokens_in_batch; - // Add verified token to request's token list - request.tokens.push_back(token.first); + if (request->llm_prefill_len == request->tokens.size()) { + // Indicates that this request's prefilling phase finishes + request->tokens.push_back( + result.token_ids[num_tokens + request->num_tokens_in_batch - 1]); - if (new_bc.num_tokens == get_max_tokens_per_batch()) { - break; + if (is_eos_token(request->tokens.back())) { + request_complete_clean_up(request->batch_index); + } else { + // Temporarily offload request from the batch + request_offload_from_batch(request->batch_index); + prefilled_requests.push(request); + + if (decoding_mode == SPECULATIVE_DECODING) { + // Add the last token to the token tree + assert(request->committed_tokens.empty() && + "The committed tokens should be empty."); + request->committed_tokens.push_back(Request::CommittedToken{ + -1, (int)request->tokens.size() - 1, request->tokens.back()}); + init_token_tree(request->guid); + add_root_to_spec_token_tree(request->guid, request->tokens.back()); + update_bitmask_prompt(request->guid, 1); } } - - std::string output = this->tokenizer_->Decode(request.tokens); - // Unlike Huggingface, the sentencepiece C++ library automatically - // removes the BOS token - if (model_type == ModelType::LLAMA && - request.tokens.at(0) == bos_token_id) { - output = " " + output; - } - log_req_mgr.print("Output: %s", output.c_str()); + } else { + // Next phase will still be prefilling + incomplete_requests.push_back(request); } + profiling_requests[request->guid].llm_prefilling_steps++; + num_tokens += request->num_tokens_in_batch; + } else if (request->llm_prefill_len < request->tokens.size()) { + // The request is not completed, continue prefilling + incomplete_requests.push_back(request); + } + } - } else if (request.status == Request::PENDING) { - new_bc.request_completed[i] = false; - new_bc.request_running[i] = false; - num_active_req++; - - std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", " - << "initial_len: " << request.initial_len << std::endl; - assert(request.ssm_cache_size == request.initial_len); - - // Normal Request Info - new_bc.requestsInfo[i].first_token_depth_in_request = - request.ssm_cache_size; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[i].num_tokens_in_batch = 0; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - - // TODO: Beam Request Info, missing from VerifyTreeBatchConfig - new_bc.beamRequestsInfo[i].current_depth = 1; - int ssm_decoding_steps = - profiling_requests[request.guid].ssm_decoding_steps; - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; - new_bc.beamRequestsInfo[i].max_depth = 0; - for (int j = 0; j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; - j++) { - new_bc.beamRequestsInfo[i].parent_id[j] = 0; - new_bc.beamRequestsInfo[i].probs[j] = 1; - } + prefilling_requests.swap(incomplete_requests); + return prefilling_requests.empty(); +} - new_bc.beamRequestsInfo[i].sub_request_num = 1; +bool RequestManager::update_llm_decode_results(InferenceResult const &result) { + bool request_completed = false; + int nb_requests_decoded = 0; + long long int current_time = Realm::Clock::current_time_in_microseconds(); - new_bc.sub_requests[i] = 1; + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + // Request in this slot is unavailable + continue; + } + int guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + if (streaming_cache) { + request.streaming_cache_info.commit_cache(1); + request.llm_cache_size = request.streaming_cache_info.commit_len; + } else { + request.llm_cache_size++; + } + request.tokens.push_back( + result.token_ids[request.first_token_offset_in_batch]); + + request.decode_latency_ms = + (current_time - profiling_requests[guid].start_decoding_time) * 1e-3; + bool attained = + request.decode_latency_ms <= get_request_expected_latency(request); + profiling_requests[guid].llm_decoding_steps++; + nb_requests_decoded++; + + NewProfileInfo new_profile_info; + new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds(); + new_profile_info.request_guid = guid; + new_profile_info.request_step_idx = + profiling_requests[guid].llm_decoding_steps - 1; + new_profile_info.num_generated_tokens = 1; + new_profiling_info.push_back(new_profile_info); + + if (is_eos_token(request.tokens.back()) or + request.decode_length() >= get_max_output_length() or + request.tokens.size() >= get_max_sequence_length()) { + request_update_attainment(request_index, attained); + request_completed = true; + request_complete_clean_up(request_index); + } else if (!attained and slo_violation_early_termination) { + // Early drop that request + request_update_attainment(request_index, attained); + request_completed = true; + request_complete_clean_up(request_index); + } - // Token Info + if (verbose) { std::string output = this->tokenizer_->Decode(request.tokens); - // Unlike Huggingface, the sentencepiece C++ library automatically removes - // the BOS token - if (model_type == ModelType::LLAMA && - request.tokens.at(0) == bos_token_id) { - output = " " + output; - } - log_req_mgr.print("Output: %s", output.c_str()); - } else { - assert(false); + std::cout << "Request " << guid << " tokens: " << std::endl + << output << std::endl; } } + profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) * + 1e-3); + profiling.requests_per_step.push_back(nb_requests_decoded); + profiling.generated_tokens_per_step.push_back(nb_requests_decoded); + return request_completed; +} - // Step 2: Initialize new request - for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { - if (new_bc.request_completed[i]) { - if (!pending_request_queue.empty() && - new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); - // all_requests[new_request.guid] = new_request; - num_active_req++; - new_bc.requestsInfo[i].first_token_depth_in_request = 0; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = new_request.guid; - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, - (int)new_request.tokens.size()); - new_bc.requestsInfo[i].max_sequence_length = - new_request.max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - - // add profile_info for the new request - ProfileInfo profile_info; - profile_info.llm_decoding_steps = 0; - profile_info.ssm_decoding_steps = 0; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; - // init the beam search metadata per request - int ssm_decoding_steps = profile_info.ssm_decoding_steps; - - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; - new_bc.beamRequestsInfo[i].current_depth = 1; - new_bc.beamRequestsInfo[i].max_depth = - std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, - get_max_tokens_per_batch() - - new_bc.requestsInfo[i].num_tokens_in_batch - 1); - for (int j = 0; - j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; - j++) { - new_bc.beamRequestsInfo[i].parent_id[j] = 0; - new_bc.beamRequestsInfo[i].probs[j] = 1; - } - - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].prompt_phase = true; - - new_bc.beamRequestsInfo[i].sub_request_num = 1; - printf("sub request num == 1, %d \n", - new_bc.beamRequestsInfo[i].beam_size); - - new_bc.sub_requests[i] = 1; - - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - assert(depth < new_request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = - new_request.tokens[depth]; - - // beam search meta data, indicate which sub request this token - // belongs to, init to 0; - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; - new_bc.num_tokens++; - } +void RequestManager::update_ssm_prefill_results( + InferenceResult const &ssm_prefill_result) { + // This function is called by update_inference_results when the + // request_manager_status is PREFILLING and the prefill_model is SSM. + // There's no results to update, but we should update ssm_cache_size. + for (Request *request : prefilling_requests) { + if (request->num_tokens_in_batch > 0) { + if (streaming_cache) { + request->streaming_cache_info.commit_cache( + request->num_tokens_in_batch); + request->ssm_cache_size = request->streaming_cache_info.commit_len; + } else { + request->ssm_cache_size += request->num_tokens_in_batch; + } + request->ssm_prefill_len += request->num_tokens_in_batch; - initBitMask(new_bc.causalMask[i], - new_bc.requestsInfo[i].num_tokens_in_batch); - - // if (new_bc.requestsInfo[i].num_tokens_in_batch < - // new_request.initial_len) { - // all_requests[new_request.guid].status = Request::PENDING; - // new_bc.request_running[i] = false; - // std::cout << "Request " << new_request.guid << " is pending" - // << std::endl; - // } else { - // all_requests[new_request.guid].status = Request::RUNNING; - // new_bc.request_running[i] = true; - // std::cout << "Request " << new_request.guid << " is running" - // << std::endl; - // } - all_requests[new_request.guid].status = Request::PENDING; - all_requests[new_request.guid].ssm_cache_size = - new_bc.requestsInfo[i].num_tokens_in_batch; - new_bc.request_running[i] = false; - std::cout << "SSM KV Cache Size init: " - << all_requests[new_request.guid].ssm_cache_size << std::endl; - std::cout << "LLM KV Cache Size init: " - << all_requests[new_request.guid].llm_cache_size << std::endl; - - std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch - << " tokens for request " << new_request.guid << std::endl; - std::cout << "total prompt in request: " << new_request.initial_len - << std::endl; + profiling_requests[request->guid].ssm_prefilling_steps++; + } + } +} - if (new_bc.num_tokens == get_max_tokens_per_batch()) { - break; +BatchConfig RequestManager::prepare_next_batch() { + if (is_background_server_terminated()) { + return BatchConfig(); + } + switch (request_manager_status) { + case PREFILLING: + if (decoding_mode == INCREMENTAL_DECODING) { + return prepare_llm_prefilling_batch(); + } else if (decoding_mode == SPECULATIVE_DECODING) { + if (prefill_model == SSM) { + if (current_ssm_step == 0) { + return prepare_ssm_prefilling_batch(); + } else { + // Return an empty batch config + return BatchConfig(); + } + } else if (prefill_model == LLM) { + return prepare_llm_prefilling_batch(); + } else { + assert(false && "Invalid prefill model."); } + } else { + assert(false && "Invalid inference mode."); } + break; + case DECODING: + if (get_fcfs_slo()) { + return prepare_decoding_batch_fcfs_slo(); + } else if (get_stta()) { + return prepare_decoding_batch_stta(); + } else { + return prepare_decoding_batch(); + } + case SSM_SPEC: + if (current_ssm_step == 0) { + return prepare_first_spec_batch_config(); + } else if (!ssm_completed) { + return prepare_next_spec_batch_config(); + } else { + // Return an empty batch config + return BatchConfig(); + } + case LLM_VERIFY: + return prepare_verify_batch_config(); + default: + std::cout << "Invalid request manager status: " << request_manager_status + << std::endl; + assert(false); + } +} + +BatchConfig RequestManager::prepare_llm_prefilling_batch() { + // This function is called when the request_manager_status is PREFILLING, + // which means that there is a request in the prefilling phase. + // This function load its prefilling tokens, constructing a BatchConfig with + // only one request. + if (verbose) { + std::cout << "\n############### prepare_llm_prefilling_batch " + "##############\n"; + } + assert(prefilling_requests.size() > 0 && + "No prefilling request to process in the prefilling phase."); + + // get page manager + PageManager *page_manager = PageManager::get_page_manager(); + + BatchConfig bc; + if (decoding_mode == INCREMENTAL_DECODING) { + bc.inference_mode = InferenceMode::INC_DECODING_MODE; + } else if (decoding_mode == SPECULATIVE_DECODING) { + bc.inference_mode = InferenceMode::TREE_VERIFY_MODE; + } + bc.prompt_phase = true; + bc.num_available_requests = 0; + int num_tokens = 0; + for (Request *request : prefilling_requests) { + int request_index = request->batch_index; + + assert(request->status == Request::RUNNING); + + // Request Info + bc.requestsInfo[request_index].first_token_offset_in_batch = num_tokens; + bc.requestsInfo[request_index].first_token_index_in_request = + request->llm_cache_size; + int num_tokens_in_batch = + std::min(get_max_tokens_per_prefilling_batch() - num_tokens, + (int)request->tokens.size() - request->llm_prefill_len); + num_tokens_in_batch = std::max(num_tokens_in_batch, 0); + bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch; + + // Copy the streaming cache info + bc.streamingCacheInfo[request_index] = request->streaming_cache_info; + + request->first_token_offset_in_batch = num_tokens; + request->num_tokens_in_batch = num_tokens_in_batch; + + // Token Info + for (int idx = 0; idx < num_tokens_in_batch; idx++) { + int token_idx = num_tokens + idx; + int abs_idx = request->llm_cache_size + idx; + + bc.tokensInfo[token_idx].request_index = request_index; + bc.tokensInfo[token_idx].abs_index_in_request = abs_idx; + bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx; + assert(request->llm_prefill_len + idx < request->tokens.size()); + bc.tokensInfo[token_idx].token_id = + request->tokens[request->llm_prefill_len + idx]; + + append_token_to_block( + *request, request->tokens[request->llm_prefill_len + idx], true); + } + num_tokens += num_tokens_in_batch; + if (num_tokens_in_batch > 0) { + bc.num_available_requests++; + bc.request_available[request_index] = true; + } + bc.requestsInfo[request_index].request_guid = request->guid; + + // Record prefilling start time. We don't do this for speculative decoding, + // because in that case we start the timer in the ssm prefilling Step idx + // -2: enqueueing; step idx -1: prefilling begins, step idx 0: prefilling + // finished + if (decoding_mode == INCREMENTAL_DECODING) { + NewProfileInfo new_profile_info; + new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds(); + new_profile_info.request_guid = request->guid; + new_profile_info.request_step_idx = -1; + new_profiling_info.push_back(new_profile_info); } } - new_bc.num_generation_tokens = num_generation_tokens; + bc.num_tokens = num_tokens; if (verbose) { - std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" - << std::endl; - old_bc.print(); - new_bc.print(); + std::cout << "prepare_llm_prefilling_batch NEW batchconfig:" << std::endl; + bc.print(); } - return new_bc; + return bc; } -/***** Beam Search Phase *****/ -BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam( - BeamSearchBatchConfigFuture const &old_bc, - BeamInferenceResultFuture const &result, - Context ctx, - Runtime *runtime) { +BatchConfig RequestManager::prepare_ssm_prefilling_batch() { + // This function is called when the request_manager_status is PREFILLING, + // which means that there is a request in the prefilling phase. + // This function load its prefilling tokens, constructing a BatchConfig with + // only one request. + if (verbose) { + std::cout << "\n############### prepare_ssm_prefilling_batch " + "##############\n"; + } + assert(prefilling_requests.size() > 0 && + "No prefilling request to process in the prefilling phase."); + + BatchConfig bc; + bc.inference_mode = InferenceMode::TREE_SEARCH_MODE; + bc.prompt_phase = true; + bc.num_available_requests = 0; + int num_tokens = 0; + for (Request *request : prefilling_requests) { + int request_index = request->batch_index; + + // Request Info + bc.requestsInfo[request_index].first_token_offset_in_batch = num_tokens; + bc.requestsInfo[request_index].first_token_index_in_request = + request->ssm_cache_size; + int num_tokens_in_batch = + std::min(get_max_tokens_per_prefilling_batch() - num_tokens, + (int)request->tokens.size() - request->ssm_prefill_len); + num_tokens_in_batch = std::max(num_tokens_in_batch, 0); + bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch; + + // Copy the streaming cache info + bc.streamingCacheInfo[request_index] = request->streaming_cache_info; + + request->first_token_offset_in_batch = num_tokens; + request->num_tokens_in_batch = num_tokens_in_batch; + + // Token Info + for (int idx = 0; idx < num_tokens_in_batch; idx++) { + int token_idx = num_tokens + idx; + int abs_idx = request->ssm_cache_size + idx; + + bc.tokensInfo[token_idx].request_index = request_index; + bc.tokensInfo[token_idx].abs_index_in_request = abs_idx; + bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx; + assert(request->ssm_prefill_len + idx < request->tokens.size()); + bc.tokensInfo[token_idx].token_id = + request->tokens[request->ssm_prefill_len + idx]; + } + num_tokens += num_tokens_in_batch; + if (num_tokens_in_batch > 0) { + bc.num_available_requests++; + // Only set the prefilling request to be available + bc.request_available[request_index] = true; + } - RequestManager *rm = this; - TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, - TaskArgument(&rm, sizeof(RequestManager *))); - launcher.add_future(old_bc); - launcher.add_future(result); - return runtime->execute_task(ctx, launcher); -} + // Record prefilling start time + // Step idx -2: enqueueing; step idx -1: prefilling begins, step idx 0: + // prefilling finished + NewProfileInfo new_profile_info; + new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds(); + new_profile_info.request_guid = request->guid; + new_profile_info.request_step_idx = -1; + new_profiling_info.push_back(new_profile_info); + } + bc.num_tokens = num_tokens; -BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - RequestManager *rm = *((RequestManager **)task->args); - BeamSearchBatchConfig const &bc = - Future(task->futures[0]).get_result(); - BeamInferenceResult const &result = - Future(task->futures[1]).get_result(); - return rm->prepare_next_batch_beam(bc, result); -} - -// update beam search metadata -BeamSearchBatchConfig - RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, - BeamInferenceResult const &result) { - const std::lock_guard lock(request_queue_mutex); if (verbose) { - std::cout << "\n############### prepare_next_batch_beam ###############\n"; + std::cout << "prepare_ssm_prefilling_batch NEW batchconfig:" << std::endl; + bc.print(); } + return bc; +} + +BatchConfig RequestManager::prepare_decoding_batch() { + // This function is called when the request_manager_status is DECODING. It + // fills the last token of each request in the current batch to the + // BatchConfig for the LLM to decode. if (verbose) { - std::cout << "print all results" - << "\n"; - for (int i = 0; i < 40; i++) { - std::cout << result.token_ids[i] << ", "; - } - std::cout << "Current Beam Depth: " - << old_bc.beamRequestsInfo[0].current_depth << "\n"; - std::cout << "Current sub request num: " - << old_bc.beamRequestsInfo[0].sub_request_num << "\n"; - } - // Step 1: Store result to the beam tree struct - store_beam_metadata(old_bc, result); - - // Step 2: preparing the next batch for existing requests - BeamSearchBatchConfig new_bc; - new_bc.model_id = old_bc.model_id; - // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; - int num_generation_tokens = 0; - - // Add incremental tokens to the batch - int num_active_req = -1; - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (old_bc.request_completed[i] || !old_bc.request_running[i]) { + std::cout << "\n############### prepare_decoding_batch " + "##############\n"; + } + + BatchConfig bc; + bc.inference_mode = InferenceMode::INC_DECODING_MODE; + bc.prompt_phase = false; + std::copy(std::begin(request_available), + std::end(request_available), + std::begin(bc.request_available)); + bc.num_available_requests = num_available_requests; + + for (int request_index = 0; request_index < get_max_requests_per_batch(); + request_index++) { + if (!request_available[request_index]) { continue; } - num_active_req++; - // Comment out this assertion since num_tokens_in_batch can be - // zero when beam search has reached required sequence length - // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); - Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request + - old_bc.requestsInfo[i].num_tokens_in_batch; - - // assert(processed_tokens < request.tokens.size()); - log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - { - log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " - << new_bc.num_tokens; - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - profiling_requests[request.guid].ssm_decoding_steps += 1; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - // update the beam search metadata - // how many sub request in current request - // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH - // entries? - // update the parentid, accumalated_probs, depth, and token_ids - int ssm_decoding_steps = - profiling_requests[request.guid].ssm_decoding_steps; - - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; - - new_bc.beamRequestsInfo[i].max_depth = - old_bc.beamRequestsInfo[i].max_depth; - - new_bc.sub_requests[i] = - old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; - new_bc.beamRequestsInfo[i].sub_request_num = - old_bc.beamRequestsInfo[i].sub_request_num * - old_bc.beamRequestsInfo[i].beam_size; - - assert(new_bc.beamRequestsInfo[i].sub_request_num <= - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && - "exceed maximum nodes per layer"); - - if (request.status == Request::RUNNING) { - new_bc.beamRequestsInfo[i].current_depth = - old_bc.beamRequestsInfo[i].current_depth + 1; - new_bc.request_running[i] = true; - // do the slot exchange to minimize the cache exchange in kernel. - update_beam_metadata( - new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); + Request &request = all_requests[guid_of_requests[request_index]]; + assert(request.status == Request::RUNNING); - } else { - assert(false && "Request should not be pending in beam search phase"); - } + // Per Request Info + bc.requestsInfo[request_index].first_token_index_in_request = + request.llm_cache_size; + bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens; + bc.requestsInfo[request_index].num_tokens_in_batch = 1; - // do the slot exchange to minimize the cache exchange in kernel. - // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), - // i); - if (new_bc.requestsInfo[i].first_token_depth_in_request >= - request.tokens.size()) { - // Incremental phase - if (request.status == Request::RUNNING) { - // todo this is replaced by this_layer_size, but should check it - new_bc.requestsInfo[i].num_tokens_in_batch = 1; - } else { - assert(false && "Request should be done"); - // new_bc.requestsInfo[i].num_tokens_in_batch = 0; - } + // Copy the streaming cache info + bc.streamingCacheInfo[request_index] = request.streaming_cache_info; - if (verbose) { - std::cout << "[ Beam Spec] " << request.guid << std::endl; - std::cout << "Incremental phase: " << request.tokens.size() - << ", num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; - } - } + request.first_token_offset_in_batch = bc.num_tokens; + request.num_tokens_in_batch = 1; - if (verbose) { - std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size - << std::endl; - std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size - << std::endl; - } + // Per Token Info + bc.tokensInfo[bc.num_tokens].request_index = request_index; + bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size; + bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size; + bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back(); + bc.requestsInfo[request_index].request_guid = request.guid; - // register more tokens due to the beam width - - // copy metadata - memcpy(&new_bc.causalMask[i], - &old_bc.causalMask[i], - sizeof(BatchConfig::BitMask)); - BeamTree tree = request.beam_trees[old_bc.model_id]; - appendBitMask(new_bc.causalMask[i], - new_bc.beamRequestsInfo[i].sub_request_num, - old_bc.beamRequestsInfo[i].beam_size, - old_bc.beamRequestsInfo[i].sub_request_num, - tree, - old_bc.beamRequestsInfo[i].current_depth); - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - - // get value from requestinfo - new_bc.tokensInfo[new_bc.num_tokens].token_id = - new_bc.beamRequestsInfo[i].tokens[k]; - - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; - new_bc.num_tokens++; + bc.num_tokens++; - num_generation_tokens++; - } - } + if (profiling_requests[request.guid].llm_decoding_steps == 0) { + profiling_requests[request.guid].start_decoding_time = + Realm::Clock::current_time_in_microseconds(); } } - // how many requests is in speculative phase - new_bc.speculative_request_num = num_active_req + 1; + if (verbose) { + std::cout << "prepare_decoding_batch NEW batchconfig:" << std::endl; + bc.print(); + } + profiling.llm_step_start = Realm::Clock::current_time_in_microseconds(); + return bc; +} - // Add prompt tokens to the batch - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (old_bc.request_completed[i] || old_bc.request_running[i]) { +BatchConfig RequestManager::prepare_decoding_batch_fcfs_slo() { + // This function is called when the request_manager_status is DECODING. It + // fills the last token of each request in the current batch to the + // BatchConfig for the LLM to decode. + if (verbose) { + std::cout << "\n############### prepare_decoding_batch_fcfs_slo " + "##############\n"; + } + + BatchConfig bc; + bc.inference_mode = InferenceMode::INC_DECODING_MODE; + bc.prompt_phase = false; + + // Check if there are any requests whose SLO is in the fastest category + std::fill(request_available, + request_available + get_max_requests_per_batch(), + false); + num_available_requests = 0; + std::vector fcfs_request_queue; + for (int request_index = 0; request_index < get_max_requests_per_batch(); + request_index++) { + if (guid_of_requests[request_index] == INVALID_GUID) { continue; } - num_active_req++; - // Comment out this assertion since num_tokens_in_batch can be - // zero when beam search has reached required sequence length - // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); - Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request + - old_bc.requestsInfo[i].num_tokens_in_batch; - - // assert(processed_tokens < request.tokens.size()); - log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - - { - log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " - << new_bc.num_tokens; - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - - // update the beam search metadata - // how many sub request in current request - // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH - // entries? - int ssm_decoding_steps = - profiling_requests[request.guid].ssm_decoding_steps; - - new_bc.beamRequestsInfo[i].beam_size = 1; - // printf("beam size: %d, %d\n", - // new_bc.beamRequestsInfo[i].beam_size, - // ssm_decoding_steps); - new_bc.beamRequestsInfo[i].max_depth = - old_bc.beamRequestsInfo[i].max_depth; - // new_bc.sub_requests[i] = - // old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; - new_bc.sub_requests[i] = 1; - new_bc.beamRequestsInfo[i].sub_request_num = - old_bc.beamRequestsInfo[i].sub_request_num; - - assert(new_bc.beamRequestsInfo[i].sub_request_num <= - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && - "exceed maximum nodes per layer"); - - // update the parentid, accumalated_probs, depth, and token_ids - - if (request.status == Request::PENDING) { - // if the request is pending, we need to update the beam search - // metadata based on the initial length - new_bc.beamRequestsInfo[i].current_depth = - old_bc.beamRequestsInfo[i].current_depth; - new_bc.request_running[i] = false; - } else { - assert(false && "Request should be pending"); + Request &request = all_requests[guid_of_requests[request_index]]; + assert(request.status == Request::RUNNING); + fcfs_request_queue.push_back(request); + } + + // Sort the requests in the FCFS queue based on the decoding time in + // descending order + std::sort(fcfs_request_queue.begin(), + fcfs_request_queue.end(), + [](Request const &a, Request const &b) { + return a.decode_latency_ms < b.decode_latency_ms; + }); + + // Include the requests one by one until: + // 1. If the batch includes a request whose SLO is in the fastest category, + // limit the number of requests in the batch to 8. + // 2. If the batch does not include a request whose SLO is in the fastest + // category, keep adding requests until a request whose SLO is in the fastest + // category is met (do not include it). + bool has_fastest_slo = false; + for (Request &request : fcfs_request_queue) { + if (has_fastest_slo and num_available_requests >= 8) { + break; + } + if (request.get_slo_ratio() <= 1.0) { + has_fastest_slo = true; + if (num_available_requests >= 8) { + break; } + } + request_load_onto_batch(request.batch_index); + } - memcpy(&new_bc.causalMask[i], - &old_bc.causalMask[i], - sizeof(BatchConfig::BitMask)); - - new_bc.requestsInfo[i].prompt_phase = true; - if (new_bc.requestsInfo[i].first_token_depth_in_request >= - request.tokens.size()) { - // request is done - new_bc.requestsInfo[i].num_tokens_in_batch = 0; - new_bc.causalMask[i].this_layer_size = 0; - new_bc.beamRequestsInfo[i].sub_request_num = 0; - new_bc.beamRequestsInfo[i].beam_size = 1; - } else { - // Prompt phase - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens - - BatchConfig::max_requests_per_batch() + i, - (int)request.tokens.size() - - new_bc.requestsInfo[i].first_token_depth_in_request); - request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; - BeamTree tree = request.beam_trees[old_bc.model_id]; - appendPendingRequest(new_bc.causalMask[i], - new_bc.requestsInfo[i].num_tokens_in_batch); - } + std::copy(std::begin(request_available), + std::end(request_available), + std::begin(bc.request_available)); + bc.num_available_requests = num_available_requests; - if (verbose) { - std::cout << "[ Beam Spec] " << request.guid << std::endl; - std::cout << "Prompt phase: " << request.tokens.size() - << ", num_tokens_in_batch:" - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; - std::cout << "Update ssm cache size: " << request.ssm_cache_size - << std::endl; + for (int request_index = 0; request_index < get_max_requests_per_batch(); + request_index++) { + if (!request_available[request_index]) { + continue; + } + Request &request = all_requests[guid_of_requests[request_index]]; + assert(request.status == Request::RUNNING); - std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size - << std::endl; - std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size - << std::endl; - } + // Per Request Info + bc.requestsInfo[request_index].first_token_index_in_request = + request.llm_cache_size; + bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens; + bc.requestsInfo[request_index].num_tokens_in_batch = 1; - // register more tokens due to the beam width - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + // Copy the streaming cache info + bc.streamingCacheInfo[request_index] = request.streaming_cache_info; - // get value from requestinfo - new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[request.tokens.size() - - new_bc.requestsInfo[i].num_tokens_in_batch + j]; + request.first_token_offset_in_batch = bc.num_tokens; + request.num_tokens_in_batch = 1; - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; - new_bc.num_tokens++; - } - } + // Per Token Info + bc.tokensInfo[bc.num_tokens].request_index = request_index; + bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size; + bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size; + bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back(); + + bc.num_tokens++; + + if (profiling_requests[request.guid].llm_decoding_steps == 0) { + profiling_requests[request.guid].start_decoding_time = + Realm::Clock::current_time_in_microseconds(); } } - new_bc.num_generation_tokens = num_generation_tokens; if (verbose) { - std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:" + std::cout << "prepare_decoding_batch_fcfs_slo NEW batchconfig:" << std::endl; - old_bc.print(); - new_bc.print(); + bc.print(); } - return new_bc; + profiling.llm_step_start = Realm::Clock::current_time_in_microseconds(); + return bc; } -/***** Verify Phase *****/ +BatchConfig RequestManager::prepare_decoding_batch_stta() { + // This function is called when the request_manager_status is DECODING. It + // fills the last token of each request in the current batch to the + // BatchConfig for the LLM to decode. + if (verbose) { + std::cout << "\n############### prepare_decoding_batch_stta " + "##############\n"; + } + + BatchConfig bc; + bc.inference_mode = InferenceMode::INC_DECODING_MODE; + bc.prompt_phase = false; + + // Check if there are any requests whose SLO is in the fastest category + std::fill(request_available, + request_available + get_max_requests_per_batch(), + false); + num_available_requests = 0; + std::vector> tta_2_batch_index; + for (int request_index = 0; request_index < get_max_requests_per_batch(); + request_index++) { + if (guid_of_requests[request_index] == INVALID_GUID) { + continue; + } + Request &request = all_requests[guid_of_requests[request_index]]; + assert(request.status == Request::RUNNING); + tta_2_batch_index.push_back(std::make_pair( + get_request_expected_latency(request) - request.decode_latency_ms, + request_index)); + } + + // Sort the requests in the queue based on the time to attain SLO in ascending + // order + std::sort(tta_2_batch_index.begin(), + tta_2_batch_index.end(), + [](std::pair const &a, + std::pair const &b) { return a.first < b.first; }); + + // Include the requests one by one until: + // 1. If the batch includes a request whose SLO is in the fastest category, + // limit the number of requests in the batch to 8. + // 2. If the batch does not include a request whose SLO is in the fastest + // category, keep adding requests until a request whose SLO is in the fastest + // category is met (do not include it). + bool has_fastest_slo = false; + for (auto const &[tta, request_index] : tta_2_batch_index) { + Request &request = all_requests[guid_of_requests[request_index]]; + if (has_fastest_slo and num_available_requests >= 8) { + break; + } + if (request.get_slo_ratio() <= 1.0) { + has_fastest_slo = true; + if (num_available_requests >= 8) { + break; + } + } + request_load_onto_batch(request_index); + } -TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify( - std::vector const &old_batches, - Context ctx, - Runtime *runtime) { + std::copy(std::begin(request_available), + std::end(request_available), + std::begin(bc.request_available)); + bc.num_available_requests = num_available_requests; - RequestManager *rm = this; - TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, - TaskArgument(&rm, sizeof(RequestManager *))); - for (auto const &bcf : old_batches) { - launcher.add_future(bcf); - } - return runtime->execute_task(ctx, launcher); -} + for (int request_index = 0; request_index < get_max_requests_per_batch(); + request_index++) { + if (!request_available[request_index]) { + continue; + } + Request &request = all_requests[guid_of_requests[request_index]]; + assert(request.status == Request::RUNNING); -TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - RequestManager *rm = *((RequestManager **)task->args); - std::vector old_batches; - for (auto const &bcf : task->futures) { - old_batches.push_back(Future(bcf).get_result()); - } - return rm->prepare_next_batch_verify(old_batches); -} + // Per Request Info + bc.requestsInfo[request_index].first_token_index_in_request = + request.llm_cache_size; + bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens; + bc.requestsInfo[request_index].num_tokens_in_batch = 1; -TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( - std::vector const &old_batches) { - const std::lock_guard lock(request_queue_mutex); + // Copy the streaming cache info + bc.streamingCacheInfo[request_index] = request.streaming_cache_info; - if (verbose) { - std::cout - << "\n############### prepare_next_batch_verify ###############\n"; - } + request.first_token_offset_in_batch = bc.num_tokens; + request.num_tokens_in_batch = 1; - assert(old_batches.size() > 0); + // Per Token Info + bc.tokensInfo[bc.num_tokens].request_index = request_index; + bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size; + bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size; + bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back(); - TreeVerifyBatchConfig new_bc; - new_bc.num_tokens_to_commit = 0; - new_bc.num_tokens = 0; + bc.num_tokens++; - int max_prompt_load_size = get_max_verify_tokens_per_batch(); - for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { - if (old_batches.at(0).request_completed[i]) { - continue; - } else if (old_batches.at(0).request_running[i]) { - max_prompt_load_size -= (BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1); - } else { - max_prompt_load_size -= 1; + if (profiling_requests[request.guid].llm_decoding_steps == 0) { + profiling_requests[request.guid].start_decoding_time = + Realm::Clock::current_time_in_microseconds(); } } - int num_active_req = -1; - for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { - if (old_batches.at(0).request_completed[i]) { - continue; - } - num_active_req++; - size_t guid = old_batches.at(0).requestsInfo[i].request_guid; - Request &request = all_requests[guid]; - - // Profiling - profiling_requests[request.guid].llm_decoding_steps += 1; - if (request.status == Request::RUNNING) { - new_bc.request_running[i] = true; + if (verbose) { + std::cout << "prepare_decoding_batch_fcfs_slo NEW batchconfig:" + << std::endl; + bc.print(); + } + profiling.llm_step_start = Realm::Clock::current_time_in_microseconds(); + return bc; +} +/* ----- Speculative Inference Specific functions ----- */ - // Get the dfs tree - std::vector>> - all_dfs_trees; +/***** Request Init Phase *****/ +BatchConfig RequestManager::prepare_first_spec_batch_config() { + if (verbose) { + std::cout << "\n############### prepare_first_spec_batch_config " + "##############\n"; + } + // This method does the following: + // 1. Commit the verified tokens through BatchConfig. The infomation + // of the committed tokens are stored in request.committed_tokens. Put the + // information of the committed tokens into BatchConfig.TokensInfo. + // 2. Maintain BatchConfig::RequestsInfo and all other fields of + // BatchConfig. + assert(current_ssm_step == 0); - for (int j = 0; j < old_batches.size(); j++) { - std::vector> new_tree = - traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1); - all_dfs_trees.push_back(new_tree); + BatchConfig new_bc; + new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE; + // Assume that only one small model is in use now + new_bc.prompt_phase = true; + std::copy(std::begin(request_available), + std::end(request_available), + std::begin(new_bc.request_available)); + new_bc.num_available_requests = num_available_requests; + + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + + std::vector &committed_tokens = + request.committed_tokens; + + // Maintain requestsInfo + new_bc.requestsInfo[request_index].first_token_offset_in_batch = + new_bc.num_tokens; + new_bc.requestsInfo[request_index].first_token_index_in_request = + request.ssm_cache_size; + + // Store committed tokens to tokensInfo + int num_committed_tokens = committed_tokens.size(); + if (num_committed_tokens == 1) { + new_bc.requestsInfo[request_index].num_tokens_in_batch = 1; + // The case where the prefilling is just finished. Although the last + // token's kv cache is already there, the we need to decode the last + // token because it's the root of the token tree. + new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index; + if (streaming_cache) { + new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = + request.ssm_cache_size; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.ssm_cache_size; + } else { + new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = + committed_tokens[0].to_index; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + committed_tokens[0].to_index; } - assert(all_dfs_trees.size() == old_batches.size()); - std::vector> dfs_tree_inputs = - merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); - - if (verbose) { - std::cout << "Request Tokens Size: " << request.tokens.size() - << std::endl; - for (int k = 0; k < request.tokens.size(); k++) { - std::cout << k << ": " << request.tokens[k] << std::endl; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + committed_tokens[0].token_id; + new_bc.num_tokens++; + } else { + for (int committed_token_index = 1; + committed_token_index < committed_tokens.size(); + committed_token_index++) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index; + if (streaming_cache) { + new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = + request.ssm_cache_size + committed_token_index - 1; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.ssm_cache_size + committed_token_index - 1; + } else { + new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = + committed_tokens[committed_token_index].to_index; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + committed_tokens[committed_token_index].to_index; } + new_bc.tokensInfo[new_bc.num_tokens].token_id = + committed_tokens[committed_token_index].token_id; + new_bc.num_tokens++; } + new_bc.requestsInfo[request_index].num_tokens_in_batch = + num_committed_tokens - 1; + } - // Normal Request Info - new_bc.requestsInfo[i].first_token_depth_in_request = - dfs_tree_inputs.front().second; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = - old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - - // copy bitmask to verify batchconfig - memcpy(&(new_bc.causalMask[i]), - &(old_batches.at(0).causalMask[i]), - sizeof(BatchConfig::BitMask)); - // TODO: Check this - new_bc.requestsInfo[i].num_tokens_in_batch = 0; - new_bc.request_completed[i] = false; - - // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " - // << new_bc.causalMask[i].tree_size << ", " - // << new_bc.causalMask[i].non_tree_cache_size << "\n"; - // std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) - // << "\n"; - - // Committed Tokens - if (committed_tokens.find(guid) != committed_tokens.end()) { - for (int j = 0; j < committed_tokens.at(guid).size(); j++) { - // if (j < committed_tokens.at(guid).size()) { - - auto committed_token = committed_tokens.at(guid).at(j); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second - << std::endl; - } - new_bc.num_tokens_to_commit++; - request.llm_cache_size++; - // } - } - } - if (verbose) { - std::cout << "new_bc.num_tokens_to_commit: " - << new_bc.num_tokens_to_commit << std::endl; - } + request.first_token_offset_in_batch = + new_bc.requestsInfo[request_index].first_token_offset_in_batch; + request.num_tokens_in_batch = + new_bc.requestsInfo[request_index].num_tokens_in_batch; + + // Copy the causal mask, it should already been updated in + // update_llm_verify_results + new_bc.causalMask[request_index] = request.causal_mask; + if (streaming_cache) { + new_bc.causalMask[request_index].non_tree_cache_size = + request.ssm_cache_size - 1; + } - // Incremental phase: only add the last committed token - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = - request.tokens.size() - 1; + // Copy the streaming cache info + new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info; - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; + if (profiling_requests[guid].ssm_decoding_steps == 0) { + profiling_requests[guid].start_decoding_time = + Realm::Clock::current_time_in_microseconds(); + } + profiling.ssm_step_start = Realm::Clock::current_time_in_microseconds(); + } - if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { - assert(false && - "Exceeding the space available in the TreeVerify batch"); - break; - } + if (!spec_infer_old_version) { + // Only dynamically update the tree depth in the new version + update_token_tree_depth(); + } + if (verbose) { + std::cout << "prepare_first_spec_batch_config NEW batchconfig:" + << std::endl; + new_bc.print(); + } + return new_bc; +} - new_bc.requestsInfo[i].first_token_depth_in_request = - request.tokens.size() - 1; +/***** Speculative Decoding Phase *****/ +BatchConfig RequestManager::prepare_next_spec_batch_config() { + if (verbose) { + std::cout << "\n############### prepare_next_spec_batch_config " + "###############\n"; + std::cout << "Current tree depth: " << current_ssm_step + 1 << "\n"; + } - bool cutLayer = false; - // Add Tokens from the DFS Tree to the next batch - for (int j = 1; j < dfs_tree_inputs.size(); j++) { - auto token = dfs_tree_inputs.at(j); - if (verbose) { - std::cout << "[" << j << "] Token: " << token.first - << ", Depth:" << token.second << std::endl; - } - // Normal Token Info - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; + // Prepare the next batch for existing requests + BatchConfig new_bc; + new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE; + // We assume that only one small model is in use now + new_bc.model_id = 0; + std::copy(std::begin(request_available), + std::end(request_available), + std::begin(new_bc.request_available)); + new_bc.num_available_requests = num_available_requests; + + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + int guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + new_bc.requestsInfo[request_index].first_token_offset_in_batch = + new_bc.num_tokens; + + // Fill in the tokens + TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id); + if (token_tree.tree_layers.size() <= current_ssm_step) { + // This request has no token to decode in this and the following small + // model inference steps + new_bc.requestsInfo[request_index].num_tokens_in_batch = 0; + // non_tree_cache_size = ssm_cache_size - 1 + new_bc.requestsInfo[request_index].first_token_index_in_request = + request.ssm_cache_size - 1 + request.causal_mask.tree_or_prompt_size - + request.causal_mask.current_layer_size; + request.num_tokens_in_batch = 0; + request.first_token_offset_in_batch = new_bc.num_tokens; + continue; + } else { + std::vector> ¤t_layer = + token_tree.tree_layers.back(); + // Exclude the current layer from the token tree, because we want the + // start index + // non_tree_cache_size = ssm_cache_size - 1 + new_bc.requestsInfo[request_index].first_token_index_in_request = + request.ssm_cache_size - 1 + request.causal_mask.tree_or_prompt_size - + request.causal_mask.current_layer_size; + new_bc.requestsInfo[request_index].num_tokens_in_batch = + request.causal_mask.current_layer_size; + + request.num_tokens_in_batch = + new_bc.requestsInfo[request_index].num_tokens_in_batch; + request.first_token_offset_in_batch = new_bc.num_tokens; + + int child_index = 0; + for (auto const &node_ptr : current_layer) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index; + new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = + new_bc.requestsInfo[request_index].first_token_index_in_request + + child_index; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = - token.second; + request.ssm_cache_size - 1 + current_ssm_step; + new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id; new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; - - if (new_bc.num_tokens == get_max_verify_tokens_per_batch() && - (j != dfs_tree_inputs.size() - 1)) { - cutLayer = true; - break; - } + child_index++; } + } - // delete the last incomplete layer - if (cutLayer) { - int total_tokens = new_bc.num_tokens; - for (int j = total_tokens - 1; j >= 1; j--) { - new_bc.num_tokens--; - new_bc.requestsInfo[i].num_tokens_in_batch--; - // std::cout << "cut: " << j << "\n"; - if (new_bc.tokensInfo[j].abs_depth_in_request != - new_bc.tokensInfo[j - 1].abs_depth_in_request) { - break; - } - } - } + // Copy the causal mask, it should already been updated by + // update_ssm_inference_results + new_bc.causalMask[request_index] = request.causal_mask; + if (streaming_cache) { + new_bc.causalMask[request_index].non_tree_cache_size = + request.ssm_cache_size - 1; + } - } else if (request.status == Request::PENDING) { - new_bc.request_running[i] = false; - if (verbose) { - std::cout << "[Verify] Request " << request.guid - << " is pending in loading prompt phase" << std::endl; - std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size - << std::endl; - std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size - << std::endl; - } + // Copy the streaming cache info + new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info; + } - // Commit all tokens from the last loading batch - if (committed_tokens.find(guid) != committed_tokens.end()) { - for (int j = 0; j < committed_tokens.at(guid).size(); j++) { - auto token = committed_tokens.at(guid).at(j); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - token.first; - - new_bc.num_tokens_to_commit++; - request.llm_cache_size++; - } - std::cout << "[Verify] Committed Tokens from last loading batch: " - << new_bc.num_tokens_to_commit << std::endl; - } + if (verbose) { + std::cout << "prepare_next_spec_batch_config NEW batchconfig:" << std::endl; + new_bc.print(); + } + return new_bc; +} - memcpy(&(new_bc.causalMask[i]), - &(old_batches.at(0).causalMask[i]), - sizeof(BatchConfig::BitMask)); - - // Normal Request Info - new_bc.requestsInfo[i].first_token_depth_in_request = - request.llm_cache_size; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = - old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - - new_bc.request_completed[i] = false; - - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(max_prompt_load_size, - (int)request.initial_len - - new_bc.requestsInfo[i].first_token_depth_in_request); - max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch; - - std::cout << "max_prompt_load_size: " << max_prompt_load_size - << std::endl; +/***** Verify Phase *****/ +BatchConfig RequestManager::prepare_verify_batch_config() { + if (verbose) { + std::cout + << "\n############### prepare_verify_batch_config ###############\n"; + } + // This method does the following: + // 1. Commit the verified tokens in the last iteration through the + // BatchConfig. We can do this request by request. + // The information of the committed tokens is stored in + // Request.llm_committed_tokens. Put the information of the committed tokens + // into BatchConfig.committed_tokens. + // 2. Load the tokens on the token tree that are not yet pruned to + // BatchConfig.tokensInfo. Be careful with the abs_depth etc. + // (skip the pruned tokens). + // 3. Create the causal mask for the large model based on the small model + // causal mask (call create_llm_bitmask()). + // 4. Maintain BatchConfig::RequestsInfo and all other fields of + // BatchConfig. + // Please refer to the implementation of prepare_next_spec_batch_config() + // for more details. + BatchConfig new_bc; + new_bc.inference_mode = InferenceMode::TREE_VERIFY_MODE; + std::copy(std::begin(request_available), + std::end(request_available), + std::begin(new_bc.request_available)); + new_bc.num_available_requests = num_available_requests; + + // get page manager + PageManager *page_manager = PageManager::get_page_manager(); + + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + int guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + + // before commit token, reset the pages assigned by cleaning all the tokens + std::vector block_table_before_commit = + page_manager->get_block_table_indices(guid); + // also need to reset the pages + reset_block_table(request); + + // 1. Maintain requestsInfo + new_bc.requestsInfo[request_index].first_token_index_in_request = + request.tokens.size() - 1; // Exclude the last token + new_bc.requestsInfo[request_index].first_token_offset_in_batch = + new_bc.num_tokens; + new_bc.requestsInfo[request_index].num_tokens_in_batch = 0; + + // Put the information of the committed tokens into + // BatchConfig.committed_tokens. + // Note here, we shouldn't put the last token in request.committed_tokens + // into new_bc. Because the LLM don't have that token's KV cache. + std::vector &committed_tokens = + request.committed_tokens; + for (int committed_token_index = 0; + committed_token_index < committed_tokens.size() - 1; + committed_token_index++) { + Request::CommittedToken &committed_token = + committed_tokens.at(committed_token_index); + + int idx_to_physical = + append_token_to_block(request, committed_token.token_id, true); + int idx_from_logical = committed_token.from_index; + int idx_from_physical = + block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + + idx_from_logical % kPagesize; + + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + request_index; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache = + idx_from_physical; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + idx_to_physical; + new_bc.num_tokens_to_commit++; + } - if (request.llm_cache_size < request.initial_len) { - // std::cout << "Initialization (prompt) phase: " - // << new_bc.requestsInfo[i].num_tokens_in_batch << ", " - // << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; - // Initialization (prompt) phase - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[request.llm_cache_size + j]; + // Load the tokens on the token tree that are not yet pruned to + // BatchConfig.tokensInfo. + TokenTree &token_tree = request.speculative_token_trees[0]; + int token_tree_index = 0; + int layer_index = 0; + for (auto const &tree_layer : token_tree.tree_layers) { + for (auto const &tree_node : tree_layer) { + if (tree_node->included == true) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index; + new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = + request.tokens.size() - 1 + token_tree_index; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = - request.llm_cache_size + j; + request.tokens.size() - 1 + layer_index; + new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id; new_bc.num_tokens++; - } + token_tree_index++; - if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { - printf("Exceeding (%i) the space available (%i) in the TreeVerify " - "batch\n", - new_bc.num_tokens, - get_max_verify_tokens_per_batch()); - assert(false); + // Append the token to the block + append_token_to_block(request, tree_node->id, false); } + } + layer_index++; + } + if (verbose) { + // print token tree + std::cout << "Token tree for request " << request_index << ": " + << std::endl; + std::cout << token_tree << std::endl; + } + new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index; - if (new_bc.requestsInfo[i].num_tokens_in_batch + - request.llm_cache_size >= - request.initial_len) { - // launch the request into running phase after loading all prompt - request.status = Request::RUNNING; - new_bc.request_running[i] = true; - - // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - // << new_bc.requestsInfo[i].num_tokens_in_batch << - // std::endl; - new_bc.requestsInfo[i].prompt_phase = true; - - dfs_tree_inputs[guid] = - std::vector>{std::make_pair( - request.tokens.back(), request.tokens.size() - 1)}; - } - } else { // launch the request into running phase after loading all prompt - if (get_max_verify_tokens_per_batch() - new_bc.num_tokens > 0) { - // std::cout << "Initialization running phase: " - // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; - request.status = Request::RUNNING; - new_bc.request_running[i] = true; - - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = - request.tokens.size() - 1; + request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index; + request.num_tokens_in_batch = token_tree_index; - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; - // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch2: " - // << new_bc.requestsInfo[i].num_tokens_in_batch << - // std::endl; - - new_bc.requestsInfo[i].prompt_phase = true; - dfs_tree_inputs[guid] = - std::vector>{std::make_pair( - request.tokens.back(), request.tokens.size() - 1)}; - } - } + // Create the causal mask for the large model based on the small model + // causal mask. + new_bc.causalMask[request_index] = create_llm_bitmask(guid); - } else { - assert(false && "Request status is not RUNNING or PENDING"); - } + // Copy the streaming cache info + new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info; + new_bc.requestsInfo[request_index].request_guid = request.guid; } + if (verbose) { + std::cout << "prepare_verify_batch_config NEW batchconfig:" << std::endl; + new_bc.print(); + } + profiling.llm_step_start = Realm::Clock::current_time_in_microseconds(); return new_bc; } -void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, - BeamInferenceResult const &result) { - // step1 store the outputs - if (old_bc.num_tokens <= 0) { - return; +int get_tree_size(Request const &request) { + int size = 0; + for (auto &layer : request.speculative_token_trees[0].tree_layers) { + size += (int)layer.size(); } - auto guid = - old_bc.requestsInfo[old_bc.tokensInfo[0].request_index].request_guid; - auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request; - int result_index = 0; - - if (verbose) { - std::cout << "Store total of " << old_bc.num_tokens - << " tokens in the current batch.\n"; - } - - for (int i = 0; i <= old_bc.num_tokens; i++) { - if (i == old_bc.num_tokens || - old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != - guid) { - - // std::cout << "i is: " << i << "old guid" << guid << " new guid" - // << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] - // .request_guid - // << "\n"; - - int index = old_bc.tokensInfo[i - 1].request_index; - int beam_size = old_bc.beamRequestsInfo[index].beam_size; - - // int leaf_node_num = old_bc.sub_requests[index]; - int leaf_node_num = - old_bc.beamRequestsInfo[index].sub_request_num * beam_size; - int depth = old_bc.beamRequestsInfo[index].current_depth; - - // Each token yields (beam_width) results - // int beam_width = old_bc.beamRequestsInfo[index].beam_size; - - // Count tokens sent to model in this request to find the final token's - // index - result_index += - (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - beam_size; - - if (verbose) { - std::cout << "i = " << i << ", result index = " << result_index - << ", value: " << result.token_ids[result_index] - << ", leaf node num: " << leaf_node_num << ", depth" << depth - << ", beam size: " << beam_size << "\n"; - } - - Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; - - if (old_bc.requestsInfo[index].num_tokens_in_batch == 0) { - continue; - } - - if (depth == 1) { - // store the last input into the tree; - if (verbose) { - std::cout << "try to store the input" - << "\n"; - } + return size; +} - request.beam_trees.at(old_bc.model_id).treeLayers[0].tokens[0] = - request.tokens.back(); - request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1; - request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1; - request.beam_trees.at(old_bc.model_id) - .treeLayers[0] - .nodes_num_this_layer = 1; - - if (verbose) { - std::cout << "Store the previous last token to the tree root: " - << request.tokens.back() << "\n"; - } - } - request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .nodes_num_this_layer = leaf_node_num; - for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { - - request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .tokens[beam_id] = result.token_ids[result_index]; - request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .probs[beam_id] = result.probs[result_index]; - request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .parent_ids[beam_id] = result.parent_id[result_index]; - - if (verbose) { - std::cout << "tree value: " << depth << "token: " - << request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .tokens[beam_id] - << "result tokens: " << result.token_ids[result_index]; - } - result_index += 1; - } - // update the guid and start_depth for current request - if (i < old_bc.num_tokens) { - int new_req_idx = old_bc.tokensInfo[i].request_index; - guid = old_bc.requestsInfo[new_req_idx].request_guid; - start_depth = old_bc.tokensInfo[i].abs_depth_in_request; - } +bool RequestManager::is_eos_token(TokenId token_id) { + for (int eos_token : eos_token_ids) { + if (token_id == eos_token) { + return true; } } + return false; } -// for updating the beam search metadata in requests in incremental phase -void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, - BeamSearchBatchConfig const &old_bc, - BeamTree &tree, - int request_index) { +bool RequestManager::update_llm_verify_results( + InferenceResult const &llm_verify_result) { + // We may have two types of InferenceResults, one is the results from + // sampling the large model, the other is the top-p / top-k logits of the + // large model, we can first implement the former one. For the latter one, + // we have to add a CPU based verify function. + + // Compare the results returned from the LLM and compare them with the + // SSM's speculative token tree. For the greedy construction of the + // speculative token tree, we can simply compare LLM's sample result at each + // token, this is implemented in get_verify_results_greedy(). This function + // stores the commmitted tokens into the corresponding fields in the + // Request. For the sampling construction of the speculative token tree, we + // need to implement a CPU based verify function. + + // Update llm_cache_size with the last committed_tokens, and clear + // committed_tokens + int nb_requests_decoded = 0; + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + // Request in this slot is unavailable + continue; + } + int guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + request.llm_cache_size += request.committed_tokens.size() - 1; + request.committed_tokens.clear(); - // do the exchange - if (new_bc.request_completed[request_index]) { - assert(false); + profiling_requests[guid].llm_decoding_steps++; + nb_requests_decoded++; } - int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1; - int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; - - // int leaf_node_num = old_bc.sub_requests[request_index]; - int leaf_node_num = new_bc.beamRequestsInfo[request_index].sub_request_num; - if (new_bc.beamRequestsInfo[request_index].current_depth == - 1) { // TODO: check if this is correct - // for (int j = 0; j < beam_size; j++) { - // new_bc.beamRequestsInfo[request_index].parent_id[j] = j; - // new_bc.beamRequestsInfo[request_index].probs[j] = - // tree.treeLayers[depth].probs[j]; // ? - // new_bc.beamRequestsInfo[request_index].tokens[j] = - // tree.treeLayers[depth].tokens[j]; // ? - // } - // Do nothing - // assert(false); + // Process the LLM results greedily + if (speculative_sampling) { + get_verify_results_sample(llm_verify_result); } else { - for (int j = 0; j < leaf_node_num; j++) { - new_bc.beamRequestsInfo[request_index].parent_id[j] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[j] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[j] = - tree.treeLayers[depth].tokens[j]; - // std::cout << "token: " << j << ": " - // << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; - } - } - if (verbose) { - std::cout << "-----------after parent id exchange-----------" << std::endl; - for (int j = 0; j < beam_size; j++) { - std::cout << "after request id: " << request_index << "beam id = " << j - << "parent: " - << new_bc.beamRequestsInfo[request_index].parent_id[j] - << "token: " << new_bc.beamRequestsInfo[request_index].tokens[j] - << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j] - << std::endl; - } + get_verify_results_greedy(llm_verify_result); } -} -// bit mask related function + long long int current_time = Realm::Clock::current_time_in_microseconds(); + profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) * + 1e-3); + profiling.requests_per_step.push_back(nb_requests_decoded); -// prompt phase, init task -void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, - int initLength) { - assert(initLength > 0); - // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: - // 0000000..1000 - bitmask.non_tree_cache_size = 0; - bitmask.tree_size = 1; - - bitmask.prompt_size = initLength; - bitmask.this_layer_size = initLength; - // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; - // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; - // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; - // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; -} - -// prepare next init -void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, - int initLength, - int non_tree_size) { - // assert(initLength == 1); - // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: - // 0000000..1000 - assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && - "do not support tree size > 64"); - assert(initLength >= 1 && "verified token num should >= 1"); - - // std::cout << "non tree size: " << non_tree_size << ", " - // << bitmask.non_tree_cache_size << "\n"; - - bitmask.non_tree_cache_size = non_tree_size + initLength - 1; - bitmask.tree_size = 1; - bitmask.this_layer_size = initLength; - // std::cout << "non_tree_size: " << non_tree_size << "\n"; - bitmask.prompt_size = 1; - for (int i = 0; i < bitmask.prompt_size; i++) { - for (int j = i; j < bitmask.prompt_size; j++) { - bitmask.mask[i] |= (1 << j); - } - } - - // std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; - // std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) - // << "\n"; -} - -// prompt phase, init task -void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, - int initLength) { - assert(initLength > 0); - // std::cout << "append pending bit mask: " << initLength << "\n"; - // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: - // 0000000..1000 - bitmask.non_tree_cache_size = 0; - bitmask.tree_size = 1; - bitmask.prompt_size += initLength; - bitmask.this_layer_size = initLength; - - // for (int i = 0; i < bitmask.prompt_size; i++) { - // for (int j = i; j < bitmask.prompt_size; j++) { - // bitmask.mask[i] |= (1 << j); - // } - // } -} + bool request_completed = false; + + // Iterate over the requests + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + // Request in this slot is unavailable + continue; + } + int guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + if (verbose) { + std::cout << "Request " << guid << " token tree: " << std::endl; + std::cout << request.speculative_token_trees[0]; + } -// prepare next beam, append layers to the tree -void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, - int newNodes, - int preBeamSize, - int old_sub_num, - BeamTree const tree, - int currentDepth) { - int pre_tree_size = bitmask.tree_size; - bitmask.tree_size += newNodes; - bitmask.this_layer_size = newNodes; - assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && - "do not support tree size > 64"); - // preBeamSize: replicate num - - // add relationship with input/prompt - for (int i = 0; i < bitmask.prompt_size; i++) { - for (int j = pre_tree_size; j < bitmask.tree_size; j++) { - bitmask.mask[i] |= (1 << j); - // std::cout << "see bit mask append: " << i << ", to" << j - // << std::bitset<64>(bitmask.mask[i]) << "\n"; - } - } - - // std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " - // << pre_tree_size << ", " << bitmask.prompt_size << ", " - // << preBeamSize << "\n"; - - // int num_groups = newNodes / preBeamSize; - // int group_size = newNodes / num_groups; - // add relations to branch - // requests in same groups share same relations, except the last token. - - // set middle layers - // skip the root prompt/tokens - int token_idx = bitmask.prompt_size; - int new_nodes_start_idx = pre_tree_size; - // std::cout << "new nodes start " << new_nodes_start_idx << "\n"; - for (int i = 1; i < currentDepth; i++) { - new_nodes_start_idx = pre_tree_size; - int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer; - // std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer - // << "group size: " << newNodes / nodes_this_layer << "\n"; - for (int j = 0; j < nodes_this_layer; j++) { - int group_size = newNodes / nodes_this_layer; - for (int k = 0; k < group_size; k++) { - bitmask.mask[token_idx] |= (1 << new_nodes_start_idx); - new_nodes_start_idx += 1; + request.decode_latency_ms = + (current_time - profiling_requests[guid].start_decoding_time) * 1e-3; + bool attained = + request.decode_latency_ms <= get_request_expected_latency(request); + bool current_attained = + request.decode_latency_ms <= + get_request_expected_latency(request) + get_slo_constraint(request) * 6; + + // Initialize the token tree for the request + init_token_tree(guid); + assert(!request.committed_tokens.empty() && + "The committed tokens should not be empty."); + // Add the last committed token as the root of the speculative token tree + add_root_to_spec_token_tree(guid, request.committed_tokens.back().token_id); + + // Check if the request is completed. If its completed, clean up the + // metainfo stored in the RequestManager. Otherwise, update its bitmask. + bool eos_token_found = false; + for (auto const &committed_token : request.committed_tokens) { + if (is_eos_token(committed_token.token_id)) { + eos_token_found = true; + break; } - token_idx += 1; + } + if (eos_token_found or request.decode_length() >= get_max_output_length() or + request.tokens.size() >= get_max_sequence_length()) { + // Request is completed + request_update_attainment(request_index, attained); + request_completed = true; + request_complete_clean_up(request_index); + } else if (!current_attained and slo_violation_early_termination) { + // Early drop that request + request_update_attainment(request_index, attained); + request_completed = true; + request_complete_clean_up(request_index); + } else { + update_bitmask_prompt(guid, request.committed_tokens.size() - 1); } } - assert(token_idx == pre_tree_size); - assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); + // Some requests may be completed after appending the verified tokens. + // If there is a request completed, return true. + return request_completed; +} - // assert(currentDepth <= 2); - // set last layer, all tokens are only relevant to it self; - for (int i = token_idx; i < bitmask.tree_size; i++) { - bitmask.mask[i] |= (1 << i); - // std::cout << "set rel: " << i << "to: " << i << "\n"; +bool RequestManager::update_ssm_inference_results( + InferenceResult const &ssm_inference_result) { + // This function returns true if no tokens are added to the token tree, + // which indicates that the ssm inference phase is done. + assert(current_ssm_step >= 1 && + "The current speculation step should be no less than 1"); + + // Here we assume that the order of the tokens in the last + // BatchConfig and hence the last InferenceResult is equal to + // the order of the request in the last BatchConfig + if (!spec_infer_old_version) { + static double schedule_start = 0.0; + if (get_eval_overhead_breakdown()) { + schedule_start = Realm::Clock::current_time_in_microseconds(); + } + add_tokens_to_spec_token_tree(ssm_inference_result); + if (get_eval_overhead_breakdown()) { + eval_schedule_latency_us += + Realm::Clock::current_time_in_microseconds() - schedule_start; + } + } else { + add_tokens_to_spec_token_tree_old_version(ssm_inference_result); } - // if(bitmask.non_tree_cache_size == 19 && bitmask.tree_size > 2){ - // assert(false); - // } + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + // Request in this slot is unavailable + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); - // std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; - // std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; - // std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) - // << "\n"; -} - -bool PreOrder( - BeamTree const &tree, - int max_depth, - int current_depth, - int beam_width, - int id, - std::vector> &serializedTree, - bool verbose) { - // terminate - if (current_depth >= max_depth) { - serializedTree.push_back(std::make_pair( - tree.treeLayers[current_depth].tokens[id], current_depth)); - if (verbose) { - std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id] - << "\n"; - std::cout << "return true" - << "\n"; + if (current_ssm_step == 1) { + if (streaming_cache) { + request.streaming_cache_info.commit_cache(request.num_tokens_in_batch); + request.ssm_cache_size = request.streaming_cache_info.commit_len; + } else { + request.ssm_cache_size = request.tokens.size(); + } + } + + if (current_ssm_step == 1) { + init_bitmask_spec(guid); + } + append_bitmask(guid); + + profiling_requests[guid].ssm_decoding_steps++; + + if (current_ssm_step == ssm_tree_depth) { + profiling_requests[guid].speculation_start_timestamp = + profiling.ssm_step_start; + profiling_requests[guid].speculation_end_timestamp = + Realm::Clock::current_time_in_microseconds(); } - return true; } - // add to tree; - // std::cout<<"node: " << current_depth << ", id: " << - serializedTree.push_back( - std::make_pair(tree.treeLayers[current_depth].tokens[id], current_depth)); - if (verbose) { - std::cout << "push something: " << tree.treeLayers[current_depth].tokens[id] - << ", " << current_depth << std::endl; - } - int index = serializedTree.size() - 1; - int next_layers = current_depth + 1; - - bool flag = false; - // recursion - for (int i = 0; i < beam_width; i++) { - int child_id = i; - int child_parent = tree.treeLayers[next_layers].parent_ids[i]; - - // for all childs, do preOrder - if (child_parent == id) { - if (verbose) { - std::cout << "current depth: " << current_depth << ", child_parent, " - << child_parent << ", child_id, " << child_id << "\n"; + // Stop conditions + if (current_ssm_step == ssm_tree_depth) { + // Prune the token tree at the last step + if (!spec_infer_old_version) { + static double schedule_start = 0.0; + if (get_eval_overhead_breakdown()) { + schedule_start = Realm::Clock::current_time_in_microseconds(); + } + prune_token_tree(); + if (get_eval_overhead_breakdown()) { + eval_schedule_latency_us += + Realm::Clock::current_time_in_microseconds() - schedule_start; } - bool res = PreOrder(tree, - max_depth, - current_depth + 1, - beam_width, - child_id, - serializedTree, - verbose); - flag = flag || res; - } - } - // if (!flag) { - // // no child for this token, delete it - // std::cout << "delete a node: " << - // tree.treeLayers[current_depth].tokens[id] - // << ", " << current_depth << std::endl; - // serializedTree.erase(serializedTree.begin() + index); - // } - return flag; -} - -std::vector> - RequestManager::traverse_verify_tree( - size_t guid, - std::vector> const - &inputSerializedTree, - std::vector> const - &outputSerializedTree) { - std::vector> verifiedTree; - // verifiedTree.push_back(inputSerializedTree.at(0)); - std::vector> new_committed_tokens = - std::vector>(); - - log_req_mgr.print("Input tree size (%zu) Output tree size (%zu)", - inputSerializedTree.size(), - outputSerializedTree.size()); - { // Input tree - std::ostringstream oss; - // inputSerializedTree is the dfs_tree_inputs_map[guid] array og (token id, - // depth) pairs - for (auto const &pair : inputSerializedTree) { - oss << " " << pair.second << ":" << pair.first; - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); - } - log_req_mgr.print("Input tree:%s", oss.str().c_str()); - } - { // Output tree - // log_req_mgr.print("========Output============"); - // outputSerializedTree is an array of (token id, depth + 1) pairs - std::ostringstream oss; - for (auto const &pair : outputSerializedTree) { - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); - oss << " " << pair.second << ":" << pair.first; - } - log_req_mgr.print("Output tree:%s", oss.str().c_str()); - } - { - // log_req_mgr.print("========Committed============"); - // committed_tokens[guid] is an array of (depth, result_index) pairs for - // the given request - std::ostringstream oss; - for (auto const &pair : committed_tokens.at(guid)) { - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); - oss << " " << pair.second << ":" << pair.first; - } - log_req_mgr.print("Committed tokens:%s", oss.str().c_str()); - } - - // It's safe to have inputSerializedTree.size() > outputSerializedTree.size() - // In this case the inputSeriedTree ends with padding 0s - assert(inputSerializedTree.size() >= outputSerializedTree.size()); - - int *treeLayers = new int[inputSerializedTree.size()]; - int node_num = 1; - int layer_num = 0; - for (int token_id = 0; token_id < inputSerializedTree.size(); token_id++) { - if (token_id == (inputSerializedTree.size() - 1) || - inputSerializedTree.at(token_id + 1).second != - inputSerializedTree.at(token_id).second) { - treeLayers[layer_num] = node_num; - layer_num += 1; - node_num = 1; - } else { - node_num++; } + // Update profiling statistics before returning + profiling.ssm_step_times.push_back( + (Realm::Clock::current_time_in_microseconds() - + profiling.ssm_step_start) * + 1e-3); + profiling.ssm_steps.push_back(current_ssm_step); + return true; } + return false; +} - // to avoid branch switch when same tokens in input tree. - // todo, only checked for N->1->1->1 cases +/* --------- Bitmask Related Functions --------- */ - bool findFirst = false; - layer_num = -1; - int first_layer_slot = 0; - int first_layer_slot_total = 0; - int processed_whole_layer_tokens = 0; +void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) { + // This method is called by load_pending_request_to_batch when there is a + // new request to load into the batch + Request &request = all_requests[guid]; + BatchConfig::BitMask &bitmask = request.causal_mask; - for (int i = 0; i < outputSerializedTree.size(); i++) { - auto input = inputSerializedTree.at(i); - auto output = outputSerializedTree.at(i); + // Clear because the prompt kernel doesn't use mask + bitmask.clear_bitmask(); + // Set the info for the mask which is used to store the KV cache + bitmask.tree_or_prompt_size = prompt_length; + bitmask.current_layer_size = prompt_length; + bitmask.non_tree_cache_size = 0; +} - if (i == 0 || inputSerializedTree.at(i - 1).second != - inputSerializedTree.at(i).second) { - layer_num += 1; - processed_whole_layer_tokens += i == 0 ? 0 : treeLayers[layer_num - 1]; - } +void RequestManager::update_bitmask_prompt(RequestGuid guid, + int num_committed_tokens) { + // This method modifies the bitmask in place + // This method is called by update_llm_verify_results + // 1. Clear the causal mask because the first SSM inference uses the prompt + // kernel and it doesn't use mask. + // 2. Maintain all other fields. + Request &request = all_requests[guid]; + BatchConfig::BitMask &bitmask = request.causal_mask; + // Clear because the prompt kernel doesn't use mask + bitmask.clear_bitmask(); + bitmask.tree_or_prompt_size = num_committed_tokens; + bitmask.current_layer_size = num_committed_tokens; - if (i == 0) { - verifiedTree.push_back(output); + // If the request just finishes the prefilling phase, we need to set the + // non_tree_cache_size to the size of the prompt + if (bitmask.non_tree_cache_size == 0) { + bitmask.non_tree_cache_size = request.tokens.size() - num_committed_tokens; + } +} + +void RequestManager::init_bitmask_spec(RequestGuid guid) { + // This method modifies the bitmask in place + // This method is called by the first call of update_ssm_inference_results + // in a speculative iteration CAUTION: You should still call + // append_bitmask() after this method + // 1. Clear the causal mask and add a root into it, because the tree is + // currently empty but we have a root. + // 2. Maintain all other fields. + assert(current_ssm_step == 1 && "The current speculation step should be 1"); + Request &request = all_requests[guid]; + request.causal_mask = BatchConfig::BitMask(); + // Set the mask for the root + request.causal_mask.bit_mask[0].set_bit(0); + request.causal_mask.tree_or_prompt_size = 1; + request.causal_mask.non_tree_cache_size = request.tokens.size() - 1; + request.causal_mask.current_layer_size = 1; +} - new_committed_tokens.push_back(std::make_pair( - input.second, - committed_tokens.at(guid).at(i).second)); // - // std::cout << committed_tokens.at(guid).at(i).first << ", " - // << committed_tokens.at(guid).at(i).second << std::endl; - // std::cout << input.first << ", " << input.second << std::endl; +void RequestManager::append_bitmask(RequestGuid guid) { + // This method changes the bitmask in place + // This method is called by update_ssm_inference_results(), after the new + // tokens are added to the token tree + assert(current_ssm_step >= 1 && + "The current speculation step should be no less than 1"); - assert(committed_tokens.at(guid).at(i).first == input.second); - continue; - } + Request &request = all_requests[guid]; + BatchConfig::BitMask &bitmask = request.causal_mask; + TokenTree &token_tree = request.speculative_token_trees[0]; - if (input.first == verifiedTree.back().first && - input.second == verifiedTree.back().second) { - if (findFirst) { - // must in this branch. - int layer_slot = i - processed_whole_layer_tokens; - int layer_slot_total = treeLayers[layer_num]; - if ((first_layer_slot == layer_slot)) { - verifiedTree.push_back(output); - new_committed_tokens.push_back(std::make_pair( - input.second, committed_tokens.at(guid).at(i).second)); - // at this point, you'll not go other branches - // std::cout << "verify tree push back: " << output.first - // << ", tree size is: " << verifiedTree.size() - // << ", ??: " << input.first << ", " << input.second << - // "\n"; + if (token_tree.tree_layers.size() <= current_ssm_step) { + // This request has no token added in this and the following small model + // inference steps, skip it + return; + } + std::vector> &tree_layer = + request.speculative_token_trees[0].tree_layers.back(); + int new_layer_size = tree_layer.size(); + int last_layer_size = bitmask.current_layer_size; + int previous_tree_size = bitmask.tree_or_prompt_size; + bitmask.current_layer_size = new_layer_size; + bitmask.tree_or_prompt_size += new_layer_size; + + assert(bitmask.tree_or_prompt_size <= get_max_spec_tree_token_num()); + + int parent_offset = previous_tree_size - last_layer_size; + int child_offset = previous_tree_size; + + int child_idx = 0; + for (auto const &child_ptr : tree_layer) { + // Each child copy its parent's mask + bitmask.bit_mask[child_offset + child_idx] = + bitmask.bit_mask[parent_offset + child_ptr->parent_pos]; + // Each child attend to itself + bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset + + child_idx); + child_idx++; + } +} + +BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) { + // This method creates a new bitmask for LLM verification model's bitmask, + // it does not modify the small model's bitmask This method is called by + // prepare_verify_batch_config(). + Request &request = all_requests[guid]; + TokenTree &token_tree = request.speculative_token_trees[0]; + BatchConfig::BitMask llm_bitmask = BatchConfig::BitMask(); + + int abs_index_in_tree = 0; + std::vector parent_pos_2_abs_index; + std::vector current_layer_abs_index; + for (auto const &tree_layer : token_tree.tree_layers) { + for (auto const &tree_node : tree_layer) { + current_layer_abs_index.push_back(abs_index_in_tree); + if (tree_node->included == true) { + if (abs_index_in_tree == 0) { + // The root token, set itself + llm_bitmask.bit_mask[0].set_bit(0); } else { - printf("not correct slot\n"); + // Copy from the parent, and set itself + int parent_abs_index = parent_pos_2_abs_index[tree_node->parent_pos]; + llm_bitmask.bit_mask[abs_index_in_tree] = + llm_bitmask.bit_mask[parent_abs_index]; + llm_bitmask.bit_mask[abs_index_in_tree].set_bit(abs_index_in_tree); } - } else { - verifiedTree.push_back(output); - first_layer_slot = i - processed_whole_layer_tokens; - first_layer_slot_total = treeLayers[layer_num]; - findFirst = true; - new_committed_tokens.push_back(std::make_pair( - input.second, - committed_tokens.at(guid).at(i).second)); // - // at this point, you'll not go other branches - // std::cout << "verify tree push back: " << output.first - // << ", tree size is: " << verifiedTree.size() - // << ", ??: " << input.first << ", " << input.second << "\n"; + abs_index_in_tree++; } - - assert(committed_tokens.at(guid).at(i).first == input.second); } + parent_pos_2_abs_index.clear(); + parent_pos_2_abs_index.swap(current_layer_abs_index); } - committed_tokens[guid] = new_committed_tokens; - { - // log_req_mgr.print("========Verified============"); - std::ostringstream oss; - for (auto const &pair : verifiedTree) { - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); - oss << " " << pair.second << ":" << pair.first; - } - log_req_mgr.print("Verified:%s", oss.str().c_str()); + + // Maintain other fields of llm_bitmask + llm_bitmask.non_tree_cache_size = request.causal_mask.non_tree_cache_size; + llm_bitmask.tree_or_prompt_size = request.causal_mask.tree_or_prompt_size; + // We don't need to set llm_bitmask.current_layer_size here because they are + // not used in LLM verification. + return llm_bitmask; +} + +/* --------- Page Attention Related Functions --------- */ +int RequestManager::get_num_blocks_allocated(Request &request) const { + // needs some assertion + return request.blocks.size(); +} + +int RequestManager::get_len_last_block(Request &request) const { + int num_tokens = request.blocks.back().get_num_tokens(); + if (request.blocks.empty()) { + return 0; } - { - // log_req_mgr.print("========New Committed============"); - std::ostringstream oss; - for (auto const &pair : committed_tokens.at(guid)) { - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); - oss << " " << pair.second << ":" << pair.first; - } - log_req_mgr.print("New committed:%s", oss.str().c_str()); + return request.blocks.back().get_num_tokens(); +} + +// get the index of the last token in the request +int RequestManager::get_idx_last_logical_token(Request &request) const { + if (request.blocks.empty()) { + printf("Error: request.blocks is empty\n"); + return -1; + } else { + return (request.blocks.size() - 1) * kPagesize + + request.blocks.back().get_num_tokens() - 1; } +} - return verifiedTree; +int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) { + // get physical indices + PageManager *page_manager = PageManager::get_page_manager(); + std::vector block_table_indices = + page_manager->get_block_table_indices(request.guid); + return block_table_indices[idx_logical / kPagesize] * kPagesize + + idx_logical % kPagesize; } -std::vector> - RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, - int request_index, - int first_token_depth_in_request) { - if (verbose) { - std::cout << "[Traverse Beam Tree] request_index: " << request_index - << "\n"; - std::cout << "[Traverse Beam Tree] max_depth: " - << old_bc.beamRequestsInfo[request_index].max_depth << "\n"; - std::cout << "[Traverse Beam Tree] current_depth: " - << old_bc.beamRequestsInfo[request_index].current_depth << "\n"; - std::cout << "[Traverse Beam Tree] beam_width: " - << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; - std::cout << "[Traverse Beam Tree] start index: " - << first_token_depth_in_request << "\n"; - } - - auto guid = old_bc.requestsInfo[request_index].request_guid; - Request &request = all_requests[guid]; - // std::cout << "request.beam_trees.size(): " << request.beam_trees.size() - // << std::endl; - BeamTree tree = request.beam_trees.at(old_bc.model_id); - - // std::cout << "print beam tree: " - // << "\n"; - std::vector> serializedTree; - for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) { - // std::cout << "tree layer: " << i - // << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer - // << "\n"; - // push tokens into tree - for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) { - // std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; - serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i)); - } - } - // token, index - // todo make this one global for different stages - - // PreOrder(tree, - // old_bc.beamRequestsInfo[request_index].max_depth, - // 0, - // old_bc.beamRequestsInfo[request_index].beam_size, - // 0, - // serializedTree, - // verbose); - - // print it - if (verbose) { - std::cout << "Print serialized tree: size:" << request_index - << serializedTree.size() << "\n"; - } - for (int k = 0; k < serializedTree.size(); k++) { - serializedTree.at(k).second += first_token_depth_in_request; - if (verbose) { - std::cout << "token id: " << serializedTree.at(k).first - << ", depth: " << serializedTree.at(k).second << "\n"; - } +// this will allocate one logical block and one physical block to the request +void RequestManager::_append_block_to_request(Request &request, + bool is_commit) { + PageManager *page_manager = PageManager::get_page_manager(); + // Append the logical block to the request + // page attention: in this function we need to remember the last logical block + // number that still contains committed tokens + LogicalTokenBlock block(request.blocks.size(), kPagesize); + request.blocks.push_back(block); + page_manager->allocate_one_block(request.guid); + std::vector block_table_indices = + page_manager->get_block_table_indices(request.guid); + // update page_id_commit + if (is_commit) { + request.page_last_committed++; + int size_blocks = request.blocks.size(); } +} - // if (dfs_tree_inputs.find(old_bc.requestsInfo[request_index].request_guid) - // != - // dfs_tree_inputs.end()) { - // dfs_tree_inputs[old_bc.requestsInfo[request_index].request_guid] = - // serializedTree; - // } else { - // dfs_tree_inputs.insert(std::make_pair( - // old_bc.requestsInfo[request_index].request_guid, serializedTree)); - // } +// this function is used for appending a token to the last logical block and +// also the last physical block it will return the physical position of this +// token +int RequestManager::append_token_to_block(Request &request, + TokenId token, + bool is_commit) { + PageManager *page_manager = PageManager::get_page_manager(); + if (request.blocks.empty() || request.blocks.back().is_full()) { + // Append a new logical block + _append_block_to_request(request, is_commit); + // also allocate one physical page + } + // insert token to both logical block and physical block + request.blocks.back().append_tokens({token}, is_commit); + int idx_logical = get_idx_last_logical_token(request); + assert(idx_logical >= 0); + int idx_physical = idx_logical_to_physical(request, idx_logical); + assert(idx_physical >= 0); + return idx_physical; +} - return serializedTree; - // } +void RequestManager::reset_block_table(Request &request) { + // get the indices of original physical block table for request + PageManager *page_manager = PageManager::get_page_manager(); + assert(request.page_last_committed < static_cast(request.blocks.size())); + std::vector block_table_indices = + page_manager->get_block_table_indices(request.guid); + // reset the block table according to the request's page_last_commit + page_manager->free_multiple_blocks(request.guid, + block_table_indices.size() - + request.page_last_committed - 1); + // reset this request's logical block table + if (request.page_last_committed < static_cast(request.blocks.size())) { + request.blocks.erase(request.blocks.begin() + request.page_last_committed + + 1, + request.blocks.end()); + } + request.blocks.back().reset_num_spec_tokens(); + // the indices of block table should be the same as the number of blocks + std::vector block_table = + page_manager->get_block_table_indices(request.guid); + return; } -std::vector> - RequestManager::merge_dfs_trees( - std::vector>> - input_trees, - int root_depth, - RequestGuid guid) { - assert(input_trees.size() == 1 && "currently using one ssm"); - dfs_tree_inputs[guid] = input_trees.at(0); - return input_trees.at(0); +/* --------- Bitmask Related Functions --------- */ +void RequestManager::gumbel_conditioned_on_max( + double target_max, std::vector> &logits) { + // Assume the logits are sorted in descending order + if (logits.size() == 0) { + return; + } + double max_logit = logits[0].first; + for (auto &logit_n_idx : logits) { + logit_n_idx.first = + -log(exp(-target_max) - exp(-max_logit) + exp(-logit_n_idx.first)); + } +} - std::vector> merged_tree; +void RequestManager::renormalize(std::vector> &D, + std::unordered_map &R, + TokenId token_id) { + float token_prob; + for (auto &kv : D) { + TokenId d_token_id = kv.first; + float d_prob = kv.second; + if (R.find(d_token_id) != R.end()) { + float r_prob = R[d_token_id]; + R[d_token_id] = max(0.0f, r_prob - d_prob); + } + if (d_token_id == token_id) { + token_prob = d_prob; + kv.second = 0.0f; + } + } + // Normalize R + float sum_r = 0.0f; + for (auto &kv : R) { + sum_r += kv.second; + } + for (auto &kv : R) { + kv.second /= (sum_r + 1e-6); + } + // Normalize D + for (auto &kv : D) { + kv.second /= (1.0f - token_prob - 1e-6); + } +} - std::unordered_map> childrens; - std::unordered_map curr_path; +std::tuple + RequestManager::reject_sampling(std::vector> &D, + std::unordered_map &R, + int k) { + assert(D.size() == k); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0.0, 1.0); + double r; + for (int i = 0; i < k; ++i) { + // Generate a random number in the range [0, 1) + r = dis(gen); + double d_prob = (double)D[i].second; + if (R.find(D[i].first) != R.end()) { + double r_prob = (double)R[D[i].first]; + if (r < d_prob / d_prob + 1e-6) { + return {i, D[i].first, true}; + } + } + // else, r_prob = 0.0, reject the token + renormalize(D, R, D[i].first); + } + std::vector r_probs; + std::vector r_tokens; + for (auto &kv : R) { + r_probs.push_back(kv.second); + r_tokens.push_back(kv.first); + } + std::discrete_distribution<> r_dist(r_probs.begin(), r_probs.end()); + int sampled_index = r_dist(gen); + return {-1, r_tokens[sampled_index], false}; +} - // convert pair to an integer - auto root = input_trees.at(0).at(0); - int root_id = root.first * 10000 + root.second; +void RequestManager::get_verify_results_sample( + InferenceResult const &llm_verify_result) { + // This function maintain the generated token list of the request and the + // committed tokens. + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + + int llm_result_offset = + request.first_token_offset_in_batch * BatchConfig::MAX_K_LOGITS; + int llm_input_offset = request.first_token_offset_in_batch; + int committed_token_index = request.tokens.size() - 1; + + TokenTree &token_tree = request.speculative_token_trees[0]; + // First add the root to the committed tokens + request.committed_tokens.push_back(Request::CommittedToken( + llm_input_offset, committed_token_index, request.tokens.back())); + committed_token_index++; + // Don't add it to request.tokens because it has already been added. + + // The position of the last accepted token in its tree layer (includeing + // the pruned tokens) + int last_accepted_token_index_in_layer = 0; + // The index of the last accepted token in the entire tree (excluding the + // pruned tokens) + int last_accepted_token_index = 0; + float last_accepted_token_accumulated_log_prob = 0.0f; + int current_token_index = 1; // Because we skip the root + bool rejected = false; + + auto layer_it = token_tree.tree_layers.begin(); + ++layer_it; + for (; layer_it != token_tree.tree_layers.end(); ++layer_it) { + // We skip the first layer + std::vector> const &tree_layer = *layer_it; + std::vector> D; + std::unordered_map R; + // Data format: + std::unordered_map> d_token_info; + + int current_token_index_in_layer = 0; + + // Iterate through the tokens in the current layer to find the candidate + // tokens whose parent is the last accepted token + for (auto const &node_ptr : tree_layer) { + if (!node_ptr->included) { + // Don't increase current_token_index here + current_token_index_in_layer++; + continue; + } + if (node_ptr->parent_pos != last_accepted_token_index_in_layer) { + // The token's parent is not accepted + current_token_index++; + current_token_index_in_layer++; + continue; + } else { + // The token's parent is accepted + float prob = std::exp(node_ptr->log_accumulated_prob - + last_accepted_token_accumulated_log_prob); + D.push_back({node_ptr->id, prob}); + d_token_info[node_ptr->id] = {current_token_index, + current_token_index_in_layer, + node_ptr->log_accumulated_prob}; + current_token_index++; + current_token_index_in_layer++; + } + } - for (int i = 0; i < input_trees.size(); i++) { - auto tree = input_trees.at(i); - // all trees should have the same root - assert(tree.at(0) == root); + int result_offset = llm_result_offset + + last_accepted_token_index * BatchConfig::MAX_K_LOGITS; + for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) { + TokenId token_id = llm_verify_result.token_ids[result_offset + i]; + R[token_id] = llm_verify_result.probs[result_offset + i]; + } - for (auto const &pair : tree) { - int id = pair.first * 10000 + pair.second; // current node - curr_path[pair.second] = id; // log node in current search + auto [sampled_index, token_id, accepted] = + reject_sampling(D, R, D.size()); + if (accepted) { + // The token's parent is accepted, and this token's id equals the + // llm's sample at its parent's position. We accept this token. + // from_index: the index of the token in the tree (excluding the + // pruned tokens) + // to_index: the committed token index in the request + request.committed_tokens.push_back(Request::CommittedToken( + llm_input_offset + std::get<0>(d_token_info[token_id]), + committed_token_index, + token_id)); + request.tokens.push_back(token_id); + + last_accepted_token_index = std::get<0>(d_token_info[token_id]); + last_accepted_token_index_in_layer = + std::get<1>(d_token_info[token_id]); + last_accepted_token_accumulated_log_prob = + std::get<2>(d_token_info[token_id]); + committed_token_index++; + } else { + request.committed_tokens.push_back( + Request::CommittedToken(-1, committed_token_index, token_id)); + rejected = true; + break; + } + } - if (childrens.find(id) == childrens.end()) { - // init empty set - childrens[id] = std::set(); + // Add the last token (that is not in the cache of the LLM) if the + // sampling procedure succeed in the last layer from_index: since this + // token is not in the token tree, the llm doesn't have its KV cache, so + // the from_index should be a place holder, which is -1 + if (!rejected) { + std::unordered_map R; + std::vector> D; + int result_offset = llm_result_offset + + last_accepted_token_index * BatchConfig::MAX_K_LOGITS; + for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) { + TokenId token_id = llm_verify_result.token_ids[result_offset + i]; + R[token_id] = llm_verify_result.probs[result_offset + i]; } + auto [sampled_index, token_id, accepted] = + reject_sampling(D, R, D.size()); + request.committed_tokens.push_back( + Request::CommittedToken(-1, committed_token_index, token_id)); + request.tokens.push_back(token_id); + } - if (pair.second > root_depth) { - int parent_id = curr_path[pair.second - 1]; - childrens[parent_id].insert(id); + if (verbose) { + std::cout << "Request " << request.guid << " committed tokens: "; + for (auto const &committed_token : request.committed_tokens) { + std::cout << committed_token.token_id << " (" + << tokenizer_->Decode({committed_token.token_id}) << ") "; } + std::cout << std::endl; + std::string output = this->tokenizer_->Decode(request.tokens); + // std::cout << "Output sequence: " << output << std::endl; } } +} - std::stack q; - q.push(root_id); - - while (!q.empty()) { - int curr = q.top(); - q.pop(); - merged_tree.push_back(std::make_pair(curr / 10000, curr % 10000)); - for (int child : childrens[curr]) { - q.push(child); +void RequestManager::get_verify_results_greedy( + InferenceResult const &llm_verify_result) { + // This function maintain the generated token list of the request and the + // committed tokens. + int total_nb_generated_tokens = 0; + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + + int llm_result_offset = request.first_token_offset_in_batch; + int llm_cache_size = request.tokens.size() - 1; + int committed_token_index = request.tokens.size() - 1; + + TokenTree &token_tree = request.speculative_token_trees[0]; + // First add the root to the committed tokens + request.committed_tokens.push_back(Request::CommittedToken( + llm_cache_size, committed_token_index, request.tokens.back())); + committed_token_index++; + // Don't add it to request.tokens because it has already been added. + + // The position of the last accepted token in its tree layer (includeing + // the pruned tokens) + int last_accepted_token_index_in_layer = 0; + // The index of the last accepted token in the entire tree (excluding the + // pruned tokens) + int last_accepted_token_index = 0; + + int current_token_index = 1; // Because we skip the root + // We skip the first layer + bool found_eos = false; + for (auto layer_it = token_tree.tree_layers.begin() + 1; + layer_it != token_tree.tree_layers.end(); + ++layer_it) { + std::vector> const &tree_layer = *layer_it; + + bool token_accepted_this_layer = false; + int current_token_index_in_layer = 0; + + for (auto const &node_ptr : tree_layer) { + if (!node_ptr->included) { + current_token_index_in_layer++; + continue; + } + if ((node_ptr->parent_pos != last_accepted_token_index_in_layer) || + token_accepted_this_layer) { + // The token's parent is not accepted, or there is already another + // token accepted in this layer + current_token_index++; + current_token_index_in_layer++; + continue; + } else { + // The token's parent is accepted, and no token has been accepted in + // this layer yet + if (node_ptr->id == + llm_verify_result + .token_ids[llm_result_offset + last_accepted_token_index]) { + // The token's parent is accepted, and this token's id equals the + // llm's sample at its parent's position. We accept this token. + + // from_index: the index of the token in the tree (excluding the + // pruned tokens) + // to_index: the committed token index in the request + request.committed_tokens.push_back( + Request::CommittedToken(llm_cache_size + current_token_index, + committed_token_index, + node_ptr->id)); + request.tokens.push_back(node_ptr->id); + + token_accepted_this_layer = true; + last_accepted_token_index = current_token_index; + last_accepted_token_index_in_layer = current_token_index_in_layer; + committed_token_index++; + if (is_eos_token(node_ptr->id)) { + found_eos = true; + } + } + current_token_index++; + current_token_index_in_layer++; + } + if (found_eos) { + break; + } + } + if (!token_accepted_this_layer) { + // No token is accepted in this layer, we should stop the traversal + break; + } + if (found_eos) { + break; + } } - } - if (verbose) { - for (auto &pair : merged_tree) { - std::cout << pair.first << ", depth: " << pair.second << std::endl; + // Add the last token (that is not verified by the LLM) + // from_index: since this token is not in the token tree, the llm + // doesn't have its KV cache, so the from_index should be a place + // holder, which is -1 + if (!found_eos) { + request.committed_tokens.push_back(Request::CommittedToken( + -1, + committed_token_index, + llm_verify_result + .token_ids[llm_result_offset + last_accepted_token_index])); + request.tokens.push_back( + llm_verify_result + .token_ids[llm_result_offset + last_accepted_token_index]); } - } - dfs_tree_inputs[guid] = merged_tree; + assert(request.committed_tokens.size() >= 2); + int nb_generated_tokens = (int)request.committed_tokens.size() - + 1; // exclude previous bonus token + int accepted_tokens = (int)request.committed_tokens.size() - + 1; // exclude previous bonus token + if (!found_eos) { + accepted_tokens--; // exclude the last bonus token (if we found eos, we + // don't add it) + } + total_nb_generated_tokens += nb_generated_tokens; + + NewProfileInfo new_profile_info; + new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds(); + new_profile_info.request_guid = guid; + new_profile_info.request_step_idx = + profiling_requests[guid].llm_decoding_steps - + 1; // check if this has already been incremented + new_profile_info.num_speculated_tokens = get_tree_size(request); + new_profile_info.num_accepted_tokens = accepted_tokens; + new_profile_info.speculation_score = -1.0; + new_profile_info.num_generated_tokens = nb_generated_tokens; + new_profile_info.speculation_start_timestamp = + profiling_requests[guid].speculation_start_timestamp; + new_profile_info.speculation_end_timestamp = + profiling_requests[guid].speculation_end_timestamp; + new_profiling_info.push_back(new_profile_info); - return merged_tree; + if (verbose) { + std::cout << "Request " << request.guid << " committed tokens: "; + for (auto const &committed_token : request.committed_tokens) { + std::cout << committed_token.token_id << " (" + << tokenizer_->Decode({committed_token.token_id}) << ") "; + } + std::cout << std::endl; + std::string output = this->tokenizer_->Decode(request.tokens); + std::cout << "Output sequence: " << output << std::endl; + } + } + profiling.generated_tokens_per_step.push_back(total_nb_generated_tokens); } std::vector - FFModel::generate(std::vector &prompts, int max_seq_length) { + FFModel::generate(std::vector &requests, + EmissionMachine &emission_machine) { RequestManager *rm = RequestManager::get_request_manager(); std::vector guids; - for (int i = 0; i < prompts.size(); i++) { - RequestManager::RequestGuid guid = - rm->register_new_request(prompts.at(i), max_seq_length); + + // Wait until the request manager is ready + while (!rm->is_background_server_serving()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + for (size_t i = 0; i < requests.size(); i++) { + requests[i].slo_ratio = emission_machine.sample_slo_ratio(); + requests[i].emission_time_ms = emission_machine.get_elapsed_time_ms(); + printf("Prompt[%ld]: %s\n", i, requests[i].prompt.c_str()); + RequestManager::RequestGuid guid = rm->register_new_request(requests[i]); if (guid != RequestManager::INVALID_GUID) { guids.push_back(guid); } + emission_machine.wait_until_next_request(); } std::vector results; for (int i = 0; i < guids.size(); i++) { @@ -2314,9 +2901,18 @@ std::vector return results; } +std::vector + FFModel::generate(std::vector &prompts, + EmissionMachine &emission_machine) { + std::vector requests; + for (std::string &prompt : prompts) { + requests.push_back(GenerationRequest(prompt, -1.0, 0)); + } + return generate(requests, emission_machine); +} + void RequestManager::start_background_server(FFModel *model) { - assert(request_manager_status == INITIALIZED); - request_manager_status = SERVING; + assert(background_server_status == INITIALIZED); // Start background task Runtime *runtime = Runtime::get_runtime(); Context ctx = Runtime::get_context(); @@ -2358,17 +2954,23 @@ void RequestManager::background_serving_task( ssm->config.lg_ctx = ctx; } } - if (rm->get_num_ssms() == 0) { + // page attention: initalize the page manager here + int kv_cache_size = rm->get_max_kv_cache_size(); + PageManager::get_page_manager(llm, rm->get_max_kv_cache_size()); + if (rm->decoding_mode == INCREMENTAL_DECODING) { // No SSMs: perform incremental decoding - rm->serve_incr_decoding(llm); + rm->serve_decoding(llm); } else { // Registered SSMs: perform speculative inference rm->serve_spec_infer(llm); } +#ifdef FF_USE_NCCL + llm->finish_nccl_comms(); +#endif } /*static*/ -void RequestManager::serve_incr_decoding(FFModel *llm) { +void RequestManager::serve_decoding(FFModel *llm) { Context ctx = llm->config.lg_ctx; Runtime *runtime = llm->config.lg_hlr; // Compile the llm @@ -2377,50 +2979,45 @@ void RequestManager::serve_incr_decoding(FFModel *llm) { assert(im->model_weights_loaders.find(llm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[llm]->load_weights(llm); + im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime); // init operators im->init_operators_inference(llm); // Legion futures for inc_decoding and spec_infer - BatchConfigFuture last_bcf; InferenceResultFuture last_irf; { // Initialize futures for incr decoding - BatchConfig bc; InferenceResult ir; - last_bcf = Future::from_value(bc); last_irf = Future::from_value(ir); } - std::queue> - batch_pipeline; - { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); } + std::queue batch_pipeline; + { batch_pipeline.push(last_irf); } + // reset_profiling_statistics(); + background_server_status = SERVING; while (!is_background_server_terminated()) { if (batch_pipeline.size() >= 4) { // Block here to avoid launching too many batches - auto const &batch = batch_pipeline.front(); - batch.second.get_void_result(); + auto const &ir = batch_pipeline.front(); + ir.get_void_result(); } // deque finished batches while (batch_pipeline.size() > 1) { - auto const &batch = batch_pipeline.front(); - if (batch.second.is_ready()) { + auto const &ir = batch_pipeline.front(); + if (ir.is_ready()) { batch_pipeline.pop(); } else { break; } } runtime->begin_trace(ctx, 12346 /*trace_id*/); - auto const &next_batch = batch_pipeline.back(); - BatchConfigFuture bcf = - prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime); + InferenceResultFuture next_ir = batch_pipeline.back(); + BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime); FutureMap fm = im->inference(llm, 0, bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); - batch_pipeline.push(std::make_pair(bcf, irf)); - last_bcf = bcf; - last_irf = irf; + batch_pipeline.push(irf); runtime->end_trace(ctx, 12346 /*trace_id*/); } } @@ -2436,91 +3033,132 @@ void RequestManager::serve_spec_infer(FFModel *llm) { assert(im->model_weights_loaders.find(llm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[llm]->load_weights(llm); + im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime); // init operators im->init_operators_inference(llm); } for (size_t i = 0; i < get_num_ssms(); i++) { // Compile the i-th ssm FFModel *ssm = get_ssm_model(i); - im->compile_model_and_allocate_buffer(ssm); - assert(im->model_weights_loaders.find(llm) != + im->compile_model_and_allocate_buffer(ssm, false); + assert(im->model_weights_loaders.find(ssm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[ssm]->load_weights(ssm); + im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime); // init operators im->init_operators_inference(ssm); } - std::queue> - batch_pipeline; - // Legion futures for inc_decoding and spec_infer - TreeVerifyBatchConfigFuture last_tree_bcf; - InferenceResultFuture last_tree_irf; + InferenceResultFuture irf_0; { - // Initialize futures for spec infer - TreeVerifyBatchConfig tree_bc; - InferenceResult tree_ir; - last_tree_bcf = Future::from_value(tree_bc); - last_tree_irf = Future::from_value(tree_ir); + // Initialize futures for incr decoding + InferenceResult ir_0; + irf_0 = Future::from_value(ir_0); } - batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf)); - while (!is_background_server_terminated()) { + request_manager_status = PREFILLING; + prefill_model = SSM; + ssm_tree_depth = get_max_tree_depth(); - if (batch_pipeline.size() >= 4) { + std::queue infer_result_future_pipeline; + infer_result_future_pipeline.push(irf_0); + + // reset_profiling_statistics(); + background_server_status = SERVING; + while (!is_background_server_terminated()) { + if (infer_result_future_pipeline.size() >= 4) { // Block here to avoid launching too many batches - auto const &batch = batch_pipeline.front(); - batch.second.get_void_result(); + auto const &ir = infer_result_future_pipeline.front(); + ir.get_void_result(); } // deque finished batches - while (batch_pipeline.size() > 1) { - auto const &batch = batch_pipeline.front(); - if (batch.second.is_ready()) { - batch_pipeline.pop(); + while (infer_result_future_pipeline.size() > 1) { + auto const &ir = infer_result_future_pipeline.front(); + if (ir.is_ready()) { + infer_result_future_pipeline.pop(); } else { break; } } - auto const &next_batch = batch_pipeline.back(); - BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init( - next_batch.first, next_batch.second, 0, ctx, runtime); - std::vector beam_bcf_vec(get_num_ssms()); - for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) { - beam_bcf_vec[ssm_id] = beam_bcf; - } + runtime->begin_trace(ctx, 12345 /*trace_id*/); + for (int ssm_step_i = 0; ssm_step_i < get_max_tree_depth(); ssm_step_i++) { + InferenceResultFuture irf = infer_result_future_pipeline.back(); + BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime); + FutureMap fm = im->inference(get_ssm_model(0), 0, bcf); + infer_result_future_pipeline.push(fm.get_future(0)); + } + InferenceResultFuture irf = infer_result_future_pipeline.back(); + BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime); + FutureMap fm = im->inference(llm, 0, bcf); + infer_result_future_pipeline.push(fm.get_future(0)); + runtime->end_trace(ctx, 12345 /*trace_id*/); + } +} + +/*static*/ +void RequestManager::serve_spec_infer_sync(FFModel *llm) { + Context ctx = llm->config.lg_ctx; + Runtime *runtime = llm->config.lg_hlr; + InferenceManager *im = InferenceManager::get_inference_manager(); + { + // Compile the llm + im->compile_model_and_allocate_buffer(llm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime); + // init operators + im->init_operators_inference(llm); + } + for (size_t i = 0; i < get_num_ssms(); i++) { + // Compile the i-th ssm + FFModel *ssm = get_ssm_model(i); + im->compile_model_and_allocate_buffer(ssm, false); + assert(im->model_weights_loaders.find(ssm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime); + // init operators + im->init_operators_inference(ssm); + } + + InferenceResultFuture irf_0; + { + // Initialize futures for incr decoding + InferenceResult ir_0; + irf_0 = Future::from_value(ir_0); + } - for (size_t i = 0; i < get_num_ssms(); i++) { - for (int depth = 0; depth < BeamSearchBatchConfig::MAX_BEAM_DEPTH; - depth++) { - beam_bcf = beam_bcf_vec[i]; + request_manager_status = PREFILLING; + prefill_model = SSM; - FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]); - assert(fm.get_future_map_domain().get_volume() == 1); - BeamInferenceResultFuture beam_irf = fm.get_future(0); - beam_bcf_vec[i] = - prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime); - } - } - // Token Tree Verification - { - TreeVerifyBatchConfigFuture tree_bcf = - prepare_next_batch_verify(beam_bcf_vec, ctx, runtime); - FutureMap fm = im->inference(llm, 0, tree_bcf); - assert(fm.get_future_map_domain().get_volume() == 1); - InferenceResultFuture tree_irf = fm.get_future(0); - batch_pipeline.push(std::make_pair(tree_bcf, tree_irf)); - last_tree_bcf = tree_bcf; - last_tree_irf = tree_irf; + background_server_status = SERVING; + while (!is_background_server_terminated()) { + BatchConfigFuture bcf = get_next_batch_config(irf_0, ctx, runtime); + bcf.get_void_result(); + if ((request_manager_status == PREFILLING and prefill_model == LLM) or + request_manager_status == LLM_VERIFY) { + runtime->begin_trace(ctx, 12345 /*trace_id*/); + FutureMap fm = im->inference(llm, 0, bcf); + irf_0 = fm.get_future(0); + runtime->end_trace(ctx, 12345 /*trace_id*/); + } else if ((request_manager_status == PREFILLING and + prefill_model == SSM) or + request_manager_status == SSM_SPEC) { + runtime->begin_trace(ctx, 23456 /*trace_id*/); + FutureMap fm = im->inference(get_ssm_model(0), 0, bcf); + irf_0 = fm.get_future(0); + runtime->end_trace(ctx, 23456 /*trace_id*/); + } else { + assert(false && "Invalid request manager status"); } - runtime->end_trace(ctx, 12345 /*trace_id*/); } } void RequestManager::trigger_request_completion_future( RequestGuid const &guid) { - const std::lock_guard lock(request_to_promise_mutex); + std::lock_guard const lock(request_to_promise_mutex); assert(request_to_promise.find(guid) != request_to_promise.end()); // Set the completion promise in case other threads are waiting request_to_promise[guid]->set_value(); @@ -2533,8 +3171,222 @@ void RequestManager::terminate_background_server_at_exit() { } void RequestManager::terminate_background_server() { - if (request_manager_status == SERVING) { - request_manager_status = TERMINATED; + if (is_background_server_serving()) { + assert(profiling.llm_step_times.size() == + profiling.requests_per_step.size()); + // Write the last profiling statistics to output file + std::string str = "[Profiling Statistics]"; + + profiling.server_end_time = Realm::Clock::current_time_in_microseconds(); + long long total_time = + profiling.server_end_time - profiling.server_start_time; + int total_requests = 0; + for (auto const &profiling_info : profiling_requests) { + int request_id = profiling_info.first; + Request &request = all_requests[request_id]; + if (request.status == Request::COMPLETED) { + total_requests++; + } + } + int total_tokens = 0; + for (int num_tokens : profiling.generated_tokens_per_step) { + total_tokens += num_tokens; + } + + if (profiling_requests.size() != all_requests.size()) { + std::cerr << "profiling_requests.size()=" << profiling_requests.size() + << " != all_requests.size()=" << all_requests.size() + << std::endl; + } + assert(profiling_requests.size() == all_requests.size()); + str += "\nDecoding Steps: "; + for (auto const &profiling_info : profiling_requests) { + int request_id = profiling_info.first; + Request &request = all_requests[request_id]; + str += "Request " + std::to_string(request_id) + ": "; + str += std::to_string(profiling_info.second.llm_decoding_steps); + str += "/"; + str += std::to_string(request.decode_length()); + float speedup = (float)request.decode_length() / + profiling_info.second.llm_decoding_steps; + str += " " + std::to_string(speedup) + "\n"; + } + str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")"; + str += "\n total_requests(" + std::to_string(total_requests) + "/" + + std::to_string(all_requests.size()) + ")"; + str += "\n total_tokens(" + std::to_string(total_tokens) + ")"; + // throughput + str += "\n throughput_requests_per_sec(" + + std::to_string(total_requests / (total_time / 1e6)) + ")"; + str += "\n throughput_tokens_per_sec(" + + std::to_string(total_tokens / (total_time / 1e6)) + ")"; + + double average_latency_per_request = 0; + std::string latency_per_request_ms = "\n latency_per_request_ms( "; + for (auto const &profiling_info : profiling_requests) { + double latency_ms = (profiling_info.second.finish_time - + profiling_info.second.start_time) / + 1000.0; + + // latency_per_request_ms += "[" + std::to_string(profiling_info.first) + // + + // ","; latency_per_request_ms += std::to_string(latency_ms) + "] "; + latency_per_request_ms += std::to_string(latency_ms) + " "; + average_latency_per_request += latency_ms; + } + latency_per_request_ms += ")"; + str += latency_per_request_ms; + + average_latency_per_request /= total_requests; + str += "\n average_latency_per_request_ms(" + + std::to_string(average_latency_per_request) + ")"; + + std::string ttft_per_request_ms = "\n ttft_per_request_ms( "; + for (auto const &profiling_info : profiling_requests) { + double prefilling_time_ms = 0; + auto const &profiling = profiling_info.second; + if (profiling.start_decoding_time != 0) { + prefilling_time_ms = + (profiling.start_decoding_time - profiling.start_time) / 1000.0; + } else { + prefilling_time_ms = + (profiling.finish_time - profiling.start_time) / 1000.0; + } + ttft_per_request_ms += std::to_string(prefilling_time_ms) + " "; + } + ttft_per_request_ms += ")"; + str += ttft_per_request_ms; + + std::unordered_map> tpots; + std::string tpot_per_request_ms = "\n tpot_per_request_ms( "; + for (auto const &profiling_info : profiling_requests) { + double per_token_time_ms = 0; + auto const &request = all_requests[profiling_info.first]; + auto const &profiling = profiling_info.second; + if (profiling.start_decoding_time != 0) { + per_token_time_ms = + (profiling.finish_time - profiling.start_decoding_time) / 1000.0 / + request.decode_length(); + } + tpot_per_request_ms += std::to_string(per_token_time_ms) + " "; + auto &tpot = tpots[request.slo_ratio]; + tpot.first++; + tpot.second += per_token_time_ms; + } + tpot_per_request_ms += ")"; + str += tpot_per_request_ms; + + std::string average_tpot_per_slo_ms = "\n average_tpot_per_slo_ms( "; + for (auto const &kv : tpots) { + double average_tpot = kv.second.second / kv.second.first; + average_tpot_per_slo_ms += + std::to_string(kv.first) + ":" + std::to_string(average_tpot) + " "; + } + average_tpot_per_slo_ms += ")"; + str += average_tpot_per_slo_ms; + + std::string req_per_step = "\n requests_per_step( "; + for (int nb : profiling.requests_per_step) { + req_per_step += std::to_string(nb) + " "; + } + req_per_step += ")"; + str += req_per_step; + + if (profiling.ssm_step_times.size() > 0) { + // assert(profiling.ssm_step_times.size() == + // profiling.llm_step_times.size()); + std::string ssm_step_times_ms = "\n ssm_step_times_ms( "; + for (double time : profiling.ssm_step_times) { + ssm_step_times_ms += std::to_string(time) + " "; + } + ssm_step_times_ms += ")"; + str += ssm_step_times_ms; + } + + if (profiling.ssm_steps.size() > 0) { + std::string ssm_steps = "\n ssm_steps( "; + for (int nb : profiling.ssm_steps) { + ssm_steps += std::to_string(nb) + " "; + } + ssm_steps += ")"; + str += ssm_steps; + } + + std::string llm_step_times_ms = "\n llm_step_times_ms( "; + for (double time : profiling.llm_step_times) { + llm_step_times_ms += std::to_string(time) + " "; + } + llm_step_times_ms += ")"; + str += llm_step_times_ms; + + std::string generated_tokens_per_step = "\n generated_tokens_per_step( "; + for (int nb : profiling.generated_tokens_per_step) { + generated_tokens_per_step += std::to_string(nb) + " "; + } + generated_tokens_per_step += ")"; + str += generated_tokens_per_step; + + std::string mean_generated_tokens_per_step = + "\n mean_generated_tokens_per_step( "; + double mean_generated_tokens = + (double)std::accumulate(profiling.generated_tokens_per_step.begin(), + profiling.generated_tokens_per_step.end(), + 0); + double total_request_steps = + (double)std::accumulate(profiling.requests_per_step.begin(), + profiling.requests_per_step.end(), + 0); + mean_generated_tokens /= total_request_steps; + mean_generated_tokens_per_step += std::to_string(mean_generated_tokens); + mean_generated_tokens_per_step += ")"; + str += mean_generated_tokens_per_step; + + double attainment = 0, goodput = 0; + for (auto request_pair : all_requests) { + Request &request = request_pair.second; + if (request.attained) { + attainment += 1; + goodput += request.decode_length(); + } + } + attainment /= total_requests; + goodput /= total_time / 1e6; + + std::string slo_attainment = "\n slo_attainment( "; + slo_attainment += std::to_string(attainment); + slo_attainment += ")"; + str += slo_attainment; + + std::string goodput_str = "\n goodput( "; + goodput_str += std::to_string(goodput); + goodput_str += ")"; + str += goodput_str; + + if (get_eval_overhead_breakdown()) { + eval_process_latency_us -= + eval_schedule_latency_us + eval_other_latency_us; + std::string eval_overhead_breakdown_str = "\n eval_overhead_breakdown( "; + eval_overhead_breakdown_str += + "\n ssm_prefill_us: " + std::to_string(eval_ssm_prefill_latency_us); + eval_overhead_breakdown_str += + "\n ssm_spec_us: " + std::to_string(eval_ssm_spec_latency_us); + eval_overhead_breakdown_str += + "\n llm_prefill_us: " + std::to_string(eval_llm_prefill_latency_us); + eval_overhead_breakdown_str += + "\n llm_verify_us: " + std::to_string(eval_llm_verify_latency_us); + eval_overhead_breakdown_str += + "\n process_us: " + std::to_string(eval_process_latency_us); + eval_overhead_breakdown_str += + "\n scheduling_us: " + std::to_string(eval_schedule_latency_us); + eval_overhead_breakdown_str += + "\n other_us: " + std::to_string(eval_other_latency_us); + eval_overhead_breakdown_str += ")"; + str += eval_overhead_breakdown_str; + } + + write_to_output_file("", str); + background_server_status = TERMINATED; + request_queue_cv.notify_all(); // Wait for the background server to terminate Runtime *runtime = Runtime::get_runtime(); Context ctx = Runtime::get_context(); @@ -2542,8 +3394,12 @@ void RequestManager::terminate_background_server() { } } +bool RequestManager::is_background_server_serving() { + return background_server_status == SERVING; +} + bool RequestManager::is_background_server_terminated() { - return request_manager_status == TERMINATED; + return background_server_status == TERMINATED; } RequestManager *request_manager_singleton = nullptr; @@ -2556,4 +3412,554 @@ RequestManager *RequestManager::get_request_manager() { return request_manager_singleton; } +/* --------- Request Token Tree Related Functions --------- */ +void RequestManager::init_token_tree(RequestGuid guid) { + Request &request = all_requests[guid]; + request.speculative_token_trees.clear(); + // Assume we only use one small model for speculation + request.speculative_token_trees.emplace_back(); +} + +void RequestManager::add_root_to_spec_token_tree( + RequestGuid guid, BatchConfig::TokenId token_id) { + // This method is called by update_llm_verify_results() + // The last token in the accepted sequence should be the root of the next + // speculation tree. The reason is that the KV cache of this token is not + // computed yet, and we need the large model to decode the logit of this + // token to verify its childs (the tokens in the first layer). This method + // should: construct and add the root token to the empty speculative token + // tree, with parent_pos being -1 and log_accumulated_prob being 0.0 + Request &request = all_requests[guid]; + TokenTree &speculative_token_tree = request.speculative_token_trees[0]; + speculative_token_tree.add_layer(); + auto node_ptr = std::make_shared(token_id, 0.0, -1); + node_ptr->included = true; + if (speculative_sampling) { + node_ptr->gumbel = true; + } + speculative_token_tree.tree_layers[0].push_back(node_ptr); +} + +void RequestManager::add_tokens_to_spec_token_tree( + InferenceResult const &ssm_inference_result) { + // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES + // TODO: support gumbel sampling + + int tree_width = + min(get_max_tokens_per_ssm_batch() / get_num_active_requests(), + get_max_tree_width()); + assert(tree_width >= 1); + + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + // Request in this slot is unavailable + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + + int parent_num = request.num_tokens_in_batch; + if (parent_num == 0) { + continue; + } + + // ssm_first_step only decode the last token (the root of the tree) + int result_offset = + (request.first_token_offset_in_batch + + (current_ssm_step == 1 ? (request.num_tokens_in_batch - 1) : 0)) * + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + TokenTree &spec_token_tree = request.speculative_token_trees[0]; + std::vector> &last_layer = + spec_token_tree.tree_layers.back(); + std::vector> child_probs_v; + child_probs_v.reserve(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES * + get_max_tree_width()); + int parent_pos = 0; + for (auto const &parent_ptr : last_layer) { + double parent_log_prob = parent_ptr->log_accumulated_prob; + int child_start_idx = + result_offset + + parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + for (int result_idx = child_start_idx; + result_idx < + child_start_idx + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + result_idx++) { + double log_prob = log((double)ssm_inference_result.probs[result_idx]); + if (log_prob == -std::numeric_limits::infinity()) { + continue; + } + if (log_prob == 0.0) { + // Slightly perturb the log prob to make it strictly less than 0 + log_prob -= 1e-10; + } + + double accumulated_log_prob = log_prob + parent_log_prob; + + child_probs_v.emplace_back(accumulated_log_prob, result_idx); + } + parent_pos++; + } + + spec_token_tree.add_layer(); + int actual_width = min(tree_width, (int)child_probs_v.size()); + if (actual_width == 0) { + continue; + } + std::partial_sort(child_probs_v.begin(), + child_probs_v.begin() + actual_width, + child_probs_v.end(), + std::greater>()); + for (int i = 0; i < actual_width; i++) { + auto [accumulated_log_prob, result_idx] = child_probs_v[i]; + int parent_pos = (result_idx - result_offset) / + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + std::shared_ptr node_ptr = std::make_shared( + ssm_inference_result.token_ids[result_idx], + accumulated_log_prob, + parent_pos); + spec_token_tree.tree_layers.back().push_back(node_ptr); + request.token_tree_nodes_acc_prob_pair_pq.push( + std::make_pair(node_ptr, accumulated_log_prob)); + } + } +} + +void RequestManager::add_tokens_to_spec_token_tree_old_version( + InferenceResult const &ssm_inference_result) { + + std::vector tree_width_vector = { + 1, 1, this->expansion_degree, 1, 1, 1, 1, 1}; + + int expand_width = tree_width_vector[current_ssm_step - 1]; + + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + // Request in this slot is unavailable + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + + int parent_num = request.num_tokens_in_batch; + if (parent_num == 0) { + continue; + } + + // ssm_first_step only decode the last token (the root of the tree) + int result_offset = + (request.first_token_offset_in_batch + + (current_ssm_step == 1 ? (request.num_tokens_in_batch - 1) : 0)) * + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + TokenTree &spec_token_tree = request.speculative_token_trees[0]; + std::vector> &last_layer = + spec_token_tree.tree_layers.back(); + spec_token_tree.add_layer(); + + int parent_pos = 0; + for (auto const &parent_ptr : last_layer) { + double parent_log_prob = parent_ptr->log_accumulated_prob; + int child_start_idx = + result_offset + + parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + std::vector> child_probs_v; + child_probs_v.reserve(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + for (int result_idx = child_start_idx; + result_idx < + child_start_idx + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + result_idx++) { + double log_prob = log((double)ssm_inference_result.probs[result_idx]); + if (log_prob == -std::numeric_limits::infinity()) { + continue; + } + if (log_prob == 0.0) { + // Slightly perturb the log prob to make it strictly less than 0 + log_prob -= 1e-10; + } + + double accumulated_log_prob = log_prob + parent_log_prob; + + child_probs_v.emplace_back(accumulated_log_prob, result_idx); + } + int actual_width = min(expand_width, (int)child_probs_v.size()); + if (actual_width == 0) { + continue; + } + std::partial_sort(child_probs_v.begin(), + child_probs_v.begin() + actual_width, + child_probs_v.end(), + std::greater>()); + for (int i = 0; i < actual_width; i++) { + auto [accumulated_log_prob, result_idx] = child_probs_v[i]; + std::shared_ptr node_ptr = + std::make_shared( + ssm_inference_result.token_ids[result_idx], + accumulated_log_prob, + parent_pos); + node_ptr->included = true; + spec_token_tree.tree_layers.back().push_back(node_ptr); + } + parent_pos++; + } + } +} + +void RequestManager::prune_token_tree() { + if (get_greedy_schedule()) { + return prune_token_tree_greedy(); + } else if (get_equal_schedule()) { + return prune_token_tree_equal(); + } + + // Each reqeust has at least one token + int budget = get_max_tokens_per_batch() - num_available_requests; + assert(budget >= 0); + + std::vector> num_tokens_to_decode_2_request_index; + num_tokens_to_decode_2_request_index.reserve(get_max_requests_per_batch()); + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + if (request.get_slo_ratio() > 999) { // infinity + continue; + } + double num_tokens_to_decode_per_step = + (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor / + get_slo_constraint(request); + double expected_num_tokens_decoded = + request.decode_latency_ms / get_slo_constraint(request); + double num_tokens_to_decode = + max(1.0, + num_tokens_to_decode_per_step + expected_num_tokens_decoded - + request.decode_length()); + num_tokens_to_decode = + min(num_tokens_to_decode, (double)ssm_tree_depth + 1); + num_tokens_to_decode_2_request_index.push_back( + std::make_pair(num_tokens_to_decode, request_index)); + } + + // Sort the requests by spare latency in ascending order + std::sort(num_tokens_to_decode_2_request_index.begin(), + num_tokens_to_decode_2_request_index.end(), + std::less>()); + + for (auto const &spare_latency_request_index_pair : + num_tokens_to_decode_2_request_index) { + int request_index = spare_latency_request_index_pair.second; + RequestGuid guid = guid_of_requests[request_index]; + if (all_requests[guid].get_slo_ratio() < 0) { + continue; + } + add_tokens_toward_slo( + guid, budget, num_tokens_to_decode_2_request_index.size()); + } + + assert(budget >= 0); + if (budget > 0) { + if (memory_occupancy) { + add_tokens_toward_memory_occupancy(budget); + } else { + add_tokens_toward_goodput(budget); + } + } +} + +void RequestManager::prune_token_tree_equal() { + // Each reqeust has at least one token + int const equal_budget = + get_max_tokens_per_batch() / get_num_active_requests(); + assert(equal_budget >= 0); + + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + int budget = equal_budget; + assert(budget >= 0); + if (budget > 0) { + add_tokens_toward_goodput_per_request(budget, request_index); + } + } +} + +void RequestManager::prune_token_tree_greedy() { + // Each reqeust has at least one token + int budget = get_max_tokens_per_batch(); + assert(budget >= 0); + + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + } + + if (budget > 0) { + add_tokens_toward_goodput(budget); + } +} + +void RequestManager::add_tokens_toward_slo(RequestGuid guid, + int &budget, + int num_req_with_slo) { + Request &request = all_requests[guid]; + double num_tokens_to_decode_per_step = + (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor / + get_slo_constraint(request); + double expected_num_tokens_decoded = + request.decode_latency_ms / get_slo_constraint(request); + + double num_tokens_to_decode = + max(1.0, + num_tokens_to_decode_per_step + expected_num_tokens_decoded - + request.decode_length()); + num_tokens_to_decode = min(num_tokens_to_decode, (double)ssm_tree_depth + 1); + + // The root is already included + // In function add_root_to_spec_token_tree + double current_added = 1.0; + + // The max token that can be added to the token tree when fulfilling the SLO + int max_token_toward_slo = + int(get_max_tokens_per_batch() * 1.2 / num_available_requests); + + while (budget > 0 and max_token_toward_slo > 0 and + current_added < num_tokens_to_decode) { + if (request.token_tree_nodes_acc_prob_pair_pq.empty()) { + break; + } + auto [node_ptr, log_acc_prob] = + request.token_tree_nodes_acc_prob_pair_pq.top(); + request.token_tree_nodes_acc_prob_pair_pq.pop(); + node_ptr->included = true; + current_added += exp(log_acc_prob); + budget--; + max_token_toward_slo--; + } +} + +void RequestManager::add_tokens_toward_memory_occupancy(int budget) { + // This is a helper data structure to store help the pruning of the token + // trees across different requests. + std::vector, RequestGuid>> + global_token_tree_node_vector; + global_token_tree_node_vector.reserve(get_max_requests_per_batch()); + std::priority_queue< + std::pair, RequestGuid>, + std::vector, RequestGuid>>, + SharedTokenTreeNodePtrRequestGuidWeightedLess> + global_token_tree_node_pq(SharedTokenTreeNodePtrRequestGuidWeightedLess(), + std::move(global_token_tree_node_vector)); + + // Initialie the priority queue with the top element in each request's token + // tree + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + if (request.token_tree_nodes_acc_prob_pair_pq.empty()) { + continue; + } + if (!request.token_tree_nodes_acc_prob_pair_pq.empty()) { + global_token_tree_node_pq.push( + {request.token_tree_nodes_acc_prob_pair_pq.top().first, guid}); + request.token_tree_nodes_acc_prob_pair_pq.pop(); + } + } + + // Perform dequeue and enqueue until the budget is used up + while (budget > 0 and !global_token_tree_node_pq.empty()) { + auto [node_ptr, guid] = global_token_tree_node_pq.top(); + global_token_tree_node_pq.pop(); + node_ptr->included = true; + if (!get_request_with_guid(guid) + .token_tree_nodes_acc_prob_pair_pq.empty()) { + global_token_tree_node_pq.push( + {get_request_with_guid(guid) + .token_tree_nodes_acc_prob_pair_pq.top() + .first, + guid}); + get_request_with_guid(guid).token_tree_nodes_acc_prob_pair_pq.pop(); + } + budget--; + } + + // Clear the priority queue in each requests + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + std::vector, double>> + _prealloc_vector; + _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue< + std::pair, double>, + std::vector, double>>, + SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(), + std::move(_prealloc_vector)); + } +} + +void RequestManager::add_tokens_toward_goodput(int budget) { + // This is a helper data structure to store help the pruning of the token + // trees across different requests. + std::vector, double, RequestGuid>> + global_token_tree_node_vector; + global_token_tree_node_vector.reserve(get_max_requests_per_batch()); + std::priority_queue< + std::tuple, double, RequestGuid>, + std::vector< + std::tuple, double, RequestGuid>>, + SharedTokenTreeNodePtrDoubleRequestGuidLess> + global_token_tree_node_pq(SharedTokenTreeNodePtrDoubleRequestGuidLess(), + std::move(global_token_tree_node_vector)); + + // Initialie the priority queue with the top element in each request's token + // tree + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + if (request.token_tree_nodes_acc_prob_pair_pq.empty()) { + continue; + } + if (!request.token_tree_nodes_acc_prob_pair_pq.empty()) { + global_token_tree_node_pq.push( + {request.token_tree_nodes_acc_prob_pair_pq.top().first, + request.token_tree_nodes_acc_prob_pair_pq.top().second, + guid}); + request.token_tree_nodes_acc_prob_pair_pq.pop(); + } + } + + // Perform dequeue and enqueue until the budget is used up + while (budget > 0 and !global_token_tree_node_pq.empty()) { + auto [node_ptr, acc_log_prob, guid] = global_token_tree_node_pq.top(); + global_token_tree_node_pq.pop(); + node_ptr->included = true; + if (!get_request_with_guid(guid) + .token_tree_nodes_acc_prob_pair_pq.empty()) { + global_token_tree_node_pq.push( + {get_request_with_guid(guid) + .token_tree_nodes_acc_prob_pair_pq.top() + .first, + get_request_with_guid(guid) + .token_tree_nodes_acc_prob_pair_pq.top() + .second, + guid}); + get_request_with_guid(guid).token_tree_nodes_acc_prob_pair_pq.pop(); + } + budget--; + } + + // Clear the priority queue in each requests + for (int request_index = 0; request_index < get_max_requests_per_batch(); + ++request_index) { + if (!request_available[request_index]) { + continue; + } + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + std::vector, double>> + _prealloc_vector; + _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue< + std::pair, double>, + std::vector, double>>, + SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(), + std::move(_prealloc_vector)); + } +} + +void RequestManager::add_tokens_toward_goodput_per_request(int budget, + int request_index) { + RequestGuid guid = guid_of_requests[request_index]; + Request &request = all_requests[guid]; + assert(request.status == Request::RUNNING); + if (request.token_tree_nodes_acc_prob_pair_pq.empty()) { + return; + } + + auto &pq = request.token_tree_nodes_acc_prob_pair_pq; + + // Perform dequeue and enqueue until the budget is used up + while (budget > 0 and !pq.empty()) { + auto [node_ptr, acc_log_prob] = pq.top(); + pq.pop(); + node_ptr->included = true; + budget--; + } + + // Clear the priority queue in the request + std::vector, double>> + _prealloc_vector; + _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); + request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue< + std::pair, double>, + std::vector, double>>, + SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(), + std::move(_prealloc_vector)); +} + +std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) { + os << "Token tree: " << std::endl; + int layer_idx = 0; + for (auto const &layer : token_tree.tree_layers) { + os << "Layer: " << layer_idx << std::endl; + int token_pos = 0; + for (auto const &node : layer) { + os << std::fixed << std::setprecision(12); + os << "token pos: " << token_pos << "\ttoken id: " << node->id + << "\tparent pos: " << node->parent_pos + << "\tlog prob: " << node->log_accumulated_prob + << (node->included ? " included" : " not included") << std::endl; + token_pos++; + } + layer_idx++; + } + return os; +} + +/* --------- Request Token Tree Related Functions --------- */ + +/* --------- Profiling Related Functions --------- */ +void RequestManager::reset_profiling_statistics() { + profiling.llm_step_times.clear(); + profiling.requests_per_step.clear(); + profiling.ssm_step_times.clear(); + profiling.ssm_steps.clear(); + profiling.generated_tokens_per_step.clear(); + profiling.llm_step_start = 0; + profiling.ssm_step_start = 0; + profiling.server_start_time = Realm::Clock::current_time_in_microseconds(); +} +/* --------- Profiling Related Functions --------- */ }; // namespace FlexFlow diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index fadbf80d6..e3e5a5d5f 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/request_manager.h" +#include "flexflow/ffconst.h" #include "flexflow/utils/hip_helper.h" #include @@ -35,9 +36,14 @@ void RequestManager::load_tokens_task( // Extreme long prompts are not supported, only load up to // max_tokens_per_batch as prompt - if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) { + int max_tokens_per_batch = + std::max(batch_config->get_mode() == TREE_SEARCH_MODE + ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()); + if (batch_config->num_tokens > max_tokens_per_batch) { printf("Warning: too many tokens in prompt, only load up to %d tokens\n", - BatchConfig::max_tokens_per_batch()); + max_tokens_per_batch); printf("Got: %d tokens\n", batch_config->num_tokens); } @@ -90,53 +96,30 @@ void RequestManager::load_batch_config_task( total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata - if (batch_config->get_mode() == BEAM_SEARCH_MODE) { - BeamSearchBatchConfig const *beam_batch_config = - static_cast(batch_config); - - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), - hipMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - + if (batch_config->get_mode() == TREE_SEARCH_MODE) { checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - hipMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(beam_batch_config->causalMask), + &(batch_config->causalMask), sizeof(BatchConfig::causalMask), hipMemcpyHostToDevice, stream)); total_copy_size += sizeof(BatchConfig::causalMask); } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { - TreeVerifyBatchConfig const *tree_batch_config = - static_cast(batch_config); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->causalMask), + &(batch_config->causalMask), sizeof(BatchConfig::causalMask), hipMemcpyHostToDevice, stream)); total_copy_size += sizeof(BatchConfig::causalMask); checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->committed_tokens), - sizeof(TreeVerifyBatchConfig::committed_tokens), + &(batch_config->committed_tokens), + sizeof(BatchConfig::committed_tokens), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + total_copy_size += sizeof(BatchConfig::committed_tokens); } // add a size check @@ -160,7 +143,7 @@ void RequestManager::load_positions_task( int dram_copy[BatchConfig::MAX_NUM_TOKENS]; for (int i = 0; i < batch_config->num_tokens; i++) { - dram_copy[i] = batch_config->tokensInfo[i].abs_depth_in_request + offset; + dram_copy[i] = batch_config->tokensInfo[i].abs_index_in_request + offset; } hipStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 8380d6be7..be09ee7b2 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -13,13 +13,25 @@ * limitations under the License. */ +#include "flashinfer/decode_attention_decl.cuh" +#include "flashinfer/prefill_attention_decl.cuh" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/request_manager.h" #include "flexflow/utils/cuda_helper.h" +#include namespace FlexFlow { using namespace Legion; +using flashinfer::BatchDecodeHandler; +using flashinfer::BatchPrefillHandler; +using flashinfer::LogitsPostHook; +using flashinfer::paged_kv_t; +using flashinfer::PageStorage; +using flashinfer::PosEncodingMode; +using flashinfer::QKVLayout; + void RequestManager::load_tokens_task( Task const *task, std::vector const ®ions, @@ -35,36 +47,295 @@ void RequestManager::load_tokens_task( // Extreme long prompts are not supported, only load up to // BatchConfig::max_tokens_per_batch() as prompt - if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() && - batch_config->get_mode() == INC_DECODING_MODE) { + int max_tokens_per_batch = + std::max(batch_config->get_mode() == TREE_SEARCH_MODE + ? BatchConfig::max_tokens_per_ssm_batch() + : BatchConfig::max_tokens_per_batch(), + BatchConfig::max_tokens_per_prefilling_batch()); + if (batch_config->num_tokens > max_tokens_per_batch) { printf("Warning: too many tokens in prompt, only load up to %d tokens\n", - BatchConfig::max_tokens_per_batch()); - printf("Got: %d tokens\n", batch_config->num_tokens); - } else if (batch_config->num_tokens > - BatchConfig::max_verify_tokens_per_batch()) { - printf("Warning: Speculative decoding. too many tokens in prompt, only " - "load up to %d tokens\n", - BatchConfig::max_verify_tokens_per_batch()); + max_tokens_per_batch); printf("Got: %d tokens\n", batch_config->num_tokens); } - for (int i = 0; i < batch_config->num_tokens; i++) { - dram_copy[i] = batch_config->tokensInfo[i].token_id; + if (batch_config->num_tokens > 0) { + for (int i = 0; i < batch_config->num_tokens; i++) { + dram_copy[i] = batch_config->tokensInfo[i].token_id; + } + TokenId *fb_ptr = helperGetTensorPointerWO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + assert(batch_config->num_tokens <= domain.get_volume()); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDA(cudaMemcpyAsync(fb_ptr, + dram_copy, + sizeof(TokenId) * batch_config->num_tokens, + cudaMemcpyHostToDevice, + stream)); } - TokenId *fb_ptr = helperGetTensorPointerWO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - Domain domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - assert(batch_config->num_tokens <= domain.get_volume()); - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkCUDA(cudaMemcpyAsync(fb_ptr, - dram_copy, - sizeof(TokenId) * batch_config->num_tokens, +} + +void prepare_inference_params_kernel_h(BatchConfig const *batch_config, + PageManager *pm, + AttentionMetaData *attention_metadata, + cudaStream_t stream, + uint32_t const max_num_pages, + int32_t *q_indptr_h, + int32_t *kv_indptr_h, + int32_t *kv_indices_h, + int32_t *kv_last_page_len_h, + int32_t *qk_indptr_h) { + int batch_size = batch_config->num_active_requests(); + // we just search for the page number for each request + q_indptr_h[0] = 0; + kv_indptr_h[0] = 0; + qk_indptr_h[0] = 0; + int q_lens = 0, qk_lens = 0; + int indices_offset = 0, indices_lens = 0; + for (int req_idx = 0, indptr_idx = 0; + req_idx < batch_config->max_requests_per_batch(); + req_idx++) { + if (batch_config->request_available[req_idx]) { + int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch; + int kv_len = + batch_config->requestsInfo[req_idx].num_tokens_in_batch + + batch_config->requestsInfo[req_idx].first_token_index_in_request; + + q_lens += q_len; + qk_lens += (q_len * kv_len + 7) / 8; + indices_offset = indices_lens; + indices_lens += (kv_len + kPagesize - 1) / kPagesize; + q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len; + kv_indptr_h[indptr_idx + 1] = + round_up_pages(kv_len) + kv_indptr_h[indptr_idx]; + std::vector kv_indices = pm->get_block_table_indices( + batch_config->requestsInfo[req_idx].request_guid); + for (int i = indices_offset; i < indices_lens; i++) { + kv_indices_h[i] = kv_indices[i - indices_offset]; + } + kv_last_page_len_h[indptr_idx] = (kv_len - 1) % kPagesize + 1; + qk_indptr_h[indptr_idx + 1] = qk_lens; + indptr_idx++; + } + } + + // do the copy + checkCUDA(cudaMemcpyAsync(attention_metadata->q_indptr, + q_indptr_h, + sizeof(int32_t) * (batch_size + 1), + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(attention_metadata->kv_indptr, + kv_indptr_h, + sizeof(int32_t) * (batch_size + 1), + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(attention_metadata->kv_indices, + kv_indices_h, + sizeof(int32_t) * batch_size * max_num_pages, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(attention_metadata->kv_last_page_len, + kv_last_page_len_h, + sizeof(int32_t) * batch_size, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(attention_metadata->qk_indptr, + qk_indptr_h, + sizeof(int32_t) * (batch_size + 1), cudaMemcpyHostToDevice, stream)); } +// q_indptr: the start offset of q in the batch for each request, +// the length is `num_requests + 1`: [0, num_q_0, num_q_0 + num_q_1, +// ..., num_q_0 + num_q_1 + ... + num_q_{num_requests - 1}] +// kv_indptr: the start offset of kv page_indices for each request, +// the length is `num_requests + 1`. +// kv_indices: the page indices for kv, the length is `num_kv_pages`. +// kv_last_page_len: the cache length in the last page for each request, +// the length is `num_requests`. +// qk_indptr: the start offset of custom_mask in the flattened mask for each +// request, the length is `num_requests + 1`. It can be calculated as +// accumulative `ceil(qk_len / 8)`. +__global__ void + prepare_inference_params_kernel(int const num_requests, + BatchConfig::PerRequestInfo *request_infos, + bool *request_available, + uint32_t const max_num_pages, + int32_t *q_indptr, + int32_t *kv_indptr, + int32_t *kv_indices, + int32_t *kv_last_page_len, + int32_t *qk_indptr) { + int const request_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (request_idx >= num_requests) { + return; + } + + // request id in batch config + int requext_idx_in_batch = -1; + int cnt_1 = 0, q_lens = 0, qk_lens = 0; + int indices_offset = 0, indices_lens = 0, kv_len = 0; + while (cnt_1 < request_idx + 1) { + requext_idx_in_batch++; + if (request_available[requext_idx_in_batch]) { + cnt_1++; + int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch; + q_lens += q_len; + kv_len = request_infos[requext_idx_in_batch].num_tokens_in_batch + + request_infos[requext_idx_in_batch].first_token_index_in_request; + qk_lens += (q_len * kv_len + 7) / 8; + indices_offset = indices_lens; + indices_lens += (kv_len + kPagesize - 1) / kPagesize; + } + } + + if (request_idx == 0) { + q_indptr[0] = 0; + kv_indptr[0] = 0; + qk_indptr[0] = 0; + } + __syncthreads(); + q_indptr[request_idx + 1] = q_lens; + kv_indptr[request_idx + 1] = indices_lens; + for (int i = indices_offset; i < indices_lens; i++) { + kv_indices[i] = max_num_pages * requext_idx_in_batch + (i - indices_offset); + } + kv_last_page_len[request_idx] = (kv_len - 1) % kPagesize + 1; + qk_indptr[request_idx + 1] = qk_lens; +} + +#define test_bit_orig(bit_mask, idx, pos) \ + (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0) + +// cache = (global-sink) % window + sink +#define cache_2_global_index(cache_info, cache_index) \ + do { \ + if (cache_index >= (cache_info).sink_cache_size) { \ + cache_index -= (cache_info).sink_cache_size; \ + int num_window = \ + ((cache_info).total_len - (cache_info).sink_cache_size) / \ + (cache_info).window_cache_size - \ + ((cache_info).window_back <= cache_index); \ + cache_index += (cache_info).sink_cache_size + \ + num_window * (cache_info).window_cache_size; \ + } \ + } while (0) + +__global__ void + update_custom_mask_kernel(uint8_t *custom_mask, + int32_t const *qk_indptr, + BatchConfig::BitMask *causalMask, + BatchConfig::PerRequestInfo *request_infos, + bool *request_available, + uint32_t const num_requests, + StreamingCacheInfo *streaming_cache_infos, + bool streaming_cache) { + int byte_idx = blockIdx.x * blockDim.x + threadIdx.x; + int request_idx = 0; + while (request_idx < num_requests) { + if (qk_indptr[request_idx + 1] > byte_idx) { + break; + } + request_idx++; + } + + if (request_idx >= num_requests) { + return; + } + byte_idx -= qk_indptr[request_idx]; + + // request id in batch config + int requext_idx_in_batch = -1, cnt_1 = 0; + while (cnt_1 < request_idx + 1) { + requext_idx_in_batch++; + if (request_available[requext_idx_in_batch]) { + cnt_1++; + } + } + + BatchConfig::PerRequestInfo const &request_info = + request_infos[requext_idx_in_batch]; + BatchConfig::BitMask &causal_mask = causalMask[requext_idx_in_batch]; + + int const q_length = request_info.num_tokens_in_batch, + q_start = request_info.first_token_index_in_request - + causal_mask.non_tree_cache_size, + non_tree_cache_size = causal_mask.non_tree_cache_size, + kv_len = request_info.num_tokens_in_batch + + request_info.first_token_index_in_request; + + uint8_t packed_bits = 0; + for (int bit_idx = 0; bit_idx < 8; bit_idx++) { + int const bit_offset = byte_idx * 8 + bit_idx, q_idx = bit_offset / kv_len; + int kv_idx = bit_offset % kv_len; + if (streaming_cache) { // recover to the original index + if (kv_idx < streaming_cache_infos[requext_idx_in_batch].commit_len) { + cache_2_global_index(streaming_cache_infos[requext_idx_in_batch], + kv_idx); + } else { + kv_idx += streaming_cache_infos[requext_idx_in_batch].total_len - + streaming_cache_infos[requext_idx_in_batch].commit_len; + } + } + if (kv_idx < non_tree_cache_size || q_idx >= q_length) { + packed_bits |= 1 << bit_idx; + } else { + if (test_bit_orig(causal_mask.bit_mask, + q_start + q_idx, + kv_idx - non_tree_cache_size)) { + packed_bits |= 1 << bit_idx; + } + } + } + custom_mask[qk_indptr[request_idx] + byte_idx] = packed_bits; +} + +// Passing the CPU-side causalMask, then output the bit-packed custom_mask for +// attention forward. +// Layout of causalMask: [num_requests][tree_size][tree_size] +// Layout of custom_mask: [num_requests][q_length][kv_length] (bit-packed) +// Note that for spec-decoding, q_length == last_layer_length != tree_size +// Also we should consider the influence of StreamingCache +void update_custom_mask(BatchConfig const *batch_config, + AttentionMetaData *metadata, + BatchConfig::BitMask *causalMask, + BatchConfig::PerRequestInfo *request_infos, + bool *request_available, + int batch_size, + StreamingCacheInfo *streaming_cache_infos, + cudaStream_t stream) { + InferenceMode mode = batch_config->get_mode(); + assert(mode == TREE_SEARCH_MODE || mode == TREE_VERIFY_MODE); + bool streaming_cache = + mode == TREE_SEARCH_MODE && batch_config->streaming_cache(); + int parallelism = 0; + for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch(); + req_idx++) { + if (batch_config->request_available[req_idx]) { + int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch; + int kv_len = + batch_config->requestsInfo[req_idx].num_tokens_in_batch + + batch_config->requestsInfo[req_idx].first_token_index_in_request; + parallelism += (q_len * kv_len + 7) / 8; + } + } + update_custom_mask_kernel<<>>(metadata->custom_mask, + metadata->qk_indptr, + causalMask, + request_infos, + request_available, + batch_size, + streaming_cache_infos, + streaming_cache); +} + void RequestManager::load_batch_config_task( Task const *task, std::vector const ®ions, @@ -81,86 +352,406 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); size_t total_copy_size = 0; - checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, - &(batch_config->tokensInfo), - sizeof(BatchConfig::tokensInfo), - cudaMemcpyHostToDevice, - stream)); + if (batch_config->num_tokens > 0) { + // The tokensInfo is compact + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + batch_config->num_tokens * + sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream)); + } total_copy_size += sizeof(BatchConfig::tokensInfo); + for (int request_idx = 0; request_idx < BatchConfig::max_requests_per_batch(); + request_idx++) { + if (batch_config->request_available[request_idx]) { + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size + + request_idx * sizeof(BatchConfig::PerRequestInfo), + &(batch_config->requestsInfo[request_idx]), + sizeof(BatchConfig::PerRequestInfo), + cudaMemcpyHostToDevice, + stream)); + } + } + total_copy_size += sizeof(BatchConfig::requestsInfo); + checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->requestsInfo), - sizeof(BatchConfig::requestsInfo), + &(batch_config->request_available), + sizeof(BatchConfig::request_available), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::requestsInfo); + total_copy_size += sizeof(BatchConfig::request_available); + + for (int request_idx = 0; request_idx < BatchConfig::max_requests_per_batch(); + request_idx++) { + if (batch_config->request_available[request_idx]) { + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size + + request_idx * sizeof(BatchConfig::BitMask), + &(batch_config->causalMask[request_idx]), + sizeof(BatchConfig::BitMask), + cudaMemcpyHostToDevice, + stream)); + } + } + total_copy_size += sizeof(BatchConfig::causalMask); - // load speculative metadata - if (batch_config->get_mode() == BEAM_SEARCH_MODE) { - BeamSearchBatchConfig const *beam_batch_config = - static_cast(batch_config); + checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->streamingCacheInfo), + sizeof(BatchConfig::streamingCacheInfo), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::streamingCacheInfo); + if (batch_config->num_tokens_to_commit > 0) { checkCUDA(cudaMemcpyAsync( static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), + &(batch_config->committed_tokens), + batch_config->num_tokens_to_commit * + sizeof(BatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, stream)); + } + total_copy_size += sizeof(BatchConfig::committed_tokens); - total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->num_tokens_to_commit), + sizeof(int), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(int); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + // load attention metadata + if (batch_config->get_mode() == INC_DECODING_MODE) { + PageManager *pm = PageManager::get_page_manager(); + static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], + kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1]; + static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_NUM_TOKENS]; + static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1]; + static int32_t kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS]; + if (handle.incr_attention_metadata->enabled()) { + // calculate the attention meta data + { + int batch_size = batch_config->num_active_requests(); + uint32_t const max_num_pages = + round_up_pages(BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BatchConfig::causalMask); + prepare_inference_params_kernel_h(batch_config, + pm, + handle.incr_attention_metadata, + stream, + max_num_pages, + q_indptr_h, + kv_indptr_h, + kv_indices_h, + kv_last_page_len_h, + qk_indptr_h); + } - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->request_completed), - sizeof(BatchConfig::request_completed), - cudaMemcpyHostToDevice, - stream)); + // prepare attention forward handler + { + int batch_size = batch_config->num_active_requests(); + if (!batch_config->prompt_phase) { + BatchDecodeHandler *handler = nullptr; + if (handle.incr_attention_metadata->decode_handler_collections.count( + batch_size) == 0) { + handle.incr_attention_metadata + ->decode_handler_collections[batch_size] = static_cast( + new flashinfer::BatchDecodeHandler(true, batch_size)); + } + handler = static_cast( + handle.incr_attention_metadata + ->decode_handler_collections[batch_size]); + + handler->SetCUDAStream(stream); + DISPATCH_HEADDIM( + handle.incr_attention_metadata->head_dim(), HEAD_DIM, { + handler->BeginForwardDispatched( + static_cast( + handle.incr_attention_metadata->float_workspace), + handle.incr_attention_metadata->float_workspace_size, + static_cast( + handle.incr_attention_metadata->int_workspace), + handle.incr_attention_metadata->int_workspace_size, + static_cast(kv_indptr_h), + static_cast(kv_last_page_len_h), + batch_size, + handle.incr_attention_metadata->num_q_heads(), + handle.incr_attention_metadata->num_kv_heads(), + kPagesize); + }); + } else { + BatchPrefillHandler *handler = nullptr; + if (handle.incr_attention_metadata->prompt_handler_collections.count( + batch_size) == 0) { + handle.incr_attention_metadata + ->prompt_handler_collections[batch_size] = + static_cast(new flashinfer::BatchPrefillHandler(true)); + } + handler = static_cast( + handle.incr_attention_metadata + ->prompt_handler_collections[batch_size]); + + handler->SetCUDAStream(stream); + handler->BeginForward( + static_cast( + handle.incr_attention_metadata->float_workspace), + handle.incr_attention_metadata->float_workspace_size, + static_cast( + handle.incr_attention_metadata->int_workspace), + handle.incr_attention_metadata->int_workspace_size, + static_cast(q_indptr_h), + static_cast(kv_indptr_h), + batch_size, + handle.incr_attention_metadata->num_q_heads(), + handle.incr_attention_metadata->num_kv_heads(), + handle.incr_attention_metadata->head_dim(), + kPagesize); + } + } + } + } else if (batch_config->get_mode() == TREE_SEARCH_MODE) { + if (handle.tree_search_attention_metadata->enabled()) { + // calculate the attention meta data + { + BatchConfig::PerRequestInfo *request_infos = + reinterpret_cast( + static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo)); + bool *request_available = reinterpret_cast( + static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + BatchConfig::BitMask *causalMask = + reinterpret_cast( + static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::request_available)); + StreamingCacheInfo *streaming_cache_infos = + reinterpret_cast( + static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::request_available) + + sizeof(BatchConfig::causalMask)); + int batch_size = batch_config->num_active_requests(); + uint32_t const max_num_pages = + round_up_pages(BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); + + int parallelism = batch_size; + prepare_inference_params_kernel<<>>( + batch_size, + request_infos, + request_available, + max_num_pages, + handle.tree_search_attention_metadata->q_indptr, + handle.tree_search_attention_metadata->kv_indptr, + handle.tree_search_attention_metadata->kv_indices, + handle.tree_search_attention_metadata->kv_last_page_len, + handle.tree_search_attention_metadata->qk_indptr); - total_copy_size += sizeof(BatchConfig::request_completed); + // Update gpu-side custom mask referring from CaualMask + if (!batch_config->prompt_phase) { + update_custom_mask(batch_config, + handle.tree_search_attention_metadata, + causalMask, + request_infos, + request_available, + batch_size, + streaming_cache_infos, + stream); + } + } + + // prepare attention forward handler + { + int batch_size = batch_config->num_active_requests(); + BatchPrefillHandler *handler = nullptr; + + if (!batch_config->prompt_phase) { + if (handle.tree_search_attention_metadata->decode_handler_collections + .count(batch_size) == 0) { + handle.tree_search_attention_metadata + ->decode_handler_collections[batch_size] = + static_cast(new flashinfer::BatchPrefillHandler(true)); + } + handler = static_cast( + handle.tree_search_attention_metadata + ->decode_handler_collections[batch_size]); + } else { + if (handle.tree_search_attention_metadata->prompt_handler_collections + .count(batch_size) == 0) { + handle.tree_search_attention_metadata + ->prompt_handler_collections[batch_size] = + static_cast(new flashinfer::BatchPrefillHandler(true)); + } + handler = static_cast( + handle.tree_search_attention_metadata + ->prompt_handler_collections[batch_size]); + } + + static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], + kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1]; + q_indptr_h[0] = 0; + kv_indptr_h[0] = 0; + for (int req_idx = 0, indptr_idx = 0; + req_idx < batch_config->max_requests_per_batch(); + req_idx++) { + if (batch_config->request_available[req_idx]) { + int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch; + int kv_len = + batch_config->requestsInfo[req_idx].num_tokens_in_batch + + batch_config->requestsInfo[req_idx] + .first_token_index_in_request; + q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len; + kv_indptr_h[indptr_idx + 1] = + kv_indptr_h[indptr_idx] + round_up_pages(kv_len); + indptr_idx++; + } + } + + handler->SetCUDAStream(stream); + handler->BeginForward( + static_cast( + handle.tree_search_attention_metadata->float_workspace), + handle.tree_search_attention_metadata->float_workspace_size, + static_cast( + handle.tree_search_attention_metadata->int_workspace), + handle.tree_search_attention_metadata->int_workspace_size, + static_cast(q_indptr_h), + static_cast(kv_indptr_h), + batch_size, + handle.tree_search_attention_metadata->num_q_heads(), + handle.tree_search_attention_metadata->num_kv_heads(), + handle.tree_search_attention_metadata->head_dim(), + kPagesize); + } + } } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { - TreeVerifyBatchConfig const *tree_batch_config = - static_cast(batch_config); + PageManager *pm = PageManager::get_page_manager(); + static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], + kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1]; + static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS * + BatchConfig::MAX_NUM_TOKENS]; + static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1]; + static int32_t kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS]; + if (handle.tree_verify_attention_metadata->enabled()) { + // calculate the attention meta data + { + BatchConfig::PerRequestInfo *request_infos = + reinterpret_cast( + static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo)); + bool *request_available = reinterpret_cast( + static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + BatchConfig::BitMask *causalMask = + reinterpret_cast( + static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::request_available)); + StreamingCacheInfo *streaming_cache_infos = + reinterpret_cast( + static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::request_available) + + sizeof(BatchConfig::causalMask)); + int batch_size = batch_config->num_active_requests(); + uint32_t const max_num_pages = + round_up_pages(BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->committed_tokens), - sizeof(TreeVerifyBatchConfig::committed_tokens), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + // int parallelism = batch_size; + prepare_inference_params_kernel_h(batch_config, + pm, + handle.tree_verify_attention_metadata, + stream, + max_num_pages, + q_indptr_h, + kv_indptr_h, + kv_indices_h, + kv_last_page_len_h, + qk_indptr_h); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->request_completed), - sizeof(BatchConfig::request_completed), - cudaMemcpyHostToDevice, - stream)); + // Update gpu-side custom mask referring from CaualMask + if (!batch_config->prompt_phase) { + update_custom_mask(batch_config, + handle.tree_verify_attention_metadata, + causalMask, + request_infos, + request_available, + batch_size, + streaming_cache_infos, + stream); + } + } + + // prepare attention forward handler + { + int batch_size = batch_config->num_active_requests(); + BatchPrefillHandler *handler = nullptr; + + if (!batch_config->prompt_phase) { + if (handle.tree_verify_attention_metadata->decode_handler_collections + .count(batch_size) == 0) { + handle.tree_verify_attention_metadata + ->decode_handler_collections[batch_size] = + static_cast(new flashinfer::BatchPrefillHandler(true)); + } + handler = static_cast( + handle.tree_verify_attention_metadata + ->decode_handler_collections[batch_size]); + } else { + if (handle.tree_verify_attention_metadata->prompt_handler_collections + .count(batch_size) == 0) { + handle.tree_verify_attention_metadata + ->prompt_handler_collections[batch_size] = + static_cast(new flashinfer::BatchPrefillHandler(true)); + } + handler = static_cast( + handle.tree_verify_attention_metadata + ->prompt_handler_collections[batch_size]); + } - total_copy_size += sizeof(BatchConfig::request_completed); + handler->SetCUDAStream(stream); + handler->BeginForward( + static_cast( + handle.tree_verify_attention_metadata->float_workspace), + handle.tree_verify_attention_metadata->float_workspace_size, + static_cast( + handle.tree_verify_attention_metadata->int_workspace), + handle.tree_verify_attention_metadata->int_workspace_size, + static_cast(q_indptr_h), + static_cast(kv_indptr_h), + batch_size, + handle.tree_verify_attention_metadata->num_q_heads(), + handle.tree_verify_attention_metadata->num_kv_heads(), + handle.tree_verify_attention_metadata->head_dim(), + kPagesize); + } + } } // add a size check diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc index d94337641..b71af0d47 100644 --- a/src/runtime/simulator.cc +++ b/src/runtime/simulator.cc @@ -31,10 +31,10 @@ namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_sim("sim"); -LegionRuntime::Logger::Category log_ps_sim("ps_sim"); -LegionRuntime::Logger::Category log_xfer_sim("xfer_sim"); -LegionRuntime::Logger::Category log_xfer_est("xfer_est"); +Legion::Logger log_sim("sim"); +Legion::Logger log_ps_sim("ps_sim"); +Legion::Logger log_xfer_sim("xfer_sim"); +Legion::Logger log_xfer_est("xfer_est"); // template class std::map; // for debugging in gdb // template class std::map; // for debugging in gdb diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index c0804d6e1..176133c49 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -54,8 +54,8 @@ namespace FlexFlow::PCG { using namespace Legion; -LegionRuntime::Logger::Category log_xfers("xfers"); -LegionRuntime::Logger::Category log_xfer_matches("xfer_matches"); +Legion::Logger log_xfers("xfers"); +Legion::Logger log_xfer_matches("xfer_matches"); const TensorX TensorX::NO_TX = TensorX(); @@ -3826,9 +3826,13 @@ bool FFModel::convert_graph_to_operators( case OP_SIGMOID_SILU_MULTI: { assert(inList.size() == 2); SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr; - SigmoidSiluMultiParams params = ssm->get_params(); - new_op = new SigmoidSiluMulti( - *this, ssm->layer_guid, inputs[0], inputs[1], NULL); + new_op = new SigmoidSiluMulti(*this, + ssm->layer_guid, + inputs[0], + inputs[1], + ssm->intermediate_size, + ssm->tensor_parallelism_degree, + NULL); break; } default: { diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc deleted file mode 100644 index 841c735f5..000000000 --- a/src/runtime/tree_verify_batch_config.cc +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright 2023 CMU, Stanford, Facebook, LANL - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/batch_config.h" -#include "flexflow/request_manager.h" -#include "legion.h" -#include -#include - -namespace FlexFlow { - -LegionRuntime::Logger::Category log_tree_bc("TreeVerifyBatchConfig"); - -TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {} - -TreeVerifyBatchConfig::~TreeVerifyBatchConfig() {} - -InferenceMode TreeVerifyBatchConfig::get_mode() const { - return TREE_VERIFY_MODE; -} - -std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { - os << "@@@@@@@@@@@@@@ TreeVerifyBatchConfig (mode " << bc.get_mode() - << ") @@@@@@@@@@@@@@" << std::endl; - // Max values - os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl; - os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl; - os << "Max sequence length: " << bc.max_sequence_length() << std::endl; - // Current values - os << "Number of tokens: " << bc.num_active_tokens() << std::endl; - os << "Number of requests: " << bc.num_active_requests() << std::endl; - os << "Number of tokens to commit: " << bc.num_tokens_to_commit << std::endl; - - os << "Per-request info:\n"; - for (int i = 0; i < bc.max_requests_per_batch(); i++) { - if (!bc.request_completed[i]) { - os << " Request " << i << ":\n"; - os << " First token depth in request: " - << bc.requestsInfo[i].first_token_depth_in_request << std::endl; - os << " First token offset in batch: " - << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; - os << " Number of tokens in batch: " - << bc.requestsInfo[i].num_tokens_in_batch << std::endl; - os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; - os << " Request completed: " << bc.request_completed[i] << std::endl; - os << " Request running: " << bc.request_running[i] << std::endl; - } - } - - os << "Per-token info:\n"; - for (int i = 0; i < bc.num_tokens; i++) { - os << " Token " << i << ":\n"; - os << " Absolute depth in request: " - << bc.tokensInfo[i].abs_depth_in_request << std::endl; - os << " Request index: " << bc.tokensInfo[i].request_index << std::endl; - os << " Token id: " << bc.tokensInfo[i].token_id << std::endl; - } - - os << "Tokens to commit info:\n"; - for (int i = 0; i < bc.num_tokens_to_commit; i++) { - os << " Token " << i << ":\n"; - os << " token_index: " << bc.committed_tokens[i].token_index - << std::endl; - os << " request_index: " << bc.committed_tokens[i].request_index - << std::endl; - os << " token_depth: " << bc.committed_tokens[i].token_depth - << std::endl; - } - - os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl; - return os; -} - -void TreeVerifyBatchConfig::print() const { - std::cout << *this << std::endl; -} - -void TreeVerifyBatchConfig::save_to_file(std::string const &filename) const { - std::ofstream outputFile(filename); - if (outputFile.is_open()) { - outputFile << *this << std::endl; - outputFile.close(); - } else { - std::cerr << "Error: Unable to open the batch config output file: " - << filename << std::endl; - assert(false); - } -} - -}; // namespace FlexFlow diff --git a/src/utils/communication_buffer.cu b/src/utils/communication_buffer.cu new file mode 100644 index 000000000..83b0385a3 --- /dev/null +++ b/src/utils/communication_buffer.cu @@ -0,0 +1,147 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/utils/communication_buffer.h" +#include "flexflow/utils/cuda_helper.h" +#include "tensorrt_llm/custom_allreduce_kernels.h" +#include +#include + +// Given a local CUDA data pointer, return the peer memory pointers group. +// For the i-th pointer, if i is the worker id of the given device, +// then the returned i-th ptr_group is the local pointer, +// or otherwise it is an peer memory pointer from the remote device. +std::vector create_peer_ptr_group(Legion::Context ctx, + Legion::Runtime *runtime, + int num_devices, + int device_id, + ncclComm_t ncclComm, + void *allgather_src, + void *allgather_dst, + void *local_ptr, + cudaStream_t stream) { + // Ensure we are on the correct device + int device = 0; + checkCUDA(cudaGetDevice(&device)); + assert(device == device_id && "Device ID does not match current device."); + + // Next we all-gather the peer memory pointers across all distributed workers. + // On each worker, we copy the peer pointers to GPU memory. And nccl AllGather + // is used to all-gather the pointers. Finally the all-gathered pointers + // on each worker are copied from GPU to CPU. + + checkCUDA(cudaMemcpyAsync(allgather_src, + &local_ptr, + sizeof(void *), + cudaMemcpyHostToDevice, + stream)); + + runtime->concurrent_task_barrier(ctx); + checkNCCL(ncclAllGather(allgather_src, + allgather_dst, + sizeof(void *), + ncclChar, + ncclComm, + stream)); + runtime->concurrent_task_barrier(ctx); + + std::vector peer_pointers(num_devices); + checkCUDA(cudaMemcpyAsync(peer_pointers.data(), + allgather_dst, + sizeof(void *) * num_devices, + cudaMemcpyDeviceToHost, + stream)); + checkCUDA(cudaStreamSynchronize(stream)); + + return peer_pointers; +} + +// Free the peer memory pointers group. +void free_peer_ptr_group(std::vector ptr_group, + int device_id, + bool free_local) { + for (int i = 0; i < static_cast(ptr_group.size()); ++i) { + if (i == device_id && free_local) { + // Free the local buffer. + checkCUDA(cudaFree(ptr_group[i])); + } + // No need to do anything for other devices. + } +} + +// Given a local CUDA data pointer, return the CommunicationBuffer of the +// pointer. The CommunicationBuffer contains the local pointer and the peer +// memory pointers group. It contains the barrier helpers for synchronization +// across distributed workers, which should also be peer-based. The +// allgather_src and allgather_dst are device buffers, which are used for +// all-gathering peer pointers across devices. The size of allgather_src should +// be sizeof(void*), and the size of allgather_dst should be sizeof(void*) * +// num_devices. +CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx, + Legion::Runtime *runtime, + int num_devices, + int device_id, + ncclComm_t ncclComm, + void *allgather_src, + void *allgather_dst, + void *local_ptr, + void *barrier_in_ptr, + void *barrier_out_ptr, + int *barrier_flag, + cudaStream_t stream) { + assert(local_ptr != nullptr && "Local pointer is nullptr."); + CommunicationBuffer *comm_buf = new CommunicationBuffer(); + comm_buf->num_devices = num_devices; + comm_buf->device_id = device_id; + comm_buf->local_ptr = local_ptr; + comm_buf->comm_ptrs = create_peer_ptr_group(ctx, + runtime, + num_devices, + device_id, + ncclComm, + allgather_src, + allgather_dst, + local_ptr, + stream); + comm_buf->barrier_in = create_peer_ptr_group(ctx, + runtime, + num_devices, + device_id, + ncclComm, + allgather_src, + allgather_dst, + barrier_in_ptr, + stream); + comm_buf->barrier_out = create_peer_ptr_group(ctx, + runtime, + num_devices, + device_id, + ncclComm, + allgather_src, + allgather_dst, + barrier_out_ptr, + stream); + comm_buf->barrier_flag = barrier_flag; + + return comm_buf; +} + +// Release the CommunicationBuffer. +void release_comm_buf(CommunicationBuffer *comm_buf) { + free_peer_ptr_group(comm_buf->comm_ptrs, comm_buf->device_id, false); + free_peer_ptr_group(comm_buf->barrier_in, comm_buf->device_id, false); + free_peer_ptr_group(comm_buf->barrier_out, comm_buf->device_id, false); + delete comm_buf; +} diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index 8beea5599..cceca7845 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -10,26 +10,26 @@ cd "${BASH_SOURCE[0]%/*}" ############################################################################################### # LLAMA -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 20000 -ll:zsize 30000 --verbose --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 # OPT (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### @@ -37,63 +37,63 @@ fi ############################################################################################### # LLAMA (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1 # LLAMA (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 20000 -ll:zsize 30000 --verbose --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 # LLAMA (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4 # OPT (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 # OPT (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 # OPT (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 # OPT (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 # Falcon (full precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # Falcon (half precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # # StarCoder (full precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 # # StarCoder (half precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 6857b5cbc..8fa17f153 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -87,7 +87,7 @@ def main(): # Get Tokenizer hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True) hf_arch = getattr(hf_config, "architectures")[0] - if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM" or hf_arch == "MistralForCausalLM": tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True) else: tokenizer = AutoTokenizer.from_pretrained(args.model_name) diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 895b74c79..5fb142282 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -10,9 +10,9 @@ cleanup() { cd "${BASH_SOURCE[0]%/*}" # Enable Python tests (on by default) -PYTHON_INFERENCE_TESTS=${PYTHON_INFERENCE_TESTS:-ON} +PYTHON_INFERENCE_TESTS=${PYTHON_INFERENCE_TESTS:-OFF} # Enable C++ tests, (off by default) -CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-OFF} +CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-ON} # Enable model parallelism tests in C++, if desired TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF} @@ -25,9 +25,6 @@ fi # Clean up before test (just in case) cleanup -# Make sure supported version of protobuf is installed -pip3 install protobuf==3.20.3 - # Create test prompt file mkdir -p ../inference/prompt echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json diff --git a/tests/ops/batch_matmul_test.cc b/tests/ops/batch_matmul_test.cc index 7931f4412..f61048feb 100644 --- a/tests/ops/batch_matmul_test.cc +++ b/tests/ops/batch_matmul_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("bmm_test"); +Legion::Logger log_app("bmm_test"); struct BMMTestMeta { int m, k, n, d; diff --git a/tests/ops/concat_test.cc b/tests/ops/concat_test.cc index c67b718e0..b0489d1ad 100644 --- a/tests/ops/concat_test.cc +++ b/tests/ops/concat_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("concat_test"); +Legion::Logger log_app("concat_test"); struct ConcatTestMeta { int batch_size, i_dim, num_channels, projected_num_channels, diff --git a/tests/ops/flat_test.cc b/tests/ops/flat_test.cc index 428893a0d..61de83b6b 100644 --- a/tests/ops/flat_test.cc +++ b/tests/ops/flat_test.cc @@ -7,7 +7,7 @@ #include using namespace Legion; -LegionRuntime::Logger::Category log_app("Flat_test"); +Legion::Logger log_app("Flat_test"); struct FlatTestMeta { int i_dim, o_dim; diff --git a/tests/ops/linear_test.cc b/tests/ops/linear_test.cc index 5b65de3a5..7c84ad107 100644 --- a/tests/ops/linear_test.cc +++ b/tests/ops/linear_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("linear_test"); +Legion::Logger log_app("linear_test"); struct LinearTestMeta { int batch_size, i_dim, num_channels, dense_projection_o_dim, diff --git a/tests/ops/reshape_test.cc b/tests/ops/reshape_test.cc index e8f4586b2..a8aa046a6 100644 --- a/tests/ops/reshape_test.cc +++ b/tests/ops/reshape_test.cc @@ -6,7 +6,7 @@ #include #define PRECISION 16 using namespace Legion; -LegionRuntime::Logger::Category log_app("Reshape_test"); +Legion::Logger log_app("Reshape_test"); struct ReshapeTestMeta { int i_dim, o_dim; diff --git a/tests/ops/tanh_test.cc b/tests/ops/tanh_test.cc index 1c24d96aa..1e86934f8 100644 --- a/tests/ops/tanh_test.cc +++ b/tests/ops/tanh_test.cc @@ -6,7 +6,7 @@ #include #define PRECISION 16 using namespace Legion; -LegionRuntime::Logger::Category log_app("Tanh_test"); +Legion::Logger log_app("Tanh_test"); struct TanhTestMeta { int i_dim, o_dim; diff --git a/tests/ops/transpose_test.cc b/tests/ops/transpose_test.cc index 10481aa14..045f28479 100644 --- a/tests/ops/transpose_test.cc +++ b/tests/ops/transpose_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("transpose_test"); +Legion::Logger log_app("transpose_test"); struct TransposeTestMeta { int m, k, d;