RWKV
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 36 additions & 10 deletions b/‎.github/workflows/build.yml‎
Lines changed: 36 additions & 10 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 32 additions & 4 deletions b/‎CMakeLists.txt‎
Lines changed: 32 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 6 deletions b/‎README.md‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎extras/quantize.c‎
Lines changed: 2 additions & 0 deletions b/‎extras/quantize.c‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ggml‎ b/‎ggml‎
diff --git a/‎python/chat_with_bot.py‎
Lines changed: 9 additions & 8 deletions b/‎python/chat_with_bot.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎python/convert_pytorch_to_ggml.py‎
Lines changed: 39 additions & 3 deletions b/‎python/convert_pytorch_to_ggml.py‎
Lines changed: 39 additions & 3 deletions
diff --git a/‎python/generate_completions.py‎
Lines changed: 2 additions & 1 deletion b/‎python/generate_completions.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/inference_example.py‎
Lines changed: 2 additions & 1 deletion b/‎python/inference_example.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/prompt/Chinese-Chat.json‎
Lines changed: 1 addition & 1 deletion b/‎python/prompt/Chinese-Chat.json‎
Lines changed: 1 addition & 1 deletion
@@ -186,9 +186,8 @@ jobs:
            defines: '-DRWKV_AVX512=ON'
          - build: 'cuda12'
            defines: '-DRWKV_CUBLAS=ON'
-         - build: 'rocm5.5'
-           defines: '-G "Unix Makefiles" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DRWKV_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030"'
-
+         - build: 'hip'
+           defines: ''
     steps:
       - name: Clone
         id: checkout
@@ -206,25 +205,52 @@ jobs:
 
       - name: Install rocm-toolkit
         id: rocm-toolkit
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: Cyberhan123/rocm-toolkit@v0.1.0
-        with:
-          rocm: '5.5.0'
+        if: ${{ matrix.build == 'hip' }}
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: rocm-verify
+        if: ${{ matrix.build == 'hip' }}
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
 
       - name: Install Ninja
         id: install-ninja
-        if: ${{ matrix.build == 'rocm5.5' }}
+        if: ${{ matrix.build == 'hip' }}
         uses: urkle/action-get-ninja@v1
         with:
           version: 1.11.1
 
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ github.job }}
+
       - name: Build
         id: cmake_build
+        if: ${{ matrix.build != 'hip' }}
         run: |
           mkdir build
           cd build
           cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Build-hip
+        id: cmake_build_hip
+        if: ${{ matrix.build == 'hip' }}
+        run: |
+          mkdir build
+          cd build
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake .. -G "Unix Makefiles" -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DRWKV_HIPBLAS=ON -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
 
       - name: Check AVX512F support
         id: check_avx512f
@@ -242,7 +268,7 @@ jobs:
       - name: Test
         id: cmake_test
         # Test AVX-512 only when possible
-        if: ${{ (matrix.build != 'avx512' || env.HAS_AVX512F == '1') && matrix.build != 'cuda12' && matrix.build != 'rocm5.5'}}
+        if: ${{ (matrix.build != 'avx512' || env.HAS_AVX512F == '1') && matrix.build != 'cuda12' && matrix.build != 'hip'}}
         run: |
           cd build
           ctest -C Release --verbose
 
@@ -58,7 +58,7 @@ endfunction()
 
 set(GGML_ACCELERATE             ${RWKV_ACCELERATE})
 set(GGML_CUDA                   ${RWKV_CUBLAS})
-set(GGML_HIPBLAS                ${RWKV_HIPBLAS})
+set(GGML_HIP                    ${RWKV_HIPBLAS})
 set(GGML_METAL                  ${RWKV_METAL})
 if (RWKV_OPENBLAS)
     set(GGML_BLAS_VENDOR "OpenBLAS")
@@ -107,6 +107,7 @@ if (RWKV_ALL_WARNINGS)
             -Wcast-qual
             -Wno-unused-function
             -Wno-multichar
+            -Wno-nonnull
         )
     else()
         set(c_flags
@@ -234,7 +235,7 @@ if (GGML_METAL)
         )
 endif()
 
-if (GGML_HIPBLAS)
+if (GGML_HIP)
     # CMake on Windows doesn't support the HIP language yet
     if (WIN32)
         set(CXX_IS_HIPCC TRUE)
@@ -262,12 +263,39 @@ if (GGML_HIPBLAS)
 endif()
 
 target_include_directories(rwkv PUBLIC .)
-target_include_directories(rwkv PRIVATE ggml/include)
+target_include_directories(rwkv PRIVATE ggml/include ggml/src)
 target_compile_features(rwkv PUBLIC cxx_std_11)
-target_link_libraries(rwkv PRIVATE $<TARGET_OBJECTS:ggml> ${RWKV_EXTRA_LIBS})
+
+if (GGML_METAL)
+    set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-metal> $<TARGET_OBJECTS:ggml-blas>)
+endif()
+if (GGML_CUDA)
+    set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-cuda>)
+endif()
+if (GGML_HIP)
+    set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-hip>)
+endif()
+if (GGML_RPC)
+    set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} $<TARGET_OBJECTS:ggml-rpc>)
+endif()
+
+target_link_libraries(rwkv PRIVATE $<TARGET_OBJECTS:ggml> $<TARGET_OBJECTS:ggml-base> $<TARGET_OBJECTS:ggml-cpu> ${RWKV_EXTRA_LIBS})
 
 if (RWKV_BUILD_SHARED_LIBRARY)
     set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    set_target_properties(ggml-base PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    set_target_properties(ggml-cpu PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    if (GGML_METAL)
+        set_target_properties(ggml-metal PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        set_target_properties(ggml-blas PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    endif()
+    if (GGML_CUDA)
+        set_target_properties(ggml-cuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    endif()
+    if (GGML_HIP)
+        set_target_properties(ggml-hip PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    endif()
+
     target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
     set_target_properties(rwkv PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(rwkv PRIVATE RWKV_SHARED RWKV_BUILD)
 
@@ -6,20 +6,18 @@ Besides the usual **FP32**, it supports **FP16**, **quantized INT4, INT5 and INT
 
 This project provides [a C library rwkv.h](rwkv.h) and [a convinient Python wrapper](python%2Frwkv_cpp%2Frwkv_cpp_model.py) for it.
 
-[RWKV](https://arxiv.org/abs/2305.13048) is a large language model architecture, [with the largest model in the family having 14B parameters](https://huggingface.co/BlinkDL/rwkv-4-pile-14b). In contrast to Transformer with `O(n^2)` attention, RWKV requires only state from previous step to calculate logits. This makes RWKV very CPU-friendly on large context lenghts.
+[RWKV](https://arxiv.org/abs/2305.13048) is a large language model architecture. In contrast to Transformer with `O(n^2)` attention, RWKV requires only state from previous step to calculate logits. This makes RWKV very CPU-friendly on large context lenghts.
 
-[RWKV v5](https://huggingface.co/BlinkDL/rwkv-5-world) is a major upgrade to RWKV architecture, making it competitive with Transformers in quality. RWKV v5 models are supported.
-
-[RWKV v6](https://huggingface.co/BlinkDL/rwkv-6-world) is a further improvement to RWKV architecture, with better quality. RWKV v6 models are supported.
+This project supports RWKV [v4](https://huggingface.co/BlinkDL/rwkv-4-pile-14b), [v5](https://huggingface.co/BlinkDL/rwkv-5-world), [v6](https://huggingface.co/BlinkDL/rwkv-6-world) and the latest [v7](https://huggingface.co/BlinkDL/rwkv-7-world) architectures.
 
 Loading LoRA checkpoints in [Blealtan's format](https://github.com/Blealtan/RWKV-LM-LoRA) is supported through [merge_lora_into_ggml.py script](rwkv%2Fmerge_lora_into_ggml.py).
 
+<!-- TODO: Update data below -->
+
 ## Quality and performance
 
 If you use `rwkv.cpp` for anything serious, please [test all available formats for perplexity and latency](rwkv%2Fmeasure_pexplexity.py) on a representative dataset, and decide which trade-off is best for you.
 
-In general, **`RWKV v5` models are as fast as `RWKV v4` models**, with minor differencies in latency and memory consumption, and with having way higher quality than `v4`. Therefore, it is recommended to use `RWKV v5`.
-
 Below table is for reference only. Measurements were made on 4C/8T x86 CPU with AVX2, 4 threads. The models are `RWKV v4 Pile 169M`, `RWKV v4 Pile 1.5B`.
 
 | Format    | Perplexity (169M) | Latency, ms (1.5B) | File size, GB (1.5B) |
 
@@ -25,8 +25,10 @@ bool QueryPerformanceCounter(uint64_t* lpPerformanceCount);
 static enum ggml_type type_from_string(const char * string) {
     if (strcmp(string, "Q4_0") == 0) return GGML_TYPE_Q4_0;
     if (strcmp(string, "Q4_1") == 0) return GGML_TYPE_Q4_1;
+    if (strcmp(string, "Q4_K") == 0) return GGML_TYPE_Q4_K;
     if (strcmp(string, "Q5_0") == 0) return GGML_TYPE_Q5_0;
     if (strcmp(string, "Q5_1") == 0) return GGML_TYPE_Q5_1;
+    if (strcmp(string, "Q5_K") == 0) return GGML_TYPE_Q5_K;
     if (strcmp(string, "Q8_0") == 0) return GGML_TYPE_Q8_0;
     return GGML_TYPE_COUNT;
 }
 
@@ -40,6 +40,7 @@
 
 parser = argparse.ArgumentParser(description='Provide terminal-based chat interface for RWKV model')
 parser.add_argument('model_path', help='Path to RWKV model in ggml format')
+parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
 add_tokenizer_argument(parser)
 args = parser.parse_args()
 
@@ -48,7 +49,7 @@
 with open(script_dir / 'prompt' / f'{LANGUAGE}-{PROMPT_TYPE}.json', 'r', encoding='utf8') as json_file:
     prompt_data = json.load(json_file)
 
-    user, bot, separator, init_prompt = prompt_data['user'], prompt_data['bot'], prompt_data['separator'], prompt_data['prompt']
+    user, assistant, separator, init_prompt = prompt_data['user'], prompt_data['assistant'], prompt_data['separator'], prompt_data['prompt']
 
 if init_prompt == '':
     raise ValueError('Prompt must not be empty')
@@ -57,7 +58,7 @@
 print(f'System info: {library.rwkv_get_system_info_string()}')
 
 print('Loading RWKV model')
-model = rwkv_cpp_model.RWKVModel(library, args.model_path)
+model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layer_count=args.num_gpu_layers)
 
 tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)
 
@@ -154,7 +155,7 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
     if msg == '+reset':
         load_thread_state('chat_init')
         save_thread_state('chat')
-        print(f'{bot}{separator} Chat reset.\n')
+        print(f'{assistant}{separator} Chat reset.\n')
         continue
     elif msg[:5].lower() == '+gen ' or msg[:3].lower() == '+i ' or msg[:4].lower() == '+qa ' or msg[:4].lower() == '+qq ' or msg.lower() == '+++' or msg.lower() == '++':
 
@@ -194,7 +195,7 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
             load_thread_state('chat_init')
 
             real_msg = msg[4:].strip()
-            new = f'{user}{separator} {real_msg}\n\n{bot}{separator}'
+            new = f'{user}{separator} {real_msg}\n\n{assistant}{separator}'
 
             process_tokens(tokenizer_encode(new))
             save_thread_state('gen_0')
@@ -225,17 +226,17 @@ def split_last_end_of_line(tokens: List[int]) -> List[int]:
             except Exception as e:
                 print(e)
                 continue
-        # chat with bot
+        # chat with assistant
         else:
             load_thread_state('chat')
-            new = f'{user}{separator} {msg}\n\n{bot}{separator}'
+            new = f'{user}{separator} {msg}\n\n{assistant}{separator}'
             process_tokens(tokenizer_encode(new), new_line_logit_bias=-999999999)
             save_thread_state('chat_pre')
 
         thread = 'chat'
 
-        # Print bot response
-        print(f'> {bot}{separator}', end='')
+        # Print assistant response
+        print(f'> {assistant}{separator}', end='')
 
     start_index: int = len(processed_tokens)
     accumulated_tokens: List[int] = []
 
@@ -35,8 +35,11 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
     is_v5_1_or_2: bool = 'blocks.0.att.ln_x.weight' in state_dict
     is_v5_2: bool = 'blocks.0.att.gate.weight' in state_dict
     is_v6_0: bool = 'blocks.0.att.time_maa_x' in state_dict
+    is_v7_0: bool = 'blocks.0.att.k_k' in state_dict
 
-    if is_v6_0:
+    if is_v7_0:
+        print('Detected RWKV v7.0')
+    elif is_v6_0:
         print('Detected RWKV v6.0')
     elif is_v5_2:
         print('Detected RWKV v5.2')
@@ -45,6 +48,23 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
     else:
         print('Detected RWKV v4')
 
+    if is_v7_0:
+        # concat to reduce some cpu overhead during ggml inference
+        state_dict_new = {}
+        for k in state_dict.keys():
+            if 'att.x_' in k:
+                l = int(k.split('.')[1].split('.')[0])
+                try:
+                    state_dict_new[f'blocks.{l}.att.x_rwkvag'] = torch.cat(
+                        [state_dict_new[f'blocks.{l}.att.x_rwkvag'], state_dict[k]], dim=0)
+                except KeyError:
+                    state_dict_new[f'blocks.{l}.att.x_rwkvag'] = state_dict[k]
+            else:
+                state_dict_new[k] = state_dict[k]
+
+        del state_dict[k]
+        state_dict = state_dict_new
+
     with open(dest_path, 'wb') as out_file:
         is_FP16: bool = data_type == 'FP16' or data_type == 'float16'
 
@@ -68,7 +88,16 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
             if '.time_' in k:
                 tensor = tensor.squeeze()
 
-            if is_v6_0:
+            if is_v7_0:
+                if any(s in k for s in [
+                    '.w1', '.w2',
+                    '.a1', '.a2',
+                    '.v1', '.v2',
+                    '.g1', '.g2',
+                ]):
+                    tensor = tensor.transpose(0, 1)
+
+            elif is_v6_0:
                 if '.time_faaaa' in k:
                     tensor = tensor.unsqueeze(-1)
                 if '.time_maa_w1' in k or '.time_decay_w' in k:
@@ -95,7 +124,14 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
                     tensor = -torch.exp(tensor)
 
             # Keep 1-dim vectors and small matrices in FP32
-            if is_FP16 and len(tensor.shape) > 1 and '.time_' not in k:
+            if is_FP16 and len(tensor.shape) > 1 and all(
+                s not in k for s in [
+                    '.time_',
+                    '.k_k', '.k_a', '.r_k',
+                    '.x_rwkvag', '.x_k',
+                    '.w0', '.a0', '.v0',
+                ]
+            ):
                 tensor = tensor.half()
 
             shape = tensor.shape
 
@@ -29,6 +29,7 @@
 
 parser = argparse.ArgumentParser(description='Generate completions from RWKV model based on a prompt')
 parser.add_argument('model_path', help='Path to RWKV model in ggml format')
+parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
 add_tokenizer_argument(parser)
 args = parser.parse_args()
 
@@ -39,7 +40,7 @@
 print(f'System info: {library.rwkv_get_system_info_string()}')
 
 print('Loading RWKV model')
-model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layers_count=0)
+model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layers_count=args.num_gpu_layers)
 
 tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)
 
 
@@ -10,12 +10,13 @@
 # Parse received arguments.
 parser = argparse.ArgumentParser(description='Generate some text with an RWKV model')
 parser.add_argument('model_path', help='Path to RWKV model in ggml format')
+parser.add_argument('-ngl', '--num_gpu_layers', type=int, default=99, help='Number of layers to run on GPU')
 add_tokenizer_argument(parser)
 args = parser.parse_args()
 
 # Load the model.
 library = rwkv_cpp_shared_library.load_rwkv_shared_library()
-model = rwkv_cpp_model.RWKVModel(library, args.model_path)
+model = rwkv_cpp_model.RWKVModel(library, args.model_path, gpu_layer_count=args.num_gpu_layers)
 
 # Set up the tokenizer.
 tokenizer_decode, tokenizer_encode = get_tokenizer(args.tokenizer, model.n_vocab)
 
@@ -1,6 +1,6 @@
 {
     "user": "Bob",
-    "bot": "Alice",
+    "assistant": "Alice",
     "separator": ":",
     "prompt": "\nThe following is a coherent verbose detailed conversation between a Chinese girl named Alice and her friend Bob. Alice is very intelligent, creative and friendly. Alice likes to tell Bob a lot about herself and her opinions. Alice usually gives Bob kind, helpful and informative advices.\n\nBob: lhc\n\nAlice: LHC是指大型强子对撞机（Large Hadron Collider），是世界最大最强的粒子加速器，由欧洲核子中心（CERN）在瑞士日内瓦地下建造。LHC的原理是加速质子（氢离子）并让它们相撞，让科学家研究基本粒子和它们之间的相互作用，并在2012年证实了希格斯玻色子的存在。\n\nBob: 企鹅会飞吗\n\nAlice: 企鹅是不会飞的。企鹅的翅膀短而扁平，更像是游泳时的一对桨。企鹅的身体结构和羽毛密度也更适合在水中游泳，而不是飞行。\n\n"
 }
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"user": "Bob",`
`3`		`- "bot": "Alice",`
	`3`	`+ "assistant": "Alice",`
`4`	`4`	`"separator": ":",`
`5`	`5`	"prompt": "\nThe following is a coherent verbose detailed conversation between a Chinese girl named Alice and her friend Bob. Alice is very intelligent, creative and friendly. Alice likes to tell Bob a lot about herself and her opinions. Alice usually gives Bob kind, helpful and informative advices.\n\nBob: lhc\n\nAlice: LHC是指大型强子对撞机（Large Hadron Collider），是世界最大最强的粒子加速器，由欧洲核子中心（CERN）在瑞士日内瓦地下建造。LHC的原理是加速质子（氢离子）并让它们相撞，让科学家研究基本粒子和它们之间的相互作用，并在2012年证实了希格斯玻色子的存在。\n\nBob: 企鹅会飞吗\n\nAlice: 企鹅是不会飞的。企鹅的翅膀短而扁平，更像是游泳时的一对桨。企鹅的身体结构和羽毛密度也更适合在水中游泳，而不是飞行。\n\n"
`6`	`6`	`}`