Bump version to 0.1.2

simonJJJ · simonJJJ · commit 9d7bee71d717 · 2023-10-11T01:53:28.000+08:00
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,21 @@
+global-include CMakeLists.txt *.cmake README.md LICENSE
+include *.cpp *.h
+
+# absl
+graft third_party/abseil-cpp/absl
+graft third_party/abseil-cpp/CMake
+include third_party/abseil-cpp/*
+
+# re2
+graft third_party/re2/re2
+graft third_party/re2/util
+include third_party/re2/*
+
+# ggml
+graft third_party/ggml/include
+graft third_party/ggml/src
+include third_party/ggml/*
+
+# pybind11
+graft third_party/pybind11/include
+graft third_party/pybind11/tools
diff --git a/README.md b/README.md
@@ -79,7 +79,10 @@ The Python binding provides high-level `chat` and `stream_chat` interface simila
 
 **Installation**
 
-Install from PyPI (recommended): WIP.
+Install from PyPI (recommended): will trigger compilation on your platform.
+```sh
+pip install -U qwen-cpp
+```
 
 You may also install from source.
 ```sh
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,3 +33,4 @@ dynamic = ["version"]
 [project.urls]
 Homepage = "https://github.com/QwenLM/qwen.cpp"
 Repository = "https://github.com/QwenLM/qwen.cpp.git"
+BugTracker = "https://github.com/QwenLM/qwen.cpp/issues"
diff --git a/qwen_cpp/__init__.py b/qwen_cpp/__init__.py
@@ -0,0 +1,140 @@
+import tempfile
+from pathlib import Path
+from typing import Iterator, List, Optional, Union
+
+import qwen_cpp._C as _C
+
+
+class Pipeline(_C.Pipeline):
+    def __init__(
+        self, model_path: str, tiktoken_path: str, *, dtype: Optional[str] = None
+    ) -> None:
+        if Path(model_path).is_file() and Path(tiktoken_path).is_file():
+            super().__init__(str(model_path), str(tiktoken_path))
+        else:
+            from qwen_cpp.convert import convert
+
+            if dtype is None:
+                dtype = "q4_0"  # default dtype
+
+            with tempfile.NamedTemporaryFile("wb") as f:
+                convert(f, model_path, dtype=dtype)
+                super().__init__(f.name, str(tiktoken_path))
+
+    def chat(
+        self,
+        history: List[str],
+        *,
+        max_length: int = 2048,
+        max_context_length: int = 512,
+        do_sample: bool = True,
+        top_k: int = 0,
+        top_p: float = 0.7,
+        temperature: float = 0.95,
+        repetition_penalty: float = 1.0,
+        num_threads: int = 0,
+        stream: bool = False,
+    ) -> Union[Iterator[str], str]:
+        input_ids = self.tokenizer.encode_history(history, max_context_length)
+        return self._generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            max_context_length=max_context_length,
+            do_sample=do_sample,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            num_threads=num_threads,
+            stream=stream,
+        )
+
+    def _generate(
+        self,
+        input_ids: List[int],
+        *,
+        max_length: int = 2048,
+        max_context_length: int = 512,
+        do_sample: bool = True,
+        top_k: int = 0,
+        top_p: float = 0.7,
+        temperature: float = 0.95,
+        repetition_penalty: float = 1.0,
+        num_threads: int = 0,
+        stream: bool = False,
+    ) -> Union[Iterator[str], str]:
+        gen_config = _C.GenerationConfig(
+            max_length=max_length,
+            max_context_length=max_context_length,
+            do_sample=do_sample,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            num_threads=num_threads,
+        )
+
+        generate_fn = self._stream_generate if stream else self._sync_generate
+        return generate_fn(input_ids=input_ids, gen_config=gen_config)
+
+    def _stream_generate(
+        self, input_ids: List[int], gen_config: _C.GenerationConfig
+    ) -> Iterator[str]:
+        input_ids = [x for x in input_ids]  # make a copy
+        n_past = 0
+        n_ctx = len(input_ids)
+
+        token_cache = []
+        print_len = 0
+        while len(input_ids) < gen_config.max_length:
+            next_token_id = self.model.generate_next_token(
+                input_ids, gen_config, n_past, n_ctx
+            )
+            n_past = len(input_ids)
+            input_ids.append(next_token_id)
+
+            token_cache.append(next_token_id)
+            output = self.tokenizer.decode(token_cache)
+
+            if output.endswith("\n"):
+                yield output[print_len:]
+                token_cache = []
+                print_len = 0
+            elif output.endswith((",", "!", ":", ";", "?", "�")):
+                pass
+            else:
+                yield output[print_len:]
+                print_len = len(output)
+
+            if next_token_id in (
+                self.model.config.eos_token_id,
+                self.model.config.im_start_id,
+                self.model.config.im_end_id,
+            ):
+                break
+
+        output = self.tokenizer.decode(token_cache)
+        yield output[print_len:]
+
+    def _sync_generate(
+        self, input_ids: List[int], gen_config: _C.GenerationConfig
+    ) -> str:
+        input_ids = [x for x in input_ids]  # make a copy
+        n_past = 0
+        n_ctx = len(input_ids)
+
+        while len(input_ids) < gen_config.max_length:
+            next_token_id = self.model.generate_next_token(
+                input_ids, gen_config, n_past, n_ctx
+            )
+            n_past = len(input_ids)
+            input_ids.append(next_token_id)
+            if next_token_id in (
+                self.model.config.eos_token_id,
+                self.model.config.im_start_id,
+                self.model.config.im_end_id,
+            ):
+                break
+
+        output = self.tokenizer.decode(input_ids[n_ctx:])
+        return output
diff --git a/qwen_pybind.cpp b/qwen_pybind.cpp
@@ -40,6 +40,24 @@ PYBIND11_MODULE(_C, m) {
     .def("encode", &QwenTokenizer::encode)
     .def("decode", &QwenTokenizer::decode)
     .def("encode_history", &QwenTokenizer::encode_history);
+
+  py::class_<GenerationConfig>(m, "GenerationConfig")
+    .def(py::init<int, int, bool, int, float, float, float, int>(), "max_length"_a = 2048,
+         "max_context_length"_a = 512, "do_sample"_a = true, "top_k"_a = 0, "top_p"_a = 0.7, "temperature"_a = 0.95,
+         "repetition_penalty"_a = 1.0, "num_threads"_a = 0)
+    .def_readwrite("max_length", &GenerationConfig::max_length)
+    .def_readwrite("max_context_length", &GenerationConfig::max_context_length)
+    .def_readwrite("do_sample", &GenerationConfig::do_sample)
+    .def_readwrite("top_k", &GenerationConfig::top_k)
+    .def_readwrite("top_p", &GenerationConfig::top_p)
+    .def_readwrite("temperature", &GenerationConfig::temperature)
+    .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty)
+    .def_readwrite("num_threads", &GenerationConfig::num_threads);
+
+  py::class_<Pipeline>(m, "Pipeline")
+    .def(py::init<const std::string &, const std::string &>())
+    .def_property_readonly("model", [](const Pipeline &self) { return self.model.get(); })
+    .def_property_readonly("tokenizer", [](const Pipeline &self) { return self.tokenizer.get(); });
 }
 
 } // namespace qwen
diff --git a/setup.py b/setup.py
@@ -114,7 +114,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
 HERE = Path(__file__).resolve().parent
 
 setup(
-    version="0.1",
+    version="0.1.2",
     author="Shijie Wang",
     packages=find_packages(),
     ext_modules=[CMakeExtension("qwen_cpp._C")],
diff --git a/tiktoken.h b/tiktoken.h
@@ -164,10 +164,9 @@ class tiktoken {
 		}
 
 	private:
-		template <typename T>
 		auto split_with_allowed_special_token(
 			re2::StringPiece &input,
-			const T &allowed_special
+			const ankerl::unordered_dense::map<std::string, int> &allowed_special
 		) const -> std::pair<std::optional<std::string>, re2::StringPiece> {
 			if (special_regex_ == nullptr) return { std::nullopt, input };
 
@@ -206,7 +205,7 @@ class tiktoken {
 		auto _encode_native(
 			const std::string &text,
 			const ankerl::unordered_dense::map<std::string, int> &allowed_special
-		) const -> const std::pair<std::vector<int>, int> {
+		) const -> std::pair<std::vector<int>, int> {
 			std::vector<int> ret;
 			int last_piece_token_len = 0;
 			re2::StringPiece input(text);