Disable shared library by default. Set default max_length in api server. (#317)

li-plus · web-flow · commit a0f2d4ad9c0a · 2024-06-14T20:56:37.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,6 +17,7 @@ option(CHATGLM_ENABLE_EXAMPLES "chatglm: enable c++ examples" ON)
 option(CHATGLM_ENABLE_PYBIND "chatglm: enable python binding" OFF)
 option(CHATGLM_ENABLE_TESTING "chatglm: enable testing" OFF)
 
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "")
 if (CHATGLM_ENABLE_PYBIND)
     set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
diff --git a/Dockerfile b/Dockerfile
@@ -47,7 +47,6 @@ RUN \
     rm -rf /var/lib/apt/lists/*
 
 COPY --from=build /chatglm.cpp/build/bin/main /chatglm.cpp/build/bin/main
-COPY --from=build /chatglm.cpp/build/lib/*.so /chatglm.cpp/build/lib/
 COPY --from=build /chatglm.cpp/dist/ /chatglm.cpp/dist/
 
 ADD examples examples
diff --git a/chatglm_cpp/langchain_api.py b/chatglm_cpp/langchain_api.py
@@ -12,6 +12,7 @@
 
 class Settings(BaseSettings):
     model: str = "models/chatglm-ggml.bin"
+    max_length: int = 4096
 
 
 class ChatRequest(BaseModel):
@@ -48,7 +49,7 @@ class ChatResponse(BaseModel):
 settings = Settings()
 logging.info(settings)
 
-pipeline = chatglm_cpp.Pipeline(settings.model)
+pipeline = chatglm_cpp.Pipeline(settings.model, max_length=settings.max_length)
 
 
 @app.post("/")
diff --git a/chatglm_cpp/openai_api.py b/chatglm_cpp/openai_api.py
@@ -17,6 +17,7 @@
 
 class Settings(BaseSettings):
     model: str = "models/chatglm3-ggml.bin"
+    max_length: int = 4096
     num_threads: int = 0
 
 
@@ -129,7 +130,7 @@ class ChatCompletionResponse(BaseModel):
 app.add_middleware(
     CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
 )
-pipeline = chatglm_cpp.Pipeline(settings.model)
+pipeline = chatglm_cpp.Pipeline(settings.model, max_length=settings.max_length)
 lock = asyncio.Lock()
 
 
diff --git a/examples/cli_demo.py b/examples/cli_demo.py
@@ -63,7 +63,7 @@ def main() -> None:
     if args.sp:
         system = args.sp.read_text()
 
-    pipeline = chatglm_cpp.Pipeline(args.model)
+    pipeline = chatglm_cpp.Pipeline(args.model, max_length=args.max_length)
 
     if args.mode != "chat" and args.interactive:
         print("interactive demo is only supported for chat mode, falling back to non-interactive one")
diff --git a/examples/web_demo.py b/examples/web_demo.py
@@ -21,7 +21,7 @@
 parser.add_argument("--plain", action="store_true", help="display in plain text without markdown support")
 args = parser.parse_args()
 
-pipeline = chatglm_cpp.Pipeline(args.model)
+pipeline = chatglm_cpp.Pipeline(args.model, max_length=args.max_length)
 
 
 def postprocess(text):
diff --git a/tests/perplexity.cpp b/tests/perplexity.cpp
@@ -94,7 +94,7 @@ static float cross_entropy(const ggml_tensor *input, const ggml_tensor *target)
 // reference: https://huggingface.co/docs/transformers/perplexity
 static void perplexity(Args &args) {
     std::cout << "Loading model from " << args.model_path << " ...\n";
-    chatglm::Pipeline pipeline(args.model_path);
+    chatglm::Pipeline pipeline(args.model_path, args.max_length);
 
     std::cout << "Loading corpus from " << args.corpus_path << " ...\n";
     std::string corpus = read_text(args.corpus_path);
diff --git a/tests/test_chatglm_cpp.py b/tests/test_chatglm_cpp.py
@@ -147,3 +147,39 @@ def test_internlm7b_pipeline():
 @pytest.mark.skipif(not INTERNLM20B_MODEL_PATH.exists(), reason="model file not found")
 def test_internlm20b_pipeline():
     check_pipeline(model_path=INTERNLM20B_MODEL_PATH, prompt="你好", target="你好！有什么我可以帮助你的吗？")
+
+
+@pytest.mark.skipif(not CHATGLM4_MODEL_PATH.exists(), reason="model file not found")
+def test_langchain_api():
+    import os
+    from unittest.mock import patch
+
+    from fastapi.testclient import TestClient
+
+    with patch.dict(os.environ, {"MODEL": str(CHATGLM4_MODEL_PATH)}):
+        from chatglm_cpp.langchain_api import app
+
+    client = TestClient(app)
+    response = client.post("/", json={"prompt": "你好", "temperature": 0})
+    assert response.status_code == 200
+    assert response.json()["response"] == "你好👋！有什么可以帮助你的吗？"
+
+
+@pytest.mark.skipif(not CHATGLM4_MODEL_PATH.exists(), reason="model file not found")
+def test_openai_api():
+    import os
+    from unittest.mock import patch
+
+    from fastapi.testclient import TestClient
+
+    with patch.dict(os.environ, {"MODEL": str(CHATGLM4_MODEL_PATH)}):
+        from chatglm_cpp.openai_api import app
+
+    client = TestClient(app)
+    response = client.post(
+        "/v1/chat/completions", json={"messages": [{"role": "user", "content": "你好"}], "temperature": 0}
+    )
+    assert response.status_code == 200
+    response_message = response.json()["choices"][0]["message"]
+    assert response_message["role"] == "assistant"
+    assert response_message["content"] == "你好👋！有什么可以帮助你的吗？"