diff --git a/koboldcpp.py b/koboldcpp.py index ea83bbf98e8..a01cf868e45 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -146,6 +146,10 @@ MaxMemory = [0] MaxFreeMemory = [0] +server_process: subprocess.Popen | None = None +is_proxy = "KOBOLDCPP_SERVER" not in os.environ +current_model = None + class logit_bias(ctypes.Structure): _fields_ = [("token_id", ctypes.c_int32), ("bias", ctypes.c_float)] @@ -1343,7 +1347,19 @@ def auto_set_backend_cli(): if not found_new_backend: print(f"Auto Selected Default Backend (flag={cpusupport})\n") +def get_models(): + if args.admin and args.admindir and os.path.exists(args.admindir): + from pathlib import Path + return [path for path in Path(args.admindir).iterdir() if (path.suffix in [".kcpps", ".kcppt", ".gguf"] and path.is_file())] + else: + return [] + def load_model(model_filename): + if is_proxy: + current_model = model_filename + print("Deferred model loading.", current_model) + return True + global args inputs = load_model_inputs() inputs.model_filename = model_filename.encode("UTF-8") @@ -3260,9 +3276,8 @@ def do_GET(self): elif self.path.endswith(('/api/admin/list_options')): #used by admin to get info about a kcpp instance opts = [] - if args.admin and args.admindir and os.path.exists(args.admindir) and self.check_header_password(args.adminpassword): - dirpath = os.path.abspath(args.admindir) - opts = [f for f in sorted(os.listdir(dirpath)) if (f.endswith(".kcpps") or f.endswith(".kcppt") or f.endswith(".gguf")) and os.path.isfile(os.path.join(dirpath, f))] + if self.check_header_password(args.adminpassword): + opts = [path.name for path in get_models()] opts.append("unload_model") response_body = (json.dumps(opts).encode()) @@ -3332,7 +3347,7 @@ def do_GET(self): response_body = (json.dumps({"logprobs":logprobsdict}).encode()) elif self.path.endswith('/v1/models'): - response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":int(time.time()),"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode()) + response_body = (json.dumps({"object":"list","data":[{"id":path.stem,"object":"model","created":path.stat().st_mtime,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"} for path in get_models()]}).encode()) elif self.path.endswith('/sdapi/v1/sd-models'): if friendlysdmodelname=="inactive" or fullsdmodelpath=="": @@ -3799,7 +3814,7 @@ def do_POST(self): else: dirpath = os.path.abspath(args.admindir) targetfilepath = os.path.join(dirpath, targetfile) - opts = [f for f in os.listdir(dirpath) if (f.lower().endswith(".kcpps") or f.lower().endswith(".kcppt") or f.lower().endswith(".gguf")) and os.path.isfile(os.path.join(dirpath, f))] + opts = [str(path) for path in get_models()] if targetfile in opts and os.path.exists(targetfilepath): global_memory["restart_override_config_target"] = "" if targetfile.lower().endswith(".gguf") and overrideconfig: @@ -4020,6 +4035,73 @@ def do_POST(self): if args.foreground: bring_terminal_to_foreground() + # Proxy + if is_proxy: + global server_process + global current_model + + model = genparams["model"] + model_path = next((str(path) for path in get_models() if path.stem == model), None) + if model_path is None: + self.send_response(404) + self.end_headers(content_type='application/json') + self.wfile.write(json.dumps({"detail": { + "error": "Model Not Found", + "msg": f"Model file {model} not found.", + "type": "bad_input", + }}).encode()) + return + + if server_process is None: + server_process = subprocess.Popen([sys.executable] + sys.argv + ["--port", str(args.port + 1), "--model", model_path], env={ + "KOBOLDCPP_SERVER": "True" + }) + elif current_model != model: + with urllib.request.urlopen(urllib.request.Request(f"http://localhost:{args.port + 1}/api/admin/reload_config", method="POST", data=json.dumps({"filename": model_path}).encode(), headers={ + "Authorization": f"Bearer {args.adminpassword}" + })) as response: + if response.status != 200: + self.send_response(500) + self.end_headers(content_type='application/json') + self.wfile.write(json.dumps({"detail": { + "error": "Failed to switch model", + "msg": f"Failed to switch model to {model}.", + "type": "server_error", + }}).encode()) + return + + current_model = model + + # Poke the server until it has the new model + while True: + try: + with urllib.request.urlopen(urllib.request.Request(f"http://localhost:{args.port + 1}/api/v1/model", method="GET"), timeout=1000) as response: + data = json.loads(response.read().decode()) + if response.status == 200 and data.get("result") == f"koboldcpp/{model}": + break + + time.sleep(1) + except Exception: + time.sleep(1) + + + request = urllib.request.Request(f"http://localhost:{args.port + 1}" + self.path, data=body, headers=dict(self.headers), method="POST") + with urllib.request.urlopen(request) as response: + self.send_response_only(response.status) + for keyword, value in response.headers.items(): + self.send_header(keyword, value) + super(KcppServerRequestHandler, self).end_headers() + + while True: + chunk = response.read() + if not chunk: + break + self.wfile.write(chunk) + + self.wfile.flush() + self.close_connection = True + return + if api_format > 0: #text gen # Check if streaming chat completions, if so, set stream mode to true if (api_format == 4 or api_format == 3) and "stream" in genparams and genparams["stream"]: