Skip to content

Commit 25d5b0c

Browse files
committed
updated server-local-image-loading feature branch with latest master branch changes
1 parent 583cb83 commit 25d5b0c

File tree

8 files changed

+155
-2
lines changed

8 files changed

+155
-2
lines changed

common/arg.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2338,6 +2338,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23382338
params.port = value;
23392339
}
23402340
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
2341+
add_opt(common_arg(
2342+
{"--allowed-local-media-path"}, "PATH",
2343+
string_format("path from which local media files are allowed to be read from (default: none)"),
2344+
[](common_params & params, const std::string & value) {
2345+
try {
2346+
params.allowed_local_media_path = std::filesystem::canonical(std::filesystem::path(value));
2347+
if (!std::filesystem::is_directory(params.allowed_local_media_path)) {
2348+
throw std::invalid_argument(string_format("allowed local media path must be a dir: %s", params.allowed_local_media_path.c_str()));
2349+
}
2350+
} catch (std::filesystem::filesystem_error &err) {
2351+
throw std::invalid_argument(string_format("invalid allowed local media path: %s", err.what()));
2352+
}
2353+
}
2354+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALLOWED_LOCAL_MEDIA_PATH"));
2355+
add_opt(common_arg(
2356+
{"--local-media-max-size-mb"}, "N",
2357+
string_format("max size in mb for local media files (default: %lu)", params.local_media_max_size_mb),
2358+
[](common_params & params, int value) {
2359+
params.local_media_max_size_mb = static_cast<size_t>(value);
2360+
}
2361+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_LOCAL_MEDIA_MAX_SIZE_MB"));
23412362
add_opt(common_arg(
23422363
{"--path"}, "PATH",
23432364
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "ggml-opt.h"
66
#include "llama-cpp.h"
77

8+
#include <filesystem>
89
#include <set>
910
#include <sstream>
1011
#include <string>
@@ -454,9 +455,11 @@ struct common_params {
454455
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
455456
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
456457
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
458+
size_t local_media_max_size_mb = 15; // 0 = no limit, 15 = 1 MiB. Max size of loaded local media files
457459

458460
std::string hostname = "127.0.0.1";
459461
std::string public_path = ""; // NOLINT
462+
std::filesystem::path allowed_local_media_path; // NOLINT
460463
std::string api_prefix = ""; // NOLINT
461464
std::string chat_template = ""; // NOLINT
462465
bool use_jinja = false; // NOLINT

tools/server/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ The project is under active development, and we are [looking for feedback and co
171171
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
172172
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
173173
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
174+
| `--allowed-local-media-path PATH` | path from which local media files are allowed to be read from (default: none)<br/>(env: LLAMA_ARG_ALLOWED_LOCAL_MEDIA_PATH) |
175+
| `--local-media-max-size-mb N` | max size in mb for local media files (default: 15)<br/>(env: LLAMA_ARG_LOCAL_MEDIA_MAX_SIZE_MB) |
174176
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
175177
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
176178
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
@@ -1213,6 +1215,8 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
12131215

12141216
If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
12151217

1218+
We also support local files as input (e.g. `file://`) if enabled (see `--allowed-local-media-path` and `--local-media-max-size-mb` for details).
1219+
12161220
*Options:*
12171221

12181222
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.

tools/server/server-common.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include <random>
1313
#include <sstream>
14+
#include <fstream>
1415

1516
json format_error_response(const std::string & message, const enum error_type type) {
1617
std::string type_str;
@@ -881,6 +882,37 @@ json oaicompat_chat_params_parse(
881882
throw std::runtime_error("Failed to download image");
882883
}
883884

885+
} else if (string_starts_with(url, "file://")) {
886+
if (opt.allowed_local_media_path.empty()) {
887+
throw std::runtime_error("Local media paths are not enabled");
888+
}
889+
// Strip off the leading "file://"
890+
const std::string fname = url.substr(7);
891+
const std::filesystem::path input_path = std::filesystem::canonical(std::filesystem::path(fname));
892+
auto [allowed_end, nothing] = std::mismatch(opt.allowed_local_media_path.begin(), opt.allowed_local_media_path.end(), input_path.begin());
893+
if (allowed_end != opt.allowed_local_media_path.end()) {
894+
throw std::runtime_error("Local media file path not allowed: " + fname);
895+
}
896+
if (!std::filesystem::is_regular_file(input_path)) {
897+
throw std::runtime_error("Local media file does not exist: " + fname);
898+
}
899+
const auto file_size = std::filesystem::file_size(input_path);
900+
if (file_size > opt.local_media_max_size_mb * 1024 * 1024) {
901+
throw std::runtime_error("Local media file exceeds maximum allowed size");
902+
}
903+
// load local file path
904+
std::ifstream f(input_path, std::ios::binary);
905+
if (!f) {
906+
SRV_ERR("Unable to open file %s: %s\n", fname.c_str(), strerror(errno));
907+
throw std::runtime_error("Unable to open local media file: " + fname);
908+
}
909+
raw_buffer buf((std::istreambuf_iterator(f)), std::istreambuf_iterator<char>());
910+
if (buf.size() != file_size) {
911+
SRV_ERR("Failed to read entire file %s", fname.c_str());
912+
throw std::runtime_error("Failed to read entire image file");
913+
}
914+
out_files.push_back(buf);
915+
884916
} else {
885917
// try to decode base64 image
886918
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');

tools/server/server-common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,8 @@ struct oaicompat_parser_options {
286286
bool allow_image;
287287
bool allow_audio;
288288
bool enable_thinking = true;
289+
size_t local_media_max_size_mb;
290+
std::filesystem::path allowed_local_media_path;
289291
};
290292

291293
// used by /chat/completions endpoint

tools/server/server.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,8 @@ struct server_context {
751751
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
752752
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
753753
/* enable_thinking */ enable_thinking,
754+
/* local_media_max_size_mb */ params_base.local_media_max_size_mb,
755+
/* allowed_local_media_path */ params_base.allowed_local_media_path,
754756
};
755757

756758
// print sample chat example to make it clear which template is used

tools/server/tests/unit/test_vision_api.py

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
from utils import *
33
import base64
44
import requests
5+
from pathlib import Path
56

67
server: ServerProcess
78

8-
def get_img_url(id: str) -> str:
9+
def get_img_url(id: str, tmp_path: str | None = None) -> str:
910
IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
1011
IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
12+
IMG_FILE_2 = "https://picsum.photos/id/237/5000"
1113
if id == "IMG_URL_0":
1214
return IMG_URL_0
1315
elif id == "IMG_URL_1":
@@ -28,6 +30,45 @@ def get_img_url(id: str) -> str:
2830
response = requests.get(IMG_URL_1)
2931
response.raise_for_status() # Raise an exception for bad status codes
3032
return base64.b64encode(response.content).decode("utf-8")
33+
elif id == "IMG_FILE_0":
34+
if tmp_path is None:
35+
raise RuntimeError("get_img_url must be called with a tmp_path if using local files")
36+
image_name = IMG_URL_0.split('/')[-1]
37+
file_name: Path = Path(tmp_path) / image_name
38+
if file_name.exists():
39+
return f"file://{file_name}"
40+
else:
41+
response = requests.get(IMG_URL_0)
42+
response.raise_for_status() # Raise an exception for bad status codes
43+
with open(file_name, 'wb') as f:
44+
f.write(response.content)
45+
return f"file://{file_name}"
46+
elif id == "IMG_FILE_1":
47+
if tmp_path is None:
48+
raise RuntimeError("get_img_url must be called with a tmp_path if using local files")
49+
image_name = IMG_URL_1.split('/')[-1]
50+
file_name: Path = Path(tmp_path) / image_name
51+
if file_name.exists():
52+
return f"file://{file_name}"
53+
else:
54+
response = requests.get(IMG_URL_1)
55+
response.raise_for_status() # Raise an exception for bad status codes
56+
with open(file_name, 'wb') as f:
57+
f.write(response.content)
58+
return f"file://{file_name}"
59+
elif id == "IMG_FILE_2":
60+
if tmp_path is None:
61+
raise RuntimeError("get_img_url must be called with a tmp_path if using local files")
62+
image_name = "dog.jpg"
63+
file_name: Path = Path(tmp_path) / image_name
64+
if file_name.exists():
65+
return f"file://{file_name}"
66+
else:
67+
response = requests.get(IMG_FILE_2)
68+
response.raise_for_status() # Raise an exception for bad status codes
69+
with open(file_name, 'wb') as f:
70+
f.write(response.content)
71+
return f"file://{file_name}"
3172
else:
3273
return id
3374

@@ -70,6 +111,9 @@ def test_v1_models_supports_multimodal_capability():
70111
("What is this:\n", "malformed", False, None),
71112
("What is this:\n", "https://google.com/404", False, None), # non-existent image
72113
("What is this:\n", "https://ggml.ai", False, None), # non-image data
114+
("What is this:\n", "IMG_FILE_0", False, None),
115+
("What is this:\n", "IMG_FILE_1", False, None),
116+
("What is this:\n", "IMG_FILE_2", False, None),
73117
# TODO @ngxson : test with multiple images, no images and with audio
74118
]
75119
)
@@ -83,7 +127,46 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
83127
{"role": "user", "content": [
84128
{"type": "text", "text": prompt},
85129
{"type": "image_url", "image_url": {
86-
"url": get_img_url(image_url),
130+
"url": get_img_url(image_url, "./tmp"),
131+
}},
132+
]},
133+
],
134+
})
135+
if success:
136+
assert res.status_code == 200
137+
choice = res.body["choices"][0]
138+
assert "assistant" == choice["message"]["role"]
139+
assert match_regex(re_content, choice["message"]["content"])
140+
else:
141+
assert res.status_code != 200
142+
143+
@pytest.mark.parametrize(
144+
"allowed_mb_size, allowed_path, img_dir_path, prompt, image_url, success, re_content",
145+
[
146+
# test model is trained on CIFAR-10, but it's quite dumb due to small size
147+
(0, "./tmp", "./tmp", "What is this:\n", "IMG_FILE_0", True, "(cat)+"),
148+
(0, "./tmp", "./tmp", "What is this:\n", "IMG_FILE_1", True, "(frog)+"),
149+
(1, "./tmp", "./tmp", "What is this:\n", "IMG_FILE_2", False, None),
150+
(0, "./tmp/allowed", "./tmp", "What is this:\n", "IMG_FILE_0", False, None),
151+
(0, "./tm", "./tmp", "What is this:\n", "IMG_FILE_0", False, None),
152+
(0, "./tmp/allowed", "./tmp/allowed/..", "What is this:\n", "IMG_FILE_0", False, None),
153+
(0, "./tmp/allowed", "./tmp/allowed/../.", "What is this:\n", "IMG_FILE_0", False, None),
154+
]
155+
)
156+
def test_vision_chat_completion_local_files(allowed_mb_size, allowed_path, img_dir_path, prompt, image_url, success, re_content):
157+
global server
158+
server.local_media_max_size_mb = allowed_mb_size
159+
server.allowed_local_media_path = allowed_path
160+
Path(allowed_path).mkdir(exist_ok=True)
161+
server.start()
162+
res = server.make_request("POST", "/chat/completions", data={
163+
"temperature": 0.0,
164+
"top_k": 1,
165+
"messages": [
166+
{"role": "user", "content": [
167+
{"type": "text", "text": prompt},
168+
{"type": "image_url", "image_url": {
169+
"url": get_img_url(image_url, img_dir_path),
87170
}},
88171
]},
89172
],

tools/server/tests/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ class ServerProcess:
9595
chat_template_file: str | None = None
9696
server_path: str | None = None
9797
mmproj_url: str | None = None
98+
local_media_max_size_mb: int | None = None
99+
allowed_local_media_path: str | None = None
98100

99101
# session variables
100102
process: subprocess.Popen | None = None
@@ -215,6 +217,10 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
215217
server_args.extend(["--chat-template-file", self.chat_template_file])
216218
if self.mmproj_url:
217219
server_args.extend(["--mmproj-url", self.mmproj_url])
220+
if self.local_media_max_size_mb:
221+
server_args.extend(["--local-media-max-size-mb", self.local_media_max_size_mb])
222+
if self.allowed_local_media_path:
223+
server_args.extend(["--allowed-local-media-path", self.allowed_local_media_path])
218224

219225
args = [str(arg) for arg in [server_path, *server_args]]
220226
print(f"tests: starting server with: {' '.join(args)}")

0 commit comments

Comments
 (0)