diff --git a/KK2-0905.sh b/KK2-0905.sh
new file mode 100755
index 0000000000000..fd0493ce7d6f7
--- /dev/null
+++ b/KK2-0905.sh
@@ -0,0 +1,15 @@
+# Shell to run KK2-0905-UD-Q3_K_XL without having to type the monster CLI
+# assumes we have CD'd to ~/Development/llama.cpp
+
+#!/bin/bash
+MODEL_PATH="/Users/edsilmacstudio/Development/llama.cpp/models/Kimi-K2-Instruct-0905-UD-Q3_K_XL/models--unsloth--Kimi-K2-Instruct-0905-GGUF/snapshots/ca516d05c7621c0615db3fc7efa63c9617547363/UD-Q3_K_XL/Kimi-K2-Instruct-0905-UD-Q3_K_XL-00001-of-00010.gguf"
+
+./build/bin/llama-server \
+ -m "$MODEL_PATH" \
+ -c 225176 \
+ -ngl 99 \
+ --parallel 4 \
+ --no-warmup \
+ --host 0.0.0.0 \
+ --port 3000 \
+ --log-file "KK2_0905_log_4"
diff --git a/tools/server/public/index.html b/tools/server/public/index.html
new file mode 100644
index 0000000000000..3d86dcbf6974e
--- /dev/null
+++ b/tools/server/public/index.html
@@ -0,0 +1,605 @@
+
+
+
+ (null);
const timings = useMemo(
() =>
@@ -55,6 +70,22 @@ export default function ChatMessage({
const nextSibling = siblingLeafNodeIds[siblingCurrIdx + 1];
const prevSibling = siblingLeafNodeIds[siblingCurrIdx - 1];
+ const { getConversationTokenTotal, addTokensToConversation } =
+ useAppContext();
+ const [hasAddedTokens, setHasAddedTokens] = useState(false);
+
+ // Get current conversation token total
+ const conversationTotal = getConversationTokenTotal(msg.convId);
+
+ // Add tokens to running total when timings are available
+ useEffect(() => {
+ if (timings && !hasAddedTokens && msg.role === 'assistant') {
+ const messageTokens = timings.prompt_n + timings.predicted_n;
+ addTokensToConversation(msg.convId, messageTokens);
+ setHasAddedTokens(true);
+ }
+ }, [timings, hasAddedTokens, msg.convId, msg.role, addTokensToConversation]);
+
// for reasoning model, we split the message into content and thought
// TODO: implement this as remark/rehype plugin in the future
const { content, thought, isThinking }: SplitMessage = useMemo(() => {
@@ -87,6 +118,9 @@ export default function ChatMessage({
const isUser = msg.role === 'user';
+ // @ts-expect-error/ban-ts-comment
+ const contextSize = serverProps?.['default_generation_settings']?.['n_ctx'];
+
return (
- Speed: {timings.predicted_per_second.toFixed(1)} t/s
+ Speed test: {timings.predicted_per_second.toFixed(1)} t/s |
+ Tokens: {timings.prompt_n + timings.predicted_n} this msg,{' '}
+ {conversationTotal} total
-
-
Prompt
-
- Tokens: {timings.prompt_n}
-
- Time: {timings.prompt_ms} ms
-
- Speed: {timings.prompt_per_second.toFixed(1)} t/s
-
-
Generation
-
- Tokens: {timings.predicted_n}
-
- Time: {timings.predicted_ms} ms
+
+
Chat Stats:
+ This Response
+
- Generated: {timings.predicted_n} tokens
- Speed: {timings.predicted_per_second.toFixed(1)} t/s
+ Total Conversation
+
- Context used:{' '}
+ {timings.prompt_n + timings.predicted_n} tokens
+
- Prompt history: {timings.prompt_n} tokens
+
- This response: {timings.predicted_n} tokens
+ {contextSize && (
+ <>
+
- Context limit: {contextSize} tokens
+
- Remaining:{' '}
+ {contextSize -
+ timings.prompt_n -
+ timings.predicted_n}{' '}
+ tokens
+
- Usage:{' '}
+ {Math.round(
+ ((timings.prompt_n + timings.predicted_n) /
+ contextSize) *
+ 100
+ )}
+ %
+ >
+ )}
)}
diff --git a/tools/server/webui/src/components/Header.tsx b/tools/server/webui/src/components/Header.tsx
index ccddc21ddab73..e66a371c9f96e 100644
--- a/tools/server/webui/src/components/Header.tsx
+++ b/tools/server/webui/src/components/Header.tsx
@@ -12,7 +12,16 @@ import {
export default function Header() {
const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
- const { setShowSettings } = useAppContext();
+ const { setShowSettings, serverProps } = useAppContext();
+
+ const fullFile = serverProps?.model_path?.split(/[/\\]/).pop() ?? '';
+ const build = serverProps?.build_info ?? '?';
+
+ // Extract model name from model_path and remove the sharding suffix if present
+ const modelName = serverProps?.model_path
+ ?.split(/(\\|\/)/)
+ .pop()
+ ?.replace(/-\d{5}-of-\d{5}(?=\.gguf$)/, '');
const setTheme = (theme: string) => {
StorageUtils.setTheme(theme);
@@ -34,7 +43,12 @@ export default function Header() {
- llama.cpp
+
+ llama server.cpp: {modelName}
+
{/* action buttons (top right) */}
diff --git a/tools/server/webui/src/utils/app.context.tsx b/tools/server/webui/src/utils/app.context.tsx
index 1860ecc721821..137417c57df79 100644
--- a/tools/server/webui/src/utils/app.context.tsx
+++ b/tools/server/webui/src/utils/app.context.tsx
@@ -52,6 +52,10 @@ interface AppContextValue {
// props
serverProps: LlamaCppServerProps | null;
+
+ // Token tracking
+ getConversationTokenTotal: (convId: string) => number;
+ addTokensToConversation: (convId: string, tokens: number) => void;
}
// this callback is used for scrolling to the bottom of the chat and switching to the last node
@@ -93,6 +97,9 @@ export const AppContextProvider = ({
const [config, setConfig] = useState(StorageUtils.getConfig());
const [canvasData, setCanvasData] = useState
(null);
const [showSettings, setShowSettings] = useState(false);
+ const [conversationTokenTotals, setConversationTokenTotals] = useState<
+ Record
+ >({});
// get server props
useEffect(() => {
@@ -386,6 +393,17 @@ export const AppContextProvider = ({
setConfig(config);
};
+ const getConversationTokenTotal = (convId: string): number => {
+ return conversationTokenTotals[convId] || 0;
+ };
+
+ const addTokensToConversation = (convId: string, tokens: number) => {
+ setConversationTokenTotals((prev) => ({
+ ...prev,
+ [convId]: (prev[convId] || 0) + tokens,
+ }));
+ };
+
return (
{children}