Skip to content

Commit 43bc5f1

Browse files
Merge pull request #34 from BradHutchings/work-in-progress
Work in progress
2 parents ae5e913 + c2deea2 commit 43bc5f1

File tree

13 files changed

+773
-317
lines changed

13 files changed

+773
-317
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,5 @@ In no particular order of importance, these are the things that bother me:
7474
- The args thing is cute, but it might be easier as a yaml file. Key value pairs. Flags can be keys with null values.
7575
- The `--ctx-size` parameter doesn't seem quite right given that new models have the training (or max) context size in their metadata. That size should be used subject to a maximum in a passed parameter. E.g. So a 128K model can run comfortably on a smaller device.
7676
- Write docs for a Deploying step. It should address the args file, removing the extra executable depending on platform, models, host, port. context size.
77-
- Make a `.gitattributes` file so we can set the default file to be displayed and keep the README.md from llama.cpp. This will help in syncing changes continually from upstream. Reference: https://git-scm.com/docs/gitattributes
77+
- ~~Make a `.gitattributes` file so we can set the default file to be displayed and keep the README.md from llama.cpp. This will help in syncing changes continually from upstream. Reference: https://git-scm.com/docs/gitattributes~~ -- This doesn't actually work.
78+
- Cosmo needs libssl and libcrypto. Building these from scratch gets an error about Cosco not liking assembly files. Sort this out.

convert_hf_to_gguf.py

Lines changed: 363 additions & 271 deletions
Large diffs are not rendered by default.

convert_lora_to_gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import gguf
2525

2626
# reuse model definitions from convert_hf_to_gguf.py
27-
from convert_hf_to_gguf import LazyTorchTensor, Model
27+
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
2828

2929
logger = logging.getLogger("lora-to-gguf")
3030

@@ -340,11 +340,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
340340
sys.exit(1)
341341
else:
342342
logger.info(f"Loading base model: {dir_base_model.name}")
343-
hparams = Model.load_hparams(dir_base_model)
343+
hparams = ModelBase.load_hparams(dir_base_model)
344344

345345
with torch.inference_mode():
346346
try:
347-
model_class = Model.from_model_architecture(hparams["architectures"][0])
347+
model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
348348
except NotImplementedError:
349349
logger.error(f"Model {hparams['architectures'][0]} is not supported")
350350
sys.exit(1)

docs/Buidling-ls1-Brads-Env.md

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,15 @@ This file contains instructions for building `llama.cpp` with `cosmocc` to yield
77

88
### Environment Variables
99

10-
Let's define some environment variables:
10+
Let's define some environment variables, resetting those that affect the Makefile:
1111
```
1212
BUILDING_DIR="1-BUILDING-llama.cpp"
13+
unset CC
14+
unset CXX
15+
unset AR
16+
unset UNAME_S
17+
unset UNAME_P
18+
unset UNAME_M
1319
printf "\n**********\n*\n* FINISHED: Environment Variables.\n*\n**********\n\n"
1420
```
1521

@@ -60,6 +66,7 @@ We use the old `Makefile` rather than CMake. We've updated the `Makefile` in thi
6066
```
6167
cd ~/$BUILDING_DIR
6268
export LLAMA_MAKEFILE=1
69+
export LLAMA_SERVER_SSL=ON
6370
make clean
6471
make
6572
printf "\n**********\n*\n* FINISHED: Make llama.cpp.\n*\n**********\n\n"
@@ -100,13 +107,29 @@ export PATH="$(pwd)/cosmocc/bin:$PATH"
100107
export CC="cosmocc -I$(pwd)/cosmocc/include -L$(pwd)/cosmocc/lib"
101108
export CXX="cosmocc -I$(pwd)/cosmocc/include \
102109
-I$(pwd)/cosmocc/include/third_party/libcxx \
103-
-L$(pwd)/cosmocc/lib"
110+
-L$(pwd)/cosmocc/lib -L$(pwd)/openssl"
111+
export AR="cosmoar"
104112
export UNAME_S="cosmocc"
105113
export UNAME_P="cosmocc"
106114
export UNAME_M="cosmocc"
107115
printf "\n**********\n*\n* FINISHED: Prepare to make llama.cpp with Cosmo.\n*\n**********\n\n"
108116
```
109117

118+
---
119+
### Make openssl with Cosmo
120+
We need cross-architectire `libssl` and `libcrypto` static libraries to support SSL in `llama-server-one`.
121+
```
122+
cp -r /usr/include/openssl/ ./cosmocc/include/
123+
cp -r /usr/include/x86_64-linux-gnu/openssl/* ./cosmocc/include/openssl
124+
git clone https://github.com/openssl/openssl.git
125+
cd openssl
126+
./Configure no-asm no-dso no-afalgeng no-shared no-pinshared no-apps
127+
make
128+
cd ..
129+
printf "\n**********\n*\n* FINISHED: Make openssl with Cosmo.\n*\n**********\n\n"
130+
131+
```
132+
110133
---
111134
### Make llama.cpp with Cosmo
112135
```

docs/Configuring-ls1-Brads-Env.md

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,25 +54,46 @@ zip -d $LLAMA_SERVER_ONE_ZIP "/usr/*"
5454
printf "\n**********\n*\n* FINISHED: Delete Extraneous Timezone Files.\n*\n**********\n\n"
5555
```
5656

57-
---
58-
### Verify Contents of Zip Archive
57+
#### Verify Contents of Zip Archive
5958

6059
Verify that these files are no longer in the archive:
6160
```
6261
unzip -l $LLAMA_SERVER_ONE_ZIP
6362
printf "\n**********\n*\n* FINISHED: Verify Contents of Zip Archive.\n*\n**********\n\n"
6463
```
6564

65+
---
66+
### Add Certs to Archive
67+
68+
Add self-signed certs to the archive. CA cert is added to the website folder.
69+
```
70+
mkdir certs
71+
cp /mnt/hyperv/Mmojo-Raspberry-Pi/Mmojo-certs/mmojo.local.crt certs
72+
cp /mnt/hyperv/Mmojo-Raspberry-Pi/Mmojo-certs/mmojo.local.key certs
73+
cp /mnt/hyperv/Mmojo-Raspberry-Pi/Mmojo-certs/selfsignCA.crt certs
74+
zip -0 -r $LLAMA_SERVER_ONE_ZIP certs/*
75+
printf "\n**********\n*\n* FINISHED: Add Certs to Archive.\n*\n**********\n\n"
76+
```
77+
78+
#### Verify certs Directory in Archive
79+
80+
Verify that the archive has your certs:
81+
```
82+
unzip -l $LLAMA_SERVER_ONE_ZIP
83+
printf "\n**********\n*\n* FINISHED: Verify certs Directory in Archive.\n*\n**********\n\n"
84+
```
85+
6686
---
6787
### Create website Directory in Archive
6888

6989
`llama.cpp` has a built in chat UI. If you'd like to provide a custom UI, you should add a `website` directory to the `llama-server-one` archive. `llama.cpp`'s chat UI is optimized for serving inside the project's source code. But we can copy the unoptimized source:
7090
```
7191
mkdir website
7292
cp -r /mnt/hyperv/web-apps/completion-tool/* website
93+
cp /mnt/hyperv/Mmojo-Raspberry-Pi/Mmojo-certs/selfsignCA.crt website/CA.crt
7394
rm website/*.txt
74-
rm website/images/*.svg
75-
rm website/images/*.psd
95+
rm website/completion/images/*.svg
96+
rm website/completion/images/*.psd
7697
zip -0 -r $LLAMA_SERVER_ONE_ZIP website/*
7798
printf "\n**********\n*\n* FINISHED: Create website Directory in Archive.\n*\n**********\n\n"
7899
```
@@ -84,8 +105,9 @@ Verify that the archive has your website:
84105
unzip -l $LLAMA_SERVER_ONE_ZIP
85106
printf "\n**********\n*\n* FINISHED: Verify website Directory in Archive.\n*\n**********\n\n"
86107
```
108+
87109
---
88-
### Create default-args File
110+
### Create default-args File in Archive
89111

90112
A `default-args` file in the archive can specify sane default parameters. The format of the file is parameter name on a line, parameter value on a line, rinse, repeat. End the file with a `...` line to include user specified parameters.
91113

@@ -106,22 +128,17 @@ model.gguf
106128
8
107129
--path
108130
/zip/website
131+
--ssl-key-file
132+
/zip/certs/mmojo.local.key
133+
--ssl-cert-file
134+
/zip/certs/mmojo.local.crt
109135
...
110136
EOF
111-
printf "\n**********\n*\n* FINISHED: Create Default args File.\n*\n**********\n\n"
112-
```
113-
114-
---
115-
### Add default-args File to Archive
116-
117-
Add the `default-args` file to the archive:
118-
```
119137
zip -0 -r $LLAMA_SERVER_ONE_ZIP $DEFAULT_ARGS
120-
printf "\n**********\n*\n* FINISHED: Add default-args File to Archive.\n*\n**********\n\n"
138+
printf "\n**********\n*\n* FINISHED: Create Default args File in Archive.\n*\n**********\n\n"
121139
```
122140

123-
---
124-
### Verify default-args File in Archive
141+
#### Verify default-args File in Archive
125142

126143
Verify that the archive contains the `default-args` file:
127144
```
@@ -163,8 +180,7 @@ After starting up and loading the model, it should display:
163180

164181
Hit `ctrl-C` on your keyboard to stop it.
165182

166-
---
167-
### Test Run on Public Interfaces
183+
#### Test Run on Public Interfaces
168184

169185
If you'd like it to listen on all available interfaces, so you can connect from a browser on another computer:
170186
```
@@ -184,5 +200,6 @@ Congratulations! You are ready to copy `llams-server-one` executable to the shar
184200

185201
```
186202
sudo cp llama-server-one /mnt/hyperv/Mmojo-Raspberry-Pi/Mmojo-LLMs
203+
sudo cp llama-server-one /mnt/hyperv/Mmojo-Raspberry-Pi/Mmojo-LLMs/llama-server-one.exe
187204
printf "\n**********\n*\n* FINISHED: Copy llama-server-one for Deployment.\n*\n**********\n\n"
188205
```

examples/llava/clip-impl.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
// tensor name constants
5151
//
5252

53-
#define TN_TOKEN_EMBD "%s.token_embd.weight"
5453
#define TN_POS_EMBD "%s.position_embd.weight"
5554
#define TN_CLASS_EMBD "v.class_embd"
5655
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
@@ -66,8 +65,6 @@
6665
#define TN_LN_2 "%s.blk.%d.ln2.%s"
6766
#define TN_LN_PRE "%s.pre_ln.%s"
6867
#define TN_LN_POST "%s.post_ln.%s"
69-
#define TN_TEXT_PROJ "text_projection.weight"
70-
#define TN_VIS_PROJ "visual_projection.weight"
7168
#define TN_LLAVA_PROJ "mm.%d.%s"
7269
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
7370
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"

examples/llava/clip.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@ struct clip_image_size {
3030
int height;
3131
};
3232

33+
struct clip_image_f32;
3334
struct clip_image_u8_batch;
3435
struct clip_image_f32_batch;
3536

3637
struct clip_context_params {
3738
bool use_gpu;
38-
ggml_log_level verbosity;
39+
enum ggml_log_level verbosity;
3940
};
4041

4142
// deprecated, use clip_init
@@ -84,7 +85,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
8485
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
8586
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
8687
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
87-
CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
88+
CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
8889

8990
/**
9091
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
481481
GGML_METAL_KERNEL_TYPE_SQRT,
482482
GGML_METAL_KERNEL_TYPE_SIN,
483483
GGML_METAL_KERNEL_TYPE_COS,
484+
GGML_METAL_KERNEL_TYPE_NEG,
484485
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
485486
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
486487
GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
@@ -1159,6 +1160,7 @@ @implementation GGMLMetalClass
11591160
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
11601161
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
11611162
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
1163+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true);
11621164
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
11631165
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);
11641166
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true);
@@ -1320,6 +1322,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
13201322
case GGML_UNARY_OP_GELU_QUICK:
13211323
case GGML_UNARY_OP_SILU:
13221324
case GGML_UNARY_OP_ELU:
1325+
case GGML_UNARY_OP_NEG:
13231326
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
13241327
default:
13251328
return false;
@@ -2010,6 +2013,18 @@ static void ggml_metal_encode_node(
20102013

20112014
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
20122015
} break;
2016+
case GGML_UNARY_OP_NEG:
2017+
{
2018+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NEG].pipeline;
2019+
2020+
[encoder setComputePipelineState:pipeline];
2021+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
2022+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
2023+
2024+
const int64_t n = ggml_nelements(dst);
2025+
2026+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
2027+
} break;
20132028
default:
20142029
{
20152030
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,13 @@ kernel void kernel_cos(
949949
dst[tpig] = cos(src0[tpig]);
950950
}
951951

952+
kernel void kernel_neg(
953+
device const float * src0,
954+
device float * dst,
955+
uint tpig[[thread_position_in_grid]]) {
956+
dst[tpig] = -src0[tpig];
957+
}
958+
952959
kernel void kernel_sum_rows(
953960
device const float * src0,
954961
device float * dst,

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2397,7 +2397,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
23972397

23982398
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
23992399
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
2400-
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
2400+
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
24012401
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
24022402
ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
24032403

@@ -6006,6 +6006,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
60066006
case GGML_OP_REPEAT:
60076007
case GGML_OP_REPEAT_BACK:
60086008
case GGML_OP_ROPE:
6009+
case GGML_OP_RMS_NORM:
60096010
return true;
60106011
default:
60116012
return false;
@@ -6216,7 +6217,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
62166217

62176218
switch (op) {
62186219
case GGML_OP_NORM:
6219-
case GGML_OP_RMS_NORM:
62206220
case GGML_OP_RMS_NORM_BACK:
62216221
case GGML_OP_L2_NORM:
62226222
case GGML_OP_SOFT_MAX:
@@ -6233,6 +6233,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
62336233
elements = { nr, 1, 1 };
62346234
}
62356235
} break;
6236+
case GGML_OP_RMS_NORM:
6237+
elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 };
6238+
break;
6239+
62366240
case GGML_OP_SUM:
62376241
// We use GGML_OP_SUM_ROWS with 1 row.
62386242
elements = { 1, 1, 1 };
@@ -6883,7 +6887,17 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx
68836887

68846888
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
68856889
float * op_params = (float *)dst->op_params;
6886-
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
6890+
const uint32_t src0_type_size = ggml_type_size(src0->type);
6891+
const uint32_t dst_type_size = ggml_type_size(dst->type);
6892+
6893+
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, {
6894+
(uint32_t)ggml_nelements(src0),
6895+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
6896+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
6897+
0,
6898+
op_params[0], 0.0f,
6899+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6900+
}, dryrun);
68876901
}
68886902

68896903
static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -9388,10 +9402,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
93889402
case GGML_OP_VIEW:
93899403
case GGML_OP_PERMUTE:
93909404
case GGML_OP_TRANSPOSE:
9405+
case GGML_OP_RMS_NORM:
93919406
return true;
93929407
case GGML_OP_NORM:
93939408
case GGML_OP_GROUP_NORM:
9394-
case GGML_OP_RMS_NORM:
93959409
case GGML_OP_L2_NORM:
93969410
return ggml_is_contiguous(op->src[0]);
93979411
case GGML_OP_ADD:

0 commit comments

Comments
 (0)