diff --git a/contrib/fine-tuning/convert-gguf.sh b/contrib/fine-tuning/convert-gguf.sh index c720ec3e75..428e118b84 100755 --- a/contrib/fine-tuning/convert-gguf.sh +++ b/contrib/fine-tuning/convert-gguf.sh @@ -16,6 +16,8 @@ set -e # cmake -B build # cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) +# Run 'ollama serve' in a separate terminal + export TOKENIZERS_PARALLELISM=false LLAMA_CPP_PATH=/Users/appthreat/work/llama.cpp cd $LLAMA_CPP_PATH @@ -52,12 +54,14 @@ GGUF_MODEL_Q8_0_NAME=${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q8_0-${FORMAT} GGUF_MODEL_Q8_0_PATH=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q8_0-${FORMAT} FUSED_MODEL=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL} +# Direct conversion to 8-bit from the fused BF16 version rm -rf ${GGUF_MODEL_Q8_0_PATH} mkdir -p ${GGUF_MODEL_Q8_0_PATH} python convert_hf_to_gguf.py --outtype q8_0 --outfile ${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q8_0-${FORMAT}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-q8_0.gguf --model-name ${GGUF_MODEL_Q8_0_NAME} ${FUSED_MODEL} cp ${MODEL_FILE_PATH} ${GGUF_MODEL_Q8_0_PATH}/Modelfile cp ${FUSED_MODEL}/*.json ${FUSED_MODEL}/merges.txt ${GGUF_MODEL_Q8_0_PATH}/ +# BF16 GGUF_MODEL_BF16_NAME=${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-BF16-${FORMAT} GGUF_MODEL_BF16_PATH=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-BF16-${FORMAT} rm -rf ${GGUF_MODEL_BF16_PATH} @@ -67,6 +71,16 @@ cp ${MODEL_FILE_PATH} ${GGUF_MODEL_BF16_PATH}/Modelfile sed -i '' 's|./${TOOL_BASE_MODEL}-${PARAM_SIZE}-q8_0.gguf|./${TOOL_BASE_MODEL}-${PARAM_SIZE}-bf16.gguf|g' ${GGUF_MODEL_BF16_PATH}/Modelfile cp ${FUSED_MODEL}/*.json ${FUSED_MODEL}/merges.txt ${GGUF_MODEL_BF16_PATH}/ +# MXFP4 - MOE only +GGUF_MODEL_MXFP4_NAME=${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-MXFP4-${FORMAT} +GGUF_MODEL_MXFP4_PATH=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-MXFP4-${FORMAT} +rm -rf ${GGUF_MODEL_MXFP4_PATH} +mkdir -p ${GGUF_MODEL_MXFP4_PATH} +llama-quantize ${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-BF16-${FORMAT}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-bf16.gguf ${GGUF_MODEL_MXFP4_PATH}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-MXFP4.gguf MXFP4_MOE +cp ${MODEL_FILE_PATH} ${GGUF_MODEL_MXFP4_PATH}/Modelfile +sed -i '' 's|./${TOOL_BASE_MODEL}-${PARAM_SIZE}-q8_0.gguf|./${TOOL_BASE_MODEL}-${PARAM_SIZE}-MXFP4.gguf|g' ${GGUF_MODEL_MXFP4_PATH}/Modelfile +cp ${FUSED_MODEL}/*.json ${FUSED_MODEL}/merges.txt ${GGUF_MODEL_MXFP4_PATH}/ + if [ "$TOOL_BASE_MODEL" == "cdx1-mini" ] || [ "$TOOL_BASE_MODEL" == "cdx1-nano" ]; then GGUF_MODEL_Q6_K_NAME=${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q6_K-${FORMAT} GGUF_MODEL_Q6_K_PATH=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q6_K-${FORMAT} @@ -114,6 +128,7 @@ fi export HF_HUB_ENABLE_HF_TRANSFER=0 hf auth whoami hf upload --quiet --exclude "**/README.md" --repo-type model ${GGUF_MODEL_Q8_0_NAME} ${GGUF_MODEL_Q8_0_PATH} . +hf upload --quiet --exclude "**/README.md" --repo-type model ${GGUF_MODEL_MXFP4_NAME} ${GGUF_MODEL_MXFP4_PATH} . if [ "$TOOL_BASE_MODEL" == "cdx1-mini" ] || [ "$TOOL_BASE_MODEL" == "cdx1-nano" ]; then hf upload --quiet --exclude "**/README.md" --repo-type model ${GGUF_MODEL_Q6_K_NAME} ${GGUF_MODEL_Q6_K_PATH} . else @@ -123,11 +138,18 @@ else fi hf upload --quiet --exclude "**/README.md" --repo-type model ${GGUF_MODEL_BF16_NAME} ${GGUF_MODEL_BF16_PATH} . +### upload to ollama registry. Move this to a separate script in the future. + ollama pull hf.co/${GGUF_MODEL_Q8_0_NAME} ollama cp hf.co/${GGUF_MODEL_Q8_0_NAME} ${GGUF_MODEL_Q8_0_NAME} ollama push ${GGUF_MODEL_Q8_0_NAME} ollama rm hf.co/${GGUF_MODEL_Q8_0_NAME} +ollama pull hf.co/${GGUF_MODEL_MXFP4_NAME} +ollama cp hf.co/${GGUF_MODEL_MXFP4_NAME} ${GGUF_MODEL_MXFP4_NAME} +ollama push ${GGUF_MODEL_MXFP4_NAME} +ollama rm hf.co/${GGUF_MODEL_MXFP4_NAME} + if [ "$TOOL_BASE_MODEL" == "cdx1-mini" ] || [ "$TOOL_BASE_MODEL" == "cdx1-nano" ]; then ollama pull hf.co/${GGUF_MODEL_Q6_K_NAME} ollama cp hf.co/${GGUF_MODEL_Q6_K_NAME} ${GGUF_MODEL_Q6_K_NAME} diff --git a/contrib/fine-tuning/fine-tune-mlx.sh b/contrib/fine-tuning/fine-tune-mlx.sh index c530587cac..9f33f42ba8 100755 --- a/contrib/fine-tuning/fine-tune-mlx.sh +++ b/contrib/fine-tuning/fine-tune-mlx.sh @@ -45,6 +45,7 @@ FUSED_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL} QUANT_MODEL_8BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-8bit QUANT_MODEL_6BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-6bit QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit +QUANT_MODEL_MXFP4=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-MXFP4 DWQ_QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit-DWQ ### mlx-lm needs train.jsonl and valid.jsonl @@ -109,6 +110,11 @@ mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_6BIT} -q --q-bi echo "Test ${QUANT_MODEL_6BIT} with the prompt 'Tell me about cdxgen'. Must yield a better response." mlx_lm.generate --model ./${QUANT_MODEL_6BIT} --prompt "Tell me about cdxgen" --temp ${TEMP} --max-tokens ${MAX_TOKENS} +rm -rf ${QUANT_MODEL_MXFP4} +mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_MXFP4} -q --q-bits 4 --q-group-size 32 --q-mode mxfp4 --dtype bfloat16 +echo "Test ${QUANT_MODEL_MXFP4} with the prompt 'Tell me about cdxgen'. Must yield a better response." +mlx_lm.generate --model ./${QUANT_MODEL_MXFP4} --prompt "Tell me about cdxgen" --temp ${TEMP} --max-tokens ${MAX_TOKENS} + # 4-bit for a small model has very poor performance if [ "$TOOL_BASE_MODEL" != "cdx1-mini" ] && [ "$TOOL_BASE_MODEL" != "cdx1-nano" ]; then rm -rf ${QUANT_MODEL_4BIT} diff --git a/contrib/fine-tuning/upload-hf.sh b/contrib/fine-tuning/upload-hf.sh index ad84582898..560b334eef 100755 --- a/contrib/fine-tuning/upload-hf.sh +++ b/contrib/fine-tuning/upload-hf.sh @@ -10,6 +10,7 @@ FUSED_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL} QUANT_MODEL_8BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-8bit QUANT_MODEL_6BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-6bit QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit +QUANT_MODEL_MXFP4=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-MXFP4 DWQ_QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit-DWQ hf auth whoami @@ -20,12 +21,13 @@ hf upload --quiet --repo-type dataset CycloneDX/cdx-docs ./guides guides hf upload --quiet --repo-type dataset CycloneDX/cdx-docs ./semantics semantics echo "Uploading models. Please wait ..." -hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_8BIT} ./${QUANT_MODEL_8BIT} . -hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_6BIT} ./${QUANT_MODEL_6BIT} . +hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_8BIT} ./${QUANT_MODEL_8BIT} --delete "*.safetensors" . +hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_MXFP4} ./${QUANT_MODEL_MXFP4} --delete "*.safetensors" . +hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_6BIT} ./${QUANT_MODEL_6BIT} --delete "*.safetensors" . if [ "$TOOL_BASE_MODEL" != "cdx1-mini" ] && [ "$TOOL_BASE_MODEL" != "cdx1-nano" ]; then - hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_4BIT} ./${QUANT_MODEL_4BIT} . + hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_4BIT} ./${QUANT_MODEL_4BIT} --delete "*.safetensors" . fi #if [ "$TOOL_BASE_MODEL" != "cdx1-mini" ]; then # hf upload --quiet --exclude "**/README.md" --repo-type model ${DWQ_QUANT_MODEL_4BIT} ./${DWQ_QUANT_MODEL_4BIT} . #fi -hf upload --quiet --exclude "**/README.md" --repo-type model ${FUSED_MODEL} ./${FUSED_MODEL} . +hf upload --quiet --exclude "**/README.md" --repo-type model ${FUSED_MODEL} ./${FUSED_MODEL} --delete "*.safetensors" .