|
| 1 | +./bin/llama-cli --model DeepSeek-R1-Distill-Qwen-32B-Q2_K.gguf --cache-type-k q8_0 --threads 24 --prompt '<|User|>What is 1+1?<|Assistant|>' -no-cnv |
| 2 | +build: 4798 (1782cdfe) with cc (Ubuntu 14.2.0-4ubuntu2~24.04) 14.2.0 for x86_64-linux-gnu |
| 3 | +main: llama backend init |
| 4 | +main: load the model and apply lora adapter, if any |
| 5 | +llama_model_loader: loaded meta data with 27 key-value pairs and 771 tensors from DeepSeek-R1-Distill-Qwen-32B-Q2_K.gguf (version GGUF V3 (latest)) |
| 6 | +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. |
| 7 | +llama_model_loader: - kv 0: general.architecture str = qwen2 |
| 8 | +llama_model_loader: - kv 1: general.type str = model |
| 9 | +llama_model_loader: - kv 2: general.name str = DeepSeek R1 Distill Qwen 32B |
| 10 | +llama_model_loader: - kv 3: general.organization str = Deepseek Ai |
| 11 | +llama_model_loader: - kv 4: general.basename str = DeepSeek-R1-Distill-Qwen |
| 12 | +llama_model_loader: - kv 5: general.size_label str = 32B |
| 13 | +llama_model_loader: - kv 6: qwen2.block_count u32 = 64 |
| 14 | +llama_model_loader: - kv 7: qwen2.context_length u32 = 131072 |
| 15 | +llama_model_loader: - kv 8: qwen2.embedding_length u32 = 5120 |
| 16 | +llama_model_loader: - kv 9: qwen2.feed_forward_length u32 = 27648 |
| 17 | +llama_model_loader: - kv 10: qwen2.attention.head_count u32 = 40 |
| 18 | +llama_model_loader: - kv 11: qwen2.attention.head_count_kv u32 = 8 |
| 19 | +llama_model_loader: - kv 12: qwen2.rope.freq_base f32 = 1000000.000000 |
| 20 | +llama_model_loader: - kv 13: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000010 |
| 21 | +llama_model_loader: - kv 14: tokenizer.ggml.model str = gpt2 |
| 22 | +llama_model_loader: - kv 15: tokenizer.ggml.pre str = deepseek-r1-qwen |
| 23 | +llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ... |
| 24 | +llama_model_loader: - kv 17: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... |
| 25 | +llama_model_loader: - kv 18: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... |
| 26 | +llama_model_loader: - kv 19: tokenizer.ggml.bos_token_id u32 = 151646 |
| 27 | +llama_model_loader: - kv 20: tokenizer.ggml.eos_token_id u32 = 151643 |
| 28 | +llama_model_loader: - kv 21: tokenizer.ggml.padding_token_id u32 = 151654 |
| 29 | +llama_model_loader: - kv 22: tokenizer.ggml.add_bos_token bool = true |
| 30 | +llama_model_loader: - kv 23: tokenizer.ggml.add_eos_token bool = false |
| 31 | +llama_model_loader: - kv 24: tokenizer.chat_template str = {% if not add_generation_prompt is de... |
| 32 | +llama_model_loader: - kv 25: general.quantization_version u32 = 2 |
| 33 | +llama_model_loader: - kv 26: general.file_type u32 = 10 |
| 34 | +llama_model_loader: - type f32: 321 tensors |
| 35 | +llama_model_loader: - type q2_K: 257 tensors |
| 36 | +llama_model_loader: - type q3_K: 128 tensors |
| 37 | +llama_model_loader: - type q4_K: 64 tensors |
| 38 | +llama_model_loader: - type q6_K: 1 tensors |
| 39 | +print_info: file format = GGUF V3 (latest) |
| 40 | +print_info: file type = Q2_K - Medium |
| 41 | +print_info: file size = 11.46 GiB (3.01 BPW) |
| 42 | +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect |
| 43 | +load: special tokens cache size = 22 |
| 44 | +load: token to piece cache size = 0.9310 MB |
| 45 | +print_info: arch = qwen2 |
| 46 | +print_info: vocab_only = 0 |
| 47 | +print_info: n_ctx_train = 131072 |
| 48 | +print_info: n_embd = 5120 |
| 49 | +print_info: n_layer = 64 |
| 50 | +print_info: n_head = 40 |
| 51 | +print_info: n_head_kv = 8 |
| 52 | +print_info: n_rot = 128 |
| 53 | +print_info: n_swa = 0 |
| 54 | +print_info: n_embd_head_k = 128 |
| 55 | +print_info: n_embd_head_v = 128 |
| 56 | +print_info: n_gqa = 5 |
| 57 | +print_info: n_embd_k_gqa = 1024 |
| 58 | +print_info: n_embd_v_gqa = 1024 |
| 59 | +print_info: f_norm_eps = 0.0e+00 |
| 60 | +print_info: f_norm_rms_eps = 1.0e-05 |
| 61 | +print_info: f_clamp_kqv = 0.0e+00 |
| 62 | +print_info: f_max_alibi_bias = 0.0e+00 |
| 63 | +print_info: f_logit_scale = 0.0e+00 |
| 64 | +print_info: n_ff = 27648 |
| 65 | +print_info: n_expert = 0 |
| 66 | +print_info: n_expert_used = 0 |
| 67 | +print_info: causal attn = 1 |
| 68 | +print_info: pooling type = 0 |
| 69 | +print_info: rope type = 2 |
| 70 | +print_info: rope scaling = linear |
| 71 | +print_info: freq_base_train = 1000000.0 |
| 72 | +print_info: freq_scale_train = 1 |
| 73 | +print_info: n_ctx_orig_yarn = 131072 |
| 74 | +print_info: rope_finetuned = unknown |
| 75 | +print_info: ssm_d_conv = 0 |
| 76 | +print_info: ssm_d_inner = 0 |
| 77 | +print_info: ssm_d_state = 0 |
| 78 | +print_info: ssm_dt_rank = 0 |
| 79 | +print_info: ssm_dt_b_c_rms = 0 |
| 80 | +print_info: model type = 32B |
| 81 | +print_info: model params = 32.76 B |
| 82 | +print_info: general.name = DeepSeek R1 Distill Qwen 32B |
| 83 | +print_info: vocab type = BPE |
| 84 | +print_info: n_vocab = 152064 |
| 85 | +print_info: n_merges = 151387 |
| 86 | +print_info: BOS token = 151646 '<|begin▁of▁sentence|>' |
| 87 | +print_info: EOS token = 151643 '<|end▁of▁sentence|>' |
| 88 | +print_info: EOT token = 151643 '<|end▁of▁sentence|>' |
| 89 | +print_info: PAD token = 151654 '<|vision_pad|>' |
| 90 | +print_info: LF token = 198 'Ċ' |
| 91 | +print_info: FIM PRE token = 151659 '<|fim_prefix|>' |
| 92 | +print_info: FIM SUF token = 151661 '<|fim_suffix|>' |
| 93 | +print_info: FIM MID token = 151660 '<|fim_middle|>' |
| 94 | +print_info: FIM PAD token = 151662 '<|fim_pad|>' |
| 95 | +print_info: FIM REP token = 151663 '<|repo_name|>' |
| 96 | +print_info: FIM SEP token = 151664 '<|file_sep|>' |
| 97 | +print_info: EOG token = 151643 '<|end▁of▁sentence|>' |
| 98 | +print_info: EOG token = 151662 '<|fim_pad|>' |
| 99 | +print_info: EOG token = 151663 '<|repo_name|>' |
| 100 | +print_info: EOG token = 151664 '<|file_sep|>' |
| 101 | +print_info: max token length = 256 |
| 102 | +load_tensors: loading model tensors, this can take a while... (mmap = true) |
| 103 | +load_tensors: CPU_Mapped model buffer size = 11736.98 MiB |
| 104 | +............................................................................................... |
| 105 | +llama_init_from_model: n_seq_max = 1 |
| 106 | +llama_init_from_model: n_ctx = 4096 |
| 107 | +llama_init_from_model: n_ctx_per_seq = 4096 |
| 108 | +llama_init_from_model: n_batch = 2048 |
| 109 | +llama_init_from_model: n_ubatch = 512 |
| 110 | +llama_init_from_model: flash_attn = 0 |
| 111 | +llama_init_from_model: freq_base = 1000000.0 |
| 112 | +llama_init_from_model: freq_scale = 1 |
| 113 | +llama_init_from_model: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized |
| 114 | +llama_kv_cache_init: kv_size = 4096, offload = 1, type_k = 'q8_0', type_v = 'f16', n_layer = 64, can_shift = 1 |
| 115 | +llama_kv_cache_init: CPU KV buffer size = 784.00 MiB |
| 116 | +llama_init_from_model: KV self size = 784.00 MiB, K (q8_0): 272.00 MiB, V (f16): 512.00 MiB |
| 117 | +llama_init_from_model: CPU output buffer size = 0.58 MiB |
| 118 | +llama_init_from_model: CPU compute buffer size = 368.01 MiB |
| 119 | +llama_init_from_model: graph nodes = 2246 |
| 120 | +llama_init_from_model: graph splits = 1 |
| 121 | +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 |
| 122 | +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) |
| 123 | +main: llama threadpool init, n_threads = 24 |
| 124 | + |
| 125 | +system_info: n_threads = 24 (n_threads_batch = 24) / 24 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 | |
| 126 | + |
| 127 | +sampler seed: 2940439051 |
| 128 | +sampler params: |
| 129 | + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 |
| 130 | + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 |
| 131 | + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 |
| 132 | + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 |
| 133 | +sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist |
| 134 | +generate: n_ctx = 4096, n_batch = 2048, n_predict = -1, n_keep = 1 |
| 135 | + |
| 136 | +What is 1+1?<think> |
| 137 | +I need to calculate the sum of two numbers: 1 and 1. |
| 138 | + |
| 139 | +First, I'll identify the numbers involved in the addition: 1 and 1. |
| 140 | + |
| 141 | +Next, I'll add these two numbers together: 1 plus 1. |
| 142 | + |
| 143 | +Finally, the result of the addition is 2. |
| 144 | +</think> |
| 145 | + |
| 146 | +To solve \(1 + 1\), follow these steps: |
| 147 | + |
| 148 | +1. **Identify the numbers to add:** |
| 149 | + The numbers are \(1\) and \(1\). |
| 150 | + |
| 151 | +2. **Add the numbers together:** |
| 152 | + \(1 + 1 = 2\). |
| 153 | + |
| 154 | +3. **Final Answer:** |
| 155 | + \(\boxed{2}\) [end of text] |
| 156 | + |
| 157 | + |
| 158 | +llama_perf_sampler_print: sampling time = 8.30 ms / 147 runs ( 0.06 ms per token, 17708.71 tokens per second) |
| 159 | +llama_perf_context_print: load time = 1083.18 ms |
| 160 | +llama_perf_context_print: prompt eval time = 810.71 ms / 10 tokens ( 81.07 ms per token, 12.33 tokens per second) |
| 161 | +llama_perf_context_print: eval time = 67364.69 ms / 136 runs ( 495.33 ms per token, 2.02 tokens per second) |
| 162 | +llama_perf_context_print: total time = 68215.19 ms / 146 tokens |
0 commit comments