@@ -290,6 +290,55 @@ output1 = client.get_output(**input1)
290290 </li>
291291</ul >
292292
293+
294+ ## 4 Basic Usage
295+
296+ ### 4.1 Initialize a Speculation Engine
297+
298+ ``` python
299+ from umbrella.speculation.auto_engine import AutoEngine
300+ DEVICE = " cuda:0"
301+ engine = AutoEngine.from_config(device = DEVICE , ** config)
302+ engine.initialize()
303+ ```
304+ ### 4.2 Prefill, Append and Decode
305+
306+ ``` python
307+ GEN_LEN = 512
308+ text1 = " Tell me what you know about Reinforcement Learning in 100 words."
309+ text2 = " Tell me what you know about LSH in 100 words."
310+
311+ engine.prefill(text1) # The first operation must be prefilling
312+ engine.speculative_decoding(max_new_tokens = GEN_LEN )
313+
314+ engine.append(text2)
315+ engine.speculative_decoding(max_new_tokens = GEN_LEN )
316+ ```
317+
318+ ### 4.3 Other functions for API and Gradio
319+
320+ ``` python
321+ output = engine.generate(
322+ context = prompt,
323+ max_new_tokens = max_new_tokens,
324+ temperature = temperature,
325+ top_p = top_p,
326+ repetition_penalty = repetition_penalty,
327+ )
328+ # return a dict containing token ids and detokenized texts
329+ # context=prompt (str) can be replaced by input_ids=tokens list[int]
330+
331+ stream = engine.generate_stream(
332+ context = prompt,
333+ max_new_tokens = max_new_tokens,
334+ temperature = temperature,
335+ top_p = top_p,
336+ repetition_penalty = repetition_penalty,
337+ )
338+ # return a stream containing detokenized texts
339+ # context=prompt (str) can be replaced by input_ids=tokens list[int]
340+ ```
341+
293342## Reference
294343``` bibtex
295344@article{chen2024sequoia,
0 commit comments