@@ -1773,152 +1773,34 @@ def llama_apply_adapter_cvec(
17731773# //
17741774
17751775
1776- # // Information associated with an individual cell in the KV cache view.
1777- # struct llama_kv_cache_view_cell {
1778- # // The position for this cell. Takes KV cache shifts into account.
1779- # // May be negative if the cell is not populated.
1780- # llama_pos pos;
1781- # };
1782- class llama_kv_cache_view_cell (ctypes .Structure ):
1783- """Information associated with an individual cell in the KV cache view.
1784-
1785- Attributes:
1786- pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
1787- May be negative if the cell is not populated."""
1788-
1789- if TYPE_CHECKING :
1790- pos : llama_pos
1791-
1792- _fields_ = [("pos" , llama_pos )]
1793-
1794-
1795- # // An updateable view of the KV cache.
1796- # struct llama_kv_cache_view {
1797- # // Number of KV cache cells. This will be the same as the context size.
1798- # int32_t n_cells;
1799-
1800- # // Maximum number of sequences that can exist in a cell. It's not an error
1801- # // if there are more sequences in a cell than this value, however they will
1802- # // not be visible in the view cells_sequences.
1803- # int32_t n_seq_max;
1804-
1805- # // Number of tokens in the cache. For example, if there are two populated
1806- # // cells, the first with 1 sequence id in it and the second with 2 sequence
1807- # // ids then you'll have 3 tokens.
1808- # int32_t token_count;
1809-
1810- # // Number of populated cache cells.
1811- # int32_t used_cells;
1812-
1813- # // Maximum contiguous empty slots in the cache.
1814- # int32_t max_contiguous;
1815-
1816- # // Index to the start of the max_contiguous slot range. Can be negative
1817- # // when cache is full.
1818- # int32_t max_contiguous_idx;
1819-
1820- # // Information for an individual cell.
1821- # struct llama_kv_cache_view_cell * cells;
1822-
1823-
1824- # // The sequences for each cell. There will be n_seq_max items per cell.
1825- # llama_seq_id * cells_sequences;
1826- # };
1827- class llama_kv_cache_view (ctypes .Structure ):
1828- if TYPE_CHECKING :
1829- n_cells : int
1830- n_max_seq : int
1831- token_count : int
1832- used_cells : int
1833- max_contiguous : int
1834- max_contiguous_idx : int
1835- cells : CtypesArray [llama_kv_cache_view_cell ]
1836- cells_sequences : CtypesArray [llama_seq_id ]
1837-
1838- _fields_ = [
1839- ("n_cells" , ctypes .c_int32 ),
1840- ("n_max_seq" , ctypes .c_int32 ),
1841- ("token_count" , ctypes .c_int32 ),
1842- ("used_cells" , ctypes .c_int32 ),
1843- ("max_contiguous" , ctypes .c_int32 ),
1844- ("max_contiguous_idx" , ctypes .c_int32 ),
1845- ("cells" , ctypes .POINTER (llama_kv_cache_view_cell )),
1846- ("cells_sequences" , ctypes .POINTER (llama_seq_id )),
1847- ]
1848-
1849-
1850- llama_kv_cache_view_p = ctypes .POINTER (llama_kv_cache_view )
1851-
1852-
1853- # // Create an empty KV cache view. (use only for debugging purposes)
1854- # LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
1855- @ctypes_function (
1856- "llama_kv_cache_view_init" ,
1857- [llama_context_p_ctypes , ctypes .c_int32 ],
1858- llama_kv_cache_view ,
1859- )
1860- def llama_kv_cache_view_init (
1861- ctx : llama_context_p , n_seq_max : Union [ctypes .c_int32 , int ], /
1862- ) -> llama_kv_cache_view :
1863- """Create an empty KV cache view. (use only for debugging purposes)"""
1864- ...
1865-
1866-
1867- # // Free a KV cache view. (use only for debugging purposes)
1868- # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
1869- @ctypes_function ("llama_kv_cache_view_free" , [llama_kv_cache_view_p ], None )
1870- def llama_kv_cache_view_free (view : "ctypes.pointer[llama_kv_cache_view]" , / ): # type: ignore
1871- """Free a KV cache view. (use only for debugging purposes)"""
1872- ...
1873-
1874-
1875- # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
1876- # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
1877- @ctypes_function (
1878- "llama_kv_cache_view_update" , [llama_context_p_ctypes , llama_kv_cache_view_p ], None
1879- )
1880- def llama_kv_cache_view_update (ctx : llama_context_p , view : CtypesPointerOrRef [llama_kv_cache_view ], / ): # type: ignore
1881- """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
1882- ...
1883-
1884-
18851776# // Returns the number of tokens in the KV cache (slow, use only for debug)
18861777# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1887- # LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
1778+ # DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
1779+ # "Use llama_kv_self_seq_pos_max() instead");
18881780@ctypes_function (
18891781 "llama_kv_self_n_tokens" , [llama_context_p_ctypes ], ctypes .c_int32
18901782)
18911783def llama_kv_self_n_tokens (ctx : llama_context_p , / ) -> int :
1892- """Returns the number of tokens in the KV cache (slow, use only for debug)
1893- If a KV cell has multiple sequences assigned to it, it will be counted multiple times
18941784 """
1895- ...
1896-
1897- # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
1898- # "use llama_kv_self_n_tokens instead");
1899- @ctypes_function (
1900- "llama_get_kv_cache_token_count" , [llama_context_p_ctypes ], ctypes .c_int32
1901- )
1902- def llama_get_kv_cache_token_count (ctx : llama_context_p , / ) -> int :
1785+ DEPRECATED
1786+ Use llama_kv_self_seq_pos_max() instead
1787+ """
19031788 ...
19041789
19051790
19061791# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
1907- # LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
1792+ # DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
1793+ # "Use llama_kv_self_seq_pos_max() instead");
19081794@ctypes_function (
19091795 "llama_kv_self_used_cells" , [llama_context_p_ctypes ], ctypes .c_int32
19101796)
19111797def llama_kv_self_used_cells (ctx : llama_context_p , / ) -> int :
1912- """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1798+ """
1799+ DEPRECATED
1800+ Use llama_kv_self_seq_pos_max() instead
1801+ """
19131802 ...
19141803
1915- # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
1916- # "use llama_kv_self_used_cells instead");
1917- @ctypes_function (
1918- "llama_get_kv_cache_used_cells" , [llama_context_p_ctypes ], ctypes .c_int32
1919- )
1920- def llama_get_kv_cache_used_cells (ctx : llama_context_p , / ) -> int :
1921- ...
19221804
19231805# // Clear the KV cache - both cell info is erased and KV data is zeroed
19241806# LLAMA_API void llama_kv_self_clear(
@@ -1928,12 +1810,6 @@ def llama_kv_self_clear(ctx: llama_context_p, /):
19281810 """Clear the KV cache"""
19291811 ...
19301812
1931- # DEPRECATED(LLAMA_API void llama_kv_cache_clear(struct llama_context * ctx),
1932- # "use llama_kv_self_clear instead");
1933- @ctypes_function ("llama_kv_cache_clear" , [llama_context_p_ctypes ], None )
1934- def llama_kv_cache_clear (ctx : llama_context_p , / ):
1935- """Clear the KV cache"""
1936- ...
19371813
19381814# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
19391815# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@@ -1972,32 +1848,6 @@ def llama_kv_self_seq_rm(
19721848 ...
19731849
19741850
1975- # DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
1976- # struct llama_context * ctx,
1977- # llama_seq_id seq_id,
1978- # llama_pos p0,
1979- # llama_pos p1),
1980- # "use llama_kv_self_seq_rm instead");
1981- @ctypes_function (
1982- "llama_kv_cache_seq_rm" ,
1983- [
1984- llama_context_p_ctypes ,
1985- llama_seq_id ,
1986- llama_pos ,
1987- llama_pos ,
1988- ],
1989- ctypes .c_bool ,
1990- )
1991- def llama_kv_cache_seq_rm (
1992- ctx : llama_context_p ,
1993- seq_id : Union [llama_seq_id , int ],
1994- p0 : Union [llama_pos , int ],
1995- p1 : Union [llama_pos , int ],
1996- / ,
1997- ) -> bool :
1998- ...
1999-
2000-
20011851# // Copy all tokens that belong to the specified sequence to another sequence
20021852# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
20031853# // p0 < 0 : [0, p1]
@@ -2033,34 +1883,6 @@ def llama_kv_self_seq_cp(
20331883 p1 < 0 : [p0, inf)"""
20341884 ...
20351885
2036- # DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
2037- # struct llama_context * ctx,
2038- # llama_seq_id seq_id_src,
2039- # llama_seq_id seq_id_dst,
2040- # llama_pos p0,
2041- # llama_pos p1),
2042- # "use llama_kv_self_seq_cp instead");
2043- @ctypes_function (
2044- "llama_kv_cache_seq_cp" ,
2045- [
2046- llama_context_p_ctypes ,
2047- llama_seq_id ,
2048- llama_seq_id ,
2049- llama_pos ,
2050- llama_pos ,
2051- ],
2052- None ,
2053- )
2054- def llama_kv_cache_seq_cp (
2055- ctx : llama_context_p ,
2056- seq_id_src : Union [llama_seq_id , int ],
2057- seq_id_dst : Union [llama_seq_id , int ],
2058- p0 : Union [llama_pos , int ],
2059- p1 : Union [llama_pos , int ],
2060- / ,
2061- ):
2062- ...
2063-
20641886
20651887# // Removes all tokens that do not belong to the specified sequence
20661888# LLAMA_API void llama_kv_self_seq_keep(
@@ -2073,17 +1895,6 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int
20731895 """Removes all tokens that do not belong to the specified sequence"""
20741896 ...
20751897
2076- # DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
2077- # struct llama_context * ctx,
2078- # llama_seq_id seq_id),
2079- # "use llama_kv_self_seq_keep instead");
2080- @ctypes_function (
2081- "llama_kv_cache_seq_keep" , [llama_context_p_ctypes , llama_seq_id ], None
2082- )
2083- def llama_kv_cache_seq_keep (ctx : llama_context_p , seq_id : Union [llama_seq_id , int ], / ):
2084- """Removes all tokens that do not belong to the specified sequence"""
2085- ...
2086-
20871898
20881899# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
20891900# // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -2124,34 +1935,6 @@ def llama_kv_self_seq_add(
21241935 p1 < 0 : [p0, inf)"""
21251936 ...
21261937
2127- # DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
2128- # struct llama_context * ctx,
2129- # llama_seq_id seq_id,
2130- # llama_pos p0,
2131- # llama_pos p1,
2132- # llama_pos delta),
2133- # "use llama_kv_self_seq_add instead");
2134- @ctypes_function (
2135- "llama_kv_cache_seq_add" ,
2136- [
2137- llama_context_p_ctypes ,
2138- llama_seq_id ,
2139- llama_pos ,
2140- llama_pos ,
2141- llama_pos ,
2142- ],
2143- None ,
2144- )
2145- def llama_kv_cache_seq_add (
2146- ctx : llama_context_p ,
2147- seq_id : Union [llama_seq_id , int ],
2148- p0 : Union [llama_pos , int ],
2149- p1 : Union [llama_pos , int ],
2150- delta : Union [llama_pos , int ],
2151- / ,
2152- ):
2153- ...
2154-
21551938
21561939# // Integer division of the positions by factor of `d > 1`
21571940# // If the KV cache is RoPEd, the KV data is updated accordingly
@@ -2189,35 +1972,6 @@ def llama_kv_self_seq_div(
21891972 ...
21901973
21911974
2192- # DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
2193- # struct llama_context * ctx,
2194- # llama_seq_id seq_id,
2195- # llama_pos p0,
2196- # llama_pos p1,
2197- # int d),
2198- # "use llama_kv_self_seq_div instead");
2199- @ctypes_function (
2200- "llama_kv_cache_seq_div" ,
2201- [
2202- llama_context_p_ctypes ,
2203- llama_seq_id ,
2204- llama_pos ,
2205- llama_pos ,
2206- ctypes .c_int ,
2207- ],
2208- None ,
2209- )
2210- def llama_kv_cache_seq_div (
2211- ctx : llama_context_p ,
2212- seq_id : Union [llama_seq_id , int ],
2213- p0 : Union [llama_pos , int ],
2214- p1 : Union [llama_pos , int ],
2215- d : Union [ctypes .c_int , int ],
2216- / ,
2217- ):
2218- ...
2219-
2220-
22211975# // Returns the smallest position present in the KV cache for the specified sequence
22221976# // This is typically non-zero only for SWA caches
22231977# // Return -1 if the sequence is empty
@@ -2273,26 +2027,13 @@ def llama_kv_self_defrag(ctx: llama_context_p, /):
22732027 ...
22742028
22752029
2276- # DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
2277- # "use llama_kv_self_defrag instead");
2278- @ctypes_function ("llama_kv_cache_defrag" , [llama_context_p_ctypes ], None )
2279- def llama_kv_cache_defrag (ctx : llama_context_p , / ):
2280- ...
2281-
2282-
22832030# // Check if the context supports KV cache shifting
22842031# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx);
22852032@ctypes_function ("llama_kv_self_can_shift" , [llama_context_p_ctypes ], ctypes .c_bool )
22862033def llama_kv_self_can_shift (ctx : llama_context_p , / ) -> bool :
22872034 """Check if the context supports KV cache shifting"""
22882035 ...
22892036
2290- # DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
2291- # "use llama_kv_self_can_shift instead");
2292- @ctypes_function ("llama_kv_cache_can_shift" , [llama_context_p_ctypes ], ctypes .c_bool )
2293- def llama_kv_cache_can_shift (ctx : llama_context_p , / ) -> bool :
2294- ...
2295-
22962037
22972038# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
22982039# LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
@@ -2301,12 +2042,6 @@ def llama_kv_self_update(ctx: llama_context_p, /):
23012042 """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
23022043 ...
23032044
2304- # DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
2305- # "use llama_kv_self_update instead");
2306- @ctypes_function ("llama_kv_cache_update" , [llama_context_p_ctypes ], None )
2307- def llama_kv_cache_update (ctx : llama_context_p , / ):
2308- ...
2309-
23102045
23112046# //
23122047# // State / sessions
0 commit comments