@@ -63,10 +63,18 @@ def __init__(
63
63
num_gpu_blocks : int ,
64
64
num_cpu_blocks : int ,
65
65
watermark : float = 0.01 ,
66
+ sliding_window : Optional [int ] = None ,
66
67
) -> None :
67
68
self .block_size = block_size
68
69
self .num_total_gpu_blocks = num_gpu_blocks
69
70
self .num_total_cpu_blocks = num_cpu_blocks
71
+
72
+ self .block_sliding_window = None
73
+ if sliding_window is not None :
74
+ assert sliding_window % block_size == 0 , (sliding_window ,
75
+ block_size )
76
+ self .block_sliding_window = sliding_window // block_size
77
+
70
78
self .watermark = watermark
71
79
assert watermark >= 0.0
72
80
@@ -83,6 +91,9 @@ def can_allocate(self, seq_group: SequenceGroup) -> bool:
83
91
# the same prompt. This may not be true for preempted sequences.
84
92
seq = seq_group .get_seqs ()[0 ]
85
93
num_required_blocks = len (seq .logical_token_blocks )
94
+ if self .block_sliding_window is not None :
95
+ num_required_blocks = min (num_required_blocks ,
96
+ self .block_sliding_window )
86
97
num_free_gpu_blocks = self .gpu_allocator .get_num_free_blocks ()
87
98
# Use watermark to avoid frequent cache eviction.
88
99
return (num_free_gpu_blocks - num_required_blocks >=
@@ -95,8 +106,12 @@ def allocate(self, seq_group: SequenceGroup) -> None:
95
106
96
107
# Allocate new physical token blocks that will store the prompt tokens.
97
108
block_table : BlockTable = []
98
- for _ in range (len (seq .logical_token_blocks )):
99
- block = self .gpu_allocator .allocate ()
109
+ for logical_idx in range (len (seq .logical_token_blocks )):
110
+ if (self .block_sliding_window is not None
111
+ and logical_idx >= self .block_sliding_window ):
112
+ block = block_table [logical_idx % self .block_sliding_window ]
113
+ else :
114
+ block = self .gpu_allocator .allocate ()
100
115
# Set the reference counts of the token blocks.
101
116
block .ref_count = seq_group .num_seqs ()
102
117
block_table .append (block )
@@ -118,11 +133,17 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
118
133
block_table = self .block_tables [seq .seq_id ]
119
134
120
135
if len (block_table ) < len (logical_blocks ):
121
- # The sequence has a new logical block.
122
- # Allocate a new physical block.
123
- block = self .gpu_allocator .allocate ()
124
- block_table .append (block )
125
- return None
136
+ if (self .block_sliding_window
137
+ and len (block_table ) >= self .block_sliding_window ):
138
+ # re-use a block
139
+ block_table .append (block_table [len (block_table ) %
140
+ self .block_sliding_window ])
141
+ else :
142
+ # The sequence has a new logical block.
143
+ # Allocate a new physical block.
144
+ block = self .gpu_allocator .allocate ()
145
+ block_table .append (block )
146
+ return None
126
147
127
148
# We want to append the token to the last physical block.
128
149
last_block = block_table [- 1 ]
@@ -154,9 +175,7 @@ def _get_physical_blocks(
154
175
for seq in seq_group .get_seqs ():
155
176
if seq .is_finished ():
156
177
continue
157
- block_table = self .block_tables [seq .seq_id ]
158
- for block in block_table :
159
- blocks .add (block )
178
+ blocks .update (self .block_tables [seq .seq_id ])
160
179
return list (blocks )
161
180
162
181
def can_swap_in (self , seq_group : SequenceGroup ) -> bool :
@@ -224,7 +243,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
224
243
return block_number_mapping
225
244
226
245
def _free_block_table (self , block_table : BlockTable ) -> None :
227
- for block in block_table :
246
+ for block in set ( block_table ) :
228
247
if block .device == Device .GPU :
229
248
self .gpu_allocator .free (block )
230
249
else :
0 commit comments