Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 20 additions & 23 deletions llm_exl2_dynamic_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,8 @@ async def stream_response(prompt_id, timeout=180):
def process_prompts():
global partial_responses
global prompt_ids2jobs, prompt_length, cancelled_request_ids
try:

while True:
while True:
try:
while not prompts.empty() or len(prompt_length):
while len(prompt_length) < max_batch_size and not prompts.empty():
prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get()
Expand Down Expand Up @@ -595,29 +594,27 @@ def process_prompts():
# Re-add the valid items back to the queue
for item in temp_storage:
prompts.put(item)



else:
# Sleep for a short duration when there's no work
time.sleep(0.1) # Sleep for 100 milliseconds
except Exception as e:
print("Reset server due to ", e)
print(traceback.format_exc())
for prompt_id in prompt_ids2jobs:
job = prompt_ids2jobs[prompt_id]
if(job.streamer):
## Generator, yield here..
partial_response_data = {
"finish_reason": "stop"
}

responses[prompt_id] = partial_response_data
else:
print("Error handling for full generation current not implemented")
generator.cancel(job)
prompt_ids2jobs = {}
prompt_length = {}

except Exception as e:
print("Reset server due to ", e)
print(traceback.format_exc())
for prompt_id in prompt_ids2jobs:
job = prompt_ids2jobs[prompt_id]
if(job.streamer):
## Generator, yield here..
partial_response_data = {
"finish_reason": "stop"
}

responses[prompt_id] = partial_response_data
else:
print("Error handling for full generation current not implemented")
generator.cancel(job)
prompt_ids2jobs = {}
prompt_length = {}

# Start worker thread
worker = Thread(target=process_prompts)
Expand Down
47 changes: 22 additions & 25 deletions llm_exl2_dynamic_gen_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,9 +451,8 @@ async def stream_response(prompt_id, timeout=180):
def process_prompts():
global partial_responses
global prompt_ids2jobs, prompt_length, prompt_model, cancelled_request_ids
try:

while True:
while True:
try:
while not prompts.empty() or len(prompt_length):
while len(prompt_length) < max_batch_size and not prompts.empty():
prompt_id, prompt, max_tokens, stream, temperature, rmodel, outlines_dict = prompts.get()
Expand Down Expand Up @@ -646,31 +645,29 @@ def process_prompts():
# Re-add the valid items back to the queue
for item in temp_storage:
prompts.put(item)



else:
# Sleep for a short duration when there's no work
time.sleep(0.1) # Sleep for 100 milliseconds
except Exception as e:
print("Reset server due to ", e)
print(traceback.format_exc())
for prompt_id in prompt_ids2jobs:
job = prompt_ids2jobs[prompt_id]
if(job.streamer):
## Generator, yield here..
partial_response_data = {
"finish_reason": "stop"
}

responses[prompt_id] = partial_response_data
else:
print("Error handling for full generation current not implemented")
generators[job.model].cancel(job)
#generator.cancel(job)
prompt_ids2jobs = {}
prompt_length = {}
prompt_model = {}

except Exception as e:
print("Reset server due to ", e)
print(traceback.format_exc())
for prompt_id in prompt_ids2jobs:
job = prompt_ids2jobs[prompt_id]
if(job.streamer):
## Generator, yield here..
partial_response_data = {
"finish_reason": "stop"
}

responses[prompt_id] = partial_response_data
else:
print("Error handling for full generation current not implemented")
generators[job.model].cancel(job)
#generator.cancel(job)
prompt_ids2jobs = {}
prompt_length = {}
prompt_model = {}

# Start worker thread
worker = Thread(target=process_prompts)
Expand Down