-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Description
Hey. So I'm trying to make an OpenAssistant API, in order to use OpenAssistant as a fallback for a chatbot I'm trying to make (I'm using IBM Watson for the chatbot for what it's worth). To do so, I'm trying to get the Pythia 12B model (OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5) up and running on a cloud GPU on Google Cloud. I'm using a NVIDIA L4 GPU, and the machine I'm using has 16 vCPUs and 64 GB memory.
Here's the current code I have for my API right now
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import pdb
app = Flask(__name__)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
MODEL_NAME = "/home/bautista0848/text-generation-webui/models/OpenAssistant_oasst-sft-4-pythia-12b-epoch-3.5"
# pdb.set_trace()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", load_in_8bit=True)
# Get max context length and the determine cushion for response
MAX_CONTEXT_LENGTH = model.config.max_position_embeddings
print(f"Max context length: {MAX_CONTEXT_LENGTH}")
ROOM_FOR_RESPONSE = 512
model = model.cuda()
@app.route('/generate', methods=['POST'])
def generate():
content = request.json
inp = content.get("text", "")
# Generate both input_ids and attention_mask
encoded_inputs = tokenizer.encode_plus(inp, return_tensors="pt", padding='max_length', max_length=MAX_CONTEXT_LENGTH, truncation=True)
input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']
# Calc current size
print("Context length is currently", input_ids.shape[1], "tokens. Allowed amount is", MAX_CONTEXT_LENGTH-ROOM_FOR_RESPONSE, "tokens.")
# determine if we need to trim
if input_ids.shape[1] > (MAX_CONTEXT_LENGTH-ROOM_FOR_RESPONSE):
print("Trimming a bit")
# trim as needed AT the first dimension
input_ids = input_ids[:, -(MAX_CONTEXT_LENGTH-ROOM_FOR_RESPONSE):]
attention_mask = attention_mask[:, -(MAX_CONTEXT_LENGTH-ROOM_FOR_RESPONSE):] # trim attention_mask as well
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
print("Min Value in input_ids: ", torch.min(input_ids))
print("Max Value in input_ids: ", torch.max(input_ids))
print("Contains NaN: ", torch.isnan(input_ids).any())
print("Contains Inf: ", torch.isinf(input_ids).any())
print("Min Value in attention_mask: ", torch.min(attention_mask))
print("Max Value in attention_mask: ", torch.max(attention_mask))
print("Contains NaN: ", torch.isnan(attention_mask).any())
print("Contains Inf: ", torch.isinf(attention_mask).any())
with torch.cuda.amp.autocast():
output = model.generate(input_ids, attention_mask=attention_mask, max_length=2048, do_sample=True, early_stopping=True, num_return_sequences=1, eos_token_id=model.config.eos_token_id)
decoded = tokenizer.decode(output[0], skip_special_tokens=False)
return jsonify({'generated_text': decoded})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000) # Set the host to '0.0.0.0' to make it accessible from your local network
I also created a file to test the API to see if it's working, it can be seen below
import requests
import json
import colorama
SERVER_IP = "10.128.0.2"
URL = f"http://{SERVER_IP}:5000/generate"
USERTOKEN = "<|prompter|>"
ENDTOKEN = "<|endoftext|>"
ASSISTANTTOKEN = "<|assistant|>"
def prompt(inp):
data = {"text": inp}
headers = {'Content-type': 'application/json'}
response = requests.post(URL, data=json.dumps(data), headers=headers)
if response.status_code == 200:
return response.json()["generated_text"]
else:
return "Error:", response.status_code
history = ""
while True:
inp = input(">>> ")
context = history + USERTOKEN + inp + ENDTOKEN + ASSISTANTTOKEN
output = prompt(context)
if isinstance(output, tuple): # handle the error case
print(f"Error: {output[1]}")
else:
history = output
just_latest_asst_output = output.split(ASSISTANTTOKEN)[-1].split(ENDTOKEN)[0]
# color just_latest_asst_output green in print:
print(colorama.Fore.GREEN + just_latest_asst_output + colorama.Style.RESET_ALL)
The logs I'm getting for the error can be found below:
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
/home/bautista0848/text-generation-webui/venv2/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:318: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
[2023-06-06 18:02:26,392] ERROR in app: Exception on /generate [POST]
Traceback (most recent call last):
File "/home/bautista0848/text-generation-webui/venv2/lib/python3.10/site-packages/flask/app.py", line 2190, in wsgi_app
response = self.full_dispatch_request()
File "/home/bautista0848/text-generation-webui/venv2/lib/python3.10/site-packages/flask/app.py", line 1486, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/home/bautista0848/text-generation-webui/venv2/lib/python3.10/site-packages/flask/app.py", line 1484, in full_dispatch_request
rv = self.dispatch_request()
File "/home/bautista0848/text-generation-webui/venv2/lib/python3.10/site-packages/flask/app.py", line 1469, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
File "/home/bautista0848/text-generation-webui/app.py", line 58, in generate
output = model.generate(input_ids, attention_mask=attention_mask, max_length=2048, do_sample=True, early_stopping=True, num_return_sequences=1, eos_token_id=model.config.eos_token_id)
File "/home/bautista0848/text-generation-webui/venv2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/bautista0848/text-generation-webui/venv2/lib/python3.10/site-packages/transformers/generation/utils.py", line 1568, in generate
return self.sample(
File "/home/bautista0848/text-generation-webui/venv2/lib/python3.10/site-packages/transformers/generation/utils.py", line 2651, in sample
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
RuntimeError: probability tensor contains either `inf`, `nan` or element < 0
I have tried to debug what's going on by printing the values of both my "input_ids" and "attention_mask" tensors, as demonstrated from a snippet of my API code
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
print("Min Value in input_ids: ", torch.min(input_ids))
print("Max Value in input_ids: ", torch.max(input_ids))
print("Contains NaN: ", torch.isnan(input_ids).any())
print("Contains Inf: ", torch.isinf(input_ids).any())
print("Min Value in attention_mask: ", torch.min(attention_mask))
print("Max Value in attention_mask: ", torch.max(attention_mask))
print("Contains NaN: ", torch.isnan(attention_mask).any())
print("Contains Inf: ", torch.isinf(attention_mask).any())
The output I get is
Min Value in input_ids: tensor(1, device='cuda:0')
Max Value in input_ids: tensor(1, device='cuda:0')
Contains NaN: tensor(False, device='cuda:0')
Contains Inf: tensor(False, device='cuda:0')
Min Value in attention_mask: tensor(0, device='cuda:0')
Max Value in attention_mask: tensor(0, device='cuda:0')
Contains NaN: tensor(False, device='cuda:0')
Contains Inf: tensor(False, device='cuda:0')
Now I don't think that the mins and maxes of either of my tensors should be the same, nor should the values be strictly '0' or '1', so I'm led to believe that something is wrong when transferring my values to the GPU. If anyone can please help me out, I would gladly appreciate it!