inference_benchmarking/chatgpt.py at main · bmosaicml/inference_benchmarking · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import openai
import time
from dotenv import dotenv_values
config = dotenv_values(".env")

openai.api_key = config['OPENAI']


ITERS = 5
with open('openai.tsv', 'w') as f:
    f.write(f"model_name\tprompt_len\toutput_len\tcall_latency\tsecs_per_tok\n")
    for num1 in [ 256]:
        for num2 in [ 512]:
            prompt = f"Here are the first {num1} even numbers: " + ', '.join([str(2 * i) for i in range(1, num1+1)]) + f"\nGive the first {num2} odd numbers:"
            for engine in ["gpt-3.5-turbo", "text-babbage-001", "text-davinci-003", "text-curie-001",  "davinci-instruct-beta"]:
                print(f"{engine}")
                avg_latency = 0.0
                avg_latency_per_token = 0.0
                avg_prompt_len = 0.0
                avg_output_len = 0.0
                for i in range(ITERS):
                    if engine == "gpt-3.5-turbo":
                        start = time.time()
                        response = openai.ChatCompletion.create(
                            model=engine,
                            messages=[
                                {"role": "system", "content": prompt},
                            ],
                            max_tokens=num2*2,
                            temperature=0.25,
                            top_p=1,
                            frequency_penalty=0.5,
                            presence_penalty=0
                        )
                        end = time.time()
                    else:
                        try:
                            try:
                                start = time.time()
                                response = openai.ChatCompletion.create(
                                    engine=engine,
                                    prompt=prompt,
                                    max_tokens=num2*2,
                                    temperature=0.25,
                                    top_p=1,
                                    frequency_penalty=0.5,
                                    presence_penalty=0
                                )
                                end = time.time()
                            except:
                                start = time.time()
                                response = openai.Completion.create(
                                    engine=engine,
                                    prompt=prompt,
                                    max_tokens=num2*2,
                                    temperature=0.25,
                                    top_p=1,
                                    frequency_penalty=0.5,
                                    presence_penalty=0
                                )
                                end = time.time()
                        except openai.error.RateLimitError:
                            time.sleep(10)

                    prompt_len =  response.usage.prompt_tokens
                    output_len =  response.usage.completion_tokens

                    avg_latency += end-start
                    avg_latency_per_token += (end-start)/output_len
                    avg_prompt_len += prompt_len
                    avg_output_len += output_len

                f.write(f"{engine}\t{avg_prompt_len/ITERS}\t{avg_output_len/ITERS}\t{avg_latency/ITERS}\t{avg_latency_per_token/ITERS}\n")
                f.flush()