1
1
import requests
2
- import warnings
3
2
import re
4
- import rich
3
+ import logging
5
4
import gradio as gr
6
5
from urllib .parse import urljoin
7
6
from config import AppSettings
8
7
9
8
from langchain .schema import HumanMessage , AIMessage , SystemMessage
10
9
from langchain_openai import ChatOpenAI
10
+ import openai
11
11
12
- print ("\n Starting app \n ---------------\n " )
12
+ logging .basicConfig ()
13
+ logger = logging .getLogger (__name__ )
14
+ logger .setLevel (logging .INFO )
15
+
16
+ logger .info ("Starting app" )
13
17
14
18
settings = AppSettings .load ("./settings.yml" )
15
- print ("App settings:" )
16
- rich .print (settings )
19
+ logger .info ("App settings: %s" , settings )
17
20
18
21
backend_url = str (settings .backend_url )
19
22
backend_health_endpoint = urljoin (backend_url , "/health" )
20
- backend_initialised = False
21
-
22
- # NOTE(sd109): The Mistral family of models explicitly require a chat
23
- # history of the form user -> ai -> user -> ... and so don't like having
24
- # a SystemPrompt at the beginning. Since these models seem to be the
25
- # best around right now, it makes sense to treat them as special and make
26
- # sure the web app works correctly with them. To do so, we detect when a
27
- # mistral model is specified using this regex and then handle it explicitly
28
- # when contructing the `context` list in the `inference` function below.
29
- MISTRAL_REGEX = re .compile (r".*mi(s|x)tral.*" , re .IGNORECASE )
30
- IS_MISTRAL_MODEL = MISTRAL_REGEX .match (settings .model_name ) is not None
31
- if IS_MISTRAL_MODEL :
32
- print (
33
- "Detected Mistral model - will alter LangChain conversation format appropriately."
34
- )
23
+ BACKEND_INITIALISED = False
24
+
25
+ # # NOTE(sd109): The Mistral family of models explicitly require a chat
26
+ # # history of the form user -> ai -> user -> ... and so don't like having
27
+ # # a SystemPrompt at the beginning. Since these models seem to be the
28
+ # # best around right now, it makes sense to treat them as special and make
29
+ # # sure the web app works correctly with them. To do so, we detect when a
30
+ # # mistral model is specified using this regex and then handle it explicitly
31
+ # # when contructing the `context` list in the `inference` function below.
32
+ # MISTRAL_REGEX = re.compile(r".*mi(s|x)tral.*", re.IGNORECASE)
33
+ # IS_MISTRAL_MODEL = MISTRAL_REGEX.match(settings.model_name) is not None
34
+ # if IS_MISTRAL_MODEL:
35
+ # print(
36
+ # "Detected Mistral model - will alter LangChain conversation format appropriately."
37
+ # )
38
+
39
+ # Some models disallow 'system' role's their conversation history by raising errors in their chat prompt template, e.g. see
40
+ # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/blob/cf47bb3e18fe41a5351bc36eef76e9c900847c89/tokenizer_config.json#L42
41
+ # Detecting this ahead of time is difficult so for now we use a global variable which stores whether the API has
42
+ # responded with a HTTP 400 error and retry request without system role replaced by
43
+ INCLUDE_SYSTEM_PROMPT = True
35
44
36
45
llm = ChatOpenAI (
37
46
base_url = urljoin (backend_url , "v1" ),
50
59
51
60
def inference (latest_message , history ):
52
61
# Check backend health and warn the user on error
53
- try :
54
- response = requests .get (backend_health_endpoint , timeout = 5 )
55
- if response .status_code == 200 :
56
- global backend_initialised
57
- if not backend_initialised :
58
- # Record the fact that backend was up at one point so we know that
59
- # any future errors are not related to slow model initialisation
60
- backend_initialised = True
61
- else :
62
- # If the server's running (i.e. we get a response) but it's not an HTTP 200
63
- # we just hope Kubernetes reconciles things for us eventually..
64
- raise gr .Error ("Backend unhealthy - please try again later" )
65
- except Exception as err :
66
- warnings .warn (f"Error while checking backend health: { err } " )
67
- if backend_initialised :
68
- # If backend was previously reachable then something unexpected has gone wrong
69
- raise gr .Error ("Backend unreachable" )
70
- else :
71
- # In this case backend is probably still busy downloading model weights
72
- raise gr .Error ("Backend not ready yet - please try again later" )
62
+ # try:
63
+ # response = requests.get(backend_health_endpoint, timeout=5)
64
+ # response_code = response.status_code
65
+ # if response_code == 200:
66
+ # global backend_initialised
67
+ # if not backend_initialised:
68
+ # # Record the fact that backend was up at one point so we know that
69
+ # # any future errors are not related to slow model initialisation
70
+ # backend_initialised = True
71
+ # elif response_code >= 400 and response_code < 500:
72
+ # logging.warn(f"Received HTTP {response_code} response from backend. Full response: {response.text}")
73
+ # else:
74
+ # # If the server's running (i.e. we get a response) but it's not an HTTP 200
75
+ # # we just hope Kubernetes reconciles things for us eventually..
76
+ # raise gr.Error("Backend unhealthy - please try again later")
77
+ # except Exception as err:
78
+ # warnings.warn(f"Error while checking backend health: {err}")
79
+ # if backend_initialised:
80
+ # # If backend was previously reachable then something unexpected has gone wrong
81
+ # raise gr.Error("Backend unreachable")
82
+ # else:
83
+ # # In this case backend is probably still busy downloading model weights
84
+ # raise gr.Error("Backend not ready yet - please try again later")
85
+
86
+ # try:
87
+ # # To handle Mistral models we have to add the model instruction to
88
+ # # the first user message since Mistral requires user -> ai -> user
89
+ # # chat format and therefore doesn't allow system prompts.
90
+ # context = []
91
+ # if not IS_MISTRAL_MODEL:
92
+ # context.append(SystemMessage(content=settings.model_instruction))
93
+ # for i, (human, ai) in enumerate(history):
94
+ # if IS_MISTRAL_MODEL and i == 0:
95
+ # context.append(
96
+ # HumanMessage(content=f"{settings.model_instruction}\n\n{human}")
97
+ # )
98
+ # else:
99
+ # context.append(HumanMessage(content=human))
100
+ # context.append(AIMessage(content=ai))
101
+ # context.append(HumanMessage(content=latest_message))
102
+
103
+ # response = ""
104
+ # for chunk in llm.stream(context):
105
+ # # print(chunk)
106
+ # # NOTE(sd109): For some reason the '>' character breaks the UI
107
+ # # so we need to escape it here.
108
+ # # response += chunk.content.replace('>', '\>')
109
+ # # UPDATE(sd109): Above bug seems to have been fixed as of gradio 4.15.0
110
+ # # but keeping this note here incase we enounter it again
111
+ # response += chunk.content
112
+ # yield response
113
+
114
+ # # For all other errors notify user and log a more detailed warning
115
+ # except Exception as err:
116
+ # warnings.warn(f"Exception encountered while generating response: {err}")
117
+ # raise gr.Error(
118
+ # "Unknown error encountered - see application logs for more information."
119
+ # )
120
+
121
+
122
+ # Allow mutating global variables
123
+ global BACKEND_INITIALISED , INCLUDE_SYSTEM_PROMPT
73
124
74
125
try :
75
- # To handle Mistral models we have to add the model instruction to
76
- # the first user message since Mistral requires user -> ai -> user
77
- # chat format and therefore doesn't allow system prompts.
78
- context = []
79
- if not IS_MISTRAL_MODEL :
80
- context . append ( SystemMessage ( content = settings . model_instruction ))
126
+ # Attempt to handle models which disallow system prompts
127
+ # Construct conversation history for model prompt
128
+ if INCLUDE_SYSTEM_PROMPT :
129
+ context = [SystemMessage ( content = settings . model_instruction ) ]
130
+ else :
131
+ context = []
81
132
for i , (human , ai ) in enumerate (history ):
82
- if IS_MISTRAL_MODEL and i == 0 :
83
- context .append (
84
- HumanMessage (content = f"{ settings .model_instruction } \n \n { human } " )
85
- )
86
- else :
87
- context .append (HumanMessage (content = human ))
88
- context .append (AIMessage (content = ai ))
133
+ if not INCLUDE_SYSTEM_PROMPT and i == 0 :
134
+ # Mimic system prompt by prepending it to first human message
135
+ human = f"{ settings .model_instruction } \n \n { human } "
136
+ context .append (HumanMessage (content = human ))
137
+ context .append (AIMessage (content = (ai or "" )))
89
138
context .append (HumanMessage (content = latest_message ))
90
139
91
140
response = ""
92
141
for chunk in llm .stream (context ):
93
- # print(chunk)
142
+
143
+ # If this is our first successful response from the backend
144
+ # then update the status variable
145
+ if not BACKEND_INITIALISED and len (response ) > 0 :
146
+ BACKEND_INITIALISED = True
147
+
94
148
# NOTE(sd109): For some reason the '>' character breaks the UI
95
149
# so we need to escape it here.
96
150
# response += chunk.content.replace('>', '\>')
@@ -99,12 +153,31 @@ def inference(latest_message, history):
99
153
response += chunk .content
100
154
yield response
101
155
102
- # For all other errors notify user and log a more detailed warning
156
+ except openai .BadRequestError as err :
157
+ logger .error ("Received BadRequestError from backend API: %s" , err )
158
+ message = err .response .json ()['message' ]
159
+ if INCLUDE_SYSTEM_PROMPT :
160
+ INCLUDE_SYSTEM_PROMPT = False
161
+ # TODO: Somehow retry same inference step without system prompt
162
+ pass
163
+ ui_message = f"API Error received. This usually means the chosen LLM uses an incompatible prompt format. Error message was: { message } "
164
+ raise gr .Error (ui_message )
165
+
166
+ except openai .APIConnectionError as err :
167
+ if not BACKEND_INITIALISED :
168
+ logger .info ("Backend API not yet ready" )
169
+ gr .Info ("Backend not ready - model may still be initialising - please try again later" )
170
+ else :
171
+ logger .error ("Failed to connect to backend API: %s" , err )
172
+ gr .Warning ("Failed to connect to backend API" )
173
+
174
+ except openai .InternalServerError as err :
175
+ gr .Warning ("Internal server error encountered in backend API - see API logs for details." )
176
+
177
+ # Catch-all for unexpected exceptions
103
178
except Exception as err :
104
- warnings .warn (f"Exception encountered while generating response: { err } " )
105
- raise gr .Error (
106
- "Unknown error encountered - see application logs for more information."
107
- )
179
+ logger .error ("Unexpected error during inference: %s" , err )
180
+ raise gr .Error ("Unexpected error encountered - see logs for details." )
108
181
109
182
110
183
# UI colour theming
@@ -146,5 +219,5 @@ def inference(latest_message, history):
146
219
theme = theme ,
147
220
css = css_overrides ,
148
221
) as app :
149
- # app.launch(server_name="0.0.0.0")
222
+ # app.launch(server_name="0.0.0.0") # Do we need this for k8s service?
150
223
app .launch ()
0 commit comments