Skip to content

Commit a8f6fa0

Browse files
committed
much cleaner sandboxing machnism, and by default increases response time, fixing code issues
1 parent 9ab0c84 commit a8f6fa0

File tree

9 files changed

+135
-103
lines changed

9 files changed

+135
-103
lines changed

.env.template

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
# Provide frontend configuration settings from environment variables
2-
SHOW_KEYS_ENABLED=true
3-
SESSION_ENABLED=true # if true, the session id will be used to track the user's session
2+
# You can override these settings when lauching the app as well:
3+
# python -m data_formulator -p 5000 --exec-python-in-subprocess true --disable-display-keys true
4+
5+
DISABLE_DISPLAY_KEYS=false # if true, the display keys will not be shown in the frontend
6+
EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response

py-src/data_formulator/agent_routes.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
mimetypes.add_type('application/javascript', '.mjs')
1111

1212
import flask
13-
from flask import request, session, jsonify, Blueprint
13+
from flask import request, session, jsonify, Blueprint, current_app
1414
import logging
1515

1616
import json
@@ -321,10 +321,10 @@ def derive_data():
321321

322322
if mode == "recommendation":
323323
# now it's in recommendation mode
324-
agent = SQLDataRecAgent(client=client, conn=conn) if language == "sql" else PythonDataRecAgent(client=client)
324+
agent = SQLDataRecAgent(client=client, conn=conn) if language == "sql" else PythonDataRecAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'])
325325
results = agent.run(input_tables, instruction)
326326
else:
327-
agent = SQLDataTransformationAgent(client=client, conn=conn) if language == "sql" else PythonDataTransformationAgent(client=client)
327+
agent = SQLDataTransformationAgent(client=client, conn=conn) if language == "sql" else PythonDataTransformationAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'])
328328
results = agent.run(input_tables, instruction, [field['name'] for field in new_fields], prev_messages)
329329

330330
repair_attempts = 0
@@ -382,7 +382,7 @@ def refine_data():
382382
conn = db_manager.get_connection(session['session_id']) if language == "sql" else None
383383

384384
# always resort to the data transform agent
385-
agent = SQLDataTransformationAgent(client=client, conn=conn) if language == "sql" else PythonDataTransformationAgent(client=client)
385+
agent = SQLDataTransformationAgent(client=client, conn=conn) if language == "sql" else PythonDataTransformationAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'])
386386
results = agent.followup(input_tables, dialog, [field['name'] for field in output_fields], new_instruction)
387387

388388
repair_attempts = 0

py-src/data_formulator/agents/agent_py_concept_derive.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,9 @@ def derive_new_column(df):
9090

9191
class PyConceptDeriveAgent(object):
9292

93-
def __init__(self, client):
93+
def __init__(self, client, exec_python_in_subprocess=False):
9494
self.client = client
95+
self.exec_python_in_subprocess = exec_python_in_subprocess
9596

9697
def run(self, input_table, input_fields, output_field, description):
9798
"""derive a new concept based on input table, input fields, and output field name, (and description)
@@ -131,7 +132,7 @@ def run(self, input_table, input_fields, output_field, description):
131132
if len(code_blocks) > 0:
132133
code_str = code_blocks[-1]
133134
try:
134-
result = py_sandbox.derive_df_in_sandbox2020(code_str, output_field, input_table['rows'])
135+
result = py_sandbox.run_derive_concept(code_str, output_field, input_table['rows'], self.exec_python_in_subprocess)
135136

136137
if result['status'] == 'ok':
137138
result['content'] = {

py-src/data_formulator/agents/agent_py_data_rec.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,12 @@
22
# Licensed under the MIT License.
33

44
import json
5+
import pandas as pd
56

67
from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response
7-
88
import data_formulator.py_sandbox as py_sandbox
99

1010
import traceback
11-
12-
1311
import logging
1412

1513
logger = logging.getLogger(__name__)
@@ -129,9 +127,10 @@ def transform_data(df):
129127

130128
class PythonDataRecAgent(object):
131129

132-
def __init__(self, client, system_prompt=None):
130+
def __init__(self, client, system_prompt=None, exec_python_in_subprocess=False):
133131
self.client = client
134132
self.system_prompt = system_prompt if system_prompt is not None else SYSTEM_PROMPT
133+
self.exec_python_in_subprocess = exec_python_in_subprocess
135134

136135
def process_gpt_response(self, input_tables, messages, response):
137136
"""process gpt response to handle execution"""
@@ -160,12 +159,13 @@ def process_gpt_response(self, input_tables, messages, response):
160159
code_str = code_blocks[-1]
161160

162161
try:
163-
result = py_sandbox.run_transform_in_sandbox2020(code_str, [t['rows'] for t in input_tables])
162+
result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess)
164163
result['code'] = code_str
165164

166165
if result['status'] == 'ok':
166+
result_df = result['content']
167167
result['content'] = {
168-
'rows': json.loads(result['content']),
168+
'rows': result_df.to_dict(orient='records'),
169169
}
170170
else:
171171
logger.info(result['content'])

py-src/data_formulator/agents/agent_py_data_transform.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response
88
import data_formulator.py_sandbox as py_sandbox
9+
import pandas as pd
910

1011
import logging
1112

@@ -184,9 +185,10 @@ def transform_data(df):
184185

185186
class PythonDataTransformationAgent(object):
186187

187-
def __init__(self, client, system_prompt=None):
188+
def __init__(self, client, system_prompt=None, exec_python_in_subprocess=False):
188189
self.client = client
189190
self.system_prompt = system_prompt if system_prompt is not None else SYSTEM_PROMPT
191+
self.exec_python_in_subprocess = exec_python_in_subprocess
190192

191193
def process_gpt_response(self, input_tables, messages, response):
192194
"""process gpt response to handle execution"""
@@ -216,13 +218,16 @@ def process_gpt_response(self, input_tables, messages, response):
216218
code_str = code_blocks[-1]
217219

218220
try:
219-
result = py_sandbox.run_transform_in_sandbox2020(code_str, [t['rows'] for t in input_tables])
221+
result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess)
220222
result['code'] = code_str
221223

224+
print(f"result: {result}")
225+
222226
if result['status'] == 'ok':
223227
# parse the content
228+
result_df = result['content']
224229
result['content'] = {
225-
'rows': json.loads(result['content']),
230+
'rows': result_df.to_dict(orient='records'),
226231
}
227232
else:
228233
logger.info(result['content'])

py-src/data_formulator/app.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,23 @@
3838
app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist"))
3939
app.secret_key = secrets.token_hex(16) # Generate a random secret key for sessions
4040

41+
# Load env files early
42+
load_dotenv(os.path.join(APP_ROOT, "..", "..", 'api-keys.env'))
43+
load_dotenv(os.path.join(APP_ROOT, 'api-keys.env'))
44+
load_dotenv(os.path.join(APP_ROOT, '.env'))
45+
46+
# Add this line to store args at app level
47+
app.config['CLI_ARGS'] = {
48+
'exec_python_in_subprocess': os.environ.get('EXEC_PYTHON_IN_SUBPROCESS', 'false').lower() == 'true',
49+
'disable_display_keys': os.environ.get('DISABLE_DISPLAY_KEYS', 'false').lower() == 'true'
50+
}
51+
4152
# register blueprints
4253
app.register_blueprint(tables_bp)
4354
app.register_blueprint(agent_bp)
4455

4556
print(APP_ROOT)
4657

47-
# Load the single environment file
48-
load_dotenv(os.path.join(APP_ROOT, "..", "..", 'api-keys.env'))
49-
load_dotenv(os.path.join(APP_ROOT, 'api-keys.env'))
50-
load_dotenv(os.path.join(APP_ROOT, '.env'))
51-
5258
# Configure root logger for general application logging
5359
logging.basicConfig(
5460
level=logging.INFO,
@@ -68,7 +74,6 @@
6874
logger.info("Application level log") # General application logging
6975
app.logger.info("Flask specific log") # Web request related logging
7076

71-
7277
@app.route('/api/vega-datasets')
7378
def get_example_dataset_list():
7479
dataset_names = vega_data.list_datasets()
@@ -218,10 +223,11 @@ def get_session_id():
218223

219224
@app.route('/api/app-config', methods=['GET'])
220225
def get_app_config():
221-
"""Provide frontend configuration settings from environment variables"""
222-
226+
"""Provide frontend configuration settings from CLI arguments"""
227+
args = app.config['CLI_ARGS']
223228
config = {
224-
"SHOW_KEYS_ENABLED": os.getenv("SHOW_KEYS_ENABLED", "true").lower() == "true",
229+
"EXEC_PYTHON_IN_SUBPROCESS": args['exec_python_in_subprocess'],
230+
"DISABLE_DISPLAY_KEYS": args['disable_display_keys'],
225231
"SESSION_ID": session.get('session_id', None)
226232
}
227233
return flask.jsonify(config)
@@ -230,11 +236,21 @@ def get_app_config():
230236
def parse_args() -> argparse.Namespace:
231237
parser = argparse.ArgumentParser(description="Data Formulator")
232238
parser.add_argument("-p", "--port", type=int, default=5000, help="The port number you want to use")
239+
parser.add_argument("-e", "--exec-python-in-subprocess", action='store_true', default=False,
240+
help="Whether to execute python in subprocess, it makes the app more secure (reducing the chance for the model to access the local machine), but increases the time of response")
241+
parser.add_argument("-d", "--disable-display-keys", action='store_true', default=False,
242+
help="Whether disable displaying keys in the frontend UI, recommended to turn on if you host the app not just for yourself.")
233243
return parser.parse_args()
234244

235245

236246
def run_app():
237247
args = parse_args()
248+
# Add this line to make args available to routes
249+
# override the args from the env file
250+
app.config['CLI_ARGS'] = {
251+
'exec_python_in_subprocess': args.exec_python_in_subprocess,
252+
'disable_display_keys': args.disable_display_keys
253+
}
238254

239255
url = "http://localhost:{0}".format(args.port)
240256
threading.Timer(2, lambda: webbrowser.open(url, new=2)).start()

0 commit comments

Comments
 (0)