much cleaner sandboxing machnism, and by default increases response time, fixing code issues

Chenglong-MS · Chenglong-MS · commit a8f6fa0a5cc5 · 2025-04-17T16:47:16.000-07:00
diff --git a/.env.template b/.env.template
@@ -1,3 +1,6 @@
 # Provide frontend configuration settings from environment variables
-SHOW_KEYS_ENABLED=true
-SESSION_ENABLED=true # if true, the session id will be used to track the user's session
+# You can override these settings when lauching the app as well:
+#   python -m data_formulator -p 5000 --exec-python-in-subprocess true --disable-display-keys true
+
+DISABLE_DISPLAY_KEYS=false # if true, the display keys will not be shown in the frontend
+EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response
diff --git a/py-src/data_formulator/agent_routes.py b/py-src/data_formulator/agent_routes.py
@@ -10,7 +10,7 @@
 mimetypes.add_type('application/javascript', '.mjs')
 
 import flask
-from flask import request, session, jsonify, Blueprint
+from flask import request, session, jsonify, Blueprint, current_app
 import logging
 
 import json
@@ -321,10 +321,10 @@ def derive_data():
 
         if mode == "recommendation":
             # now it's in recommendation mode
-            agent = SQLDataRecAgent(client=client, conn=conn) if language == "sql" else PythonDataRecAgent(client=client)
+            agent = SQLDataRecAgent(client=client, conn=conn) if language == "sql" else PythonDataRecAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'])
             results = agent.run(input_tables, instruction)
         else:
-            agent = SQLDataTransformationAgent(client=client, conn=conn) if language == "sql" else PythonDataTransformationAgent(client=client)
+            agent = SQLDataTransformationAgent(client=client, conn=conn) if language == "sql" else PythonDataTransformationAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'])
             results = agent.run(input_tables, instruction, [field['name'] for field in new_fields], prev_messages)
 
         repair_attempts = 0
@@ -382,7 +382,7 @@ def refine_data():
         conn = db_manager.get_connection(session['session_id']) if language == "sql" else None
 
         # always resort to the data transform agent       
-        agent = SQLDataTransformationAgent(client=client, conn=conn) if language == "sql" else PythonDataTransformationAgent(client=client)
+        agent = SQLDataTransformationAgent(client=client, conn=conn) if language == "sql" else PythonDataTransformationAgent(client=client, exec_python_in_subprocess=current_app.config['CLI_ARGS']['exec_python_in_subprocess'])
         results = agent.followup(input_tables, dialog, [field['name'] for field in output_fields], new_instruction)
 
         repair_attempts = 0
diff --git a/py-src/data_formulator/agents/agent_py_concept_derive.py b/py-src/data_formulator/agents/agent_py_concept_derive.py
@@ -90,8 +90,9 @@ def derive_new_column(df):
 
 class PyConceptDeriveAgent(object):
 
-    def __init__(self, client):
+    def __init__(self, client, exec_python_in_subprocess=False):
         self.client = client
+        self.exec_python_in_subprocess = exec_python_in_subprocess
 
     def run(self, input_table, input_fields, output_field, description):
         """derive a new concept based on input table, input fields, and output field name, (and description)
@@ -131,7 +132,7 @@ def run(self, input_table, input_fields, output_field, description):
             if len(code_blocks) > 0:
                 code_str = code_blocks[-1]
                 try:
-                    result =  py_sandbox.derive_df_in_sandbox2020(code_str, output_field, input_table['rows'])
+                    result =  py_sandbox.run_derive_concept(code_str, output_field, input_table['rows'], self.exec_python_in_subprocess)
 
                     if result['status'] == 'ok':
                         result['content'] = {
diff --git a/py-src/data_formulator/agents/agent_py_data_rec.py b/py-src/data_formulator/agents/agent_py_data_rec.py
@@ -2,14 +2,12 @@
 # Licensed under the MIT License.
 
 import json
+import pandas as pd
 
 from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response
-
 import data_formulator.py_sandbox as py_sandbox
 
 import traceback
-
-
 import logging
 
 logger = logging.getLogger(__name__)
@@ -129,9 +127,10 @@ def transform_data(df):
 
 class PythonDataRecAgent(object):
 
-    def __init__(self, client, system_prompt=None):
+    def __init__(self, client, system_prompt=None, exec_python_in_subprocess=False):
         self.client = client
         self.system_prompt = system_prompt if system_prompt is not None else SYSTEM_PROMPT
+        self.exec_python_in_subprocess = exec_python_in_subprocess
 
     def process_gpt_response(self, input_tables, messages, response):
         """process gpt response to handle execution"""
@@ -160,12 +159,13 @@ def process_gpt_response(self, input_tables, messages, response):
                 code_str = code_blocks[-1]
 
                 try:
-                    result = py_sandbox.run_transform_in_sandbox2020(code_str, [t['rows'] for t in input_tables])
+                    result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess)
                     result['code'] = code_str
 
                     if result['status'] == 'ok':
+                        result_df = result['content']
                         result['content'] = {
-                            'rows': json.loads(result['content']),
+                            'rows': result_df.to_dict(orient='records'),
                         }
                     else:
                         logger.info(result['content'])
diff --git a/py-src/data_formulator/agents/agent_py_data_transform.py b/py-src/data_formulator/agents/agent_py_data_transform.py
@@ -6,6 +6,7 @@
 
 from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary, extract_code_from_gpt_response
 import data_formulator.py_sandbox as py_sandbox
+import pandas as pd
 
 import logging
 
@@ -184,9 +185,10 @@ def transform_data(df):
 
 class PythonDataTransformationAgent(object):
 
-    def __init__(self, client, system_prompt=None):
+    def __init__(self, client, system_prompt=None, exec_python_in_subprocess=False):
         self.client = client
         self.system_prompt = system_prompt if system_prompt is not None else SYSTEM_PROMPT
+        self.exec_python_in_subprocess = exec_python_in_subprocess
 
     def process_gpt_response(self, input_tables, messages, response):
         """process gpt response to handle execution"""
@@ -216,13 +218,16 @@ def process_gpt_response(self, input_tables, messages, response):
                 code_str = code_blocks[-1]
 
                 try:
-                    result = py_sandbox.run_transform_in_sandbox2020(code_str, [t['rows'] for t in input_tables])
+                    result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess)
                     result['code'] = code_str
 
+                    print(f"result: {result}")
+
                     if result['status'] == 'ok':
                         # parse the content
+                        result_df = result['content']
                         result['content'] = {
-                            'rows': json.loads(result['content']),
+                            'rows': result_df.to_dict(orient='records'),
                         }
                     else:
                         logger.info(result['content'])
diff --git a/py-src/data_formulator/app.py b/py-src/data_formulator/app.py
@@ -38,17 +38,23 @@
 app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist"))
 app.secret_key = secrets.token_hex(16)  # Generate a random secret key for sessions
 
+# Load env files early
+load_dotenv(os.path.join(APP_ROOT, "..", "..", 'api-keys.env'))
+load_dotenv(os.path.join(APP_ROOT, 'api-keys.env'))
+load_dotenv(os.path.join(APP_ROOT, '.env'))
+
+# Add this line to store args at app level
+app.config['CLI_ARGS'] = {
+    'exec_python_in_subprocess': os.environ.get('EXEC_PYTHON_IN_SUBPROCESS', 'false').lower() == 'true',
+    'disable_display_keys': os.environ.get('DISABLE_DISPLAY_KEYS', 'false').lower() == 'true'       
+}
+
 # register blueprints
 app.register_blueprint(tables_bp)
 app.register_blueprint(agent_bp)
 
 print(APP_ROOT)
 
-# Load the single environment file
-load_dotenv(os.path.join(APP_ROOT, "..", "..", 'api-keys.env'))
-load_dotenv(os.path.join(APP_ROOT, 'api-keys.env'))
-load_dotenv(os.path.join(APP_ROOT, '.env'))
-
 # Configure root logger for general application logging
 logging.basicConfig(
     level=logging.INFO,
@@ -68,7 +74,6 @@
 logger.info("Application level log")  # General application logging
 app.logger.info("Flask specific log") # Web request related logging
 
-
 @app.route('/api/vega-datasets')
 def get_example_dataset_list():
     dataset_names = vega_data.list_datasets()
@@ -218,10 +223,11 @@ def get_session_id():
 
 @app.route('/api/app-config', methods=['GET'])
 def get_app_config():
-    """Provide frontend configuration settings from environment variables"""
-    
+    """Provide frontend configuration settings from CLI arguments"""
+    args = app.config['CLI_ARGS']
     config = {
-        "SHOW_KEYS_ENABLED": os.getenv("SHOW_KEYS_ENABLED", "true").lower() == "true",
+        "EXEC_PYTHON_IN_SUBPROCESS": args['exec_python_in_subprocess'],
+        "DISABLE_DISPLAY_KEYS": args['disable_display_keys'],
         "SESSION_ID": session.get('session_id', None)
     }
     return flask.jsonify(config)
@@ -230,11 +236,21 @@ def get_app_config():
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Data Formulator")
     parser.add_argument("-p", "--port", type=int, default=5000, help="The port number you want to use")
+    parser.add_argument("-e", "--exec-python-in-subprocess", action='store_true', default=False,
+        help="Whether to execute python in subprocess, it makes the app more secure (reducing the chance for the model to access the local machine), but increases the time of response")
+    parser.add_argument("-d", "--disable-display-keys", action='store_true', default=False,
+        help="Whether disable displaying keys in the frontend UI, recommended to turn on if you host the app not just for yourself.")
     return parser.parse_args()
 
 
 def run_app():
     args = parse_args()
+    # Add this line to make args available to routes
+    # override the args from the env file
+    app.config['CLI_ARGS'] = {
+        'exec_python_in_subprocess': args.exec_python_in_subprocess,
+        'disable_display_keys': args.disable_display_keys
+    }
 
     url = "http://localhost:{0}".format(args.port)
     threading.Timer(2, lambda: webbrowser.open(url, new=2)).start()
diff --git a/py-src/data_formulator/py_sandbox.py b/py-src/data_formulator/py_sandbox.py
diff --git a/src/views/ConceptCard.tsx b/src/views/ConceptCard.tsx
diff --git a/src/views/ModelSelectionDialog.tsx b/src/views/ModelSelectionDialog.tsx