microsoft
diff --git a/‎.env.template
Lines changed: 2 additions & 1 deletion b/‎.env.template
Lines changed: 2 additions & 1 deletion
diff --git a/‎py-src/data_formulator/agents/agent_sql_data_rec.py
Lines changed: 11 additions & 10 deletions b/‎py-src/data_formulator/agents/agent_sql_data_rec.py
Lines changed: 11 additions & 10 deletions
diff --git a/‎py-src/data_formulator/agents/agent_sql_data_transform.py
Lines changed: 44 additions & 16 deletions b/‎py-src/data_formulator/agents/agent_sql_data_transform.py
Lines changed: 44 additions & 16 deletions
diff --git a/‎py-src/data_formulator/db_manager.py
Lines changed: 4 additions & 5 deletions b/‎py-src/data_formulator/db_manager.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎py-src/data_formulator/tables_routes.py
Lines changed: 51 additions & 23 deletions b/‎py-src/data_formulator/tables_routes.py
Lines changed: 51 additions & 23 deletions
@@ -6,7 +6,8 @@ DISABLE_DISPLAY_KEYS=false # if true, the display keys will not be shown in the
 EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response
 
 # External atabase connection settings
-# check https://duckdb.org/docs/stable/extensions/mysql.html and https://duckdb.org/docs/stable/extensions/postgres.html
+# check https://duckdb.org/docs/stable/extensions/mysql.html 
+# and https://duckdb.org/docs/stable/extensions/postgres.html
 USE_EXTERNAL_DB=false # if true, the app will use an external database instead of the one in the app
 DB_NAME=mysql_db # the name to refer to this database connection
 DB_TYPE=mysql # mysql or postgresql
 
@@ -4,7 +4,7 @@
 import json
 
 from data_formulator.agents.agent_utils import extract_json_objects, extract_code_from_gpt_response
-from data_formulator.agents.agent_sql_data_transform import get_sql_table_statistics_str
+from data_formulator.agents.agent_sql_data_transform import get_sql_table_statistics_str, sanitize_table_name
 
 import random
 import string
@@ -64,6 +64,10 @@
     3. The [OUTPUT] must only contain two items:
         - a json object (wrapped in ```json```) representing the refined goal (including "mode", "recommendation", "output_fields", "chart_type", "visualization_fields")
         - a sql query block (wrapped in ```sql```) representing the transformation code, do not add any extra text explanation.
+
+some notes:
+- in DuckDB, you escape a single quote within a string by doubling it ('') rather than using a backslash (\').
+- in DuckDB, you need to use proper date functions to perform date operations.
 '''
 
 example = """
@@ -167,21 +171,17 @@ def process_gpt_response(self, input_tables, messages, response):
                     row_count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
 
                     # Only limit to 5000 if there are more rows
-                    if row_count > 5000:
-                        query_output = self.conn.execute(f"SELECT * FROM {table_name} LIMIT 5000").fetch_df()
-                    else:
-                        query_output = self.conn.execute(f"SELECT * FROM {table_name}").fetch_df()
-                        self.conn.execute(f"DROP VIEW {table_name}")
+                    query_output = self.conn.execute(f"SELECT * FROM {table_name} LIMIT 5000").fetch_df()
 
                     result = {
                         "status": "ok",
                         "code": code_str,
                         "content": {
-                            'rows': query_output.to_dict('records'),
+                            'rows': json.loads(query_output.to_json(orient='records')),
                             'virtual': {
                                 'table_name': table_name,
                                 'row_count': row_count
-                            } if row_count > 5000 else None
+                            }
                         },
                     }
                 except Exception as e:
@@ -211,8 +211,9 @@ def process_gpt_response(self, input_tables, messages, response):
     def run(self, input_tables, description, n=1):
         data_summary = ""
         for table in input_tables:
-            table_summary_str = get_sql_table_statistics_str(self.conn, table['name'])
-            data_summary += f"[TABLE {table['name']}]\n\n{table_summary_str}\n\n"
+            table_name = sanitize_table_name(table['name'])
+            table_summary_str = get_sql_table_statistics_str(self.conn, table_name)
+            data_summary += f"[TABLE {table_name}]\n\n{table_summary_str}\n\n"
 
         user_query = f"[CONTEXT]\n\n{data_summary}\n\n[GOAL]\n\n{description}\n\n[OUTPUT]\n"
 
 
@@ -9,13 +9,13 @@
 import pandas as pd
 
 import logging 
-
+import re
 # Replace/update the logger configuration
 logger = logging.getLogger(__name__)
 
 SYSTEM_PROMPT = '''You are a data scientist to help user to transform data that will be used for visualization.
 The user will provide you information about what data would be needed, and your job is to create a sql query based on the input data summary, transformation instruction and expected fields.
-The users' instruction includes "expected fields" that the user want for visualization, and natural language instructions "goal" that describe what data is needed.
+The users' instruction includes "visualization_fields" that the user want for visualization, and natural language instructions "goal" that describe what data is needed.
 
 **Important:**
 - NEVER make assumptions or judgments about a person's gender, biological sex, sexuality, religion, race, nationality, ethnicity, political stance, socioeconomic status, mental health, invisible disabilities, medical conditions, personality type, social impressions, emotional state, and cognitive state.
@@ -24,15 +24,22 @@
 
 Concretely, you should first refine users' goal and then create a sql query in the [OUTPUT] section based off the [CONTEXT] and [GOAL]:
 
-    1. First, refine users' [GOAL]. The main objective in this step is to check if "visualization_fields" provided by the user are sufficient to achieve their "goal". Concretely:
-        (1) based on the user's "goal", elaborate the goal into a "detailed_instruction".
+    1. First, refine users' [GOAL]. The main objective in this step is to decide data transformation based on the user's goal. 
+        Concretely:
+        (1) based on the user's "goal" and provided "visualization_fields", elaborate the goal into a "detailed_instruction".
+            - first elaborate which fields the user wants to visualize based on "visualization_fields";
+            - then, elaborate the goal into a "detailed_instruction" contextualized with the provided "visualization_fields".
+                * note: try to distinguish whether the user wants to fitler the data with some conditions, or they want to aggregate data based on some fields.
+                * e.g., filter data to show all items from top 20 categories based on their average values, is different from showing the top 20 categories with their average values
         (2) determine "output_fields", the desired fields that the output data should have to achieve the user's goal, it's a good idea to include intermediate fields here.
-        (2) now, determine whether the user has provided sufficient fields in "visualization_fields" that are needed to achieve their goal:
-            - if the user's "visualization_fields" are sufficient, simply copy it.
+            - note: when the user asks for filtering the data, include all fields that are needed to filter the data in "output_fields" (as well as other fields the user asked for or necessary in computation).
+        (3) now, determine whether the user has provided sufficient fields in "visualization_fields" that are needed to achieve their goal:
+            - if the user's "visualization_fields" are sufficient, simply copy it from user input.
             - if the user didn't provide sufficient fields in "visualization_fields", add missing fields in "visualization_fields" (ordered them based on whether the field will be used in x,y axes or legends);
                 - "visualization_fields" should only include fields that will be visualized (do not include other intermediate fields from "output_fields")  
                 - when adding new fields to "visualization_fields", be efficient and add only a minimal number of fields that are needed to achive the user's goal. generally, the total number of fields in "visualization_fields" should be no more than 3 for x,y,legend.
-
+                - if the user's goal is to filter the data, include all fields that are needed to filter the data in "output_fields" (as well as other fields the user asked for or necessary in computation).
+                - all existing fields user provided in "visualization_fields" should be included in "visualization_fields" list.
     Prepare the result in the following json format:
 
 ```
@@ -52,6 +59,10 @@
     3. The [OUTPUT] must only contain two items:
         - a json object (wrapped in ```json```) representing the refined goal (including "detailed_instruction", "output_fields", "visualization_fields" and "reason")
         - a sql query block (wrapped in ```sql```) representing the transformation code, do not add any extra text explanation.
+
+some notes:
+- in DuckDB, you escape a single quote within a string by doubling it ('') rather than using a backslash (\').
+- in DuckDB, you need to use proper date functions to perform date operations.
 '''
 
 EXAMPLE='''
@@ -104,6 +115,15 @@
 ```
 '''
 
+def sanitize_table_name(table_name: str) -> str:
+    """Sanitize table name to be used in SQL queries"""
+    # Replace spaces with underscores
+    sanitized_name = table_name.replace(" ", "_")
+    sanitized_name = sanitized_name.replace("-", "_")
+    # Allow alphanumeric, underscore, dot, dash, and dollar sign
+    sanitized_name = re.sub(r'[^a-zA-Z0-9_\.$]', '', sanitized_name)
+    return sanitized_name
+
 class SQLDataTransformationAgent(object):
 
     def __init__(self, client, conn, system_prompt=None):
@@ -156,17 +176,16 @@ def process_gpt_sql_response(self, response, messages):
                         query_output = self.conn.execute(f"SELECT * FROM {table_name} LIMIT 5000").fetch_df()
                     else:
                         query_output = self.conn.execute(f"SELECT * FROM {table_name}").fetch_df()
-                        self.conn.execute(f"DROP VIEW {table_name}")
 
                     result = {
                         "status": "ok",
                         "code": query_str,
                         "content": {
-                            'rows': query_output.to_dict('records'),
+                            'rows': json.loads(query_output.to_json(orient='records')),
                             'virtual': {
                                 'table_name': table_name,
                                 'row_count': row_count
-                            } if row_count > 5000 else None
+                            }
                         },
                     }
 
@@ -205,19 +224,24 @@ def run(self, input_tables, description, expected_fields: list[str], prev_messag
         """
 
         for table in input_tables:
-            table_name = table['name']
+            table_name = sanitize_table_name(table['name'])
+
             # Check if table exists in the connection
             try:
                 self.conn.execute(f"DESCRIBE {table_name}")
             except Exception:
                 # Table doesn't exist, create it from the dataframe
                 df = pd.DataFrame(table['rows'])
+
                 # Register the dataframe as a temporary view
-                self.conn.register(f'df_temp_{table_name}', df)
+                self.conn.register(f'df_temp', df)
                 # Create a permanent table from the temporary view
-                self.conn.execute(f"CREATE VIEW {table_name} AS SELECT * FROM df_temp_{table_name}")
+                self.conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df_temp")
                 # Drop the temporary view
-                self.conn.execute(f"DROP VIEW df_temp_{table_name}")
+                self.conn.execute(f"DROP VIEW df_temp")
+
+                r = self.conn.execute(f"SELECT * FROM {table_name} LIMIT 10").fetch_df()
+                print(r)
                 # Log the creation of the table
                 logger.info(f"Created table {table_name} from dataframe")
 
@@ -232,8 +256,9 @@ def run(self, input_tables, description, expected_fields: list[str], prev_messag
 
         data_summary = ""
         for table in input_tables:
-            table_summary_str = get_sql_table_statistics_str(self.conn, table['name'])
-            data_summary += f"[TABLE {table['name']}]\n\n{table_summary_str}\n\n"
+            table_name = sanitize_table_name(table['name'])
+            table_summary_str = get_sql_table_statistics_str(self.conn, table_name)
+            data_summary += f"[TABLE {table_name}]\n\n{table_summary_str}\n\n"
 
         goal = {
             "instruction": description,
@@ -276,6 +301,9 @@ def followup(self, input_tables, dialog, output_fields: list[str], new_instructi
 
 
 def get_sql_table_statistics_str(conn, table_name: str) -> str:
+    """Get a string representation of the table statistics"""
+
+    table_name = sanitize_table_name(table_name)
 
     # Get column information
     columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
 
@@ -12,12 +12,11 @@ class DuckDBManager:
     def __init__(self, external_db_connections: Dict[str, Dict[str, Any]]):
         # Store session db file paths
         self._db_files: Dict[str, str] = {}
-        # Track which extensions have been installed for which db files
-        self._installed_extensions: Dict[str, List[str]] = {}
 
-        # external db connections
+        # External db connections and tracking of installed extensions
         self._external_db_connections: Dict[str, Dict[str, Any]] = external_db_connections
-    
+        self._installed_extensions: Dict[str, List[str]] = {}
+
     @contextmanager
     def connection(self, session_id: str) -> ContextManager[duckdb.DuckDBPyConnection]:
         """Get a DuckDB connection as a context manager that will be closed when exiting the context"""
@@ -34,7 +33,7 @@ def get_connection(self, session_id: str) -> duckdb.DuckDBPyConnection:
         """Internal method to get or create a DuckDB connection for a session"""
         # Get or create the db file path for this session
         if session_id not in self._db_files or self._db_files[session_id] is None:
-            db_file = os.path.join(tempfile.gettempdir(), f"df_{session_id}.db")
+            db_file = os.path.join(tempfile.gettempdir(), f"df_{session_id}.duckdb")
             print(f"=== Creating new db file: {db_file}")
             self._db_files[session_id] = db_file
             # Initialize extension tracking for this file
 
@@ -41,35 +41,52 @@ def list_tables():
     try:
         result = []
         with db_manager.connection(session['session_id']) as db:
-            table_metadata_list = db.execute("SELECT database_name, schema_name, table_name, schema_name==current_schema() as is_current_schema FROM duckdb_tables() WHERE internal=False").fetchall()
+            table_metadata_list = db.execute("""
+                SELECT database_name, schema_name, table_name, schema_name==current_schema() as is_current_schema, 'table' as object_type 
+                FROM duckdb_tables() 
+                WHERE internal=False 
+                UNION ALL 
+                SELECT database_name, schema_name, view_name as table_name, schema_name==current_schema() as is_current_schema, 'view' as object_type 
+                FROM duckdb_views()
+                WHERE view_name NOT LIKE 'duckdb_%' AND view_name NOT LIKE 'sqlite_%' AND view_name NOT LIKE 'pragma_%'
+            """).fetchall()
 
-            print(f"table_metadata_list: {table_metadata_list}")
+            
             for table_metadata in table_metadata_list:
-                [database_name, schema_name, table_name, is_current_schema] = table_metadata
-
+                [database_name, schema_name, table_name, is_current_schema, object_type] = table_metadata
                 table_name = table_name if is_current_schema else '.'.join([database_name, schema_name, table_name])
-                # Get column information
-                columns = db.execute(f"DESCRIBE {table_name}").fetchall()
-                # Get row count
-                row_count = db.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
-                sample_rows = db.execute(f"SELECT * FROM {table_name} LIMIT 1000").fetchdf()
+                if database_name in ['system', 'temp']:
+                    continue
+                
 
-                # Check if this is a view or a table
+                print(f"table_metadata: {table_metadata}")
+
                 try:
-                    # Get both view existence and source in one query
-                    view_info = db.execute(f"SELECT view_name, sql FROM duckdb_views() WHERE view_name = '{table_name}'").fetchone()
-                    view_source = view_info[1] if view_info else None
+                    # Get column information
+                    columns = db.execute(f"DESCRIBE {table_name}").fetchall()
+                    # Get row count
+                    row_count = db.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
+                    sample_rows = db.execute(f"SELECT * FROM {table_name} LIMIT 1000").fetchdf()
+                    
+                    # Check if this is a view or a table
+                    try:
+                        # Get both view existence and source in one query
+                        view_info = db.execute(f"SELECT view_name, sql FROM duckdb_views() WHERE view_name = '{table_name}'").fetchone()
+                        view_source = view_info[1] if view_info else None
+                    except Exception as e:
+                        # If the query fails, assume it's a regular table
+                        view_source = None
+
+                    result.append({
+                        "name": table_name,
+                        "columns": [{"name": col[0], "type": col[1]} for col in columns],
+                        "row_count": row_count,
+                        "sample_rows": json.loads(sample_rows.to_json(orient='records')),
+                        "view_source": view_source
+                    })
                 except Exception as e:
-                    # If the query fails, assume it's a regular table
-                    view_source = None
-
-                result.append({
-                    "name": table_name,
-                    "columns": [{"name": col[0], "type": col[1]} for col in columns],
-                    "row_count": row_count,
-                    "sample_rows": json.loads(sample_rows.to_json(orient='records')),
-                    "view_source": view_source
-                })
+                    logger.error(f"Error getting table metadata for {table_name}: {str(e)}")
+                    continue
 
         return jsonify({
             "status": "success",
@@ -157,6 +174,8 @@ def sample_table():
         with db_manager.connection(session['session_id']) as db:
             # Get valid column names
             columns = [col[0] for col in db.execute(f"DESCRIBE {table_id}").fetchall()]
+
+            print(f"columns: {columns}")
 
             # Filter order_by_fields to only include valid column names
             valid_order_by_fields = [field for field in order_by_fields if field in columns]
@@ -168,11 +187,16 @@ def sample_table():
 
             query, output_column_names = assemble_query(valid_aggregate_fields_and_functions, valid_select_fields, columns, table_id)
 
+            print(f"query: {query}")
+            print(f"output_column_names: {output_column_names}")
+
             # Modify the original query to include the count:
             count_query = f"SELECT *, COUNT(*) OVER () as total_count FROM ({query}) as subq LIMIT 1"
             result = db.execute(count_query).fetchone()
             total_row_count = result[-1] if result else 0
 
+            print(f"total_row_count: {total_row_count}")
+
             # Add ordering and limit to the main query
             if method == 'random':
                 query += f" ORDER BY RANDOM() LIMIT {sample_size}"
@@ -191,8 +215,12 @@ def sample_table():
                 else:
                     query += f" ORDER BY ROWID DESC LIMIT {sample_size}"
 
+            print(f"query: {query}")
+
             result = db.execute(query).fetchdf()
 
+            print(f"result: {result}")
+
         return jsonify({
             "status": "success",
             "rows": json.loads(result.to_json(orient='records')),