Skip to content

Commit f1c2b33

Browse files
authored
Merge branch 'dev' into adding-support-for-mssql
2 parents 6d88c8c + 3b98198 commit f1c2b33

24 files changed

+1051
-547
lines changed

local_server.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,6 @@
55
# export http_proxy=http://127.0.0.1:7890
66
# export https_proxy=http://127.0.0.1:7890
77

8-
env FLASK_APP=py-src/data_formulator/app.py FLASK_RUN_PORT=5000 FLASK_RUN_HOST=0.0.0.0 flask run
8+
#env FLASK_APP=py-src/data_formulator/app.py FLASK_RUN_PORT=5000 FLASK_RUN_HOST=0.0.0.0 flask run
9+
export FLASK_RUN_PORT=5000
10+
python -m py-src.data_formulator.app --port ${FLASK_RUN_PORT} --dev

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
"@emotion/react": "^11.14.0",
88
"@emotion/styled": "^11.14.0",
99
"@fontsource/roboto": "^4.5.5",
10-
"@mui/icons-material": "^5.14.0",
11-
"@mui/material": "^7.0.2",
10+
"@mui/icons-material": "^7.1.1",
11+
"@mui/material": "^7.1.1",
1212
"@reduxjs/toolkit": "^1.8.6",
1313
"@types/dompurify": "^3.0.5",
1414
"@types/validator": "^13.12.2",

py-src/data_formulator/agent_routes.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,8 +425,6 @@ def request_code_expl():
425425
if request.is_json:
426426
logger.info("# request data: ")
427427
content = request.get_json()
428-
token = content["token"]
429-
430428
client = get_client(content['model'])
431429

432430
# each table is a dict with {"name": xxx, "rows": [...]}

py-src/data_formulator/app.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,17 @@
2929
from dotenv import load_dotenv
3030
import secrets
3131
import base64
32-
APP_ROOT = Path(os.path.join(Path(__file__).parent)).absolute()
32+
APP_ROOT = Path(Path(__file__).parent).absolute()
3333

3434
import os
3535

3636
# blueprints
3737
from data_formulator.tables_routes import tables_bp
3838
from data_formulator.agent_routes import agent_bp
39+
from data_formulator.sse_routes import sse_bp
3940

41+
import queue
42+
from typing import Dict, Any
4043

4144
app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist"))
4245
app.secret_key = secrets.token_hex(16) # Generate a random secret key for sessions
@@ -65,6 +68,7 @@ def default(self, obj):
6568
# register blueprints
6669
app.register_blueprint(tables_bp)
6770
app.register_blueprint(agent_bp)
71+
app.register_blueprint(sse_bp)
6872

6973
print(APP_ROOT)
7074

@@ -252,6 +256,8 @@ def parse_args() -> argparse.Namespace:
252256
help="Whether to execute python in subprocess, it makes the app more secure (reducing the chance for the model to access the local machine), but increases the time of response")
253257
parser.add_argument("-d", "--disable-display-keys", action='store_true', default=False,
254258
help="Whether disable displaying keys in the frontend UI, recommended to turn on if you host the app not just for yourself.")
259+
parser.add_argument("--dev", action='store_true', default=False,
260+
help="Launch the app in development mode (prevents the app from opening the browser automatically)")
255261
return parser.parse_args()
256262

257263

@@ -264,11 +270,12 @@ def run_app():
264270
'disable_display_keys': args.disable_display_keys
265271
}
266272

267-
url = "http://localhost:{0}".format(args.port)
268-
threading.Timer(2, lambda: webbrowser.open(url, new=2)).start()
273+
if not args.dev:
274+
url = "http://localhost:{0}".format(args.port)
275+
threading.Timer(2, lambda: webbrowser.open(url, new=2)).start()
269276

270277
app.run(host='0.0.0.0', port=args.port, threaded=True)
271-
278+
272279
if __name__ == '__main__':
273280
#app.run(debug=True, host='127.0.0.1', port=5000)
274281
#use 0.0.0.0 for public

py-src/data_formulator/data_loader/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@
44
from data_formulator.data_loader.kusto_data_loader import KustoDataLoader
55
from data_formulator.data_loader.s3_data_loader import S3DataLoader
66
from data_formulator.data_loader.azure_blob_data_loader import AzureBlobDataLoader
7+
from data_formulator.data_loader.postgresql_data_loader import PostgreSQLDataLoader
78

89
DATA_LOADERS = {
910
"mysql": MySQLDataLoader,
1011
"mssql": MSSQLDataLoader,
1112
"kusto": KustoDataLoader,
1213
"s3": S3DataLoader,
1314
"azure_blob": AzureBlobDataLoader,
15+
"postgresql": PostgreSQLDataLoader
1416
}
1517

16-
__all__ = ["ExternalDataLoader", "MySQLDataLoader", "MSSQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader", "DATA_LOADERS"]
18+
__all__ = ["ExternalDataLoader", "MySQLDataLoader", "MSSQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader","PostgreSQLDataLoader","DATA_LOADERS"]
19+
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import json
2+
3+
import pandas as pd
4+
import duckdb
5+
6+
from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
7+
from typing import Dict, Any, List
8+
9+
class PostgreSQLDataLoader(ExternalDataLoader):
10+
11+
@staticmethod
12+
def list_params() -> List[Dict[str, Any]]:
13+
params_list = [
14+
{"name": "user", "type": "string", "required": True, "default": "postgres", "description": "PostgreSQL username"},
15+
{"name": "password", "type": "string", "required": False, "default": "", "description": "leave blank for no password"},
16+
{"name": "host", "type": "string", "required": True, "default": "localhost", "description": "PostgreSQL host"},
17+
{"name": "port", "type": "string", "required": False, "default": "5432", "description": "PostgreSQL port"},
18+
{"name": "database", "type": "string", "required": True, "default": "postgres", "description": "PostgreSQL database name"}
19+
]
20+
return params_list
21+
22+
@staticmethod
23+
def auth_instructions() -> str:
24+
return "Provide your PostgreSQL connection details. The user must have SELECT permissions on the tables you want to access."
25+
26+
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
27+
self.params = params
28+
self.duck_db_conn = duck_db_conn
29+
30+
try:
31+
# Install and load the Postgres extension
32+
self.duck_db_conn.install_extension("postgres")
33+
self.duck_db_conn.load_extension("postgres")
34+
35+
# Prepare the connection string for Postgres
36+
port = self.params.get('port', '5432')
37+
password_part = f" password={self.params.get('password', '')}" if self.params.get('password') else ""
38+
attach_string = f"host={self.params['host']} port={port} user={self.params['user']}{password_part} dbname={self.params['database']}"
39+
40+
# Detach existing postgres connection if it exists
41+
try:
42+
self.duck_db_conn.execute("DETACH mypostgresdb;")
43+
except:
44+
pass # Ignore if connection doesn't exist
45+
46+
# Register Postgres connection
47+
self.duck_db_conn.execute(f"ATTACH '{attach_string}' AS mypostgresdb (TYPE postgres);")
48+
print(f"Successfully connected to PostgreSQL database: {self.params['database']}")
49+
50+
except Exception as e:
51+
print(f"Failed to connect to PostgreSQL: {e}")
52+
raise
53+
54+
def list_tables(self):
55+
try:
56+
# Query tables through DuckDB's attached PostgreSQL connection
57+
tables_df = self.duck_db_conn.execute("""
58+
SELECT table_schema as schemaname, table_name as tablename
59+
FROM mypostgresdb.information_schema.tables
60+
WHERE table_schema NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
61+
AND table_schema NOT LIKE '%_intern%'
62+
AND table_schema NOT LIKE '%timescaledb%'
63+
AND table_name NOT LIKE '%/%'
64+
AND table_type = 'BASE TABLE'
65+
ORDER BY table_schema, table_name
66+
""").fetch_df()
67+
68+
print(f"Found tables: {tables_df}")
69+
70+
results = []
71+
72+
for schema, table_name in tables_df.values:
73+
full_table_name = f"mypostgresdb.{schema}.{table_name}"
74+
75+
try:
76+
# Get column information using DuckDB's DESCRIBE
77+
columns_df = self.duck_db_conn.execute(f"DESCRIBE {full_table_name}").df()
78+
columns = [{
79+
'name': row['column_name'],
80+
'type': row['column_type']
81+
} for _, row in columns_df.iterrows()]
82+
83+
# Get sample data
84+
sample_df = self.duck_db_conn.execute(f"SELECT * FROM {full_table_name} LIMIT 10").df()
85+
sample_rows = json.loads(sample_df.to_json(orient="records"))
86+
87+
# Get row count
88+
row_count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM {full_table_name}").fetchone()[0]
89+
90+
table_metadata = {
91+
"row_count": row_count,
92+
"columns": columns,
93+
"sample_rows": sample_rows
94+
}
95+
96+
results.append({
97+
"name": full_table_name,
98+
"metadata": table_metadata
99+
})
100+
101+
except Exception as e:
102+
print(f"Error processing table {full_table_name}: {e}")
103+
continue
104+
105+
return results
106+
107+
except Exception as e:
108+
print(f"Error listing tables: {e}")
109+
return []
110+
111+
def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000):
112+
# Create table in the main DuckDB database from Postgres data
113+
if name_as is None:
114+
name_as = table_name.split('.')[-1]
115+
116+
name_as = sanitize_table_name(name_as)
117+
118+
self.duck_db_conn.execute(f"""
119+
CREATE OR REPLACE TABLE main.{name_as} AS
120+
SELECT * FROM {table_name}
121+
LIMIT {size}
122+
""")
123+
124+
def view_query_sample(self, query: str) -> str:
125+
return json.loads(self.duck_db_conn.execute(query).df().head(10).to_json(orient="records"))
126+
127+
def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
128+
# Execute the query and get results as a DataFrame
129+
df = self.duck_db_conn.execute(query).df()
130+
# Use the base class's method to ingest the DataFrame
131+
self.ingest_df_to_duckdb(df, name_as)
132+
return df

0 commit comments

Comments
 (0)