1+ #[ cfg( feature = "integration" ) ]
2+ pub mod sqllogictest {
3+ use async_trait:: async_trait;
4+ use datafusion:: arrow:: array:: RecordBatch ;
5+ use datafusion:: arrow:: datatypes:: { DataType , Field , Schema } ;
6+ use datafusion:: execution:: context:: SessionContext ;
7+ use crate :: test_utils:: localhost:: start_localhost_context;
8+ use crate :: DefaultSessionBuilder ;
9+ use sqllogictest:: { AsyncDB , DefaultColumnType , DBOutput } ;
10+ use std:: sync:: Arc ;
11+
12+ pub struct DataFusionDistributedDB {
13+ ctx : SessionContext ,
14+ }
15+
16+ impl DataFusionDistributedDB {
17+ pub async fn new ( num_nodes : usize ) -> Self {
18+ // Start distributed context with specified number of nodes
19+ let ( ctx, _guard) = start_localhost_context ( num_nodes, DefaultSessionBuilder ) . await ;
20+
21+ // Use existing parquet tables from the test_utils
22+ use crate :: test_utils:: parquet:: register_parquet_tables;
23+ register_parquet_tables ( & ctx) . await . unwrap ( ) ;
24+
25+ // Keep the guard alive by forgetting it (for CLI purposes)
26+ std:: mem:: forget ( _guard) ;
27+
28+ Self { ctx }
29+ }
30+
31+ async fn handle_explain_analyze ( & mut self , sql : & str ) -> Result < DBOutput < DefaultColumnType > , datafusion:: error:: DataFusionError > {
32+ // For now, just treat it as a regular EXPLAIN
33+ // TODO: Implement proper distributed EXPLAIN ANALYZE with metrics
34+ let explain_sql = sql. replace ( "EXPLAIN ANALYZE" , "EXPLAIN" ) ;
35+ let df = self . ctx . sql ( & explain_sql) . await ?;
36+ let batches = df. collect ( ) . await ?;
37+ self . convert_batches_to_output ( batches)
38+ }
39+
40+ // TODO: Implement proper metrics obfuscation for EXPLAIN ANALYZE
41+ #[ allow( dead_code) ]
42+ fn obfuscate_metrics ( & self , plan_str : & str ) -> String {
43+ use regex:: Regex ;
44+
45+ let mut obfuscated = plan_str. to_string ( ) ;
46+
47+ // Replace timestamps with <TIMESTAMP>
48+ let timestamp_regex = Regex :: new ( r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+" ) . unwrap ( ) ;
49+ obfuscated = timestamp_regex. replace_all ( & obfuscated, "<TIMESTAMP>" ) . to_string ( ) ;
50+
51+ // Replace durations (e.g., "123.456ms", "1.234s") with <DURATION>
52+ let duration_regex = Regex :: new ( r"\d+\.\d+[μmn]?s" ) . unwrap ( ) ;
53+ obfuscated = duration_regex. replace_all ( & obfuscated, "<DURATION>" ) . to_string ( ) ;
54+
55+ // Replace memory sizes (e.g., "1.2MB", "345KB") with <MEMORY>
56+ let memory_regex = Regex :: new ( r"\d+\.\d+[KMGT]?B" ) . unwrap ( ) ;
57+ obfuscated = memory_regex. replace_all ( & obfuscated, "<MEMORY>" ) . to_string ( ) ;
58+
59+ // Replace row counts and other large numbers with <COUNT>
60+ let count_regex = Regex :: new ( r"rows=\d+" ) . unwrap ( ) ;
61+ obfuscated = count_regex. replace_all ( & obfuscated, "rows=<COUNT>" ) . to_string ( ) ;
62+
63+ obfuscated
64+ }
65+
66+ fn convert_batches_to_output ( & self , batches : Vec < RecordBatch > ) -> Result < DBOutput < DefaultColumnType > , datafusion:: error:: DataFusionError > {
67+ if batches. is_empty ( ) {
68+ return Ok ( DBOutput :: Rows {
69+ types : vec ! [ ] ,
70+ rows : vec ! [ ] ,
71+ } ) ;
72+ }
73+
74+ let num_columns = batches[ 0 ] . num_columns ( ) ;
75+ let column_types = vec ! [ DefaultColumnType :: Text ; num_columns] ; // Everything as text
76+
77+ let mut rows = Vec :: new ( ) ;
78+ for batch in batches {
79+ for row_idx in 0 ..batch. num_rows ( ) {
80+ let mut row = Vec :: new ( ) ;
81+ for col_idx in 0 ..batch. num_columns ( ) {
82+ let column = batch. column ( col_idx) ;
83+ let value = datafusion:: arrow:: util:: display:: array_value_to_string ( column, row_idx)
84+ . map_err ( |e| datafusion:: error:: DataFusionError :: ArrowError ( Box :: new ( e) , None ) ) ?;
85+ row. push ( value) ;
86+ }
87+ rows. push ( row) ;
88+ }
89+ }
90+
91+ Ok ( DBOutput :: Rows {
92+ types : column_types,
93+ rows,
94+ } )
95+ }
96+ }
97+
98+ #[ async_trait]
99+ impl AsyncDB for DataFusionDistributedDB {
100+ type Error = datafusion:: error:: DataFusionError ;
101+ type ColumnType = DefaultColumnType ;
102+
103+ async fn run ( & mut self , sql : & str ) -> Result < DBOutput < Self :: ColumnType > , Self :: Error > {
104+ let sql = sql. trim ( ) ;
105+
106+ // Handle different types of SQL statements
107+ if sql. to_uppercase ( ) . starts_with ( "CREATE" ) ||
108+ sql. to_uppercase ( ) . starts_with ( "INSERT" ) ||
109+ sql. to_uppercase ( ) . starts_with ( "DROP" ) {
110+ // For DDL/DML statements, just return an empty result
111+ return Ok ( DBOutput :: StatementComplete ( 0 ) ) ;
112+ }
113+
114+ // Handle EXPLAIN ANALYZE
115+ if sql. to_uppercase ( ) . starts_with ( "EXPLAIN ANALYZE" ) {
116+ return self . handle_explain_analyze ( sql) . await ;
117+ }
118+
119+ // Handle regular EXPLAIN - use distributed optimizer
120+ if sql. to_uppercase ( ) . starts_with ( "EXPLAIN" ) {
121+ let query = sql. trim_start_matches ( "EXPLAIN" ) . trim ( ) ;
122+ let df = self . ctx . sql ( query) . await ?;
123+ let physical_plan = df. create_physical_plan ( ) . await ?;
124+
125+ // Apply distributed optimizer to get the distributed plan
126+ use crate :: DistributedPhysicalOptimizerRule ;
127+ use datafusion:: physical_optimizer:: PhysicalOptimizerRule ;
128+ use datafusion:: physical_plan:: displayable;
129+
130+ let physical_distributed = DistributedPhysicalOptimizerRule :: default ( )
131+ . with_network_shuffle_tasks ( 2 )
132+ . with_network_coalesce_tasks ( 2 )
133+ . optimize ( physical_plan, & Default :: default ( ) ) ?;
134+
135+ let physical_distributed_str = displayable ( physical_distributed. as_ref ( ) )
136+ . indent ( true )
137+ . to_string ( ) ;
138+
139+ // Create a RecordBatch with the plan string
140+ use datafusion:: arrow:: array:: { ArrayRef , StringArray } ;
141+ use datafusion:: arrow:: datatypes:: { DataType , Field , Schema } ;
142+
143+ let lines: Vec < String > = physical_distributed_str. lines ( ) . map ( |s| s. to_string ( ) ) . collect ( ) ;
144+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new( "plan" , DataType :: Utf8 , false ) ] ) ) ;
145+ let batch = RecordBatch :: try_new (
146+ schema,
147+ vec ! [ Arc :: new( StringArray :: from( lines) ) as ArrayRef ] ,
148+ ) ?;
149+
150+ return self . convert_batches_to_output ( vec ! [ batch] ) ;
151+ }
152+
153+ // Execute query
154+ let df = self . ctx . sql ( sql) . await ?;
155+ let batches = df. collect ( ) . await ?;
156+
157+ self . convert_batches_to_output ( batches)
158+ }
159+
160+ fn engine_name ( & self ) -> & str {
161+ "datafusion-distributed"
162+ }
163+ }
164+ }
0 commit comments