rename all datafusion-distributed to distributed-datafusion and remove datadog-specific comments

NGA-TRAN · NGA-TRAN · commit 3c9c0039d00d · 2025-06-23T11:37:02.000-04:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,8 +16,8 @@
 # under the License.
 
 [package]
-name = "datafusion-distributed"
-description = "DataFusion distributed execution framework"
+name = "distributed-datafusion"
+description = "Distributed DataFusion execution framework"
 homepage = "https://github.com/datafusion-contrib/datafusion-distributed"
 repository = "https://github.com/datafusion-contrib/datafusion-distributed"
 authors = ["DataFusion Contributors <dev@datafusion.apache.org>"]
@@ -29,7 +29,7 @@ rust-version = "1.70"
 build = "build.rs"
 
 [[bin]]
-name = "datafusion-distributed"
+name = "distributed-datafusion"
 path = "src/main.rs"
 
 [dependencies]
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# DataFusion Distributed
+# Distributed DataFusion
 
 [![Apache licensed][license-badge]][license-url]
 
@@ -7,7 +7,7 @@
 
 ## Overview
 
-DataFusion Distributed is a distributed execution framework that enables DataFusion DataFrame and SQL queries to run in a distributed fashion. This project provides the infrastructure to scale DataFusion workloads across multiple nodes in a cluster.
+Distributed DataFusion is a distributed execution framework that enables DataFusion DataFrame and SQL queries to run in a distributed fashion. This project provides the infrastructure to scale DataFusion workloads across multiple nodes in a cluster.
 
 This is an open source version of the distributed DataFusion prototype, extracted from DataDog's internal implementation and made available to the community.
 
@@ -100,20 +100,42 @@ protoc --version
 ./build.sh --release
 ```
 
+**Clean Rebuild**: If you need to completely clean and rebuild (removes all build artifacts):
+
+```bash
+# Clean rebuild in debug mode
+./clean_and_build.sh
+
+# Clean rebuild in release mode (optimized)
+./clean_and_build.sh --release
+```
+
 #### Using Cargo Directly
 
-To build the project in debug mode:
+You can also build the project directly with Cargo (the build.rs script will automatically handle Protocol Buffer compilation):
 
 ```bash
+# Build in debug mode
 cargo build
 ```
 
-To build the project in release mode (optimized):
+```bash
+# Build in release mode (optimized)  
+cargo build --release
+```
+
+**Clean Build Artifacts**: To clean previous build artifacts before rebuilding:
 
 ```bash
+# Clean all build artifacts (removes target/ directory contents)
+cargo clean
+
+# Then rebuild
 cargo build --release
 ```
 
+**Note**: Both commands, `build.sh` script and `cargo` automatically invoke `build.rs`, which handles Protocol Buffer compilation before building the main crate. The main advantage of using `./build.sh` is the user-friendly output and usage examples it provides.
+
 ### Running Tests
 
 Run all tests:
@@ -203,10 +225,10 @@ In separate terminal windows, start two workers:
 
 ```bash
 # Terminal 1 - Start first worker
-DATAFUSION_RAY_LOG_LEVEL=trace ./target/release/datafusion-distributed --mode worker --port 20201
+DATAFUSION_RAY_LOG_LEVEL=trace ./target/release/distributed-datafusion --mode worker --port 20201
 
 # Terminal 2 - Start second worker  
-DATAFUSION_RAY_LOG_LEVEL=trace ./target/release/datafusion-distributed --mode worker --port 20202
+DATAFUSION_RAY_LOG_LEVEL=trace ./target/release/distributed-datafusion --mode worker --port 20202
 ```
 
 **Step 2: Start Proxy**
@@ -215,17 +237,17 @@ In another terminal, start the proxy connecting to both workers:
 
 ```bash
 # Terminal 3 - Start proxy connected to workers
-DATAFUSION_RAY_LOG_LEVEL=trace DFRAY_WORKER_ADDRESSES=worker1/localhost:20201,worker2/localhost:20202 ./target/release/datafusion-distributed --mode proxy --port 20200
+DATAFUSION_RAY_LOG_LEVEL=trace DFRAY_WORKER_ADDRESSES=worker1/localhost:20201,worker2/localhost:20202 ./target/release/distributed-datafusion --mode proxy --port 20200
 ```
 
 To make your cluster aware of specific table schemas, you’ll need to define a new environment variable, DFRAY_TABLES, when starting each worker and proxy. This variable should specify tables whose data is stored in Parquet files.For example, the following setup registers two tables—customer and nation—along with their corresponding data sources.
 
 ```bash
-DFRAY_TABLES=customer:parquet:/tmp/tpch_s1/customer.parquet,nation:parquet:/tmp/tpch_s1/nation.parquet DATAFUSION_RAY_LOG_LEVEL=trace ./target/release/datafusion-distributed --mode worker --port 20201
+DFRAY_TABLES=customer:parquet:/tmp/tpch_s1/customer.parquet,nation:parquet:/tmp/tpch_s1/nation.parquet DATAFUSION_RAY_LOG_LEVEL=trace ./target/release/distributed-datafusion --mode worker --port 20201
 
-DFRAY_TABLES=customer:parquet:/tmp/tpch_s1/customer.parquet,nation:parquet:/tmp/tpch_s1/nation.parquet DATAFUSION_RAY_LOG_LEVEL=trace ./target/release/datafusion-distributed --mode worker --port 20202
+DFRAY_TABLES=customer:parquet:/tmp/tpch_s1/customer.parquet,nation:parquet:/tmp/tpch_s1/nation.parquet DATAFUSION_RAY_LOG_LEVEL=trace ./target/release/distributed-datafusion --mode worker --port 20202
 
-DFRAY_TABLES=customer:parquet:/tmp/tpch_s1/customer.parquet,nation:parquet:/tmp/tpch_s1/nation.parquet DATAFUSION_RAY_LOG_LEVEL=trace DFRAY_WORKER_ADDRESSES=worker1/localhost:20201,worker2/localhost:20202 ./target/release/datafusion-distributed --mode proxy --port 20200
+DFRAY_TABLES=customer:parquet:/tmp/tpch_s1/customer.parquet,nation:parquet:/tmp/tpch_s1/nation.parquet DATAFUSION_RAY_LOG_LEVEL=trace DFRAY_WORKER_ADDRESSES=worker1/localhost:20201,worker2/localhost:20202 ./target/release/distributed-datafusion --mode proxy --port 20200
 ```
 
 #### Manual Client Setup
@@ -271,16 +293,16 @@ For development or testing, you can run individual components:
 
 ```bash
 # Single worker (no distributed queries)
-./target/release/datafusion-distributed --mode worker --port 20201
+./target/release/distributed-datafusion --mode worker --port 20201
 
 # Proxy without workers (limited functionality)
-./target/release/datafusion-distributed --mode proxy --port 20200
+./target/release/distributed-datafusion --mode proxy --port 20200
 ```
 
 View Available Options
 
 ```bash
-./target/release/datafusion-distributed --help
+./target/release/distributed-datafusion --help
 ```
 
 The system supports various configuration options through environment variables:
diff --git a/build.sh b/build.sh
@@ -8,16 +8,16 @@ if [ "$1" = "--release" ] || [ "$1" = "-r" ]; then
     echo "Building in release mode..."
     cargo build --release
     echo "✅ Build completed successfully!"
-    echo "Binary available at: ./target/release/datafusion-distributed"
+    echo "Binary available at: ./target/release/distributed-datafusion"
 else
     echo "Building in debug mode..."
     cargo build
     echo "✅ Build completed successfully!"
-    echo "Binary available at: ./target/debug/datafusion-distributed"
+    echo "Binary available at: ./target/debug/distributed-datafusion"
 fi
 
 echo ""
 echo "Usage:"
-echo "  ./target/debug/datafusion-distributed --help    # View help"
-echo "  ./target/debug/datafusion-distributed --mode proxy --port 20200    # Start proxy"
-echo "  ./target/debug/datafusion-distributed --mode worker --port 20201   # Start worker" 
+echo "  ./target/debug/distributed-datafusion --help    # View help"
+echo "  ./target/debug/distributed-datafusion --mode proxy --port 20200    # Start proxy"
+echo "  ./target/debug/distributed-datafusion --mode worker --port 20201   # Start worker" 
diff --git a/clean_and_build.sh b/clean_and_build.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+set -e
+
+echo "🧹 Cleaning old build artifacts..."
+
+# Remove old target directory containing the old binary name
+if [ -d "target" ]; then
+    echo "  Removing target/ directory..."
+    rm -rf target/
+    echo "  ✅ Old target/ directory removed"
+else
+    echo "  No target/ directory found"
+fi
+
+echo ""
+echo "🔨 Building project with new binary name 'distributed-datafusion'..."
+
+# Build the project (this will create a new target directory)
+if [ "$1" = "--release" ] || [ "$1" = "-r" ]; then
+    echo "Building in release mode..."
+    cargo build --release
+    echo "✅ Build completed successfully!"
+    echo "New binary available at: ./target/release/distributed-datafusion"
+else
+    echo "Building in debug mode..."
+    cargo build
+    echo "✅ Build completed successfully!"
+    echo "New binary available at: ./target/debug/distributed-datafusion"
+fi
+
+echo ""
+echo "🎉 Cleanup and rebuild complete!"
+echo ""
+echo "Usage:"
+echo "  ./target/debug/distributed-datafusion --help    # View help"
+echo "  ./target/debug/distributed-datafusion --mode proxy --port 20200    # Start proxy"
+echo "  ./target/debug/distributed-datafusion --mode worker --port 20201   # Start worker" 
diff --git a/scripts/launch_tpch_cluster.sh b/scripts/launch_tpch_cluster.sh
@@ -67,7 +67,7 @@ if [ "$NUM_WORKERS" -lt 1 ]; then
 fi
 
 # Check if the binary exists, build if not
-if [ ! -f "./target/release/datafusion-distributed" ]; then
+if [ ! -f "./target/release/distributed-datafusion" ]; then
     echo "Binary not found, building release version..."
     echo "This may take a few minutes on first run..."
     if [ -f "./build.sh" ]; then
@@ -77,8 +77,8 @@ if [ ! -f "./target/release/datafusion-distributed" ]; then
     fi
     
     # Verify the build was successful
-    if [ ! -f "./target/release/datafusion-distributed" ]; then
-        echo "Error: Failed to build datafusion-distributed binary"
+    if [ ! -f "./target/release/distributed-datafusion" ]; then
+        echo "Error: Failed to build distributed-datafusion binary"
         exit 1
     fi
     echo "✅ Build completed successfully!"
@@ -146,7 +146,7 @@ for ((i=0; i<NUM_WORKERS; i++)); do
     WORKER_NAME="worker$((i+1))"
     LOG_FILE="${LOG_DIR}/${WORKER_NAME}.log"
     echo "  Starting $WORKER_NAME on port $PORT..."
-    env DATAFUSION_RAY_LOG_LEVEL="$DATAFUSION_RAY_LOG_LEVEL" DFRAY_TABLES="$DFRAY_TABLES" ./target/release/datafusion-distributed --mode worker --port $PORT > "$LOG_FILE" 2>&1 &
+    env DATAFUSION_RAY_LOG_LEVEL="$DATAFUSION_RAY_LOG_LEVEL" DFRAY_TABLES="$DFRAY_TABLES" ./target/release/distributed-datafusion --mode worker --port $PORT > "$LOG_FILE" 2>&1 &
     WORKER_PIDS[$i]=$!
     WORKER_ADDRESSES[$i]="${WORKER_NAME}/localhost:${PORT}"
 done
@@ -162,7 +162,7 @@ WORKER_ADDRESSES_STR=$(IFS=,; echo "${WORKER_ADDRESSES[*]}")
 echo "Starting proxy on port 20200..."
 echo "Connecting to workers: $WORKER_ADDRESSES_STR"
 PROXY_LOG="${LOG_DIR}/proxy.log"
-env DATAFUSION_RAY_LOG_LEVEL="$DATAFUSION_RAY_LOG_LEVEL" DFRAY_TABLES="$DFRAY_TABLES" DFRAY_WORKER_ADDRESSES="$WORKER_ADDRESSES_STR" ./target/release/datafusion-distributed --mode proxy --port 20200 > "$PROXY_LOG" 2>&1 &
+env DATAFUSION_RAY_LOG_LEVEL="$DATAFUSION_RAY_LOG_LEVEL" DFRAY_TABLES="$DFRAY_TABLES" DFRAY_WORKER_ADDRESSES="$WORKER_ADDRESSES_STR" ./target/release/distributed-datafusion --mode proxy --port 20200 > "$PROXY_LOG" 2>&1 &
 PROXY_PID=$!
 
 echo
diff --git a/src/codec.rs b/src/codec.rs
@@ -7,7 +7,6 @@ use datafusion::{
     execution::FunctionRegistry,
     physical_plan::{ExecutionPlan, displayable},
 };
-// DataDog-specific dependencies removed for open source version
 use datafusion_proto::{
     physical_plan::{
         DefaultPhysicalExtensionCodec,
@@ -96,7 +95,6 @@ impl PhysicalExtensionCodec for DFRayCodec {
                         )))
                     }
                 }
-                // DataDog-specific NumpangExec and ContextExec removed for open source version
                 Payload::NumpangExec(_) => {
                     Err(internal_datafusion_err!(
                         "NumpangExec not supported in open source version"
@@ -146,7 +144,6 @@ impl PhysicalExtensionCodec for DFRayCodec {
             };
             Payload::MaxRowsExec(pb)
         } else if let Some(_exec) = node.as_any().downcast_ref::<DataSourceExec>() {
-            // DataDog-specific DataSourceExec encoding removed for open source version
             return internal_err!("DataSourceExec encoding not supported in open source version");
         } else {
             return internal_err!("Not supported node to encode to proto");
@@ -175,7 +172,6 @@ mod test {
         physical_plan::{Partitioning, displayable},
         prelude::SessionContext,
     };
-    // DataDog-specific NumpangFileSource removed for open source version
     use datafusion_proto::physical_plan::AsExecutionPlan;
 
     use super::*;
@@ -252,8 +248,6 @@ mod test {
         verify_round_trip(exec);
     }
 
-    // DataDog-specific numpang tests removed for open source version
-
     #[test]
     fn max_rows_and_reader_round_trip() {
         let schema = create_test_schema();
diff --git a/src/lib.rs b/src/lib.rs
@@ -57,7 +57,7 @@ fn setup_logging() {
     let dfr_env = env::var("DATAFUSION_RAY_LOG_LEVEL").unwrap_or("WARN".to_string());
     let rust_log_env = env::var("RUST_LOG").unwrap_or("WARN".to_string());
 
-    let combined_env = format!("{rust_log_env},datafusion_distributed={dfr_env}");
+    let combined_env = format!("{rust_log_env},distributed_datafusion={dfr_env}");
 
     env_logger::Builder::new()
         .parse_filters(&combined_env)
diff --git a/src/main.rs b/src/main.rs
@@ -1,14 +1,14 @@
 use anyhow::Result;
 use clap::Parser;
-use datafusion_distributed::{
+use distributed_datafusion::{
     friendly::new_friendly_name,
     processor_service::DFRayProcessorService,
     proxy_service::DFRayProxyService,
     setup,
 };
 
 #[derive(Parser)]
-#[command(name = "datafusion-distributed")]
+#[command(name = "distributed-datafusion")]
 #[command(about = "A distributed execution engine for DataFusion", long_about = None)]
 struct Args {
     /// Port number for the service to listen on
diff --git a/src/planning.rs b/src/planning.rs
@@ -28,7 +28,6 @@ use datafusion::{
     },
     prelude::{SQLOptions, SessionConfig, SessionContext},
 };
-// DataDog-specific table functions removed for open source version
 use futures::TryStreamExt;
 use itertools::Itertools;
 use prost::Message;
@@ -129,8 +128,6 @@ async fn make_state() -> Result<SessionState> {
         .with_config(config)
         .build();
 
-    // DataDog-specific table functions registration removed for open source version
-
     add_tables_from_env(&mut state)
         .await
         .context("Failed to add tables from environment")?;