transaction conflicts under moderate concurrent writes #618

djouallah · 2025-12-11T11:57:48Z

djouallah
Dec 11, 2025

What happens?

testing data inlining in a local postgres, heap is around 10 x compared to ducklake

To Reproduce

import psycopg2
from psycopg2 import pool
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import duckdb

# ============================================================================
# Database Connection Configuration
# ============================================================================

# Database connection parameters
DB_CONFIG = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'me',
    'port': 5432
}

# Test configuration
NUM_WORKERS = [1,2]  # List of worker counts to test
INSERTS_PER_WORKER = 100

# ============================================================================
# Create Connection Pool
# ============================================================================

# Create a PostgreSQL connection pool
connection_pool = psycopg2.pool.ThreadedConnectionPool(
    minconn=1,
    maxconn=50,
    **DB_CONFIG
)

# Create DuckDB connection for DuckLake
duckdb_conn = duckdb.connect()
duckdb_conn.execute("""
    ATTACH OR REPLACE 'ducklake:postgres:dbname=ducklake host=localhost' AS ducklake 
    (DATA_PATH 's3://mybucket/ducklake/',DATA_INLINING_ROW_LIMIT 10)
""")
duckdb_conn.execute("USE ducklake")

print("PostgreSQL connection pool created successfully")
print("DuckDB connection with DuckLake attached successfully")

# ============================================================================
# Create Test Schema and Tables
# ============================================================================

def setup_tables():
    """Create test schema and heap, iceberg, and ducklake tables"""
    # PostgreSQL tables
    conn = connection_pool.getconn()
    try:
        cursor = conn.cursor()
        
        # Create schema
        cursor.execute("CREATE SCHEMA IF NOT EXISTS test_concurrent")
        
        # Drop existing tables if they exist
        cursor.execute("DROP TABLE IF EXISTS test_concurrent.lineorder_heap CASCADE")
        cursor.execute("DROP TABLE IF EXISTS test_concurrent.lineorder_iceberg CASCADE")
        
        # Create HEAP table (no primary key for fair comparison)
        cursor.execute("""
            CREATE TABLE test_concurrent.lineorder_heap (
                lo_orderkey BIGINT NOT NULL,
                lo_linenumber INTEGER NOT NULL,
                lo_custkey INTEGER,
                lo_partkey INTEGER,
                lo_suppkey INTEGER,
                lo_orderdate INTEGER,
                lo_commitdate INTEGER,
                lo_shipdate INTEGER,
                lo_orderpriority VARCHAR(15),
                lo_shippriority INTEGER,
                lo_shipmode VARCHAR(10),
                lo_quantity NUMERIC(15,2),
                lo_extendedprice NUMERIC(15,2),
                lo_discount NUMERIC(15,2),
                lo_tax NUMERIC(15,2),
                lo_revenue NUMERIC(15,2),
                lo_supplycost NUMERIC(15,2)
            )
        """)
        print("✓ Created HEAP table: test_concurrent.lineorder_heap")
        
        # Create ICEBERG table (no primary key constraint - not supported on foreign tables)
        cursor.execute("""
            CREATE TABLE test_concurrent.lineorder_iceberg (
                lo_orderkey BIGINT NOT NULL,
                lo_linenumber INTEGER NOT NULL,
                lo_custkey INTEGER,
                lo_partkey INTEGER,
                lo_suppkey INTEGER,
                lo_orderdate INTEGER,
                lo_commitdate INTEGER,
                lo_shipdate INTEGER,
                lo_orderpriority VARCHAR(15),
                lo_shippriority INTEGER,
                lo_shipmode VARCHAR(10),
                lo_quantity NUMERIC(15,2),
                lo_extendedprice NUMERIC(15,2),
                lo_discount NUMERIC(15,2),
                lo_tax NUMERIC(15,2),
                lo_revenue NUMERIC(15,2),
                lo_supplycost NUMERIC(15,2)
            ) USING iceberg
        """)
        print("✓ Created ICEBERG table: test_concurrent.lineorder_iceberg")
        
        conn.commit()
        cursor.close()
        
    except Exception as e:
        print(f"Error setting up PostgreSQL tables: {e}")
        conn.rollback()
    finally:
        connection_pool.putconn(conn)
    
    # DuckDB DuckLake table
    try:
        duckdb_conn.execute("CREATE SCHEMA IF NOT EXISTS test_concurrent")
        duckdb_conn.execute("DROP TABLE IF EXISTS ducklake.test_concurrent.lineorder_ducklake")
        
        duckdb_conn.execute("""
            CREATE TABLE ducklake.test_concurrent.lineorder_ducklake (
                lo_orderkey BIGINT NOT NULL,
                lo_linenumber INTEGER NOT NULL,
                lo_custkey INTEGER,
                lo_partkey INTEGER,
                lo_suppkey INTEGER,
                lo_orderdate INTEGER,
                lo_commitdate INTEGER,
                lo_shipdate INTEGER,
                lo_orderpriority VARCHAR(15),
                lo_shippriority INTEGER,
                lo_shipmode VARCHAR(10),
                lo_quantity DECIMAL(15,2),
                lo_extendedprice DECIMAL(15,2),
                lo_discount DECIMAL(15,2),
                lo_tax DECIMAL(15,2),
                lo_revenue DECIMAL(15,2),
                lo_supplycost DECIMAL(15,2)
            )
        """)
        print("✓ Created DUCKLAKE table: test_concurrent.lineorder_ducklake")
        
    except Exception as e:
        print(f"Error setting up DuckLake table: {e}")

# ============================================================================
# Insert Function
# ============================================================================

# Static lists for random selection
ORDER_PRIORITIES = ['1-URGENT', '2-HIGH', '3-MEDIUM', '4-NOT SPECIFIED', '5-LOW']
SHIP_MODES = ['AIR', 'MAIL', 'RAIL', 'SHIP', 'TRUCK', 'REG AIR', 'FOB']

def insert_batch(worker_id, num_inserts, start_key, table_name):
    """Insert multiple rows from a single worker into specified table"""
    inserted = 0
    errors = 0
    
    # Determine if this is a DuckDB table
    is_ducklake = 'ducklake' in table_name.lower()
    
    if is_ducklake:
        # Use shared DuckDB connection
        try:
            for i in range(num_inserts):
                try:
                    lo_orderkey = start_key + i
                    lo_quantity = random.randint(1, 50)
                    lo_extendedprice = round(random.uniform(1000, 101000), 2)
                    lo_discount = round(random.uniform(0, 0.10), 2)
                    lo_tax = round(random.uniform(0, 0.08), 2)
                    lo_revenue = round(lo_extendedprice * (1 - lo_discount), 2)
                    
                    duckdb_conn.execute(f"""
                        INSERT INTO {table_name} VALUES (?, 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                    """, (
                        lo_orderkey,
                        random.randint(1, 1000),
                        random.randint(1, 1000),
                        random.randint(1, 1000),
                        random.randint(19920101, 19981231),
                        random.randint(19920101, 19981231),
                        random.randint(19920101, 19981231),
                        random.choice(ORDER_PRIORITIES),
                        random.randint(0, 9),
                        random.choice(SHIP_MODES),
                        lo_quantity,
                        lo_extendedprice,
                        lo_discount,
                        lo_tax,
                        lo_revenue,
                        round(random.uniform(0, 50000), 2)
                    ))
                    inserted += 1
                        
                except Exception as e:
                    print(f"Worker {worker_id} insert error: {e}")
                    errors += 1
            
        except Exception as e:
            print(f"Worker {worker_id} DuckDB connection error: {e}")
            import traceback
            traceback.print_exc()
    else:
        # PostgreSQL connection
        conn = connection_pool.getconn()
        try:
            cursor = conn.cursor()
            
            for i in range(num_inserts):
                try:
                    lo_orderkey = start_key + i
                    lo_quantity = random.randint(1, 50)
                    lo_extendedprice = round(random.uniform(1000, 101000), 2)
                    lo_discount = round(random.uniform(0, 0.10), 2)
                    lo_tax = round(random.uniform(0, 0.08), 2)
                    lo_revenue = round(lo_extendedprice * (1 - lo_discount), 2)
                    
                    cursor.execute(f"""
                        INSERT INTO {table_name} (
                            lo_orderkey, lo_linenumber, lo_custkey, lo_partkey, lo_suppkey,
                            lo_orderdate, lo_commitdate, lo_shipdate,
                            lo_orderpriority, lo_shippriority, lo_shipmode,
                            lo_quantity, lo_extendedprice, lo_discount, lo_tax,
                            lo_revenue, lo_supplycost
                        ) VALUES (
                            %s, 1, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
                        )
                    """, (
                        lo_orderkey,
                        random.randint(1, 1000),
                        random.randint(1, 1000),
                        random.randint(1, 1000),
                        random.randint(19920101, 19981231),
                        random.randint(19920101, 19981231),
                        random.randint(19920101, 19981231),
                        random.choice(ORDER_PRIORITIES),
                        random.randint(0, 9),
                        random.choice(SHIP_MODES),
                        lo_quantity,
                        lo_extendedprice,
                        lo_discount,
                        lo_tax,
                        lo_revenue,
                        round(random.uniform(0, 50000), 2)
                    ))
                    
                    # Commit after each insert (1 row at a time)
                    conn.commit()
                    inserted += 1
                        
                except Exception as e:
                    errors += 1
                    conn.rollback()
            cursor.close()
            
        except Exception as e:
            print(f"Worker {worker_id} PostgreSQL error: {e}")
        finally:
            connection_pool.putconn(conn)
    
    return worker_id, inserted, errors

# ============================================================================
# Run Concurrent Insert Tests
# ============================================================================

def run_concurrent_test(num_workers, inserts_per_worker, table_name, table_type):
    """Run a test with specified number of concurrent workers"""
    print(f"\n{'='*60}")
    print(f"Testing {table_type} with {num_workers} workers...")
    print(f"{'='*60}")
    
    # Generate unique starting keys for each worker
    base_key = int(time.time() * 1000)  # Use timestamp as base to ensure uniqueness
    
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for worker_id in range(num_workers):
            start_key = base_key + (worker_id * inserts_per_worker)
            future = executor.submit(insert_batch, worker_id, inserts_per_worker, start_key, table_name)
            futures.append(future)
        
        # Wait for all workers to complete
        results = []
        for future in as_completed(futures):
            results.append(future.result())
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    total_inserted = sum(r[1] for r in results)
    total_errors = sum(r[2] for r in results)
    
    inserts_per_second = total_inserted / elapsed_time if elapsed_time > 0 else 0
    
    print(f"  ✓ Inserted: {total_inserted} rows")
    print(f"  ✗ Errors: {total_errors}")
    print(f"  ⏱ Time: {elapsed_time:.2f} seconds")
    print(f"  ⚡ Throughput: {inserts_per_second:.2f} inserts/second")
    
    return {
        'table_type': table_type,
        'workers': num_workers,
        'total_inserts': total_inserted,
        'errors': total_errors,
        'elapsed_time': elapsed_time,
        'inserts_per_second': inserts_per_second
    }

# ============================================================================
# Main Execution
# ============================================================================

if __name__ == '__main__':
    # Create tables
    setup_tables()
    
    # Run tests on all tables
    all_results = []
    
    for num_workers in NUM_WORKERS:
        # Test HEAP table
        result_heap = run_concurrent_test(
            num_workers, 
            INSERTS_PER_WORKER, 
            'test_concurrent.lineorder_heap',
            'HEAP'
        )
        all_results.append(result_heap)
        
        time.sleep(1)  # Brief pause between tests
        
        # Test DUCKLAKE table
        result_ducklake = run_concurrent_test(
            num_workers, 
            INSERTS_PER_WORKER, 
            'ducklake.test_concurrent.lineorder_ducklake',
            'DUCKLAKE'
        )
        all_results.append(result_ducklake)
        
        time.sleep(1)  # Brief pause between tests
        
        # Test ICEBERG table
        result_iceberg = run_concurrent_test(
            num_workers, 
            INSERTS_PER_WORKER, 
            'test_concurrent.lineorder_iceberg',
            'ICEBERG'
        )
        all_results.append(result_iceberg)
        
        time.sleep(2)  # Pause before next concurrency level
    
    print("\n" + "="*60)
    print("All tests completed!")
    print("="*60)

OS:

windows

DuckDB Version:

1.4.3

DuckLake Version:

0.2

DuckDB Client:

python

Hardware:

No response

Full Name:

mim

Affiliation:

personal

What is the latest build you tested with? If possible, we recommend testing with the latest nightly build.

I have not tested with any build

Did you include all relevant data sets for reproducing the issue?

No - Other reason (please specify in the issue body)

Did you include all code required to reproduce the issue?

Yes, I have

Did you include all relevant configuration (e.g., CPU architecture, Python version, Linux distribution) to reproduce the issue?

Yes, I have

djouallah · 2025-12-11T12:12:25Z

djouallah
Dec 11, 2025
Author

forgot to show the results

PostgreSQL connection pool created successfully
DuckDB connection with DuckLake attached successfully
✓ Created HEAP table: test_concurrent.lineorder_heap
✓ Created ICEBERG table: test_concurrent.lineorder_iceberg
✓ Created DUCKLAKE table: test_concurrent.lineorder_ducklake

============================================================
Testing HEAP with 1 workers...

✓ Inserted: 100 rows
✗ Errors: 0
⏱ Time: 0.18 seconds
⚡ Throughput: 551.29 inserts/second

============================================================
Testing DUCKLAKE with 1 workers...

✓ Inserted: 100 rows
✗ Errors: 0
⏱ Time: 3.13 seconds
⚡ Throughput: 31.91 inserts/second

============================================================
Testing ICEBERG with 1 workers...

✓ Inserted: 100 rows
✗ Errors: 0
⏱ Time: 99.21 seconds
⚡ Throughput: 1.01 inserts/second

============================================================
Testing HEAP with 2 workers...

✓ Inserted: 200 rows
✗ Errors: 0
⏱ Time: 0.34 seconds
⚡ Throughput: 585.40 inserts/second

============================================================
Testing DUCKLAKE with 2 workers...

✓ Inserted: 200 rows
✗ Errors: 0
⏱ Time: 5.37 seconds
⚡ Throughput: 37.23 inserts/second

============================================================
Testing ICEBERG with 2 workers...

✓ Inserted: 200 rows
✗ Errors: 0
⏱ Time: 139.51 seconds
⚡ Throughput: 1.43 inserts/second

============================================================
All tests completed!

0 replies

guillesd · 2025-12-11T13:40:02Z

guillesd
Dec 11, 2025
Collaborator

Hey @djouallah thanks for reporting.

First things first, I think you raise a valid point regarding performance of inlining. Now a couple of comments:

Regarding your script

I get different timings. This could be related to OS and platform, but I sit around 60 transactions/s with 1 worker and 80 with two.

Another thing that I notice is that you do the timings in the main program rather than timing each thread. I think if you just want to measure the throughput of one connection then it is better to time inside insert_batch. Otherwise I feel like you get penalized by python not executing in parallel when spawning threads (unless you are using free threaded python).

Regarding DuckLake's performance

An insertion via Data Inlining in DuckLake will never match a plain Postgres table. For every insertion you do in DuckLake, you are creating a new snapshot, which requires a bunch of queries and subsequent updates/inserts to tables.
Some of these operations can be done more efficiently and we are aware of this and are planing to optimize this further for v1.0.

Another notes

If we can get closer to the pure postgres performance by reducing some of the overhead (let's say we get 3x slower or even less), we would be pretty happy. I think inlining will not only improve insertion speed to lakehouses, but most importantly it solves the small files problem and large compaction jobs.

Also, we may consider testing other catalogs in the future (maybe variations of postgres) that can scale better and provide a better baseline.

Thanks!

0 replies

pdet · 2025-12-12T17:18:37Z

pdet
Dec 12, 2025
Collaborator

Moving this to a discussion since it is not a bug.

0 replies

djouallah · 2025-12-20T03:48:31Z

djouallah
Dec 20, 2025
Author

now, I have a bigger problem, conflicts and data loss under heavy write use cases

0 replies

djouallah · 2025-12-21T02:27:39Z

djouallah
Dec 21, 2025
Author

i get it, data inlining works only with insert not update and delete !!!

2 replies

steven-luabase Jan 28, 2026

Can you say more about this? Will it not work with MERGE operations?

guillesd Jan 29, 2026
Collaborator

in 1.5 we are aiming for full support, including deletetions and updates

guillesd · 2025-12-22T10:26:56Z

guillesd
Dec 22, 2025
Collaborator

Actually transaction conflicts is generally a problem that DuckLake has because of the way we handle snapshotting. We are looking into this. But this should already be a bit better in 1.4.4 since we've reduced the amount of roundtrips to postgres (less time for writers to grab conflicting snapshot ids).

Data loss is another thing though. I wouldn't expect any data loss, not sure what you mean there.

0 replies

djouallah · 2025-12-22T11:46:49Z

djouallah
Dec 22, 2025
Author

@guillesd Sorry, I misspoke. What I meant is that if DuckLake is used as a system of record, some transactions under heavy write load may fail to be committed at all.

0 replies

transaction conflicts under moderate concurrent writes #618

Uh oh!

Uh oh!

djouallah Dec 11, 2025

What happens?

To Reproduce

OS:

DuckDB Version:

DuckLake Version:

DuckDB Client:

Hardware:

Full Name:

Affiliation:

What is the latest build you tested with? If possible, we recommend testing with the latest nightly build.

Did you include all relevant data sets for reproducing the issue?

Did you include all code required to reproduce the issue?

Did you include all relevant configuration (e.g., CPU architecture, Python version, Linux distribution) to reproduce the issue?

Replies: 7 comments · 2 replies

Uh oh!

djouallah Dec 11, 2025 Author

============================================================ Testing HEAP with 1 workers...

============================================================ Testing DUCKLAKE with 1 workers...

============================================================ Testing ICEBERG with 1 workers...

============================================================ Testing HEAP with 2 workers...

============================================================ Testing DUCKLAKE with 2 workers...

============================================================ Testing ICEBERG with 2 workers...

============================================================ All tests completed!

Uh oh!

Uh oh!

guillesd Dec 11, 2025 Collaborator

Regarding your script

Regarding DuckLake's performance

Another notes

Uh oh!

pdet Dec 12, 2025 Collaborator

Uh oh!

djouallah Dec 20, 2025 Author

Uh oh!

djouallah Dec 21, 2025 Author

Uh oh!

steven-luabase Jan 28, 2026

Uh oh!

guillesd Jan 29, 2026 Collaborator

Uh oh!

guillesd Dec 22, 2025 Collaborator

Uh oh!

djouallah Dec 22, 2025 Author

djouallah
Dec 11, 2025

Replies: 7 comments 2 replies

djouallah
Dec 11, 2025
Author

============================================================
Testing HEAP with 1 workers...

============================================================
Testing DUCKLAKE with 1 workers...

============================================================
Testing ICEBERG with 1 workers...

============================================================
Testing HEAP with 2 workers...

============================================================
Testing DUCKLAKE with 2 workers...

============================================================
Testing ICEBERG with 2 workers...

============================================================
All tests completed!

guillesd
Dec 11, 2025
Collaborator

pdet
Dec 12, 2025
Collaborator

djouallah
Dec 20, 2025
Author

djouallah
Dec 21, 2025
Author

guillesd Jan 29, 2026
Collaborator

guillesd
Dec 22, 2025
Collaborator

djouallah
Dec 22, 2025
Author