Skip to content

Conversation

@codeflash-ai
Copy link

@codeflash-ai codeflash-ai bot commented Jun 23, 2025

📄 23,047% (230.47x) speedup for find_leaf_nodes in src/dsa/nodes.py

⏱️ Runtime : 189 milliseconds 815 microseconds (best of 475 runs)

📝 Explanation and details

Here's a faster version of your function. The original code checks every edge for every node, which can be slow for large graphs (O(N×E)).
This rewrite first collects all node IDs that appear as a source in any edge (O(E)). Then, it performs a single pass over nodes (O(N)) to check which nodes are not in this source set. The new overall complexity is O(N+E).

This is much faster, especially when the number of nodes or edges grows large.
All comments are preserved as before.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 51 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 2 Passed
📊 Tests Coverage 100.0%
🌀 Generated Regression Tests and Runtime
import pytest  # used for our unit tests
from src.dsa.nodes import find_leaf_nodes

# ----------------------
# unit tests
# ----------------------

# 1. BASIC TEST CASES

def test_single_node_no_edges():
    # One node, no edges. Node should be a leaf.
    nodes = [{"id": 1}]
    edges = []
    codeflash_output = find_leaf_nodes(nodes, edges) # 417ns -> 625ns (33.3% slower)

def test_two_nodes_one_edge():
    # Two nodes, one edge from 1 -> 2. Node 2 is leaf.
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 750ns -> 708ns (5.93% faster)

def test_three_nodes_chain():
    # 1 -> 2 -> 3, only 3 is a leaf
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 2}, {"source": 2, "target": 3}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 959ns -> 791ns (21.2% faster)

def test_three_nodes_star():
    # 1 -> 2, 1 -> 3, 2 and 3 are leaves
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 2}, {"source": 1, "target": 3}]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 1.04μs -> 833ns (25.1% faster)

def test_no_edges_multiple_nodes():
    # All nodes are leaves if no edges
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = []
    codeflash_output = find_leaf_nodes(nodes, edges) # 542ns -> 667ns (18.7% slower)

def test_all_nodes_have_outgoing_edges():
    # No leaves if all nodes have outgoing edges
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2}, {"source": 2, "target": 1}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 791ns -> 708ns (11.7% faster)

def test_multiple_leaves_and_nonleaves():
    # 1 -> 2, 1 -> 3, 2 -> 4, 3 -> 5, leaves: 4, 5
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}]
    edges = [
        {"source": 1, "target": 2},
        {"source": 1, "target": 3},
        {"source": 2, "target": 4},
        {"source": 3, "target": 5},
    ]
    codeflash_output = find_leaf_nodes(nodes, edges) # 1.58μs -> 1.00μs (58.3% faster)

# 2. EDGE TEST CASES

def test_empty_nodes_and_edges():
    # No nodes, no edges: should return empty list
    codeflash_output = find_leaf_nodes([], []) # 292ns -> 375ns (22.1% slower)

def test_nodes_but_edges_with_unknown_ids():
    # Edges refer to node ids not in nodes; should not affect result
    nodes = [{"id": 1}]
    edges = [{"source": 2, "target": 3}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 542ns -> 667ns (18.7% slower)

def test_duplicate_edges():
    # Duplicate edges should not affect result
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2}, {"source": 1, "target": 2}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 833ns -> 750ns (11.1% faster)

def test_self_loop():
    # Node with self-loop is not a leaf
    nodes = [{"id": 1}]
    edges = [{"source": 1, "target": 1}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 542ns -> 625ns (13.3% slower)

def test_multiple_self_loops_and_other_edges():
    # Node 1 has self-loop and outgoing, node 2 has self-loop, node 3 is leaf
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [
        {"source": 1, "target": 1},
        {"source": 1, "target": 2},
        {"source": 2, "target": 2},
    ]
    codeflash_output = find_leaf_nodes(nodes, edges) # 1.04μs -> 833ns (25.1% faster)

def test_disconnected_nodes():
    # Some nodes not connected at all, should be leaves
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 2}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 958ns -> 791ns (21.1% faster)

def test_edge_with_missing_source_key():
    # Edges missing 'source' key should raise KeyError
    nodes = [{"id": 1}]
    edges = [{"target": 1}]
    with pytest.raises(KeyError):
        find_leaf_nodes(nodes, edges)

def test_edge_with_missing_target_key():
    # Edges missing 'target' key should not affect leaf detection
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1}]
    # Node 1 has outgoing edge, node 2 is leaf
    codeflash_output = find_leaf_nodes(nodes, edges) # 750ns -> 708ns (5.93% faster)

def test_nodes_with_nonint_ids():
    # Node ids can be strings
    nodes = [{"id": "a"}, {"id": "b"}]
    edges = [{"source": "a", "target": "b"}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 875ns -> 667ns (31.2% faster)

def test_nodes_with_mixed_id_types():
    # Node ids are mixed types; should match exactly
    nodes = [{"id": 1}, {"id": "1"}]
    edges = [{"source": 1, "target": "1"}]
    # "1" is leaf, 1 is not
    codeflash_output = find_leaf_nodes(nodes, edges) # 834ns -> 708ns (17.8% faster)

def test_duplicate_nodes():
    # Duplicate nodes should both be checked
    nodes = [{"id": 1}, {"id": 1}]
    edges = [{"source": 1, "target": 1}]
    # Both have outgoing edge, so no leaves
    codeflash_output = find_leaf_nodes(nodes, edges) # 708ns -> 708ns (0.000% faster)

def test_nodes_with_extra_keys():
    # Nodes may have extra keys, should preserve them in output
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]
    edges = [{"source": 1, "target": 2}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 791ns -> 667ns (18.6% faster)

def test_edges_with_extra_keys():
    # Edges may have extra keys, should not affect logic
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2, "weight": 5}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 708ns -> 708ns (0.000% faster)

def test_edge_case_large_id_values():
    # Very large integer ids
    nodes = [{"id": 10**18}, {"id": 10**18 + 1}]
    edges = [{"source": 10**18, "target": 10**18 + 1}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 875ns -> 750ns (16.7% faster)

# 3. LARGE SCALE TEST CASES

def test_large_linear_chain():
    # Chain of 1000 nodes: 1->2->3->...->1000; only last is leaf
    N = 1000
    nodes = [{"id": i} for i in range(1, N+1)]
    edges = [{"source": i, "target": i+1} for i in range(1, N)]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 16.1ms -> 61.8μs (25868% faster)

def test_large_star_graph():
    # Node 0 connects to 1..999, so 1..999 are leaves
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = [{"source": 0, "target": i} for i in range(1, N)]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 31.0ms -> 59.1μs (52382% faster)
    expected = [{"id": i} for i in range(1, N)]

def test_large_fully_connected_graph():
    # Every node has outgoing edge to every other node; no leaves
    N = 100
    nodes = [{"id": i} for i in range(N)]
    edges = []
    for i in range(N):
        for j in range(N):
            if i != j:
                edges.append({"source": i, "target": j})
    # All nodes have outgoing edges, so no leaves
    codeflash_output = find_leaf_nodes(nodes, edges) # 14.8ms -> 224μs (6504% faster)

def test_large_sparse_graph_with_isolated_nodes():
    # 900 nodes in a chain, 100 isolated nodes (should be leaves)
    N = 1000
    chain_nodes = [{"id": i} for i in range(900)]
    isolated_nodes = [{"id": i} for i in range(900, N)]
    nodes = chain_nodes + isolated_nodes
    edges = [{"source": i, "target": i+1} for i in range(899)]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 15.9ms -> 60.4μs (26193% faster)
    # Last of chain and all isolated nodes are leaves
    expected = [{"id": 899}] + [{"id": i} for i in range(900, N)]

def test_large_graph_with_many_leaves():
    # 1 -> 2, 1 -> 3, 1 -> 4, ..., 1 -> 999; leaves: 2..999
    N = 1000
    nodes = [{"id": i} for i in range(1, N+1)]
    edges = [{"source": 1, "target": i} for i in range(2, N+1)]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 31.0ms -> 58.4μs (52978% faster)
    expected = [{"id": i} for i in range(2, N+1)]
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

import random

# imports
import pytest  # used for our unit tests
from src.dsa.nodes import find_leaf_nodes

# unit tests

# -------------------------
# BASIC TEST CASES
# -------------------------

def test_empty_graph():
    # No nodes, no edges
    codeflash_output = find_leaf_nodes([], []) # 250ns -> 417ns (40.0% slower)

def test_no_edges_all_leaf():
    # All nodes are leaves if there are no edges
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = []
    codeflash_output = find_leaf_nodes(nodes, edges) # 625ns -> 708ns (11.7% slower)

def test_single_node_no_edges():
    # Single node, no edges, should be a leaf
    nodes = [{"id": 42}]
    edges = []
    codeflash_output = find_leaf_nodes(nodes, edges) # 458ns -> 583ns (21.4% slower)

def test_single_node_self_loop():
    # Single node with a self-loop, not a leaf
    nodes = [{"id": 1}]
    edges = [{"source": 1, "target": 1}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 542ns -> 625ns (13.3% slower)

def test_simple_chain():
    # 1 -> 2 -> 3, only 3 is a leaf
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 2}, {"source": 2, "target": 3}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 958ns -> 791ns (21.1% faster)

def test_simple_tree():
    #      1
    #    /   \
    #   2     3
    #        / \
    #       4   5
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}]
    edges = [
        {"source": 1, "target": 2},
        {"source": 1, "target": 3},
        {"source": 3, "target": 4},
        {"source": 3, "target": 5},
    ]
    # Leaves: 2, 4, 5
    expected = [{"id": 2}, {"id": 4}, {"id": 5}]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 1.54μs -> 1.00μs (54.2% faster)

def test_multiple_disconnected_components():
    # Two disconnected chains: 1->2, 3->4->5
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}]
    edges = [
        {"source": 1, "target": 2},
        {"source": 3, "target": 4},
        {"source": 4, "target": 5},
    ]
    # Leaves: 2, 5
    expected = [{"id": 2}, {"id": 5}]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 1.42μs -> 917ns (54.4% faster)

# -------------------------
# EDGE TEST CASES
# -------------------------

def test_all_nodes_have_outgoing_edges():
    # All nodes have outgoing edges, so no leaves
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [
        {"source": 1, "target": 2},
        {"source": 2, "target": 3},
        {"source": 3, "target": 1}
    ]
    codeflash_output = find_leaf_nodes(nodes, edges) # 958ns -> 792ns (21.0% faster)

def test_node_with_incoming_but_no_outgoing():
    # Node 2 has incoming edge but no outgoing, should be a leaf
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 791ns -> 708ns (11.7% faster)

def test_node_with_self_loop_and_outgoing():
    # Node 1 has a self-loop and an outgoing edge, not a leaf
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 1}, {"source": 1, "target": 2}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 834ns -> 750ns (11.2% faster)

def test_node_with_only_incoming_edges():
    # Node 3 only has incoming edges, should be a leaf
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 3}, {"source": 2, "target": 3}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 959ns -> 792ns (21.1% faster)

def test_duplicate_edges():
    # Duplicate edges should not affect leaf detection
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2}, {"source": 1, "target": 2}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 792ns -> 750ns (5.60% faster)

def test_edges_with_nonexistent_nodes():
    # Edges refer to nodes not in 'nodes' list; should be ignored
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 3}, {"source": 3, "target": 2}]
    # Only node 2 has no outgoing edge from itself
    codeflash_output = find_leaf_nodes(nodes, edges) # 834ns -> 750ns (11.2% faster)

def test_nodes_with_additional_attributes():
    # Nodes may have extra attributes, should be preserved in output
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]
    edges = [{"source": 1, "target": 2}]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 750ns -> 708ns (5.93% faster)

def test_unordered_nodes_and_edges():
    # The order of nodes and edges should not affect the result
    nodes = [{"id": 3}, {"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2}, {"source": 2, "target": 3}]
    # Only node 3 is a leaf
    codeflash_output = find_leaf_nodes(nodes, edges) # 1.04μs -> 833ns (25.1% faster)

def test_isolated_nodes():
    # Nodes with no edges at all are leaves
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = []
    # All nodes are leaves
    codeflash_output = find_leaf_nodes(nodes, edges) # 583ns -> 708ns (17.7% slower)

def test_large_ids():
    # Node IDs can be large integers
    nodes = [{"id": 10**9}, {"id": 10**9 + 1}]
    edges = [{"source": 10**9, "target": 10**9 + 1}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 833ns -> 750ns (11.1% faster)

def test_negative_ids():
    # Node IDs can be negative
    nodes = [{"id": -1}, {"id": -2}]
    edges = [{"source": -1, "target": -2}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 750ns -> 958ns (21.7% slower)

def test_string_ids():
    # Node IDs can be strings
    nodes = [{"id": "a"}, {"id": "b"}]
    edges = [{"source": "a", "target": "b"}]
    codeflash_output = find_leaf_nodes(nodes, edges) # 834ns -> 708ns (17.8% faster)

# -------------------------
# LARGE SCALE TEST CASES
# -------------------------

def test_large_linear_chain():
    # Large chain: 0 -> 1 -> 2 -> ... -> 999
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = [{"source": i, "target": i+1} for i in range(N-1)]
    # Only the last node is a leaf
    codeflash_output = find_leaf_nodes(nodes, edges) # 16.2ms -> 61.1μs (26350% faster)

def test_large_star_graph():
    # One central node with edges to all others
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = [{"source": 0, "target": i} for i in range(1, N)]
    # All nodes except 0 are leaves
    expected = [{"id": i} for i in range(1, N)]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 30.9ms -> 58.7μs (52618% faster)

def test_large_forest():
    # 10 trees of 100 nodes each, each a chain
    forest_size = 10
    tree_size = 100
    nodes = []
    edges = []
    expected_leaves = []
    for t in range(forest_size):
        offset = t * tree_size
        for i in range(tree_size):
            nodes.append({"id": offset + i})
            if i < tree_size - 1:
                edges.append({"source": offset + i, "target": offset + i + 1})
            else:
                expected_leaves.append({"id": offset + i})
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 16.0ms -> 63.2μs (25185% faster)

def test_large_sparse_graph():
    # 1000 nodes, only 10 random edges
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    random.seed(42)
    edges = []
    sources = set()
    for _ in range(10):
        src = random.randint(0, N-1)
        tgt = random.randint(0, N-1)
        edges.append({"source": src, "target": tgt})
        sources.add(src)
    # Leaves are nodes that are not sources in any edge
    expected = [n for n in nodes if n["id"] not in sources]
    codeflash_output = find_leaf_nodes(nodes, edges); result = codeflash_output # 399μs -> 41.6μs (860% faster)

def test_large_graph_all_leaves():
    # 1000 nodes, no edges: all are leaves
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = []
    codeflash_output = find_leaf_nodes(nodes, edges) # 38.7μs -> 33.6μs (15.3% faster)

def test_large_graph_no_leaves():
    # 1000 nodes, each node has outgoing edge (cycle)
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = [{"source": i, "target": (i+1)%N} for i in range(N)]
    codeflash_output = find_leaf_nodes(nodes, edges) # 16.2ms -> 61.4μs (26332% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

from src.dsa.nodes import find_leaf_nodes

def test_find_leaf_nodes():
    find_leaf_nodes([{'id': '\x01'}], [(v1 := {'source': '', 'id': '\x01'}), v1, {'source': '\x00'}])

def test_find_leaf_nodes_2():
    find_leaf_nodes([{'id': ''}], [{'source': ''}])

To edit these changes git checkout codeflash/optimize-find_leaf_nodes-mc8qkoyv and push.

Codeflash

Here's a faster version of your function. The original code checks every edge for every node, which can be slow for large graphs (O(N×E)).  
This rewrite first collects all node IDs that appear as a source in any edge (O(E)). Then, it performs a single pass over nodes (O(N)) to check which nodes are not in this source set. The new overall complexity is O(N+E).


This is much faster, especially when the number of nodes or edges grows large.  
All comments are preserved as before.
@codeflash-ai codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label Jun 23, 2025
@codeflash-ai codeflash-ai bot requested a review from KRRT7 June 23, 2025 06:49
@KRRT7 KRRT7 closed this Jun 23, 2025
@codeflash-ai codeflash-ai bot deleted the codeflash/optimize-find_leaf_nodes-mc8qkoyv branch June 23, 2025 23:31
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

⚡️ codeflash Optimization PR opened by Codeflash AI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant