Skip to content

Conversation

@codeflash-ai
Copy link

@codeflash-ai codeflash-ai bot commented Jun 27, 2025

📄 16,012% (160.12x) speedup for find_last_node in src/dsa/nodes.py

⏱️ Runtime : 37.9 milliseconds 235 microseconds (best of 498 runs)

📝 Explanation and details

Here’s a rewritten, optimized version of the given program. The original code is O(n·m), where n is the number of nodes and m is the number of edges, because for each node, it checks all edges.
We can make it O(n + m) by first collecting all nodes that are referenced as "source" in any edge. Then, the last node is any node whose "id" is not in this set.

Key improvement:

  • Turns the repeated search of all edges per node into a set lookup, which is O(1) per node.
  • This reduces complexity from O(n·m) to O(n + m).
  • The result is identical to your original code.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 20 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
📊 Tests Coverage 100.0%
🌀 Generated Regression Tests and Runtime
import pytest  # used for our unit tests
from src.dsa.nodes import find_last_node

# unit tests

# -------------------- BASIC TEST CASES --------------------

def test_single_node_no_edges():
    # One node, no edges: node is last node
    nodes = [{"id": "A"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_two_nodes_one_edge():
    # Two nodes, one edge from A to B: B is last node
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "B"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_three_nodes_linear_chain():
    # Linear chain: A -> B -> C, C is last node
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [{"source": "A", "target": "B"}, {"source": "B", "target": "C"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_multiple_last_nodes_returns_first():
    # Two nodes with no outgoing edges: should return the first one in nodes
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_branching_graph():
    # A -> B, A -> C; B and C are both last nodes, should return B (first in nodes)
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [{"source": "A", "target": "B"}, {"source": "A", "target": "C"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

# -------------------- EDGE TEST CASES --------------------

def test_empty_nodes_and_edges():
    # No nodes, no edges: should return None
    nodes = []
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_edges_but_no_nodes():
    # Edges present, but no nodes: should return None
    nodes = []
    edges = [{"source": "A", "target": "B"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_isolated_and_connected_nodes():
    # One isolated node, one connected: isolated node is last node
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "B", "target": "A"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_cycle_graph():
    # Cycle: A -> B -> C -> A, no last node (all have outgoing edges)
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [
        {"source": "A", "target": "B"},
        {"source": "B", "target": "C"},
        {"source": "C", "target": "A"}
    ]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_node_with_multiple_outgoing_edges():
    # A -> B, A -> C, C -> D; B and D have no outgoing edges, should return B (first in nodes)
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}, {"id": "D"}]
    edges = [
        {"source": "A", "target": "B"},
        {"source": "A", "target": "C"},
        {"source": "C", "target": "D"}
    ]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_edge_with_nonexistent_nodes():
    # Edge references nodes not in list: should ignore and return the node in nodes
    nodes = [{"id": "A"}]
    edges = [{"source": "B", "target": "C"}]  # B and C not in nodes
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_duplicate_node_ids():
    # Duplicate node IDs: function should return the first one with no outgoing edges
    nodes = [{"id": "A"}, {"id": "A"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_node_with_self_loop():
    # Node with self-loop: should not be last node
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "A"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_all_nodes_with_outgoing_edges():
    # All nodes have outgoing edges: should return None
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "B"}, {"source": "B", "target": "A"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output


def test_missing_source_key_in_edge():
    # Edge missing 'source' key: should raise KeyError
    nodes = [{"id": "A"}]
    edges = [{}]
    with pytest.raises(KeyError):
        find_last_node(nodes, edges)

# -------------------- LARGE SCALE TEST CASES --------------------

def test_large_linear_chain():
    # Large linear chain: 1000 nodes, last node is last in list
    N = 1000
    nodes = [{"id": str(i)} for i in range(N)]
    edges = [{"source": str(i), "target": str(i+1)} for i in range(N-1)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_large_branching_graph():
    # Large branching: root connects to 999 leaves, all leaves are last nodes, should return first leaf
    N = 1000
    nodes = [{"id": "root"}] + [{"id": f"leaf_{i}"} for i in range(N-1)]
    edges = [{"source": "root", "target": f"leaf_{i}"} for i in range(N-1)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_large_graph_all_nodes_with_outgoing_edges():
    # All nodes have outgoing edges: should return None
    N = 1000
    nodes = [{"id": str(i)} for i in range(N)]
    # Each node i has an edge to node (i+1)%N, forming a cycle
    edges = [{"source": str(i), "target": str((i+1)%N)} for i in range(N)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_large_graph_with_isolated_node():
    # 999 nodes in a chain, 1 isolated node at beginning
    N = 1000
    nodes = [{"id": "iso"}] + [{"id": str(i)} for i in range(N-1)]
    edges = [{"source": str(i), "target": str(i+1)} for i in range(1, N-2)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_large_graph_with_duplicate_ids():
    # 500 nodes with id "A", 500 with id "B", no edges
    N = 500
    nodes = [{"id": "A"} for _ in range(N)] + [{"id": "B"} for _ in range(N)]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

import pytest  # used for our unit tests
from src.dsa.nodes import find_last_node

# unit tests

# -------------------- BASIC TEST CASES --------------------

def test_single_node_no_edges():
    # One node, no edges: node is last node
    nodes = [{"id": "A"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.12μs -> 1.00μs (12.5% faster)

def test_two_nodes_one_edge():
    # Two nodes, one edge from A to B: B is last node
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "B"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.71μs -> 1.08μs (57.8% faster)

def test_three_nodes_linear_chain():
    # Linear chain: A -> B -> C, C is last node
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [{"source": "A", "target": "B"}, {"source": "B", "target": "C"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 2.12μs -> 1.12μs (88.9% faster)

def test_multiple_last_nodes_returns_first():
    # Two nodes with no outgoing edges: should return the first one in nodes
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.04μs -> 916ns (13.6% faster)

def test_branching_graph():
    # A -> B, A -> C; B and C are both last nodes, should return B (first in nodes)
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [{"source": "A", "target": "B"}, {"source": "A", "target": "C"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.62μs -> 1.04μs (56.0% faster)

# -------------------- EDGE TEST CASES --------------------

def test_empty_nodes_and_edges():
    # No nodes, no edges: should return None
    nodes = []
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 500ns -> 583ns (14.2% slower)

def test_edges_but_no_nodes():
    # Edges present, but no nodes: should return None
    nodes = []
    edges = [{"source": "A", "target": "B"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 417ns -> 625ns (33.3% slower)

def test_isolated_and_connected_nodes():
    # One isolated node, one connected: isolated node is last node
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "B", "target": "A"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.21μs -> 1.00μs (20.8% faster)

def test_cycle_graph():
    # Cycle: A -> B -> C -> A, no last node (all have outgoing edges)
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [
        {"source": "A", "target": "B"},
        {"source": "B", "target": "C"},
        {"source": "C", "target": "A"}
    ]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 2.12μs -> 875ns (143% faster)

def test_node_with_multiple_outgoing_edges():
    # A -> B, A -> C, C -> D; B and D have no outgoing edges, should return B (first in nodes)
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}, {"id": "D"}]
    edges = [
        {"source": "A", "target": "B"},
        {"source": "A", "target": "C"},
        {"source": "C", "target": "D"}
    ]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.71μs -> 1.08μs (57.7% faster)

def test_edge_with_nonexistent_nodes():
    # Edge references nodes not in list: should ignore and return the node in nodes
    nodes = [{"id": "A"}]
    edges = [{"source": "B", "target": "C"}]  # B and C not in nodes
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.12μs -> 1.00μs (12.5% faster)

def test_duplicate_node_ids():
    # Duplicate node IDs: function should return the first one with no outgoing edges
    nodes = [{"id": "A"}, {"id": "A"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.08μs -> 916ns (18.2% faster)

def test_node_with_self_loop():
    # Node with self-loop: should not be last node
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "A"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.58μs -> 1.00μs (58.3% faster)

def test_all_nodes_with_outgoing_edges():
    # All nodes have outgoing edges: should return None
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "B"}, {"source": "B", "target": "A"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.67μs -> 792ns (110% faster)


def test_missing_source_key_in_edge():
    # Edge missing 'source' key: should raise KeyError
    nodes = [{"id": "A"}]
    edges = [{}]
    with pytest.raises(KeyError):
        find_last_node(nodes, edges)

# -------------------- LARGE SCALE TEST CASES --------------------

def test_large_linear_chain():
    # Large linear chain: 1000 nodes, last node is last in list
    N = 1000
    nodes = [{"id": str(i)} for i in range(N)]
    edges = [{"source": str(i), "target": str(i+1)} for i in range(N-1)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 19.0ms -> 81.5μs (23167% faster)

def test_large_branching_graph():
    # Large branching: root connects to 999 leaves, all leaves are last nodes, should return first leaf
    N = 1000
    nodes = [{"id": "root"}] + [{"id": f"leaf_{i}"} for i in range(N-1)]
    edges = [{"source": "root", "target": f"leaf_{i}"} for i in range(N-1)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 36.5μs -> 17.8μs (105% faster)

def test_large_graph_all_nodes_with_outgoing_edges():
    # All nodes have outgoing edges: should return None
    N = 1000
    nodes = [{"id": str(i)} for i in range(N)]
    # Each node i has an edge to node (i+1)%N, forming a cycle
    edges = [{"source": str(i), "target": str((i+1)%N)} for i in range(N)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 18.8ms -> 79.7μs (23513% faster)

def test_large_graph_with_isolated_node():
    # 999 nodes in a chain, 1 isolated node at beginning
    N = 1000
    nodes = [{"id": "iso"}] + [{"id": str(i)} for i in range(N-1)]
    edges = [{"source": str(i), "target": str(i+1)} for i in range(1, N-2)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 38.5μs -> 41.5μs (7.04% slower)

def test_large_graph_with_duplicate_ids():
    # 500 nodes with id "A", 500 with id "B", no edges
    N = 500
    nodes = [{"id": "A"} for _ in range(N)] + [{"id": "B"} for _ in range(N)]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output # 1.21μs -> 1.08μs (11.5% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To edit these changes git checkout codeflash/optimize-find_last_node-mce60hq1 and push.

Codeflash

Here’s a rewritten, **optimized** version of the given program. The original code is **O(n·m)**, where `n` is the number of nodes and `m` is the number of edges, because for each node, it checks all edges.  
We can make it **O(n + m)** by first collecting all nodes that are referenced as `"source"` in any edge. Then, the last node is any node whose `"id"` is **not** in this set.



**Key improvement:**  
- Turns the repeated search of all edges per node into a set lookup, which is O(1) per node.  
- This reduces complexity from O(n·m) to O(n + m).  
- The result is identical to your original code.
@codeflash-ai codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label Jun 27, 2025
@codeflash-ai codeflash-ai bot requested a review from misrasaurabh1 June 27, 2025 02:00
@codeflash-ai codeflash-ai bot deleted the codeflash/optimize-find_last_node-mce60hq1 branch June 27, 2025 02:01
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

⚡️ codeflash Optimization PR opened by Codeflash AI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant