Skip to content

Conversation

@codeflash-ai
Copy link

@codeflash-ai codeflash-ai bot commented May 27, 2025

📄 20,523% (205.23x) speedup for find_last_node in src/dsa/nodes.py

⏱️ Runtime : 48.6 milliseconds 236 microseconds (best of 822 runs)

📝 Explanation and details

Here’s a faster version. The key optimization:

  • Build a set of all "source" IDs up front, O(E), and then scan nodes and find the first whose "id" is not in that set, O(N).
    This avoids the O(N*E) behavior of the original function.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 37 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
📊 Tests Coverage 100.0%
🌀 Generated Regression Tests Details
import pytest  # used for our unit tests
from src.dsa.nodes import find_last_node

# unit tests

# ------------------------
# Basic Test Cases
# ------------------------

def test_single_node_no_edges():
    # Only one node, no edges: should return that node
    nodes = [{"id": 1, "label": "A"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_two_nodes_one_edge():
    # Two nodes, one edge from A to B: should return B (no outgoing edges)
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "B"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_three_nodes_linear_chain():
    # Three nodes in a chain: A->B->C. Should return C
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [{"source": "A", "target": "B"}, {"source": "B", "target": "C"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_multiple_end_nodes():
    # Two nodes with no outgoing edges: should return the first such node found
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 2}]
    # 2 and 3 have no outgoing edges; function should return 2 (first in list)
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

# ------------------------
# Edge Test Cases
# ------------------------

def test_no_nodes():
    # No nodes at all: should return None
    nodes = []
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_no_edges_multiple_nodes():
    # Multiple nodes, no edges: should return the first node
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_all_nodes_with_outgoing_edges():
    # Every node has an outgoing edge: should return None
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2}, {"source": 2, "target": 1}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_cycle():
    # Cycle: A->B->C->A, so all nodes have outgoing edges, expect None
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [
        {"source": "A", "target": "B"},
        {"source": "B", "target": "C"},
        {"source": "C", "target": "A"}
    ]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_edges_with_nonexistent_nodes():
    # Edges refer to nodes not in the node list; should ignore and return all nodes as candidates
    nodes = [{"id": "X"}]
    edges = [{"source": "Y", "target": "Z"}]
    # X has no outgoing edges in the edges list
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_node_with_multiple_outgoing_edges():
    # Node 1 points to 2 and 3; 2 and 3 have no outgoing edges.
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 2}, {"source": 1, "target": 3}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_node_with_self_loop():
    # Node with a self-loop: should not be considered as last node
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "A"}]
    # Only B has no outgoing edges
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_duplicate_node_ids():
    # Duplicate node ids: function should return the first with no outgoing edges
    nodes = [{"id": 1}, {"id": 1}, {"id": 2}]
    edges = [{"source": 2, "target": 1}]
    # Both nodes with id=1 have no outgoing edges, so first one should be returned
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output


def test_nodes_with_extra_fields():
    # Nodes have extra fields; should still return the correct node dict
    nodes = [{"id": "A", "foo": 123}, {"id": "B", "bar": 456}]
    edges = [{"source": "A", "target": "B"}]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

# ------------------------
# Large Scale Test Cases
# ------------------------

def test_large_linear_chain():
    # Large linear chain: 1000 nodes, each points to next
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = [{"source": i, "target": i+1} for i in range(N-1)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_large_star_topology():
    # One root node points to 999 others; all others have no outgoing edges
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = [{"source": 0, "target": i} for i in range(1, N)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_large_no_edges():
    # 1000 nodes, no edges: should return the first node
    N = 1000
    nodes = [{"id": str(i)} for i in range(N)]
    edges = []
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_large_all_nodes_with_outgoing_edges():
    # 1000 nodes, each has an outgoing edge to the next, last points to first (cycle)
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = [{"source": i, "target": (i+1)%N} for i in range(N)]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

def test_large_multiple_end_nodes():
    # 500 nodes each with no outgoing edges, 500 with outgoing edges
    N = 1000
    nodes = [{"id": i} for i in range(N)]
    edges = [{"source": i, "target": (i+1)%500} for i in range(500)]
    # nodes[500:] have no outgoing edges; should return nodes[500]
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

import pytest  # used for our unit tests
from src.dsa.nodes import find_last_node

# unit tests

# -------------------
# Basic Test Cases
# -------------------

def test_single_node_no_edges():
    # Only one node, no edges: should return that node
    nodes = [{"id": 1, "value": "A"}]
    edges = []
    codeflash_output = find_last_node(nodes, edges)

def test_two_nodes_one_edge():
    # Node 1 points to Node 2: Node 2 is last node (no outgoing edges)
    nodes = [{"id": 1, "value": "A"}, {"id": 2, "value": "B"}]
    edges = [{"source": 1, "target": 2}]
    codeflash_output = find_last_node(nodes, edges)

def test_three_nodes_linear_chain():
    # 1 -> 2 -> 3: Node 3 is last node
    nodes = [{"id": "n1"}, {"id": "n2"}, {"id": "n3"}]
    edges = [{"source": "n1", "target": "n2"}, {"source": "n2", "target": "n3"}]
    codeflash_output = find_last_node(nodes, edges)

def test_multiple_last_nodes_returns_first():
    # Two nodes without outgoing edges: should return the first found
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 2}]
    # Nodes 2 and 3 have no outgoing edges
    codeflash_output = find_last_node(nodes, edges); result = codeflash_output

# -------------------
# Edge Test Cases
# -------------------

def test_empty_nodes_and_edges():
    # No nodes, no edges: should return None
    codeflash_output = find_last_node([], [])

def test_edges_without_nodes():
    # Edges reference nodes not in the list: should return None
    nodes = []
    edges = [{"source": 1, "target": 2}]
    codeflash_output = find_last_node(nodes, edges)

def test_nodes_with_self_loops():
    # Node with self-loop should not be last node
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 1}]
    # Node 2 has no outgoing edges
    codeflash_output = find_last_node(nodes, edges)

def test_cycle_graph():
    # All nodes in a cycle: no last node (every node has outgoing edge)
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}]
    edges = [{"source": 1, "target": 2}, {"source": 2, "target": 3}, {"source": 3, "target": 1}]
    codeflash_output = find_last_node(nodes, edges)

def test_disconnected_nodes():
    # Some nodes not connected at all: should return first disconnected node
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [{"source": "A", "target": "B"}]
    # Node C is disconnected and has no outgoing edges
    codeflash_output = find_last_node(nodes, edges)  # B has no outgoing edges, C too, but B comes first

def test_node_with_multiple_outgoing_edges():
    # Node with multiple outgoing edges should not be last node
    nodes = [{"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}]
    edges = [{"source": 1, "target": 2}, {"source": 1, "target": 3}]
    # Nodes 2, 3, 4 have no outgoing edges; 2 is first
    codeflash_output = find_last_node(nodes, edges)

def test_edge_with_nonexistent_source():
    # Edge source not in nodes: should ignore, still find last node
    nodes = [{"id": 10}, {"id": 20}]
    edges = [{"source": 99, "target": 10}]
    # Both nodes have no outgoing edges, so first node returned
    codeflash_output = find_last_node(nodes, edges)

def test_edge_with_nonexistent_target():
    # Edge target not in nodes: should not affect last node detection
    nodes = [{"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 99}]
    # Node 2 has no outgoing edges
    codeflash_output = find_last_node(nodes, edges)

def test_node_id_with_none():
    # Node with id None, should be handled
    nodes = [{"id": None}, {"id": 1}]
    edges = [{"source": 1, "target": None}]
    # Node None has no outgoing edges
    codeflash_output = find_last_node(nodes, edges)

def test_duplicate_node_ids():
    # Duplicate node ids: function should return first last-node found
    nodes = [{"id": 1}, {"id": 1}, {"id": 2}]
    edges = [{"source": 1, "target": 2}]
    # Both nodes with id=1 have outgoing edges, node 2 is last node
    codeflash_output = find_last_node(nodes, edges)

# -------------------
# Large Scale Test Cases
# -------------------

def test_large_linear_chain():
    # Linear chain of 1000 nodes: last node should be returned
    n = 1000
    nodes = [{"id": i} for i in range(n)]
    edges = [{"source": i, "target": i+1} for i in range(n-1)]
    codeflash_output = find_last_node(nodes, edges)

def test_large_star_graph():
    # One central node points to 999 others; all leaves are last nodes, first leaf returned
    n = 1000
    nodes = [{"id": 0}] + [{"id": i} for i in range(1, n)]
    edges = [{"source": 0, "target": i} for i in range(1, n)]
    # Leaves have no outgoing edges; first leaf is id=1
    codeflash_output = find_last_node(nodes, edges)

def test_large_disconnected_nodes():
    # 1000 nodes, no edges: first node returned
    n = 1000
    nodes = [{"id": i} for i in range(n)]
    edges = []
    codeflash_output = find_last_node(nodes, edges)

def test_large_complete_graph():
    # Every node has outgoing edges to all others: no last node
    n = 100
    nodes = [{"id": i} for i in range(n)]
    edges = [{"source": i, "target": j} for i in range(n) for j in range(n) if i != j]
    codeflash_output = find_last_node(nodes, edges)

def test_performance_with_sparse_edges():
    # 1000 nodes, 10 random edges: should still find a last node
    n = 1000
    nodes = [{"id": i} for i in range(n)]
    edges = [{"source": i, "target": (i+1)%n} for i in range(10)]
    # All nodes except first 10 have no outgoing edges; first is node 10
    codeflash_output = find_last_node(nodes, edges)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To edit these changes git checkout codeflash/optimize-find_last_node-mb73154c and push.

Codeflash

Here’s a faster version. The key optimization:  
- Build a set of all "source" IDs up front, O(E), and then scan nodes and find the first whose "id" is not in that set, O(N).  
This avoids the O(N*E) behavior of the original function.
@codeflash-ai codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label May 27, 2025
@codeflash-ai codeflash-ai bot requested a review from aseembits93 May 27, 2025 22:22
@KRRT7 KRRT7 closed this Jun 4, 2025
@codeflash-ai codeflash-ai bot deleted the codeflash/optimize-find_last_node-mb73154c branch June 4, 2025 07:33
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

⚡️ codeflash Optimization PR opened by Codeflash AI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant