diff --git a/invokeai/app/services/invocation_stats/invocation_stats_common.py b/invokeai/app/services/invocation_stats/invocation_stats_common.py index f4c906a58f7..344dc57a559 100644 --- a/invokeai/app/services/invocation_stats/invocation_stats_common.py +++ b/invokeai/app/services/invocation_stats/invocation_stats_common.py @@ -58,10 +58,10 @@ class InvocationStatsSummary: def __str__(self) -> str: _str = "" _str = f"Graph stats: {self.graph_stats.graph_execution_state_id}\n" - _str += f"{'Node':>30} {'Calls':>7} {'Seconds':>9} {'VRAM Used':>10}\n" + _str += f"{'Node':>30} {'Calls':>7} {'Seconds':>9} {'VRAM Change':+>10}\n" for summary in self.node_stats: - _str += f"{summary.node_type:>30} {summary.num_calls:>7} {summary.time_used_seconds:>8.3f}s {summary.peak_vram_gb:>9.3f}G\n" + _str += f"{summary.node_type:>30} {summary.num_calls:>7} {summary.time_used_seconds:>8.3f}s {summary.peak_vram_gb:+10.3f}G\n" _str += f"TOTAL GRAPH EXECUTION TIME: {self.graph_stats.execution_time_seconds:7.3f}s\n" diff --git a/invokeai/app/services/invocation_stats/invocation_stats_default.py b/invokeai/app/services/invocation_stats/invocation_stats_default.py index 0219d5036ea..85d77e3b44f 100644 --- a/invokeai/app/services/invocation_stats/invocation_stats_default.py +++ b/invokeai/app/services/invocation_stats/invocation_stats_default.py @@ -52,8 +52,9 @@ def collect_stats(self, invocation: BaseInvocation, graph_execution_state_id: st # Record state before the invocation. start_time = time.time() start_ram = psutil.Process().memory_info().rss - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() + + # Remember current VRAM usage + vram_in_use = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0.0 assert services.model_manager.load is not None services.model_manager.load.ram_cache.stats = self._cache_stats[graph_execution_state_id] @@ -62,14 +63,16 @@ def collect_stats(self, invocation: BaseInvocation, graph_execution_state_id: st # Let the invocation run. yield None finally: - # Record state after the invocation. + # Record delta VRAM + delta_vram_gb = ((torch.cuda.memory_allocated() - vram_in_use) / GB) if torch.cuda.is_available() else 0.0 + node_stats = NodeExecutionStats( invocation_type=invocation.get_type(), start_time=start_time, end_time=time.time(), start_ram_gb=start_ram / GB, end_ram_gb=psutil.Process().memory_info().rss / GB, - peak_vram_gb=torch.cuda.max_memory_allocated() / GB if torch.cuda.is_available() else 0.0, + peak_vram_gb=delta_vram_gb, ) self._stats[graph_execution_state_id].add_node_execution_stats(node_stats) @@ -81,6 +84,8 @@ def get_stats(self, graph_execution_state_id: str) -> InvocationStatsSummary: graph_stats_summary = self._get_graph_summary(graph_execution_state_id) node_stats_summaries = self._get_node_summaries(graph_execution_state_id) model_cache_stats_summary = self._get_model_cache_summary(graph_execution_state_id) + # Note: We use memory_allocated() here (not memory_reserved()) because we want to show + # the current actively-used VRAM, not the total reserved memory including PyTorch's cache. vram_usage_gb = torch.cuda.memory_allocated() / GB if torch.cuda.is_available() else None return InvocationStatsSummary( diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py index 33818af6100..fc48217e787 100644 --- a/invokeai/backend/model_manager/load/model_cache/model_cache.py +++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py @@ -240,6 +240,9 @@ def stats(self) -> Optional[CacheStats]: def stats(self, stats: CacheStats) -> None: """Set the CacheStats object for collecting cache statistics.""" self._stats = stats + # Populate the cache size in the stats object when it's set + if self._stats is not None: + self._stats.cache_size = self._ram_cache_size_bytes def _record_activity(self) -> None: """Record model activity and reset the timeout timer if configured. diff --git a/tests/app/services/invocation_stats/__init__.py b/tests/app/services/invocation_stats/__init__.py new file mode 100644 index 00000000000..36d7115b98c --- /dev/null +++ b/tests/app/services/invocation_stats/__init__.py @@ -0,0 +1 @@ +# Tests for invocation stats service diff --git a/tests/app/services/invocation_stats/test_vram_tracking.py b/tests/app/services/invocation_stats/test_vram_tracking.py new file mode 100644 index 00000000000..d2213c533b5 --- /dev/null +++ b/tests/app/services/invocation_stats/test_vram_tracking.py @@ -0,0 +1,225 @@ +"""Tests for VRAM usage tracking in invocation stats service.""" + +from unittest.mock import Mock + +import pytest +import torch + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_vram_tracking_with_no_gpu_operations(): + """Test that nodes without GPU operations report 0 VRAM usage.""" + from invokeai.app.services.invocation_stats.invocation_stats_default import InvocationStatsService + + # Create a minimal mock invoker with only what we need + mock_invoker = Mock() + mock_invoker.services.model_manager.load.ram_cache.stats = Mock() + + stats_service = InvocationStatsService() + stats_service.start(mock_invoker) + + # Simulate some initial VRAM being allocated (from previous operations) + dummy_tensor = torch.zeros((1000, 1000), device="cuda") + initial_vram = torch.cuda.memory_allocated() + assert initial_vram > 0, "Should have some VRAM allocated from dummy tensor" + + try: + # Create a mock invocation that doesn't use GPU + mock_invocation = Mock() + mock_invocation.get_type.return_value = "test_node" + + graph_execution_state_id = "test_graph_id" + + # Collect stats for a node that doesn't allocate VRAM + with stats_service.collect_stats(mock_invocation, graph_execution_state_id): + # No GPU operations here + pass + + # Get the stats + summary = stats_service.get_stats(graph_execution_state_id) + node_stats = summary.node_stats[0] + + # The peak VRAM should be 0 or very close to 0 since no GPU operations occurred + assert node_stats.peak_vram_gb < 0.01, ( + f"Expected near-zero VRAM usage for node without GPU operations, but got {node_stats.peak_vram_gb:.3f}G" + ) + finally: + # Clean up + del dummy_tensor + torch.cuda.empty_cache() + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_vram_tracking_with_reused_cached_memory(): + """Test that nodes reusing cached memory still report VRAM usage. + + This test addresses the issue where nodes like z_image_l2i that reuse + PyTorch's cached GPU memory were incorrectly showing 0 VRAM usage. + """ + from invokeai.app.services.invocation_stats.invocation_stats_default import InvocationStatsService + + # Create a minimal mock invoker with only what we need + mock_invoker = Mock() + mock_invoker.services.model_manager.load.ram_cache.stats = Mock() + + stats_service = InvocationStatsService() + stats_service.start(mock_invoker) + + graph_execution_state_id = "test_graph_id_reuse" + + tensor1 = None + tensor2 = None + try: + # First invocation: allocate and then free some VRAM + mock_invocation_1 = Mock() + mock_invocation_1.get_type.return_value = "first_gpu_node" + + with stats_service.collect_stats(mock_invocation_1, graph_execution_state_id): + tensor1 = torch.zeros((5000, 5000), device="cuda") # ~100MB + + # Free the tensor but keep it in PyTorch's cache + del tensor1 + tensor1 = None + + # Second invocation: allocate memory that fits in the cache + # This should still show VRAM usage even though it's reusing cached memory + mock_invocation_2 = Mock() + mock_invocation_2.get_type.return_value = "second_gpu_node_reuses_cache" + + with stats_service.collect_stats(mock_invocation_2, graph_execution_state_id): + tensor2 = torch.zeros((4000, 4000), device="cuda") # ~64MB, fits in cached space + + # Get the stats + summary = stats_service.get_stats(graph_execution_state_id) + + # Find stats for each node type + node_stats_dict = {stat.node_type: stat for stat in summary.node_stats} + + # Both nodes should show VRAM usage + assert node_stats_dict["first_gpu_node"].peak_vram_gb > 0.05, ( + f"First GPU node should show VRAM usage, got {node_stats_dict['first_gpu_node'].peak_vram_gb:.3f}G" + ) + + # This is the critical test - the second node should NOT show 0 even though + # it's reusing cached memory + assert node_stats_dict["second_gpu_node_reuses_cache"].peak_vram_gb > 0.03, ( + f"Second GPU node should show VRAM usage even when reusing cache, " + f"got {node_stats_dict['second_gpu_node_reuses_cache'].peak_vram_gb:.3f}G" + ) + finally: + # Clean up + if tensor1 is not None: + del tensor1 + if tensor2 is not None: + del tensor2 + torch.cuda.empty_cache() + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_vram_tracking_with_gpu_operations(): + """Test that nodes with GPU operations report correct VRAM usage.""" + from invokeai.app.services.invocation_stats.invocation_stats_default import InvocationStatsService + + # Create a minimal mock invoker with only what we need + mock_invoker = Mock() + mock_invoker.services.model_manager.load.ram_cache.stats = Mock() + + stats_service = InvocationStatsService() + stats_service.start(mock_invoker) + + # Create a mock invocation + mock_invocation = Mock() + mock_invocation.get_type.return_value = "test_gpu_node" + + graph_execution_state_id = "test_graph_id_2" + + test_tensor = None + try: + # Collect stats for a node that allocates VRAM + with stats_service.collect_stats(mock_invocation, graph_execution_state_id): + # Allocate a significant amount of VRAM + test_tensor = torch.zeros((10000, 10000), device="cuda") # ~400MB + + # Get the stats + summary = stats_service.get_stats(graph_execution_state_id) + node_stats = summary.node_stats[0] + + # The peak VRAM should reflect the allocation we made (roughly 400MB = 0.4GB) + assert node_stats.peak_vram_gb > 0.1, ( + f"Expected significant VRAM usage for node with GPU operations, but got {node_stats.peak_vram_gb:.3f}G" + ) + finally: + # Clean up + if test_tensor is not None: + del test_tensor + torch.cuda.empty_cache() + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_vram_tracking_multiple_invocations(): + """Test that VRAM tracking works correctly across multiple invocations.""" + from invokeai.app.services.invocation_stats.invocation_stats_default import InvocationStatsService + + # Create a minimal mock invoker with only what we need + mock_invoker = Mock() + mock_invoker.services.model_manager.load.ram_cache.stats = Mock() + + stats_service = InvocationStatsService() + stats_service.start(mock_invoker) + + graph_execution_state_id = "test_graph_id_3" + + tensor1 = None + tensor2 = None + try: + # First invocation: allocate some VRAM + mock_invocation_1 = Mock() + mock_invocation_1.get_type.return_value = "gpu_node" + + with stats_service.collect_stats(mock_invocation_1, graph_execution_state_id): + tensor1 = torch.zeros((5000, 5000), device="cuda") # ~100MB + + # Second invocation: no GPU operations (this is the key test) + mock_invocation_2 = Mock() + mock_invocation_2.get_type.return_value = "cpu_node" + + with stats_service.collect_stats(mock_invocation_2, graph_execution_state_id): + # No GPU operations, but VRAM is still allocated from previous invocation + pass + + # Third invocation: more GPU operations + mock_invocation_3 = Mock() + mock_invocation_3.get_type.return_value = "another_gpu_node" + + with stats_service.collect_stats(mock_invocation_3, graph_execution_state_id): + tensor2 = torch.zeros((5000, 5000), device="cuda") # ~100MB + + # Get the stats + summary = stats_service.get_stats(graph_execution_state_id) + + # Find stats for each node type + node_stats_dict = {stat.node_type: stat for stat in summary.node_stats} + + # First node should show VRAM usage + assert node_stats_dict["gpu_node"].peak_vram_gb > 0.05, ( + f"First GPU node should show VRAM usage, got {node_stats_dict['gpu_node'].peak_vram_gb:.3f}G" + ) + + # Second node (CPU-only) should show minimal or zero VRAM usage + # This is the critical test - it should NOT show the VRAM from the previous node + assert node_stats_dict["cpu_node"].peak_vram_gb < 0.01, ( + f"CPU node should show near-zero VRAM usage even with prior allocations, " + f"got {node_stats_dict['cpu_node'].peak_vram_gb:.3f}G" + ) + + # Third node should show VRAM usage + assert node_stats_dict["another_gpu_node"].peak_vram_gb > 0.05, ( + f"Third GPU node should show VRAM usage, got {node_stats_dict['another_gpu_node'].peak_vram_gb:.3f}G" + ) + finally: + # Clean up + if tensor1 is not None: + del tensor1 + if tensor2 is not None: + del tensor2 + torch.cuda.empty_cache()