diff --git a/.gitignore b/.gitignore
index d883573..ba3e687 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,6 +38,7 @@ htmlcov/
 nosetests.xml
 coverage.xml
 *.coveragerc
+coverage.json
 
 # IDEs
 .vscode/
@@ -79,3 +80,6 @@ repomix/repomix-output.*
 .agentready-config.yaml
 !.agentready-config.example.yaml
 docs/_site/
+
+# Claude
+.claude/settings.local.json
diff --git a/tests/unit/test_cli_benchmark.py b/tests/unit/test_cli_benchmark.py
new file mode 100644
index 0000000..7797c5c
--- /dev/null
+++ b/tests/unit/test_cli_benchmark.py
@@ -0,0 +1,620 @@
+"""Unit tests for benchmark CLI commands.
+
+Test Strategy:
+    - Uses Click's CliRunner with isolated filesystem for CLI command testing
+    - Mocks external dependencies (_real_tbench_result, compare_assessor_impact)
+    - Uses actual data models (HarborComparison, HarborRunMetrics) for type safety
+    - Tests both high-level commands (benchmark, validate_assessor) and internal helpers (_run_tbench)
+    - Covers CLI argument parsing, validation, and error handling
+
+Coverage Target:
+    - Achieves 80% coverage of cli/benchmark.py
+    - All commands (benchmark, validate-assessor) tested
+    - Helper functions (_run_tbench) tested independently
+    - Edge cases: missing API keys, invalid inputs, file system operations
+
+Test Fixtures:
+    - runner: Click test runner for CLI command invocation
+    - temp_repo: Temporary git repository structure
+    - mock_tbench_result: Mock Terminal-Bench evaluation result
+    - mock_comparison: Harbor comparison for assessor validation testing
+
+Note on Directory Creation:
+    Tests create output directories explicitly before invocation to match
+    real-world usage where the CLI creates directories on demand.
+"""
+
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from click.testing import CliRunner
+
+from agentready.cli.benchmark import (
+    DEFAULT_PHASE1_TASKS,
+    _run_tbench,
+    benchmark,
+    validate_assessor,
+)
+from agentready.models.harbor import HarborComparison, HarborRunMetrics
+
+
+@pytest.fixture
+def runner():
+    """Create Click test runner."""
+    return CliRunner()
+
+
+@pytest.fixture
+def temp_repo():
+    """Create a temporary git repository."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        repo_path = Path(tmpdir)
+        (repo_path / ".git").mkdir()
+        yield repo_path
+
+
+@pytest.fixture
+def mock_tbench_result():
+    """Create mock Terminal-Bench result."""
+    result = MagicMock()
+    result.score = 75.5
+    result.task_solved = 10
+    result.resolved_trials = 10
+    result.unresolved_trials = 0
+    result.pass_at_1 = 0.90
+    result.trajectory_path = "/path/to/trajectory.json"
+    return result
+
+
+@pytest.fixture
+def mock_comparison():
+    """Create mock Harbor comparison for assessor validation.
+
+    Simulates assessor A/B test results showing:
+    - Baseline (assessor fails): 50% success rate
+    - Treatment (assessor passes): 100% success rate
+    - Impact: +50pp success rate when assessor criteria met
+    """
+    # Baseline: assessor forced to fail
+    without_metrics = HarborRunMetrics(
+        run_id="without_20240101_120000",
+        agent_file_enabled=False,
+        task_results=[],
+        success_rate=50.0,
+        completion_rate=100.0,
+        avg_duration_sec=12.5,
+        total_tasks=2,
+        successful_tasks=1,
+        failed_tasks=1,
+        timed_out_tasks=0,
+    )
+
+    # Treatment: assessor passes normally
+    with_metrics = HarborRunMetrics(
+        run_id="with_20240101_120000",
+        agent_file_enabled=True,
+        task_results=[],
+        success_rate=100.0,
+        completion_rate=100.0,
+        avg_duration_sec=10.0,
+        total_tasks=2,
+        successful_tasks=2,
+        failed_tasks=0,
+        timed_out_tasks=0,
+    )
+
+    return HarborComparison(
+        created_at="2024-01-01T12:00:00",  # Fixed timestamp for test determinism
+        without_agent=without_metrics,
+        with_agent=with_metrics,
+        deltas={
+            "success_rate_delta": 50.0,
+            "avg_duration_delta_sec": -2.5,
+            "avg_duration_delta_pct": -20.0,
+        },
+        statistical_significance={
+            "success_rate_significant": True,
+            "duration_significant": False,
+        },
+        per_task_comparison=[],
+    )
+
+
+class TestBenchmarkCommand:
+    """Test benchmark CLI command."""
+
+    @patch("agentready.cli.benchmark._run_tbench")
+    def test_benchmark_basic_execution(self, mock_run, runner, temp_repo):
+        """Test basic benchmark command execution."""
+        result = runner.invoke(
+            benchmark,
+            [str(temp_repo), "--harness", "tbench", "--subset", "smoketest"],
+        )
+
+        # Should succeed
+        assert result.exit_code == 0
+        mock_run.assert_called_once()
+
+    @patch("agentready.cli.benchmark._run_tbench")
+    def test_benchmark_defaults_to_current_dir(self, mock_run, runner):
+        """Test benchmark defaults to current directory."""
+        with runner.isolated_filesystem():
+            Path(".git").mkdir()
+
+            result = runner.invoke(
+                benchmark,
+                ["--subset", "smoketest"],
+            )
+
+            # Should use current directory
+            assert result.exit_code == 0
+            mock_run.assert_called_once()
+
+    @patch("agentready.cli.benchmark._run_tbench")
+    def test_benchmark_with_verbose_flag(self, mock_run, runner, temp_repo):
+        """Test benchmark command with verbose output."""
+        result = runner.invoke(
+            benchmark,
+            [str(temp_repo), "--verbose", "--subset", "smoketest"],
+        )
+
+        assert result.exit_code == 0
+        # Verbose flag passed to _run_tbench
+        _, _, _, verbose, _, _, _ = mock_run.call_args[0]
+        assert verbose is True
+
+    @patch("agentready.cli.benchmark._run_tbench")
+    def test_benchmark_with_custom_timeout(self, mock_run, runner, temp_repo):
+        """Test benchmark with custom timeout."""
+        result = runner.invoke(
+            benchmark,
+            [str(temp_repo), "--timeout", "7200", "--subset", "smoketest"],
+        )
+
+        assert result.exit_code == 0
+        _, _, _, _, timeout, _, _ = mock_run.call_args[0]
+        assert timeout == 7200
+
+    @patch("agentready.cli.benchmark._run_tbench")
+    def test_benchmark_with_output_dir(self, mock_run, runner, temp_repo):
+        """Test benchmark with custom output directory."""
+        result = runner.invoke(
+            benchmark,
+            [
+                str(temp_repo),
+                "--output-dir",
+                "/custom/output",
+                "--subset",
+                "smoketest",
+            ],
+        )
+
+        assert result.exit_code == 0
+        _, _, _, _, _, output_dir, _ = mock_run.call_args[0]
+        assert output_dir == "/custom/output"
+
+    @patch("agentready.cli.benchmark._run_tbench")
+    def test_benchmark_skip_preflight(self, mock_run, runner, temp_repo):
+        """Test benchmark with skip-preflight flag."""
+        result = runner.invoke(
+            benchmark,
+            [str(temp_repo), "--skip-preflight", "--subset", "smoketest"],
+        )
+
+        assert result.exit_code == 0
+        _, _, _, _, _, _, skip_preflight = mock_run.call_args[0]
+        assert skip_preflight is True
+
+    def test_benchmark_unknown_harness(self, runner, temp_repo):
+        """Test benchmark with unknown harness."""
+        result = runner.invoke(
+            benchmark,
+            [str(temp_repo), "--harness", "unknown"],
+        )
+
+        # Should fail (but unknown won't be accepted by Click's Choice validation)
+        assert result.exit_code != 0
+
+    @patch("agentready.cli.benchmark._run_tbench")
+    def test_benchmark_with_model_selection(self, mock_run, runner, temp_repo):
+        """Test benchmark with different models."""
+        result = runner.invoke(
+            benchmark,
+            [
+                str(temp_repo),
+                "--model",
+                "claude-sonnet-4-5",
+                "--subset",
+                "smoketest",
+            ],
+        )
+
+        assert result.exit_code == 0
+        _, _, model, _, _, _, _ = mock_run.call_args[0]
+        assert model == "claude-sonnet-4-5"
+
+
+class TestRunTbench:
+    """Test _run_tbench internal function."""
+
+    @patch("agentready.cli.benchmark._real_tbench_result")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_run_tbench_smoketest(self, mock_result, tmp_path, mock_tbench_result):
+        """Test running tbench with smoketest subset."""
+        mock_result.return_value = mock_tbench_result
+
+        # Create mock repository
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+
+        # Should not raise
+        _run_tbench(
+            repo_path=repo_path,
+            subset="smoketest",
+            model="claude-haiku-4-5",
+            verbose=False,
+            timeout=3600,
+            output_dir=None,
+            skip_preflight=True,  # Skip preflight to avoid dependencies
+        )
+
+        # Should call _real_tbench_result
+        mock_result.assert_called_once()
+
+    @patch("agentready.cli.benchmark._real_tbench_result")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_run_tbench_full_subset(self, mock_result, tmp_path, mock_tbench_result):
+        """Test running tbench with full subset."""
+        mock_result.return_value = mock_tbench_result
+
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+
+        _run_tbench(
+            repo_path=repo_path,
+            subset="full",
+            model="claude-haiku-4-5",
+            verbose=False,
+            timeout=3600,
+            output_dir=None,
+            skip_preflight=True,
+        )
+
+        mock_result.assert_called_once()
+
+    @patch("agentready.cli.benchmark.click.echo")
+    @patch("agentready.cli.benchmark.click.Abort")
+    def test_run_tbench_invalid_subset(self, mock_abort, mock_echo, tmp_path):
+        """Test tbench with invalid subset."""
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+
+        with pytest.raises(Exception):
+            _run_tbench(
+                repo_path=repo_path,
+                subset="invalid",
+                model="claude-haiku-4-5",
+                verbose=False,
+                timeout=3600,
+                output_dir=None,
+                skip_preflight=True,
+            )
+
+    @patch.dict("os.environ", {}, clear=True)
+    @patch("agentready.cli.benchmark.click.echo")
+    @patch("agentready.cli.benchmark.click.Abort")
+    def test_run_tbench_missing_api_key(self, mock_abort, mock_echo, tmp_path):
+        """Test tbench fails without API key."""
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+
+        with pytest.raises(Exception):
+            _run_tbench(
+                repo_path=repo_path,
+                subset="smoketest",
+                model="claude-haiku-4-5",
+                verbose=False,
+                timeout=3600,
+                output_dir=None,
+                skip_preflight=True,
+            )
+
+    @patch("agentready.cli.benchmark._real_tbench_result")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_run_tbench_defaults_to_full(
+        self, mock_result, tmp_path, mock_tbench_result
+    ):
+        """Test tbench defaults to full subset when None specified."""
+        mock_result.return_value = mock_tbench_result
+
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+
+        _run_tbench(
+            repo_path=repo_path,
+            subset=None,  # Should default to 'full'
+            model="claude-haiku-4-5",
+            verbose=False,
+            timeout=3600,
+            output_dir=None,
+            skip_preflight=True,
+        )
+
+        # Check that HarborConfig was created with smoketest=False
+        mock_result.assert_called_once()
+        harbor_config = mock_result.call_args[0][1]
+        assert harbor_config.smoketest is False
+
+    @patch("agentready.cli.benchmark._real_tbench_result")
+    @patch("agentready.cli.benchmark.click.echo")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_run_tbench_exception_handling(self, mock_echo, mock_result, tmp_path):
+        """Test tbench handles exceptions gracefully."""
+        mock_result.side_effect = Exception("Benchmark error")
+
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+
+        with pytest.raises(Exception):
+            _run_tbench(
+                repo_path=repo_path,
+                subset="smoketest",
+                model="claude-haiku-4-5",
+                verbose=False,
+                timeout=3600,
+                output_dir=None,
+                skip_preflight=True,
+            )
+
+
+class TestValidateAssessorCommand:
+    """Test validate-assessor CLI command."""
+
+    @patch("agentready.cli.benchmark.AssessorStateToggler")
+    def test_list_assessors(self, mock_toggler_class, runner):
+        """Test --list-assessors flag."""
+        mock_toggler = MagicMock()
+        mock_toggler.list_supported_assessors.return_value = [
+            "claude_md_file",
+            "readme_structure",
+            "test_coverage",
+        ]
+        mock_toggler_class.return_value = mock_toggler
+
+        result = runner.invoke(validate_assessor, ["--list-assessors"])
+
+        # Should succeed
+        assert result.exit_code == 0
+        assert "claude_md_file" in result.output
+        assert "readme_structure" in result.output
+        assert "test_coverage" in result.output
+
+    def test_validate_missing_assessor_flag(self, runner):
+        """Test validate-assessor without --assessor flag."""
+        result = runner.invoke(validate_assessor, [])
+
+        # Should fail
+        assert result.exit_code != 0
+        assert "Missing required option" in result.output
+
+    @patch("agentready.cli.benchmark.compare_assessor_impact")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_validate_assessor_basic(self, mock_compare, runner, mock_comparison):
+        """Test basic assessor validation."""
+        mock_compare.return_value = mock_comparison
+
+        with runner.isolated_filesystem():
+            # Create output directory structure
+            Path(".agentready/validations/claude_md_file").mkdir(
+                parents=True, exist_ok=True
+            )
+
+            result = runner.invoke(
+                validate_assessor,
+                ["--assessor", "claude_md_file", "--smoketest"],
+            )
+
+            # Should succeed
+            assert result.exit_code == 0
+            assert "Results saved" in result.output
+            mock_compare.assert_called_once()
+
+    @patch("agentready.cli.benchmark.compare_assessor_impact")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_validate_assessor_with_custom_tasks(
+        self, mock_compare, runner, mock_comparison
+    ):
+        """Test validation with custom tasks."""
+        mock_compare.return_value = mock_comparison
+
+        with runner.isolated_filesystem():
+            # Create output directory structure
+            Path(".agentready/validations/readme_structure").mkdir(
+                parents=True, exist_ok=True
+            )
+
+            result = runner.invoke(
+                validate_assessor,
+                [
+                    "--assessor",
+                    "readme_structure",
+                    "--tasks",
+                    "adaptive-rejection-sampler",
+                    "--tasks",
+                    "async-http-client",
+                ],
+            )
+
+            assert result.exit_code == 0
+            # Check that custom tasks were passed
+            _, kwargs = mock_compare.call_args
+            assert kwargs["task_names"] == [
+                "adaptive-rejection-sampler",
+                "async-http-client",
+            ]
+
+    @patch("agentready.cli.benchmark.compare_assessor_impact")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_validate_assessor_with_runs(self, mock_compare, runner, mock_comparison):
+        """Test validation with custom number of runs."""
+        mock_compare.return_value = mock_comparison
+
+        with runner.isolated_filesystem():
+            # Create output directory structure
+            Path(".agentready/validations/test_coverage").mkdir(
+                parents=True, exist_ok=True
+            )
+
+            result = runner.invoke(
+                validate_assessor,
+                ["--assessor", "test_coverage", "--runs", "5", "--smoketest"],
+            )
+
+            assert result.exit_code == 0
+            _, kwargs = mock_compare.call_args
+            assert kwargs["runs_per_task"] == 5
+
+    @patch("agentready.cli.benchmark.compare_assessor_impact")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_validate_assessor_default_tasks(
+        self, mock_compare, runner, mock_comparison
+    ):
+        """Test validation uses default Phase 1 tasks."""
+        mock_compare.return_value = mock_comparison
+
+        with runner.isolated_filesystem():
+            # Create output directory structure
+            Path(".agentready/validations/claude_md_file").mkdir(
+                parents=True, exist_ok=True
+            )
+
+            result = runner.invoke(
+                validate_assessor,
+                ["--assessor", "claude_md_file"],
+            )
+
+            assert result.exit_code == 0
+            # Should use DEFAULT_PHASE1_TASKS
+            _, kwargs = mock_compare.call_args
+            assert kwargs["task_names"] == DEFAULT_PHASE1_TASKS
+
+    @patch("agentready.cli.benchmark.compare_assessor_impact")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_validate_assessor_smoketest_mode(
+        self, mock_compare, runner, mock_comparison
+    ):
+        """Test smoketest mode uses single task."""
+        mock_compare.return_value = mock_comparison
+
+        with runner.isolated_filesystem():
+            # Create output directory structure
+            Path(".agentready/validations/claude_md_file").mkdir(
+                parents=True, exist_ok=True
+            )
+
+            result = runner.invoke(
+                validate_assessor,
+                ["--assessor", "claude_md_file", "--smoketest"],
+            )
+
+            assert result.exit_code == 0
+            # Smoketest should use only 1 task
+            _, kwargs = mock_compare.call_args
+            assert kwargs["task_names"] == ["adaptive-rejection-sampler"]
+
+    @patch.dict("os.environ", {}, clear=True)
+    def test_validate_assessor_missing_api_key(self, runner):
+        """Test validation fails without API key."""
+        result = runner.invoke(
+            validate_assessor,
+            ["--assessor", "claude_md_file"],
+        )
+
+        # Should fail
+        assert result.exit_code != 0
+        assert "ANTHROPIC_API_KEY" in result.output
+
+    @patch("agentready.cli.benchmark.compare_assessor_impact")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_validate_assessor_value_error(self, mock_compare, runner):
+        """Test validation handles unsupported assessor."""
+        mock_compare.side_effect = ValueError("Unsupported assessor")
+
+        result = runner.invoke(
+            validate_assessor,
+            ["--assessor", "invalid_assessor", "--smoketest"],
+        )
+
+        # Should fail gracefully
+        assert result.exit_code != 0
+        assert "Error:" in result.output
+
+    @patch("agentready.cli.benchmark.compare_assessor_impact")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_validate_assessor_creates_output_files(
+        self, mock_compare, runner, mock_comparison
+    ):
+        """Test validation creates JSON and Markdown files."""
+        mock_compare.return_value = mock_comparison
+
+        with runner.isolated_filesystem():
+            output_dir = Path("output")
+            # Create output directory structure
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+            result = runner.invoke(
+                validate_assessor,
+                [
+                    "--assessor",
+                    "claude_md_file",
+                    "--output-dir",
+                    str(output_dir),
+                    "--smoketest",
+                ],
+            )
+
+            assert result.exit_code == 0
+            # Check files were created
+            assert (output_dir / "claude_md_file.json").exists()
+            assert (output_dir / "claude_md_file.md").exists()
+
+    @patch("agentready.cli.benchmark.compare_assessor_impact")
+    @patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"})
+    def test_validate_assessor_concurrent_flag(
+        self, mock_compare, runner, mock_comparison
+    ):
+        """Test validation with concurrent tasks."""
+        mock_compare.return_value = mock_comparison
+
+        with runner.isolated_filesystem():
+            # Create output directory structure
+            Path(".agentready/validations/claude_md_file").mkdir(
+                parents=True, exist_ok=True
+            )
+
+            result = runner.invoke(
+                validate_assessor,
+                ["--assessor", "claude_md_file", "--concurrent", "5", "--smoketest"],
+            )
+
+            assert result.exit_code == 0
+            _, kwargs = mock_compare.call_args
+            assert kwargs["n_concurrent"] == 5
+
+
+class TestPhase1Tasks:
+    """Test DEFAULT_PHASE1_TASKS constant."""
+
+    def test_phase1_tasks_defined(self):
+        """Test that Phase 1 tasks are defined."""
+        assert len(DEFAULT_PHASE1_TASKS) == 8
+        assert "adaptive-rejection-sampler" in DEFAULT_PHASE1_TASKS
+        assert "async-http-client" in DEFAULT_PHASE1_TASKS
+
+    def test_phase1_tasks_diversity(self):
+        """Test that Phase 1 tasks cover diverse categories."""
+        # Just check that we have a good variety
+        assert all(isinstance(task, str) for task in DEFAULT_PHASE1_TASKS)
+        assert all("-" in task for task in DEFAULT_PHASE1_TASKS)
diff --git a/tests/unit/test_cli_harbor.py b/tests/unit/test_cli_harbor.py
new file mode 100644
index 0000000..6a82ff1
--- /dev/null
+++ b/tests/unit/test_cli_harbor.py
@@ -0,0 +1,725 @@
+"""Unit tests for Harbor CLI commands.
+
+Test Strategy:
+    - Uses Click's CliRunner with isolated filesystem for CLI command testing
+    - Mocks external dependencies (HarborRunner, AgentFileToggler, parse_harbor_results)
+    - Uses actual data models (HarborComparison, HarborRunMetrics) for type safety
+    - Covers success paths, error handling, and edge cases
+    - Helper functions tested independently from CLI commands
+
+Coverage Target:
+    - Achieves 96% coverage of cli/harbor.py
+    - All commands (compare, list, view) tested
+    - Helper functions (_run_benchmark_phase, _generate_reports, _create_latest_symlinks) tested
+    - Error conditions and validation logic covered
+
+Test Fixtures:
+    - runner: Click test runner for CLI command invocation
+    - temp_repo: Temporary git repository with agent file structure
+    - mock_task_results: Sample Harbor task results with realistic data
+    - mock_comparison: Complete Harbor comparison object for testing report generation
+"""
+
+import json
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from click.testing import CliRunner
+
+from agentready.cli.harbor import (
+    _create_latest_symlinks,
+    _generate_reports,
+    _run_benchmark_phase,
+    compare,
+    harbor_cli,
+    list_comparisons,
+    view_comparison,
+)
+from agentready.models.harbor import (
+    HarborComparison,
+    HarborRunMetrics,
+    HarborTaskResult,
+)
+
+
+@pytest.fixture
+def runner():
+    """Create Click test runner."""
+    return CliRunner()
+
+
+@pytest.fixture
+def temp_repo():
+    """Create a temporary git repository with agent file."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        repo_path = Path(tmpdir)
+        (repo_path / ".git").mkdir()
+
+        # Create agent file
+        agent_dir = repo_path / ".claude" / "agents"
+        agent_dir.mkdir(parents=True)
+        (agent_dir / "doubleagent.md").write_text("# Agent file content")
+
+        yield repo_path
+
+
+@pytest.fixture
+def mock_task_results():
+    """Create mock Harbor task results."""
+    return [
+        HarborTaskResult(
+            task_name="test-task-1",
+            trial_name="trial_1",
+            success=True,
+            duration_sec=10.5,
+            agent_result={"status": "success"},
+            verifier_result={"passed": True},
+            exception_info=None,
+            started_at="2024-01-01T12:00:00",
+            finished_at="2024-01-01T12:00:10",
+        ),
+        HarborTaskResult(
+            task_name="test-task-2",
+            trial_name="trial_2",
+            success=True,
+            duration_sec=15.2,
+            agent_result={"status": "success"},
+            verifier_result={"passed": True},
+            exception_info=None,
+            started_at="2024-01-01T12:01:00",
+            finished_at="2024-01-01T12:01:15",
+        ),
+    ]
+
+
+@pytest.fixture
+def mock_comparison():
+    """Create mock Harbor comparison.
+
+    Simulates an A/B test comparison showing:
+    - Baseline (without agent): 50% success rate, 12.5s avg duration
+    - Treatment (with agent): 100% success rate, 10.0s avg duration
+    - Delta: +50pp success rate improvement, -2.5s duration improvement
+    """
+    # Baseline metrics (agent disabled)
+    without_metrics = HarborRunMetrics(
+        run_id="without_20240101_120000",
+        agent_file_enabled=False,
+        task_results=[],
+        success_rate=50.0,
+        completion_rate=100.0,
+        avg_duration_sec=12.5,
+        total_tasks=2,
+        successful_tasks=1,
+        failed_tasks=1,
+        timed_out_tasks=0,
+    )
+
+    # Treatment metrics (agent enabled)
+    with_metrics = HarborRunMetrics(
+        run_id="with_20240101_120000",
+        agent_file_enabled=True,
+        task_results=[],
+        success_rate=100.0,
+        completion_rate=100.0,
+        avg_duration_sec=10.0,
+        total_tasks=2,
+        successful_tasks=2,
+        failed_tasks=0,
+        timed_out_tasks=0,
+    )
+
+    # Comparison with deltas and statistical significance
+    return HarborComparison(
+        created_at="2024-01-01T12:00:00",  # Fixed timestamp for determinism
+        without_agent=without_metrics,
+        with_agent=with_metrics,
+        deltas={
+            "success_rate_delta": 50.0,  # 50 percentage point improvement
+            "avg_duration_delta_sec": -2.5,  # 2.5 second improvement
+            "avg_duration_delta_pct": -20.0,  # 20% faster
+        },
+        statistical_significance={
+            "success_rate_significant": True,
+            "duration_significant": False,
+        },
+        per_task_comparison=[],
+    )
+
+
+class TestRunBenchmarkPhase:
+    """Test _run_benchmark_phase helper function."""
+
+    @patch("agentready.cli.harbor.click.echo")
+    def test_run_without_agent(self, mock_echo, tmp_path):
+        """Test running benchmark phase without agent."""
+        mock_runner = MagicMock()
+        mock_toggler = MagicMock()
+
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        result = _run_benchmark_phase(
+            runner=mock_runner,
+            toggler=mock_toggler,
+            phase_name="WITHOUT agent",
+            run_number=1,
+            output_dir=output_dir,
+            task_list=["task1", "task2"],
+            model="anthropic/claude-sonnet-4-5",
+            verbose=False,
+            disable_agent=True,
+        )
+
+        # Should use context manager for agent toggling
+        mock_toggler.temporarily_disabled.assert_called_once()
+
+        # Should run benchmark
+        assert mock_runner.run_benchmark.called
+
+        # Should return output directory
+        assert result == output_dir
+
+    @patch("agentready.cli.harbor.click.echo")
+    def test_run_with_agent(self, mock_echo, tmp_path):
+        """Test running benchmark phase with agent."""
+        mock_runner = MagicMock()
+        mock_toggler = MagicMock()
+
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        result = _run_benchmark_phase(
+            runner=mock_runner,
+            toggler=mock_toggler,
+            phase_name="WITH agent",
+            run_number=2,
+            output_dir=output_dir,
+            task_list=["task1"],
+            model="anthropic/claude-sonnet-4-5",
+            verbose=True,
+            disable_agent=False,
+        )
+
+        # Should NOT use context manager when agent enabled
+        mock_toggler.temporarily_disabled.assert_not_called()
+
+        # Should run benchmark with verbose
+        mock_runner.run_benchmark.assert_called_once_with(
+            task_names=["task1"],
+            output_dir=output_dir,
+            model="anthropic/claude-sonnet-4-5",
+            verbose=True,
+        )
+
+        assert result == output_dir
+
+    @patch("agentready.cli.harbor.click.echo")
+    @patch("agentready.cli.harbor.click.Abort")
+    def test_run_handles_exception(self, mock_abort, mock_echo, tmp_path):
+        """Test benchmark phase handles exceptions."""
+        mock_runner = MagicMock()
+        mock_runner.run_benchmark.side_effect = Exception("Benchmark failed")
+        mock_toggler = MagicMock()
+
+        with pytest.raises(Exception):
+            _run_benchmark_phase(
+                runner=mock_runner,
+                toggler=mock_toggler,
+                phase_name="TEST",
+                run_number=1,
+                output_dir=tmp_path,
+                task_list=["task1"],
+                model="anthropic/claude-sonnet-4-5",
+                verbose=False,
+                disable_agent=False,
+            )
+
+
+class TestGenerateReports:
+    """Test _generate_reports helper function."""
+
+    @patch("agentready.cli.harbor.generate_dashboard")
+    @patch("agentready.cli.harbor.generate_markdown_report")
+    @patch("agentready.cli.harbor._create_latest_symlinks")
+    @patch("agentready.cli.harbor.click.echo")
+    def test_generates_all_formats(
+        self,
+        mock_echo,
+        mock_symlinks,
+        mock_markdown,
+        mock_dashboard,
+        tmp_path,
+        mock_comparison,
+    ):
+        """Test report generation creates JSON, Markdown, and HTML."""
+        run_dir = tmp_path / "run_123"
+        run_dir.mkdir()
+        output_dir = tmp_path
+
+        paths = _generate_reports(
+            comparison=mock_comparison,
+            run_dir=run_dir,
+            output_dir=output_dir,
+            timestamp="20240101_120000",
+        )
+
+        # Should generate all three formats
+        assert "json" in paths
+        assert "markdown" in paths
+        assert "html" in paths
+
+        # JSON file should exist
+        assert paths["json"].exists()
+
+        # Should call generators
+        mock_markdown.assert_called_once()
+        mock_dashboard.assert_called_once()
+        mock_symlinks.assert_called_once()
+
+    @patch("agentready.cli.harbor.generate_dashboard")
+    @patch("agentready.cli.harbor.generate_markdown_report")
+    @patch("agentready.cli.harbor._create_latest_symlinks")
+    @patch("agentready.cli.harbor.click.echo")
+    def test_json_content_valid(
+        self,
+        mock_echo,
+        mock_symlinks,
+        mock_markdown,
+        mock_dashboard,
+        tmp_path,
+        mock_comparison,
+    ):
+        """Test JSON report contains valid comparison data."""
+        run_dir = tmp_path / "run_123"
+        run_dir.mkdir()
+
+        paths = _generate_reports(
+            comparison=mock_comparison,
+            run_dir=run_dir,
+            output_dir=tmp_path,
+            timestamp="20240101_120000",
+        )
+
+        # Read and validate JSON
+        with open(paths["json"]) as f:
+            data = json.load(f)
+
+        assert "created_at" in data
+        assert "without_agent" in data
+        assert "with_agent" in data
+        assert "deltas" in data
+
+
+class TestCreateLatestSymlinks:
+    """Test _create_latest_symlinks helper function."""
+
+    @patch("agentready.cli.harbor.click.echo")
+    def test_creates_symlinks(self, mock_echo, tmp_path):
+        """Test symlink creation for latest comparison."""
+        # Create source files
+        run_dir = tmp_path / "run_123"
+        run_dir.mkdir()
+
+        json_file = run_dir / "comparison_123.json"
+        json_file.write_text("{}")
+
+        md_file = run_dir / "comparison_123.md"
+        md_file.write_text("# Report")
+
+        html_file = run_dir / "comparison_123.html"
+        html_file.write_text("<html></html>")
+
+        paths = {
+            "json": json_file,
+            "markdown": md_file,
+            "html": html_file,
+        }
+
+        # Create symlinks
+        _create_latest_symlinks(paths, tmp_path)
+
+        # Verify symlinks exist
+        assert (tmp_path / "comparison_latest.json").is_symlink()
+        assert (tmp_path / "comparison_latest.md").is_symlink()
+        assert (tmp_path / "comparison_latest.html").is_symlink()
+
+    @patch("agentready.cli.harbor.click.echo")
+    def test_replaces_existing_symlinks(self, mock_echo, tmp_path):
+        """Test symlink replacement for updates."""
+        # Create old files
+        old_dir = tmp_path / "run_old"
+        old_dir.mkdir()
+        old_file = old_dir / "comparison_old.json"
+        old_file.write_text("{}")
+
+        # Create old symlink
+        old_symlink = tmp_path / "comparison_latest.json"
+        old_symlink.symlink_to(old_file.relative_to(tmp_path))
+
+        # Create new files
+        new_dir = tmp_path / "run_new"
+        new_dir.mkdir()
+        new_file = new_dir / "comparison_new.json"
+        new_file.write_text("{}")
+
+        paths = {"json": new_file}
+
+        # Update symlink
+        _create_latest_symlinks(paths, tmp_path)
+
+        # Symlink should point to new file
+        assert old_symlink.is_symlink()
+        assert old_symlink.resolve() == new_file.resolve()
+
+    @patch("agentready.cli.harbor.click.echo")
+    def test_handles_symlink_errors_gracefully(self, mock_echo, tmp_path):
+        """Test symlink creation handles errors gracefully."""
+        paths = {
+            "json": tmp_path / "nonexistent.json",
+        }
+
+        # Should not raise exception
+        _create_latest_symlinks(paths, tmp_path)
+
+
+class TestCompareCommand:
+    """Test harbor compare CLI command."""
+
+    @patch("agentready.cli.harbor.HarborRunner")
+    @patch("agentready.cli.harbor.AgentFileToggler")
+    @patch("agentready.cli.harbor._run_benchmark_phase")
+    @patch("agentready.cli.harbor.parse_harbor_results")
+    @patch("agentready.cli.harbor.compare_runs")
+    @patch("agentready.cli.harbor._generate_reports")
+    @patch("agentready.cli.harbor.DashboardGenerator")
+    def test_compare_basic_execution(
+        self,
+        mock_dashboard_gen,
+        mock_gen_reports,
+        mock_compare_runs,
+        mock_parse,
+        mock_run_phase,
+        mock_toggler,
+        mock_runner_class,
+        runner,
+        temp_repo,
+        mock_task_results,
+        mock_comparison,
+    ):
+        """Test basic compare command execution."""
+        # Setup mocks
+        mock_runner_class.return_value = MagicMock()
+        mock_run_phase.return_value = temp_repo / "results"
+        mock_parse.return_value = mock_task_results
+        mock_compare_runs.return_value = mock_comparison
+        mock_gen_reports.return_value = {"json": temp_repo / "comparison.json"}
+        mock_dashboard_gen.return_value.generate_summary_text.return_value = "Summary"
+
+        # Run command
+        result = runner.invoke(
+            compare,
+            [
+                "--task",
+                "test-task-1",
+                "--task",
+                "test-task-2",
+                "--agent-file",
+                str(temp_repo / ".claude/agents/doubleagent.md"),
+                "--output-dir",
+                str(temp_repo / "output"),
+            ],
+        )
+
+        # Should succeed
+        assert result.exit_code == 0
+        assert "Harbor Benchmark Comparison" in result.output
+        assert "Summary" in result.output
+
+        # Should run benchmarks twice (with and without agent)
+        assert mock_run_phase.call_count == 2
+
+    def test_compare_missing_agent_file(self, runner, temp_repo):
+        """Test compare command with missing agent file."""
+        result = runner.invoke(
+            compare,
+            [
+                "--task",
+                "test-task",
+                "--agent-file",
+                str(temp_repo / "nonexistent.md"),
+            ],
+        )
+
+        # Should fail (Click validates path before function runs)
+        assert result.exit_code != 0
+        assert "does not exist" in result.output
+
+    def test_compare_no_tasks_specified(self, runner, temp_repo):
+        """Test compare command without tasks."""
+        result = runner.invoke(
+            compare,
+            [
+                "--agent-file",
+                str(temp_repo / ".claude/agents/doubleagent.md"),
+            ],
+        )
+
+        # Should fail
+        assert result.exit_code != 0
+        assert "At least one task must be specified" in result.output
+
+    @patch("agentready.cli.harbor.HarborRunner")
+    def test_compare_harbor_not_installed(self, mock_runner_class, runner, temp_repo):
+        """Test compare command when Harbor not installed."""
+        from agentready.services.harbor.runner import HarborNotInstalledError
+
+        mock_runner_class.side_effect = HarborNotInstalledError("Harbor not found")
+
+        result = runner.invoke(
+            compare,
+            [
+                "--task",
+                "test-task",
+                "--agent-file",
+                str(temp_repo / ".claude/agents/doubleagent.md"),
+            ],
+        )
+
+        # Should fail gracefully
+        assert result.exit_code != 0
+        assert "Harbor not found" in result.output
+
+    @patch("agentready.cli.harbor.HarborRunner")
+    @patch("agentready.cli.harbor.AgentFileToggler")
+    @patch("agentready.cli.harbor._run_benchmark_phase")
+    @patch("agentready.cli.harbor.parse_harbor_results")
+    @patch("agentready.cli.harbor.compare_runs")
+    @patch("agentready.cli.harbor._generate_reports")
+    @patch("agentready.cli.harbor.DashboardGenerator")
+    @patch("webbrowser.open")
+    def test_compare_open_dashboard(
+        self,
+        mock_webbrowser_open,
+        mock_dashboard_gen,
+        mock_gen_reports,
+        mock_compare_runs,
+        mock_parse,
+        mock_run_phase,
+        mock_toggler,
+        mock_runner_class,
+        runner,
+        temp_repo,
+        mock_task_results,
+        mock_comparison,
+    ):
+        """Test compare command with --open-dashboard flag."""
+        # Setup mocks
+        mock_runner_class.return_value = MagicMock()
+        mock_run_phase.return_value = temp_repo / "results"
+        mock_parse.return_value = mock_task_results
+        mock_compare_runs.return_value = mock_comparison
+
+        html_path = temp_repo / "comparison.html"
+        html_path.write_text("<html></html>")
+        mock_gen_reports.return_value = {"html": html_path}
+        mock_dashboard_gen.return_value.generate_summary_text.return_value = "Summary"
+
+        # Run command with open-dashboard flag
+        result = runner.invoke(
+            compare,
+            [
+                "--task",
+                "test-task",
+                "--agent-file",
+                str(temp_repo / ".claude/agents/doubleagent.md"),
+                "--open-dashboard",
+            ],
+        )
+
+        # Should succeed
+        assert result.exit_code == 0
+
+        # Should open browser
+        mock_webbrowser_open.assert_called_once()
+
+    @patch("agentready.cli.harbor.HarborRunner")
+    @patch("agentready.cli.harbor.AgentFileToggler")
+    @patch("agentready.cli.harbor._run_benchmark_phase")
+    @patch("agentready.cli.harbor.parse_harbor_results")
+    def test_compare_parse_results_failure(
+        self,
+        mock_parse,
+        mock_run_phase,
+        mock_toggler,
+        mock_runner_class,
+        runner,
+        temp_repo,
+    ):
+        """Test compare command handles result parsing errors."""
+        # Setup mocks
+        mock_runner_class.return_value = MagicMock()
+        mock_run_phase.return_value = temp_repo / "results"
+        mock_parse.side_effect = Exception("Parse error")
+
+        result = runner.invoke(
+            compare,
+            [
+                "--task",
+                "test-task",
+                "--agent-file",
+                str(temp_repo / ".claude/agents/doubleagent.md"),
+            ],
+        )
+
+        # Should fail gracefully
+        assert result.exit_code != 0
+        assert "Failed to parse results" in result.output
+
+
+class TestListComparisonsCommand:
+    """Test harbor list CLI command."""
+
+    def test_list_empty_directory(self, runner, tmp_path):
+        """Test list command with no comparisons."""
+        output_dir = tmp_path / "comparisons"
+        output_dir.mkdir()
+
+        result = runner.invoke(
+            list_comparisons,
+            ["--output-dir", str(output_dir)],
+        )
+
+        # Should succeed
+        assert result.exit_code == 0
+        assert "No comparisons found" in result.output
+
+    def test_list_with_comparisons(self, runner, tmp_path, mock_comparison):
+        """Test list command with existing comparisons."""
+        output_dir = tmp_path / "comparisons"
+        output_dir.mkdir()
+
+        # Create comparison files
+        run1 = output_dir / "run_20240101_120000"
+        run1.mkdir()
+        comp1 = run1 / "comparison_20240101_120000.json"
+        comp1.write_text(json.dumps(mock_comparison.to_dict()))
+
+        run2 = output_dir / "run_20240102_120000"
+        run2.mkdir()
+        comp2 = run2 / "comparison_20240102_120000.json"
+        comp2.write_text(json.dumps(mock_comparison.to_dict()))
+
+        result = runner.invoke(
+            list_comparisons,
+            ["--output-dir", str(output_dir)],
+        )
+
+        # Should succeed
+        assert result.exit_code == 0
+        assert "run_20240101_120000" in result.output
+        assert "run_20240102_120000" in result.output
+        assert "Success Δ:" in result.output
+        assert "Duration Δ:" in result.output
+
+    def test_list_nonexistent_directory(self, runner, tmp_path):
+        """Test list command with nonexistent directory."""
+        result = runner.invoke(
+            list_comparisons,
+            ["--output-dir", str(tmp_path / "nonexistent")],
+        )
+
+        # Should fail
+        assert result.exit_code != 0
+
+
+class TestViewComparisonCommand:
+    """Test harbor view CLI command."""
+
+    @patch("agentready.cli.harbor.DashboardGenerator")
+    def test_view_summary_format(
+        self, mock_dashboard_gen, runner, tmp_path, mock_comparison
+    ):
+        """Test view command with summary format."""
+        # Create comparison file
+        comp_file = tmp_path / "comparison.json"
+        comp_file.write_text(json.dumps(mock_comparison.to_dict()))
+
+        mock_dashboard_gen.return_value.generate_summary_text.return_value = (
+            "Test Summary"
+        )
+
+        result = runner.invoke(
+            view_comparison,
+            [str(comp_file), "--format", "summary"],
+        )
+
+        # Should succeed
+        assert result.exit_code == 0
+        assert "Test Summary" in result.output
+
+    def test_view_full_format(self, runner, tmp_path, mock_comparison):
+        """Test view command with full JSON format."""
+        # Create comparison file
+        comp_file = tmp_path / "comparison.json"
+        comp_file.write_text(json.dumps(mock_comparison.to_dict()))
+
+        result = runner.invoke(
+            view_comparison,
+            [str(comp_file), "--format", "full"],
+        )
+
+        # Should succeed
+        assert result.exit_code == 0
+        # Should output JSON
+        assert "without_agent" in result.output
+        assert "with_agent" in result.output
+
+    def test_view_nonexistent_file(self, runner, tmp_path):
+        """Test view command with nonexistent file."""
+        result = runner.invoke(
+            view_comparison,
+            [str(tmp_path / "nonexistent.json")],
+        )
+
+        # Should fail
+        assert result.exit_code != 0
+
+    def test_view_default_format(self, runner, tmp_path, mock_comparison):
+        """Test view command defaults to summary format."""
+        comp_file = tmp_path / "comparison.json"
+        comp_file.write_text(json.dumps(mock_comparison.to_dict()))
+
+        with patch("agentready.cli.harbor.DashboardGenerator") as mock_gen:
+            mock_gen.return_value.generate_summary_text.return_value = "Summary"
+
+            result = runner.invoke(
+                view_comparison,
+                [str(comp_file)],
+            )
+
+            # Should use summary format by default
+            assert result.exit_code == 0
+            mock_gen.return_value.generate_summary_text.assert_called_once()
+
+
+class TestHarborCLIGroup:
+    """Test harbor CLI group."""
+
+    def test_harbor_group_help(self, runner):
+        """Test harbor CLI group shows help."""
+        result = runner.invoke(harbor_cli, ["--help"])
+
+        assert result.exit_code == 0
+        assert "Harbor benchmark comparison commands" in result.output
+        assert "compare" in result.output
+        assert "list" in result.output
+        assert "view" in result.output
+
+    def test_harbor_group_has_commands(self):
+        """Test harbor CLI group has expected commands."""
+        assert "compare" in harbor_cli.commands
+        assert "list" in harbor_cli.commands
+        assert "view" in harbor_cli.commands