Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions remaining_test_issues.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
<!--
SPDX-FileCopyrightText: 2026 Knitli Inc.

SPDX-License-Identifier: MIT OR Apache-2.0
-->
# Remaining Test Issues & Proposed Solutions

This document outlines the current state of the test suite after initial fixes, including specific issues that are blocking tests from passing and proposed solutions for each.
Expand Down
9 changes: 9 additions & 0 deletions ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,15 @@ convention = "google"
"src/codeweaver/providers/vector_stores/inmemory.py" = [
"C901", # too complex - vector store initialization has many conditions
]
# Provider config validation has complex asymmetric compatibility checking
"src/codeweaver/providers/config/categories/embedding.py" = [
"C901", # too complex - asymmetric validation has inherent complexity
"G201", # logger.exception preferred - existing logging style
"SIM102", # nested if - validates multiple independent conditions
"TRY300", # else block - return-in-try pattern intentional
"TRY301", # abstract raise - validation raises are intentional inline
"TRY401", # redundant exception in logging - existing style
]
"tools/tests/**/*.py" = [
"ANN201", # missing type annotation for self in method
"ANN002",
Expand Down
4 changes: 2 additions & 2 deletions scripts/language-support/export-classifications.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def main() -> int:
"modification": profile["modification"],
"debugging": profile["debugging"],
"documentation": profile["documentation"],
}
},
}

with (output_dir / "_scoring.json").open("w", encoding="utf-8") as f:
Expand Down Expand Up @@ -442,7 +442,7 @@ def main() -> int:
print(f" High confidence: {total_confident:,} ({meta['confident_pct']}%)")
print(f" Unclassified: {len(unclassified_things):,}")
print(
f" Universal exact: {len(universal_exact):,} thing names classified same in all languages"
f" Universal exact: {len(universal_exact):,} thing names classified same in all languages",
)
print(f" Universal 75%+: {len(universal_majority):,} thing names with majority agreement")
print("\n Tier distribution:")
Expand Down
10 changes: 5 additions & 5 deletions scripts/language-support/generate-overrides.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,11 @@ def max_value_len(entries: list[dict], *, is_doc: bool = False) -> int:
lines.append(f"# {language.capitalize()} language classification overrides")
lines.append(
f"# Holdout evaluation: {baseline_pct}% overall"
f" \u2192 expected {expected_label} with overrides"
f" \u2192 expected {expected_label} with overrides",
)
lines.append(
f"# {len(misclassifications)} items to override"
f" ({n_unclassified} unclassified + {n_misclassified} misclassified)"
f" ({n_unclassified} unclassified + {n_misclassified} misclassified)",
)

# [overrides] section
Expand All @@ -219,7 +219,7 @@ def max_value_len(entries: list[dict], *, is_doc: bool = False) -> int:


def _emit_entries(
lines: list[str], entries: list[dict], key_width: int, val_width: int, *, is_doc: bool
lines: list[str], entries: list[dict], key_width: int, val_width: int, *, is_doc: bool,
) -> None:
"""Emit sorted, tier-grouped TOML entries with aligned columns."""
current_tier: int | None = None
Expand Down Expand Up @@ -264,15 +264,15 @@ def _emit_doc_entries(lines: list[str], entries: list[dict], key_width: int) ->

def main() -> None: # sourcery skip: low-code-quality
parser = argparse.ArgumentParser(
description="Generate TOML override files from holdout evaluation data."
description="Generate TOML override files from holdout evaluation data.",
)
parser.add_argument(
"--all",
action="store_true",
help="Regenerate ALL languages, even those with existing overrides.",
)
parser.add_argument(
"--lang", nargs="+", metavar="LANG", help="Generate overrides for specific languages only."
"--lang", nargs="+", metavar="LANG", help="Generate overrides for specific languages only.",
)
parser.add_argument(
"--dry-run",
Expand Down
28 changes: 14 additions & 14 deletions scripts/language-support/holdout-evaluation.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def load_overrides(overrides_dir: Path, lang: str) -> dict[str, str]:


def build_universal_rules(
classifications_dir: Path, holdout_lang: str
classifications_dir: Path, holdout_lang: str,
) -> tuple[dict[str, str], dict[str, str], dict[str, str]]:
"""Build universal rules excluding the holdout language.

Expand Down Expand Up @@ -183,7 +183,7 @@ def classify_thing_universal(


def evaluate_holdout(
classifications_dir: Path, holdout_lang: str, *, use_overrides: bool = False
classifications_dir: Path, holdout_lang: str, *, use_overrides: bool = False,
) -> dict[str, Any]: # sourcery skip: low-code-quality
"""Run holdout evaluation for a single language."""

Expand All @@ -197,7 +197,7 @@ def evaluate_holdout(

# Build universal rules excluding this language
exact_rules, majority_rules, category_rules = build_universal_rules(
classifications_dir, holdout_lang
classifications_dir, holdout_lang,
)

# Load overrides if requested
Expand Down Expand Up @@ -226,7 +226,7 @@ def evaluate_holdout(
continue

predicted, method = classify_thing_universal(
entry, exact_rules, majority_rules, category_rules, overrides
entry, exact_rules, majority_rules, category_rules, overrides,
)
method_counts[method] += 1

Expand Down Expand Up @@ -303,20 +303,20 @@ def evaluate_holdout(


def print_summary_table(
label: str, results: list[dict[str, Any]]
label: str, results: list[dict[str, Any]],
) -> tuple[float, float, float, float]:
"""Print a summary table and return averages."""
w = max(12, max((len(r["language"]) for r in results), default=12) + 2)
print(
f"\n {'Language':<{w}s} {'Coverage':>8s} {'Accuracy':>8s} {'Overall':>8s} {'Tier-Wtd':>8s} {'Uncls':>5s} {'Wrong':>5s}"
f"\n {'Language':<{w}s} {'Coverage':>8s} {'Accuracy':>8s} {'Overall':>8s} {'Tier-Wtd':>8s} {'Uncls':>5s} {'Wrong':>5s}",
)
print(f" {'─' * w} {'─' * 8} {'─' * 8} {'─' * 8} {'─' * 8} {'─' * 5} {'─' * 5}")
for r in results:
ovr = f" (+{r['override_count']})" if r.get("override_count") else ""
print(
f" {r['language']:<{w}s} {r['coverage_pct']:>7.1f}% {r['accuracy_pct']:>7.1f}% "
f"{r['overall_accuracy_pct']:>7.1f}% {r['tier_weighted_accuracy_pct']:>7.1f}% "
f"{r['unclassified']:>5d} {r['incorrect']:>5d}{ovr}"
f"{r['unclassified']:>5d} {r['incorrect']:>5d}{ovr}",
)

avg_cov = sum(r["coverage_pct"] for r in results) / len(results)
Expand All @@ -326,7 +326,7 @@ def print_summary_table(
print(f" {'─' * w} {'─' * 8} {'─' * 8} {'─' * 8} {'─' * 8} {'─' * 5} {'─' * 5}")
print(
f" {'AVERAGE':<{w}s} {avg_cov:>7.1f}% {avg_acc:>7.1f}% "
f"{avg_ovr:>7.1f}% {avg_tier:>7.1f}%"
f"{avg_ovr:>7.1f}% {avg_tier:>7.1f}%",
)
return avg_cov, avg_acc, avg_ovr, avg_tier

Expand All @@ -343,7 +343,7 @@ def main() -> int: # sourcery skip: low-code-quality

parser = argparse.ArgumentParser(description="Holdout evaluation for language classifications")
parser.add_argument(
"--all", action="store_true", help="Evaluate ALL languages (not just holdout set)"
"--all", action="store_true", help="Evaluate ALL languages (not just holdout set)",
)
parser.add_argument("--lang", nargs="+", help="Evaluate specific language(s)")
args = parser.parse_args()
Expand Down Expand Up @@ -406,7 +406,7 @@ def main() -> int: # sourcery skip: low-code-quality
print(
f" {lang:<14s} overall={result['overall_accuracy_pct']:5.1f}% "
f"({result['correct']}/{result['total_things']} correct, "
f"{result['unclassified']} uncls, {result['incorrect']} wrong)"
f"{result['unclassified']} uncls, {result['incorrect']} wrong)",
)

_display_phase_intro("\n", "PHASE 1 SUMMARY")
Expand Down Expand Up @@ -442,7 +442,7 @@ def main() -> int: # sourcery skip: low-code-quality
f" {lang:<14s} overall={result['overall_accuracy_pct']:5.1f}% "
f"({result['correct']}/{result['total_things']} correct, "
f"{result['unclassified']} uncls, {result['incorrect']} wrong)"
f"{marker}"
f"{marker}",
)

_display_phase_intro("\n", "PHASE 2 SUMMARY")
Expand All @@ -456,7 +456,7 @@ def main() -> int: # sourcery skip: low-code-quality
_display_phase_intro("\n\n", "COMPARISON: Baseline vs With Overrides")
w = max(12, max((len(r["language"]) for r in baseline_results), default=12) + 2)
print(
f"\n {'Language':<{w}s} {'Baseline':>8s} {'Override':>8s} {'Delta':>7s} {'Override Lines':>14s}"
f"\n {'Language':<{w}s} {'Baseline':>8s} {'Override':>8s} {'Delta':>7s} {'Override Lines':>14s}",
)
print(f" {'─' * w} {'─' * 8} {'─' * 8} {'─' * 7} {'─' * 14}")

Expand All @@ -469,7 +469,7 @@ def main() -> int: # sourcery skip: low-code-quality
print(
f" {b['language']:<{w}s} {b['overall_accuracy_pct']:>7.1f}% "
f"{o['overall_accuracy_pct']:>7.1f}% {sign}{delta:>5.1f}% "
f"{ovr_count:>14d}"
f"{ovr_count:>14d}",
)

delta_overall = avg_o[2] - avg_b[2]
Expand All @@ -493,7 +493,7 @@ def main() -> int: # sourcery skip: low-code-quality
total_overrides = sum(r["override_count"] for r in override_results)
langs_with_overrides = sum(r["override_count"] > 0 for r in override_results)
print(
f" Override cost: {total_overrides} lines across {langs_with_overrides} files"
f" Override cost: {total_overrides} lines across {langs_with_overrides} files",
)

if final_avg[2] >= 95:
Expand Down
Empty file modified scripts/model_data/hf-models.json.license
100644 → 100755
Empty file.
Empty file modified scripts/model_data/secondary_providers.json
100644 → 100755
Empty file.
Empty file modified scripts/model_data/secondary_providers.json.license
100644 → 100755
Empty file.
Empty file modified scripts/performance_baseline.py
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion src/codeweaver/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1766,8 +1766,8 @@
"SnowflakeEmbeddingCapabilities",
"SnowflakeProvider",
"SparseCapabilities",
"SparseEmbedding",
"SparseCapabilityResolverDep",
"SparseEmbedding",
"SparseEmbeddingCapabilityResolver",
"SparseEmbeddingConfigT",
"SparseEmbeddingModelCapabilities",
Expand Down
Loading
Loading