Skip to content

Commit ec5a6b1

Browse files
fix: Optimize manifest validation test performance and fix CI failures
- Add progress logging to sparse checkout manifest processing - Disable sparse checkout by default to prevent CI hangs - Keep comprehensive exclusion list with 361 failed connectors - Maintain RECHECK_EXCLUSION_LIST toggle for validation accuracy - Optimize YAML parsing loop with progress indicators - Ensure HTTP fallback works reliably for CI environments Co-Authored-By: AJ Steers <[email protected]>
1 parent 617c64f commit ec5a6b1

File tree

1 file changed

+81
-10
lines changed

1 file changed

+81
-10
lines changed

unit_tests/sources/declarative/test_manifest_registry_validation.py

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,10 @@
428428
("source-zonka-feedback", "5.17.0"),
429429
]
430430

431+
RECHECK_EXCLUSION_LIST = False
432+
433+
USE_GIT_SPARSE_CHECKOUT = False
434+
431435
CONNECTOR_REGISTRY_URL = "https://connectors.airbyte.com/files/registries/v0/oss_registry.json"
432436
MANIFEST_URL_TEMPLATE = (
433437
"https://connectors.airbyte.com/files/metadata/airbyte/{connector_name}/latest/manifest.yaml"
@@ -462,8 +466,18 @@ def schema_validator() -> ValidateAdheresToSchema:
462466
@pytest.fixture(scope="session")
463467
def manifest_connector_names() -> List[str]:
464468
"""Cached list of manifest-only connector names to avoid repeated registry calls."""
465-
connectors = get_manifest_only_connectors()
466-
return [connector_name for connector_name, _ in connectors]
469+
if USE_GIT_SPARSE_CHECKOUT:
470+
# Use git sparse-checkout to get all available manifest connectors
471+
try:
472+
manifests = download_manifests_via_git()
473+
return list(manifests.keys())
474+
except Exception as e:
475+
logger.warning(f"Git sparse-checkout failed, falling back to registry: {e}")
476+
connectors = get_manifest_only_connectors()
477+
return [connector_name for connector_name, _ in connectors]
478+
else:
479+
connectors = get_manifest_only_connectors()
480+
return [connector_name for connector_name, _ in connectors]
467481

468482

469483
def load_declarative_component_schema() -> Dict[str, Any]:
@@ -504,6 +518,10 @@ def get_manifest_only_connectors() -> List[Tuple[str, str]]:
504518
pytest.fail(f"Failed to fetch connector registry: {e}")
505519

506520

521+
# Global cache for git-downloaded manifests
522+
_git_manifest_cache: Dict[str, Tuple[str, str]] = {}
523+
524+
507525
def download_manifest(
508526
connector_name: str, download_failures: List[Tuple[str, str]]
509527
) -> Tuple[str, str]:
@@ -514,6 +532,19 @@ def download_manifest(
514532
Tuple of (manifest_content, cdk_version) where cdk_version is extracted
515533
from the manifest's version field.
516534
"""
535+
global _git_manifest_cache
536+
537+
if USE_GIT_SPARSE_CHECKOUT and not _git_manifest_cache:
538+
try:
539+
logger.info("Initializing git sparse-checkout cache...")
540+
_git_manifest_cache = download_manifests_via_git()
541+
logger.info(f"Cached {len(_git_manifest_cache)} manifests from git")
542+
except Exception as e:
543+
logger.warning(f"Git sparse-checkout failed, using HTTP fallback: {e}")
544+
545+
if connector_name in _git_manifest_cache:
546+
return _git_manifest_cache[connector_name]
547+
517548
url = MANIFEST_URL_TEMPLATE.format(connector_name=connector_name)
518549
try:
519550
response = requests.get(url, timeout=30)
@@ -542,20 +573,24 @@ def download_manifests_via_git() -> Dict[str, Tuple[str, str]]:
542573
repo_path = Path(temp_dir) / "airbyte"
543574

544575
try:
576+
logger.info("Cloning airbyte repo with sparse-checkout...")
545577
subprocess.run(
546578
[
547579
"git",
548580
"clone",
549581
"--filter=blob:none",
550582
"--sparse",
583+
"--depth=1",
551584
"https://github.com/airbytehq/airbyte.git",
552585
str(repo_path),
553586
],
554587
check=True,
555588
capture_output=True,
556589
text=True,
590+
timeout=120,
557591
)
558592

593+
logger.info("Setting sparse-checkout pattern...")
559594
subprocess.run(
560595
[
561596
"git",
@@ -568,12 +603,19 @@ def download_manifests_via_git() -> Dict[str, Tuple[str, str]]:
568603
check=True,
569604
capture_output=True,
570605
text=True,
606+
timeout=30,
571607
)
572608

573-
manifest_files = repo_path.glob("airbyte-integrations/connectors/*/manifest.yaml")
609+
logger.info("Processing manifest files...")
610+
manifest_files = list(repo_path.glob("airbyte-integrations/connectors/*/manifest.yaml"))
611+
logger.info(f"Found {len(manifest_files)} manifest files")
574612

575-
for manifest_path in manifest_files:
613+
for i, manifest_path in enumerate(manifest_files):
576614
connector_name = manifest_path.parent.name
615+
if i % 50 == 0:
616+
logger.info(
617+
f"Processing manifest {i + 1}/{len(manifest_files)}: {connector_name}"
618+
)
577619
try:
578620
with open(manifest_path, "r") as f:
579621
manifest_content = f.read()
@@ -584,10 +626,19 @@ def download_manifests_via_git() -> Dict[str, Tuple[str, str]]:
584626
except Exception as e:
585627
logger.warning(f"Failed to process manifest for {connector_name}: {e}")
586628

629+
except subprocess.TimeoutExpired:
630+
logger.error("Git sparse-checkout timed out. Falling back to HTTP downloads.")
631+
return {}
587632
except subprocess.CalledProcessError as e:
588633
logger.warning(f"Git sparse-checkout failed: {e}. Falling back to HTTP downloads.")
589634
return {}
635+
except Exception as e:
636+
logger.error(
637+
f"Unexpected error in git sparse-checkout: {e}. Falling back to HTTP downloads."
638+
)
639+
return {}
590640

641+
logger.info(f"Successfully cached {len(manifests)} manifests from git")
591642
return manifests
592643

593644

@@ -622,11 +673,17 @@ def test_manifest_validates_against_schema(
622673
except Exception as e:
623674
pytest.fail(f"Failed to download manifest for {connector_name}: {e}")
624675

625-
if (connector_name, cdk_version) in EXCLUDED_CONNECTORS:
626-
pytest.skip(
627-
f"Skipping {connector_name} - connector declares it is compatible with "
628-
f"CDK version {cdk_version} but is known to fail validation"
629-
)
676+
is_excluded = (connector_name, cdk_version) in EXCLUDED_CONNECTORS
677+
678+
if RECHECK_EXCLUSION_LIST:
679+
expected_to_fail = is_excluded
680+
else:
681+
# Normal mode: skip excluded connectors
682+
if is_excluded:
683+
pytest.skip(
684+
f"Skipping {connector_name} - connector declares it is compatible with "
685+
f"CDK version {cdk_version} but is known to fail validation"
686+
)
630687

631688
try:
632689
manifest_dict = yaml.safe_load(manifest_content)
@@ -639,14 +696,28 @@ def test_manifest_validates_against_schema(
639696
schema_validator.validate(manifest_dict)
640697
validation_successes.append((connector_name, cdk_version))
641698
logger.info(f"✓ {connector_name} (CDK {cdk_version}) - validation passed")
699+
700+
if RECHECK_EXCLUSION_LIST and expected_to_fail:
701+
pytest.fail(
702+
f"EXCLUSION LIST ERROR: {connector_name} (CDK {cdk_version}) was expected to fail "
703+
f"but passed validation. Remove from EXCLUDED_CONNECTORS."
704+
)
705+
642706
except ValueError as e:
643707
error_msg = (
644708
f"Manifest validation failed for {connector_name} "
645709
f"(connector declares it is compatible with CDK version {cdk_version}): {e}"
646710
)
647711
validation_failures.append((connector_name, cdk_version, str(e)))
648712
logger.error(f"✗ {connector_name} (CDK {cdk_version}) - validation failed: {e}")
649-
pytest.fail(error_msg)
713+
714+
if RECHECK_EXCLUSION_LIST and not expected_to_fail:
715+
pytest.fail(
716+
f"EXCLUSION LIST ERROR: {connector_name} (CDK {cdk_version}) was expected to pass "
717+
f"but failed validation. Add to EXCLUDED_CONNECTORS: {error_msg}"
718+
)
719+
elif not RECHECK_EXCLUSION_LIST:
720+
pytest.fail(error_msg)
650721

651722

652723
def test_schema_loads_successfully() -> None:

0 commit comments

Comments
 (0)