diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml new file mode 100644 index 000000000..4131e9115 --- /dev/null +++ b/.github/workflows/code-formatting.yml @@ -0,0 +1,41 @@ +name: Code Formatting + +on: + workflow_call: + push: + branches: + - main + +jobs: + formatting-check: + name: Code Formatting Check + runs-on: ubuntu-latest + if: github.event.pull_request.draft == false + env: + ZENML_DEBUG: 1 + ZENML_ANALYTICS_OPT_IN: false + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install latest ruff + run: pip install --upgrade ruff + + - name: Run formatting script + run: bash scripts/format.sh + + - name: Check for changes + id: git-check + run: | + git diff --exit-code || echo "changes=true" >> $GITHUB_OUTPUT + + - name: Fail if changes were made + if: steps.git-check.outputs.changes == 'true' + run: | + echo "::error::Formatting check failed. Please run 'scripts/format.sh' locally and commit the changes." + exit 1 diff --git a/.github/workflows/gpt4_summarizer.yml b/.github/workflows/gpt4_summarizer.yml deleted file mode 100644 index 6288cf74c..000000000 --- a/.github/workflows/gpt4_summarizer.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: Summarize Supabase with GPT-4 and ZenML - -on: - push: - branches: - - project/new-summary-project - -jobs: - build: - runs-on: ubuntu-latest - env: - ZENML_ANALYTICS_OPT_IN: false - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - - name: Log into GCP - uses: 'google-github-actions/auth@v1' - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - - - name: Run ZenML Pipeline - run: | - cd supabase-openai-summary/src - pip install -r requirements.txt - zenml init - zenml integration install gcp slack -y - zenml connect --url ${{ secrets.ZENML_URL }} --username ${{ secrets.ZENML_USERNAME }} --password ${{ secrets.ZENML_PASSWORD }} - zenml stack set ${{ secrets.ZENML_STACK }} - python run.py diff --git a/.github/workflows/production_run_complete_llm.yml b/.github/workflows/production_run_complete_llm.yml deleted file mode 100644 index 223c09858..000000000 --- a/.github/workflows/production_run_complete_llm.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Production LLM-COMPLETE -on: - push: - branches: - - main - paths: - - 'llm-complete-guide/**' -concurrency: - # New commit on branch cancels running workflows of the same branch - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - run-production-workflow: - runs-on: ubuntu-latest - if: github.event.pull_request.draft == false - env: - ZENML_STORE_URL: ${{ secrets.ZENML_PROJECTS_HOST }} - ZENML_STORE_API_KEY: ${{ secrets.ZENML_PROJECTS_API_KEY }} - ZENML_PRODUCTION_STACK: b3951d43-0fb2-4d32-89c5-3399374e7c7e # Set this to your production stack ID - ZENML_GITHUB_SHA: ${{ github.event.pull_request.head.sha }} - ZENML_GITHUB_URL_PR: ${{ github.event.pull_request._links.html.href }} - ZENML_DEBUG: true - ZENML_ANALYTICS_OPT_IN: false - ZENML_LOGGING_VERBOSITY: INFO - ZENML_PROJECT_SECRET_NAME: llm-complete - ZENML_DISABLE_CLIENT_SERVER_MISMATCH_WARNING: True - ZENML_EVENT_SOURCE_ID: ae6ae536-d811-4838-a44b-744b768a0f31 # Set this to your preferred event source ID - ZENML_SERVICE_ACCOUNT_ID: fef76af2-382f-4ab2-9e6b-5eb85a303f0e # Set this to your service account ID or delete - - steps: - - name: Check out repository code - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install requirements - working-directory: ./llm-complete-guide - run: | - pip3 install uv - uv pip install -r requirements.txt --system - uv pip install -r requirements-argilla.txt --system - zenml integration install gcp -y --uv - - - name: Connect to ZenML server - working-directory: ./llm-complete-guide - run: | - zenml init - - - name: Set stack (Production) - working-directory: ./llm-complete-guide - run: | - zenml stack set ${{ env.ZENML_PRODUCTION_STACK }} - - - name: Run pipeline, create pipeline, configure trigger (Production) - working-directory: ./llm-complete-guide - run: | - python gh_action_rag.py --no-cache --create-template --event-source-id ${{ env.ZENML_EVENT_SOURCE_ID }} --service-account-id ${{ env.ZENML_SERVICE_ACCOUNT_ID }} --config production/rag.yaml --zenml-model-version production diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index bad68db6f..23e271a8a 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -1,4 +1,4 @@ -name: Spell Checking +name: Pull Request Checks on: pull_request: @@ -25,3 +25,11 @@ jobs: markdown-link-check: uses: ./.github/workflows/markdown-link-check.yml if: github.event.pull_request.draft == false + + code-formatting-check: + uses: ./.github/workflows/code-formatting.yml + if: github.event.pull_request.draft == false + + readme-projects-check: + uses: ./.github/workflows/readme-projects-check.yml + if: github.event.pull_request.draft == false diff --git a/.github/workflows/readme-projects-check.yml b/.github/workflows/readme-projects-check.yml new file mode 100644 index 000000000..d82e2f3a2 --- /dev/null +++ b/.github/workflows/readme-projects-check.yml @@ -0,0 +1,21 @@ +name: README Projects Check + +on: + workflow_call: + +jobs: + readme-projects-check: + name: Check Projects in README + runs-on: ubuntu-latest + if: github.event.pull_request.draft == false + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Run README projects check + run: python3 scripts/check_readme_projects.py \ No newline at end of file diff --git a/.github/workflows/staging_run_complete_llm.yml b/.github/workflows/staging_run_complete_llm.yml deleted file mode 100644 index a1f831fa2..000000000 --- a/.github/workflows/staging_run_complete_llm.yml +++ /dev/null @@ -1,68 +0,0 @@ -name: Staging Trigger LLM-COMPLETE -on: - pull_request: - types: [opened, synchronize] - branches: [staging, main] -concurrency: - # New commit on branch cancels running workflows of the same branch - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - run-staging-workflow: - runs-on: ubuntu-latest - if: github.event.pull_request.draft == false - env: - ZENML_STORE_URL: ${{ secrets.ZENML_PROJECTS_HOST }} - ZENML_STORE_API_KEY: ${{ secrets.ZENML_PROJECTS_API_KEY }} - ZENML_STAGING_STACK : 67166d73-a44e-42f9-b67f-011e9afab9b5 # Set this to your staging stack ID - ZENML_GITHUB_SHA: ${{ github.event.pull_request.head.sha }} - ZENML_GITHUB_URL_PR: ${{ github.event.pull_request._links.html.href }} - ZENML_DEBUG: true - ZENML_ANALYTICS_OPT_IN: false - ZENML_LOGGING_VERBOSITY: INFO - ZENML_PROJECT_SECRET_NAME: llm-complete - ZENML_DISABLE_CLIENT_SERVER_MISMATCH_WARNING: True - - steps: - - name: Check out repository code - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install requirements - working-directory: ./llm-complete-guide - run: | - pip3 install uv - uv pip install -r requirements.txt --system - uv pip install -r requirements-argilla.txt --system - zenml integration install aws s3 -y --uv - - - name: Connect to ZenML server - working-directory: ./llm-complete-guide - run: | - zenml init - - - name: List and describe ZenML projects - working-directory: ./llm-complete-guide - run: | - zenml project list || echo "Could not list projects" - zenml project describe || echo "Failed to describe project" - - - name: Register Set ZenML project - working-directory: ./llm-complete-guide - run: | - zenml project register llm-complete-guide || echo "Failed to register project" - zenml project set llm-complete-guide || echo "Failed to set project" - - - name: Set stack (Staging) - working-directory: ./llm-complete-guide - run: | - zenml stack set ${{ env.ZENML_STAGING_STACK }} - - - name: Run pipeline (Staging) - working-directory: ./llm-complete-guide - run: | - python gh_action_rag.py --no-cache --config staging/rag.yaml --zenml-model-version staging diff --git a/.gitignore b/.gitignore index 456f7e6e1..851a4f94b 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,6 @@ finetuned-snowflake-arctic-embed-m-v1.5/ # ollama ignores nohup.out + +# Claude +.claude/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6ab572518..94b88803f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -99,10 +99,23 @@ the ["fork-and-pull" Git workflow](https://github.com/susam/gitpr) 4. Checkout the **main** branch <- `git checkout main`. 5. Create a branch locally off the **main** branch with a succinct but descriptive name. 6. Commit changes to the branch. -7. Push changes to your fork. -8. Open a PR in our repository to the `main` branch and +7. Format your code by running `bash scripts/format.sh` before committing. +8. Push changes to your fork. +9. Open a PR in our repository to the `main` branch and follow the PR template so that we can efficiently review the changes. +#### Code Formatting + +All code must pass our formatting checks before it can be merged. We use [ruff](https://github.com/astral-sh/ruff) for code formatting and linting. + +To format your code locally: +```bash +# Run from the project root +bash scripts/format.sh +``` + +Our CI pipeline automatically checks if your code is properly formatted. If the check fails, you'll need to run the formatting script locally and commit the changes before your PR can be merged. + ### 🚨 Reporting a Vulnerability If you think you have found a vulnerability, and even if you are not sure about it, diff --git a/README.md b/README.md index 0665a9584..5c5b033d8 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,15 @@ installation details. We welcome contributions from anyone to showcase your project built using ZenML. See our [contributing guide](./CONTRIBUTING.md) to start. +## Code Quality + +All code contributions must pass our automated code quality checks: +- **Code Formatting**: We use [ruff](https://github.com/astral-sh/ruff) for code formatting and linting +- **Spelling**: We check for typos and spelling errors +- **Markdown Links**: We verify that all links in documentation work properly + +Our CI pipeline will automatically check your PR for these issues. Remember to run `bash scripts/format.sh` locally before submitting your PR to ensure it passes the formatting checks. + # 🆘 Getting Help By far the easiest and fastest way to get help is to: diff --git a/databricks-production-qa-demo/steps/deployment/deployment_deploy.py b/databricks-production-qa-demo/steps/deployment/deployment_deploy.py index c8220afe9..b7407dcfb 100644 --- a/databricks-production-qa-demo/steps/deployment/deployment_deploy.py +++ b/databricks-production-qa-demo/steps/deployment/deployment_deploy.py @@ -31,14 +31,10 @@ @step(enable_cache=False) -def deployment_deploy() -> ( - Annotated[ - Optional[DatabricksDeploymentService], - ArtifactConfig( - name="databricks_deployment", is_deployment_artifact=True - ), - ] -): +def deployment_deploy() -> Annotated[ + Optional[DatabricksDeploymentService], + ArtifactConfig(name="databricks_deployment", is_deployment_artifact=True), +]: """Predictions step. This is an example of a predictions step that takes the data in and returns diff --git a/gamesense/steps/log_metadata.py b/gamesense/steps/log_metadata.py index 8398bccac..3c83a127a 100644 --- a/gamesense/steps/log_metadata.py +++ b/gamesense/steps/log_metadata.py @@ -15,7 +15,6 @@ # limitations under the License. # -from typing import Any, Dict from zenml import get_step_context, log_metadata, step @@ -33,9 +32,8 @@ def log_metadata_from_step_artifact( """ context = get_step_context() - metadata_dict: Dict[str, Any] = context.pipeline_run.steps[ - step_name - ].outputs[artifact_name] + # Access the artifact metadata but don't store the unused variable + _ = context.pipeline_run.steps[step_name].outputs[artifact_name] log_metadata( artifact_name=artifact_name, diff --git a/huggingface-sagemaker/steps/deploying/huggingface_deployment.py b/huggingface-sagemaker/steps/deploying/huggingface_deployment.py index 89d7305fe..33adcf81d 100644 --- a/huggingface-sagemaker/steps/deploying/huggingface_deployment.py +++ b/huggingface-sagemaker/steps/deploying/huggingface_deployment.py @@ -47,7 +47,9 @@ def deploy_to_huggingface( save_model_to_deploy.entrypoint() logger.info("Model saved locally. Pushing to HuggingFace...") - assert secret, "No secret found with name 'huggingface_creds'. Please create one with your `token`." + assert secret, ( + "No secret found with name 'huggingface_creds'. Please create one with your `token`." + ) token = secret.secret_values["token"] api = HfApi(token=token) diff --git a/huggingface-sagemaker/steps/promotion/promote_get_metrics.py b/huggingface-sagemaker/steps/promotion/promote_get_metrics.py index 93cebad1b..06473701c 100644 --- a/huggingface-sagemaker/steps/promotion/promote_get_metrics.py +++ b/huggingface-sagemaker/steps/promotion/promote_get_metrics.py @@ -27,12 +27,10 @@ @step -def promote_get_metrics() -> ( - Tuple[ - Annotated[Dict[str, Any], "latest_metrics"], - Annotated[Dict[str, Any], "current_metrics"], - ] -): +def promote_get_metrics() -> Tuple[ + Annotated[Dict[str, Any], "latest_metrics"], + Annotated[Dict[str, Any], "current_metrics"], +]: """Get metrics for comparison for promoting a model. This is an example of a metric retrieval step. It is used to retrieve diff --git a/llm-complete-guide/steps/eval_retrieval.py b/llm-complete-guide/steps/eval_retrieval.py index bf3594f8a..ebec42b50 100644 --- a/llm-complete-guide/steps/eval_retrieval.py +++ b/llm-complete-guide/steps/eval_retrieval.py @@ -275,9 +275,9 @@ def perform_small_retrieval_evaluation(use_reranking: bool) -> float: @step -def retrieval_evaluation_small() -> ( - Annotated[float, "small_failure_rate_retrieval"] -): +def retrieval_evaluation_small() -> Annotated[ + float, "small_failure_rate_retrieval" +]: """Executes the retrieval evaluation step without reranking. Returns: @@ -287,9 +287,9 @@ def retrieval_evaluation_small() -> ( @step -def retrieval_evaluation_small_with_reranking() -> ( - Annotated[float, "small_failure_rate_retrieval_reranking"] -): +def retrieval_evaluation_small_with_reranking() -> Annotated[ + float, "small_failure_rate_retrieval_reranking" +]: """Executes the retrieval evaluation step with reranking. Returns: diff --git a/llm-complete-guide/steps/hf_dataset_loader.py b/llm-complete-guide/steps/hf_dataset_loader.py index 5615ba4a4..0c7777573 100644 --- a/llm-complete-guide/steps/hf_dataset_loader.py +++ b/llm-complete-guide/steps/hf_dataset_loader.py @@ -23,9 +23,9 @@ @step(output_materializers=HFDatasetMaterializer) -def load_hf_dataset() -> ( - Tuple[Annotated[Dataset, "train"], Annotated[Dataset, "test"]] -): +def load_hf_dataset() -> Tuple[ + Annotated[Dataset, "train"], Annotated[Dataset, "test"] +]: train_dataset = load_dataset(DATASET_NAME_DEFAULT, split="train") test_dataset = load_dataset(DATASET_NAME_DEFAULT, split="test") return train_dataset, test_dataset diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py index c3d197caa..009a83e6d 100644 --- a/llm-complete-guide/steps/populate_index.py +++ b/llm-complete-guide/steps/populate_index.py @@ -117,7 +117,6 @@ def extract_docs_stats( num_buckets = 10 bucket_size = (max_chunk_size - min_chunk_size) / num_buckets buckets = [0] * num_buckets - bucket_ranges = [] for size in chunk_sizes: bucket_index = min( diff --git a/llm-complete-guide/utils/llm_utils.py b/llm-complete-guide/utils/llm_utils.py index ca9e776b3..3d412af03 100644 --- a/llm-complete-guide/utils/llm_utils.py +++ b/llm-complete-guide/utils/llm_utils.py @@ -406,28 +406,6 @@ def get_topn_similar_docs_elasticsearch( """ index_name = "zenml_docs" - if only_urls: - source = ["url"] - elif include_metadata: - source = ["content", "url", "parent_section"] - else: - source = ["content"] - - query = { - "_source": source, - "query": { - "script_score": { - "query": {"match_all": {}}, - "script": { - "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0", - "params": {"query_vector": query_embedding}, - }, - } - }, - "size": n, - } - - # response = es_client.search(index=index_name, body=query) response = es_client.search( index=index_name, knn={ diff --git a/magic-photobooth/frontend.py b/magic-photobooth/frontend.py index ffbb8730a..be2d7d9ac 100644 --- a/magic-photobooth/frontend.py +++ b/magic-photobooth/frontend.py @@ -178,6 +178,7 @@ def inference_mode(): st.warning("No trained models available. Please train a model first.") return + # Model selection - value used in later operations selected_model = st.selectbox( "Choose a trained model", st.session_state.trained_models ) diff --git a/magic-photobooth/k8s_run.py b/magic-photobooth/k8s_run.py index 5f19c702e..4e1d96399 100644 --- a/magic-photobooth/k8s_run.py +++ b/magic-photobooth/k8s_run.py @@ -326,13 +326,11 @@ def generate_video_frames( settings={"orchestrator.kubernetes": kubernetes_settings}, enable_cache=False, ) -def image_to_video() -> ( - Tuple[ - Annotated[PILImage.Image, "generated_image"], - Annotated[bytes, "video_data"], - Annotated[HTMLString, "video_html"], - ] -): +def image_to_video() -> Tuple[ + Annotated[PILImage.Image, "generated_image"], + Annotated[bytes, "video_data"], + Annotated[HTMLString, "video_html"], +]: model_path = f"{TrainConfig().hf_username}/{TrainConfig().hf_repo_suffix}" pipe = AutoPipelineForText2Image.from_pretrained( diff --git a/magic-photobooth/modal_run.py b/magic-photobooth/modal_run.py index ee1409ff0..5cf49ced2 100644 --- a/magic-photobooth/modal_run.py +++ b/magic-photobooth/modal_run.py @@ -308,13 +308,11 @@ def generate_video_frames( settings={"step_operator.modal": modal_settings}, enable_cache=False, ) -def image_to_video() -> ( - Tuple[ - Annotated[PILImage.Image, "generated_image"], - Annotated[bytes, "video_data"], - Annotated[HTMLString, "video_html"], - ] -): +def image_to_video() -> Tuple[ + Annotated[PILImage.Image, "generated_image"], + Annotated[bytes, "video_data"], + Annotated[HTMLString, "video_html"], +]: model_path = f"{TrainConfig().hf_username}/{TrainConfig().hf_repo_suffix}" pipe = AutoPipelineForText2Image.from_pretrained( diff --git a/magic-photobooth/modal_run_using_azure_data.py b/magic-photobooth/modal_run_using_azure_data.py index 1c91fc16f..ff53fba5b 100644 --- a/magic-photobooth/modal_run_using_azure_data.py +++ b/magic-photobooth/modal_run_using_azure_data.py @@ -300,13 +300,11 @@ def generate_video_frames( settings={"step_operator.modal": modal_settings}, enable_cache=False, ) -def image_to_video() -> ( - Tuple[ - Annotated[PILImage.Image, "generated_image"], - Annotated[bytes, "video_data"], - Annotated[HTMLString, "video_html"], - ] -): +def image_to_video() -> Tuple[ + Annotated[PILImage.Image, "generated_image"], + Annotated[bytes, "video_data"], + Annotated[HTMLString, "video_html"], +]: model_path = f"{TrainConfig().hf_username}/{TrainConfig().hf_repo_suffix}" pipe = AutoPipelineForText2Image.from_pretrained( @@ -362,7 +360,7 @@ def image_to_video() -> ( enable_cache=False, ) def dreambooth_pipeline(): - data = load_data() + _ = load_data() # train_model(data, after="load_data") # batch_inference(after="train_model") # image_to_video(after="batch_inference") diff --git a/oncoclear/steps/model_promoter.py b/oncoclear/steps/model_promoter.py index ca73c472f..0c570488d 100644 --- a/oncoclear/steps/model_promoter.py +++ b/oncoclear/steps/model_promoter.py @@ -44,7 +44,7 @@ def model_promoter(accuracy: float, stage: str = "production") -> bool: if accuracy < 0.8: logger.info( - f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + f"Model accuracy {accuracy * 100:.2f}% is below 80% ! Not promoting model." ) else: logger.info(f"Model promoted to {stage}!") diff --git a/retail-forecast/pipelines/training_pipeline.py b/retail-forecast/pipelines/training_pipeline.py index 047948383..26a68f7dd 100644 --- a/retail-forecast/pipelines/training_pipeline.py +++ b/retail-forecast/pipelines/training_pipeline.py @@ -12,14 +12,12 @@ @pipeline(name="retail_forecast_pipeline") -def training_pipeline() -> ( - Tuple[ - Annotated[Dict[str, float], "model_metrics"], - Annotated[HTMLString, "evaluation_report"], - Annotated[HTMLString, "forecast_dashboard"], - Annotated[HTMLString, "sales_visualization"], - ] -): +def training_pipeline() -> Tuple[ + Annotated[Dict[str, float], "model_metrics"], + Annotated[HTMLString, "evaluation_report"], + Annotated[HTMLString, "forecast_dashboard"], + Annotated[HTMLString, "sales_visualization"], +]: """Simple retail forecasting pipeline using Prophet. Steps: diff --git a/scripts/check-readme-projects.sh b/scripts/check-readme-projects.sh new file mode 100755 index 000000000..847ab2139 --- /dev/null +++ b/scripts/check-readme-projects.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# Check if all project directories are listed in the README table +python scripts/check_readme_projects.py \ No newline at end of file diff --git a/scripts/check_readme_projects.py b/scripts/check_readme_projects.py new file mode 100644 index 000000000..1aacd8091 --- /dev/null +++ b/scripts/check_readme_projects.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +CI check to ensure all project directories are listed in the README table. + +This script checks for project directories (excluding certain directories like _assets, +scripts, etc.) and ensures that they are all referenced in the project table in the +main README.md file. + +Projects can be exempted from this check by adding them to the exempt_projects set +in the get_project_directories function. This is useful for work-in-progress projects, +internal projects, or projects that are not meant to be public-facing. +""" + +import os +import re +import sys +from pathlib import Path + + +def get_project_directories(repo_root): + """Get a list of project directories from the repository. + + Args: + repo_root: The root directory of the repository. + + Returns: + List of project directory names. + """ + # Directories to exclude (infrastructure, config, assets, etc.) + exclude_dirs = { + "_assets", + "scripts", + "assets", + ".git", + "__pycache__", + ".github", + "wandb", + } + + # Projects to exempt from README table requirement + # Add directories here that don't need to be in the README table + exempt_projects = { + # Work-in-progress or internal projects + "finscan", + "sonicscribe", + } + + project_dirs = [] + + for item in os.listdir(repo_root): + item_path = os.path.join(repo_root, item) + + # Check if the item is a directory and not in the exclude or exempt lists + if ( + os.path.isdir(item_path) + and item not in exclude_dirs + and item not in exempt_projects + and not item.startswith(".") + ): + # Skip directories that are Python package-related but not actual projects + if not item.startswith("__") and item != "venv" and item != "env": + project_dirs.append(item) + + return project_dirs + + +def get_readme_projects(readme_path): + """Extract project directories listed in the README table. + + Args: + readme_path: Path to the README.md file. + + Returns: + List of project directory names referenced in the README. + """ + with open(readme_path, "r") as f: + readme_content = f.read() + + # Find the project table + table_pattern = r"\| Project\s+\| Domain.*?\n(.*?)(?:\n\n|\n#)" + table_match = re.search(table_pattern, readme_content, re.DOTALL) + + if not table_match: + print("Error: Could not find project table in README.md") + return [] + + table_content = table_match.group(1) + + # Extract project links from the table + # The pattern looks for Markdown links like [ProjectName](directory) + link_pattern = r"\[.*?\]\((.*?)\)" + project_links = re.findall(link_pattern, table_content) + + # Convert links to directory names + readme_projects = [] + for link in project_links: + # Remove trailing slash if present + if link.endswith("/"): + link = link[:-1] + readme_projects.append(link) + + return readme_projects + + +def main(): + """Main function to run the check.""" + # Get the repository root + repo_root = Path(__file__).parent.parent.absolute() + + # Get project directories from the repository (already excludes exempted projects) + project_dirs = get_project_directories(repo_root) + + # Get projects listed in the README + readme_path = os.path.join(repo_root, "README.md") + readme_projects = get_readme_projects(readme_path) + + # Find missing projects + missing_projects = set(project_dirs) - set(readme_projects) + + if missing_projects: + print( + "Error: The following project directories are not listed in the README table:" + ) + for project in sorted(missing_projects): + print(f" - {project}") + print( + "\nTo exempt a project from this check, add it to the exempt_projects set in this script." + ) + return 1 + + print( + "Success: All required project directories are listed in the README table." + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/format.sh b/scripts/format.sh index c944f607c..a764e2a29 100755 --- a/scripts/format.sh +++ b/scripts/format.sh @@ -17,8 +17,12 @@ fi export ZENML_DEBUG=1 export ZENML_ANALYTICS_OPT_IN=false +# Print ruff version for debugging +echo "Using ruff version:" +ruff --version + # autoflake replacement: removes unused imports and variables -ruff check $SRC --select F401,F841 --fix --exclude "__init__.py" --exclude "llm-finetuning/" --exclude "sign-language-detection-yolov5/model.py" --isolated +ruff check $SRC --select F401,F841 --fix --exclude "__init__.py" --exclude "llm-finetuning/" --exclude "sign-language-detection-yolov5/model.py" --exclude "*.ipynb" --isolated # sorts imports ruff check $SRC --exclude "llm-finetuning/" --exclude "sign-language-detection-yolov5/model.py" --select I --fix --ignore D diff --git a/sign-language-detection-yolov5/steps/model_loader.py b/sign-language-detection-yolov5/steps/model_loader.py index 1b8abffb4..f416539f5 100644 --- a/sign-language-detection-yolov5/steps/model_loader.py +++ b/sign-language-detection-yolov5/steps/model_loader.py @@ -21,9 +21,9 @@ @step -def model_loader() -> ( - Tuple[Annotated[str, "model_path"], Annotated[torch.nn.Module, "model"]] -): +def model_loader() -> Tuple[ + Annotated[str, "model_path"], Annotated[torch.nn.Module, "model"] +]: """Loads the trained models from previous training pipeline runs.""" training_pipeline = Client().get_pipeline( "sign_language_detection_train_pipeline" diff --git a/vertex-registry-and-deployer/steps/model_promoter.py b/vertex-registry-and-deployer/steps/model_promoter.py index b9a2abe41..d87ab3709 100644 --- a/vertex-registry-and-deployer/steps/model_promoter.py +++ b/vertex-registry-and-deployer/steps/model_promoter.py @@ -44,7 +44,7 @@ def model_promoter(accuracy: float, stage: str = "production") -> bool: if accuracy < 0.8: logger.info( - f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + f"Model accuracy {accuracy * 100:.2f}% is below 80% ! Not promoting model." ) else: logger.info(f"Model promoted to {stage}!") diff --git a/vertex-registry-and-deployer/steps/model_trainer.py b/vertex-registry-and-deployer/steps/model_trainer.py index eb4ed41e8..1a02e16f2 100644 --- a/vertex-registry-and-deployer/steps/model_trainer.py +++ b/vertex-registry-and-deployer/steps/model_trainer.py @@ -90,17 +90,17 @@ def model_trainer( # Evaluate the model trn_acc = model.score(X_trn, y_trn) tst_acc = model.score(X_tst, y_tst) - logger.info(f"Train accuracy={trn_acc*100:.2f}%") - logger.info(f"Test accuracy={tst_acc*100:.2f}%") + logger.info(f"Train accuracy={trn_acc * 100:.2f}%") + logger.info(f"Test accuracy={tst_acc * 100:.2f}%") messages = [] if trn_acc < min_train_accuracy: messages.append( - f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}%!" + f"Train accuracy {trn_acc * 100:.2f}% is below {min_train_accuracy * 100:.2f}%!" ) if tst_acc < min_test_accuracy: messages.append( - f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}%!" + f"Test accuracy {tst_acc * 100:.2f}% is below {min_test_accuracy * 100:.2f}%!" ) else: for message in messages: diff --git a/zencoder/pipelines/generate_code_dataset.py b/zencoder/pipelines/generate_code_dataset.py index 666230f08..800e3c467 100644 --- a/zencoder/pipelines/generate_code_dataset.py +++ b/zencoder/pipelines/generate_code_dataset.py @@ -33,4 +33,4 @@ def generate_code_dataset(dataset_id: str): # Link all the steps together by calling them and passing the output # of one step as the input of the next step. mirror_directory = mirror_repositories() - repo_id = prepare_dataset(mirror_directory, dataset_id) + prepare_dataset(mirror_directory, dataset_id) diff --git a/zencoder/steps/deployment.py b/zencoder/steps/deployment.py index 5db13ea9d..a62aaf1a4 100644 --- a/zencoder/steps/deployment.py +++ b/zencoder/steps/deployment.py @@ -38,7 +38,7 @@ def deploy_model_to_hf_hub(hf_endpoint_cfg: Optional[Dict] = None) -> None: hf_endpoint_cfg: The configuration for the Huggingface endpoint. """ - endpoint_name = None + # Endpoint name is managed by the HuggingFace service hf_endpoint_cfg = HuggingFaceServiceConfig(**hf_endpoint_cfg) secret = Client().get_secret("huggingface_creds") diff --git a/zencoder/test_starcoder_bigcode.py b/zencoder/test_starcoder_bigcode.py index bcdd00ad5..39f3245a3 100644 --- a/zencoder/test_starcoder_bigcode.py +++ b/zencoder/test_starcoder_bigcode.py @@ -1,13 +1,13 @@ -# Write a zenml pipeline that loads sklearn iris dataset and builds a sklearn classifier +# Write a zenml pipeline that loads sklearn iris dataset and builds a sklearn classifier from zenml.pipelines import pipeline -from zenml.steps.preprocesser import StandardPreprocesser -from zenml.steps.split import RandomSplit from zenml.steps.evaluator import TFMAEvaluator +from zenml.steps.preprocesser import StandardPreprocesser +from zenml.steps.preprocesser.standard_preprocesser.standard_preprocesser import ( + StandardPreprocesser, +) from zenml.steps.trainer import TFFeed -from zenml.steps.deployer import TFServingDeployer -from zenml.steps.preprocesser.standard_preprocesser.standard_preprocesser import \ - StandardPreprocesser + @pipeline def tf_mnist_pipeline(epochs: int = 5, lr: float = 0.001): @@ -15,16 +15,16 @@ def tf_mnist_pipeline(epochs: int = 5, lr: float = 0.001): # Link all the steps together by calling them and passing the output # of one step as the input -# x_train, x_test, y_train, y_test = RandomSplit(test_size=0.2)( -# dataset=iris_data_loader() -# ) + # x_train, x_test, y_train, y_test = RandomSplit(test_size=0.2)( + # dataset=iris_data_loader() + # ) x_train, x_test, y_train, y_test = StandardPreprocesser( test_size=0.2, random_state=42, - )( - dataset=iris_data_loader() - ) - model = TFFeed(epochs=epochs, lr=lr)( - x_train=x_train + )(dataset=iris_data_loader()) + model = TFFeed(epochs=epochs, lr=lr)(x_train=x_train, y_train=y_train) + + # Complete the pipeline with evaluation or deployment steps + metrics = TFMAEvaluator()(model=model, x_test=x_test, y_test=y_test) - + return model, metrics diff --git a/zencoder/test_zencoder.py b/zencoder/test_zencoder.py index 817032683..79733c1db 100644 --- a/zencoder/test_zencoder.py +++ b/zencoder/test_zencoder.py @@ -30,4 +30,4 @@ def trainer(df: pd.DataFrame) -> Any: @pipeline def sklearn_pipeline(): df = importer() - model = trainer(df) + trainer(df) diff --git a/zenml-support-agent/steps/agent_creator.py b/zenml-support-agent/steps/agent_creator.py index 1e6324c99..52a69530d 100644 --- a/zenml-support-agent/steps/agent_creator.py +++ b/zenml-support-agent/steps/agent_creator.py @@ -77,7 +77,8 @@ def agent_creator( human_message=SUFFIX, ) - agent_executor = AgentExecutor.from_agent_and_tools( + # Create but don't store the agent executor - just return the agent and tools + AgentExecutor.from_agent_and_tools( agent=my_agent, tools=tools, verbose=True,