diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 2ab89bcf1..381b2cb17 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -3,7 +3,7 @@ Please provide a short summary explaining the motivation behind these changes. # Checklist -- [ ] I have read the [contributing guidelines](CONTRIBUTING.md). +- [ ] I have read the [contributing guidelines](../CONTRIBUTING.md). - [ ] I have run the necessary tests and linters. - [ ] I have updated relevant documentation where applicable. diff --git a/end-to-end-computer-vision/utils/dataset_utils.py b/end-to-end-computer-vision/utils/dataset_utils.py index dba0cfcd8..4905e4712 100644 --- a/end-to-end-computer-vision/utils/dataset_utils.py +++ b/end-to-end-computer-vision/utils/dataset_utils.py @@ -31,8 +31,8 @@ logger = get_logger(__name__) -def load_images_from_folder(folder): - images = [] +def load_images_from_folder(folder: str) -> List[Image.Image]: + images: List[Image.Image] = [] for filename in os.listdir(folder): if ( filename.endswith(".png") @@ -45,7 +45,9 @@ def load_images_from_folder(folder): return images -def load_images_from_source(data_source, download_dir, filenames): +def load_images_from_source( + data_source: str, download_dir: str, filenames: List[str] +) -> None: total_images = len(filenames) for index, filename in enumerate(filenames): src_path = f"{data_source}/{filename}.png" diff --git a/llm-complete-guide/most_basic_rag_pipeline.py b/llm-complete-guide/most_basic_rag_pipeline.py index fd8349d3f..c73fcd170 100644 --- a/llm-complete-guide/most_basic_rag_pipeline.py +++ b/llm-complete-guide/most_basic_rag_pipeline.py @@ -17,23 +17,26 @@ import re import string +from typing import List from openai import OpenAI from utils.openai_utils import get_openai_api_key -def preprocess_text(text): +def preprocess_text(text: str) -> str: text = text.lower() text = text.translate(str.maketrans("", "", string.punctuation)) text = re.sub(r"\s+", " ", text).strip() return text -def tokenize(text): +def tokenize(text: str) -> List[str]: return preprocess_text(text).split() -def retrieve_relevant_chunks(query, corpus, top_n=2): +def retrieve_relevant_chunks( + query: str, corpus: List[str], top_n: int = 2 +) -> List[str]: query_tokens = set(tokenize(query)) similarities = [] for chunk in corpus: @@ -46,7 +49,7 @@ def retrieve_relevant_chunks(query, corpus, top_n=2): return [chunk for chunk, _ in similarities[:top_n]] -def answer_question(query, corpus, top_n=2): +def answer_question(query: str, corpus: List[str], top_n: int = 2) -> str: relevant_chunks = retrieve_relevant_chunks(query, corpus, top_n) if not relevant_chunks: return "I don't have enough information to answer the question." diff --git a/nightwatch-ai/src/pipelines/supabase_summary.py b/nightwatch-ai/src/pipelines/supabase_summary.py index cdba45409..184fca80b 100644 --- a/nightwatch-ai/src/pipelines/supabase_summary.py +++ b/nightwatch-ai/src/pipelines/supabase_summary.py @@ -13,13 +13,19 @@ # permissions and limitations under the License. +from typing import Any, Callable + from zenml.pipelines import pipeline pipeline_name = "daily_supabase_summary" @pipeline(name=pipeline_name) -def daily_supabase_summary(get_latest_data, generate_summary, report_summary): +def daily_supabase_summary( + get_latest_data: Callable[[], Any], + generate_summary: Callable[[Any], Any], + report_summary: Callable[[Any], Any], +) -> None: """Generates a summary of the latest data. Args: diff --git a/nightwatch-ai/src/run.py b/nightwatch-ai/src/run.py index 502aea88a..2db6cf494 100644 --- a/nightwatch-ai/src/run.py +++ b/nightwatch-ai/src/run.py @@ -21,7 +21,7 @@ from zenml.client import Client -def main(): +def main() -> None: if Client().active_stack.alerter is None: # we use a print alerter alerter = print_alerter() diff --git a/zencoder/steps/deployment.py b/zencoder/steps/deployment.py index a62aaf1a4..67dcdb926 100644 --- a/zencoder/steps/deployment.py +++ b/zencoder/steps/deployment.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional, cast +from typing import Dict, Optional, Tuple, cast from zenml import get_step_context, step from zenml.client import Client @@ -14,7 +14,23 @@ logger = get_logger(__name__) -def parse_huggingface_url(url): +def parse_huggingface_url(url: str) -> Tuple[str, str, str]: + """ + Parses a Hugging Face Hub URL to extract the namespace, repository, and revision. + + Args: + url: The Hugging Face Hub URL to parse. Expected format: + "https://huggingface.co/{namespace}/{repository}/tree/{revision}". + + Returns: + A tuple containing: + - namespace: The owner or organization of the repository. + - repository: The name of the repository. + - revision: The specific commit hash or branch name. + + Raises: + ValueError: If the URL does not match the expected format. + """ # Split the URL into parts parts = url.split("/") diff --git a/zencoder/steps/trainer.py b/zencoder/steps/trainer.py index 5305a80ab..5b955ee6b 100644 --- a/zencoder/steps/trainer.py +++ b/zencoder/steps/trainer.py @@ -8,7 +8,7 @@ import functools import os import random -from typing import Optional, Tuple +from typing import List, Optional, Tuple import numpy as np import torch @@ -66,16 +66,16 @@ def get_fim_token_ids(tokenizer): ## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py def permute( - sample, - np_rng, - suffix_tok_id, - prefix_tok_id, - middle_tok_id, - pad_tok_id, - fim_rate=0.5, - fim_spm_rate=0.5, - truncate_or_pad=False, -): + sample: List[int], + np_rng: np.random.RandomState, + suffix_tok_id: Optional[int], + prefix_tok_id: Optional[int], + middle_tok_id: Optional[int], + pad_tok_id: Optional[int], + fim_rate: float = 0.5, + fim_spm_rate: float = 0.5, + truncate_or_pad: bool = False, +) -> Tuple[List[int], np.random.RandomState]: """ Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes: PSM and SPM (with a probability of fim_spm_rate).