From e60e2d37322543513da3c1efe68047cd088ab8d7 Mon Sep 17 00:00:00 2001 From: marwan37 Date: Thu, 20 Mar 2025 07:39:24 -0500 Subject: [PATCH 1/4] add TODO comment to fix preprocesser typo --- .typos.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.typos.toml b/.typos.toml index 75fc8b0c..235d748e 100644 --- a/.typos.toml +++ b/.typos.toml @@ -18,8 +18,6 @@ MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1 = "MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1" [default.extend-words] # Don't correct the surname "Teh" -preprocesser = "preprocesser" -Preprocesser = "Preprocesser" aks = "aks" hashi = "hashi" womens = "womens" @@ -48,6 +46,9 @@ huggingface = "huggingface" answerdotai = "answerdotai" preprocessor = "preprocessor" logits = "logits" +# TODO: Fix zenml library typo -- zenml.steps.preprocesser, Preprocesser +preprocesser = "preprocesser" +Preprocesser = "Preprocesser" [default] locale = "en-us" From db62c178518339b96f9f07ef51db204ca6a65c6f Mon Sep 17 00:00:00 2001 From: marwan37 Date: Sat, 22 Mar 2025 16:47:26 -0500 Subject: [PATCH 2/4] fix config references in classification pipeline --- research-radar/pipelines/classification.py | 41 ++++++++++------------ 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/research-radar/pipelines/classification.py b/research-radar/pipelines/classification.py index 743284c5..5366e302 100644 --- a/research-radar/pipelines/classification.py +++ b/research-radar/pipelines/classification.py @@ -15,9 +15,7 @@ # limitations under the License. # -""" -Pipeline for article classification and dataset processing. -""" +"""Pipeline for article classification and dataset processing.""" from typing import Dict, Optional @@ -38,8 +36,7 @@ @pipeline(enable_cache=False) def classification_pipeline(config: Optional[Dict] = None): - """ - Pipeline for article classification and dataset processing. + """Pipeline for article classification and dataset processing. Args: config: Pipeline configuration from base_config.yaml @@ -53,15 +50,15 @@ def classification_pipeline(config: Optional[Dict] = None): hf_token = get_hf_token() - pipeline_config = config.steps.classify - classification_type = pipeline_config.classification_type + classify_config = config["steps"]["classify"] + classification_type = classify_config["classification_type"] logger.log_classification_type(classification_type) dataset_path = ( - config.datasets.unclassified + config["datasets"]["unclassified"] if classification_type == "augmentation" - else config.datasets.composite + else config["datasets"]["composite"] ) articles = load_classification_dataset(dataset_path) @@ -69,25 +66,25 @@ def classification_pipeline(config: Optional[Dict] = None): classifications = classify_articles( articles=articles, hf_token=hf_token, - model_id=config.model_repo_ids.deepseek, - inference_params=pipeline_config.inference_params, + model_id=config["model_repo_ids"]["deepseek"], + inference_params=classify_config["inference_params"], classification_type=classification_type, - batch_config=pipeline_config.batch_processing, - parallel_config=pipeline_config.parallel_processing, - checkpoint_config=pipeline_config.checkpoint, + batch_config=classify_config["batch_processing"], + parallel_config=classify_config["parallel_processing"], + checkpoint_config=classify_config["checkpoint"], ) results_path = save_classifications( classifications=classifications, classification_type=classification_type, - model_id=config.model_repo_ids.deepseek, - inference_params=pipeline_config.inference_params, - batch_config=pipeline_config.batch_processing, - checkpoint_config=pipeline_config.checkpoint, + model_id=config["model_repo_ids"]["deepseek"], + inference_params=classify_config["inference_params"], + batch_config=classify_config["batch_processing"], + checkpoint_config=classify_config["checkpoint"], ) if classification_type == "evaluation": - base_dataset_path = config.datasets.composite + base_dataset_path = config["datasets"]["composite"] calculate_and_save_metrics_from_json( results_path=str(results_path), base_dataset_path=base_dataset_path, @@ -96,7 +93,7 @@ def classification_pipeline(config: Optional[Dict] = None): if classification_type == "augmentation": merge_classifications( results_path=results_path, - training_dataset_path=config.datasets.composite, - augmented_dataset_path=config.datasets.augmented, - source_dataset_path=config.datasets.unclassified, + training_dataset_path=config["datasets"]["composite"], + augmented_dataset_path=config["datasets"]["augmented"], + source_dataset_path=config["datasets"]["unclassified"], ) From b580f514e8082f734e443d22dd740980afec99ca Mon Sep 17 00:00:00 2001 From: marwan37 Date: Sat, 22 Mar 2025 16:58:05 -0500 Subject: [PATCH 3/4] undo addition of preprocesser to .typos.toml --- .typos.toml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.typos.toml b/.typos.toml index 235d748e..4e44f644 100644 --- a/.typos.toml +++ b/.typos.toml @@ -44,11 +44,7 @@ modernbert = "modernbert" deepseek = "deepseek" huggingface = "huggingface" answerdotai = "answerdotai" -preprocessor = "preprocessor" logits = "logits" -# TODO: Fix zenml library typo -- zenml.steps.preprocesser, Preprocesser -preprocesser = "preprocesser" -Preprocesser = "Preprocesser" [default] -locale = "en-us" +locale = "en-us" \ No newline at end of file From 47e2ad284d4118b992d687387336f93865f9c60b Mon Sep 17 00:00:00 2001 From: marwan37 Date: Sat, 22 Mar 2025 16:59:13 -0500 Subject: [PATCH 4/4] checkout main .typos.toml --- .typos.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.typos.toml b/.typos.toml index 4e44f644..2fa346b5 100644 --- a/.typos.toml +++ b/.typos.toml @@ -18,6 +18,8 @@ MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1 = "MDEyOk9yZ2FuaXphdGlvbjg4Njc2OTU1" [default.extend-words] # Don't correct the surname "Teh" +preprocesser = "preprocesser" +Preprocesser = "Preprocesser" aks = "aks" hashi = "hashi" womens = "womens" @@ -44,6 +46,7 @@ modernbert = "modernbert" deepseek = "deepseek" huggingface = "huggingface" answerdotai = "answerdotai" +preprocessor = "preprocessor" logits = "logits" [default]