udacity · jcoo573 · Dec 29, 2024 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
diff --git a/README.md b/README.md
@@ -49,6 +49,8 @@ cd Project-Build-an-ML-Pipeline-Starter
 Commit and push to the repository often while you make progress towards the solution. Remember 
 to add meaningful commit messages.
 
+git hub link: https://github.com/jcoo573/Project-Build-an-ML-Pipeline-Starter.git
+
 ### Create environment
 Make sure to have conda installed and ready, then create a new environment using the ``environment.yaml``
 file provided in the root of the repository and activate it:
@@ -64,8 +66,9 @@ Let's make sure we are logged in to Weights & Biases. Get your API key from W&B
 then paste your key into this command:
 
 ```bash
-> wandb login [your API key]
+> wandb login 8e55b75b9c081e2823114b0a2a621ce5718e946d
 ```
+wandb workspace: https://wandb.ai/jcoo573-wgu/nyc_airbnb?nw=nwuserjcoo573
 
 You should see a message similar to:
 ```

diff --git a/components/README.md b/components/README.md
@@ -10,3 +10,6 @@ then run::
     > mlflow [url to this repo] -e help
 
 to get a description of the commands.
+
+#Weights and Biases:
+Weights & Biases public project: https://wandb.ai/jcoo573-wgu/nyc_airbnb?nw=nwuserjcoo573
diff --git a/components/conda.yml b/components/conda.yml
@@ -4,3 +4,5 @@ channels:
   - defaults
 dependencies:
   - mlflow=2.8.1
+  - python=3.10
+  - numpy=1.24
diff --git a/components/get_data/conda.yml b/components/get_data/conda.yml
@@ -6,6 +6,7 @@ dependencies:
   - python=3.10.0
   - pip=23.3.1
   - requests=2.24.0
+  - numpy=1.24
   - pyarrow
   - pip:
       - mlflow==2.8.1

diff --git a/components/test_regression_model/conda.yml b/components/test_regression_model/conda.yml
@@ -6,7 +6,7 @@ dependencies:
   - python=3.10.0
   - pip=23.3.1
   - requests=2.24.0
-  - scikit-learn=1.5.2
+  - scikit-learn=1.3.2
   - pandas=2.1.3
   - pip:
       - mlflow==2.18.0

diff --git a/conda.yml b/conda.yml
@@ -7,6 +7,8 @@ dependencies:
   - pyyaml
   - hydra-core=1.3.2
   - pip=23.3.1
+  - numpy=1.24
   - pip:
       - mlflow==2.8.1
       - wandb==0.16.0
+
diff --git a/config.yaml b/config.yaml
@@ -1,5 +1,6 @@
 main:
-  components_repository: "https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git#components"
+  components_repository: "https://github.com/jcoo573/Project-Build-an-ML-Pipeline-Starter.git#components"
+  src_repository: "https://github.com/jcoo573/Project-Build-an-ML-Pipeline-Starter.git#src"
   # All the intermediate files will be copied to this directory at the end of the run.
   # Set this to null if you are running in prod
   project_name: nyc_airbnb
@@ -26,8 +27,9 @@ modeling:
   # NOTE: you can put here any parameter that is accepted by the constructor of
   # RandomForestRegressor. This is a subsample, but more could be added:
   random_forest:
-    n_estimators: 100
-    max_depth: 15
+    n_estimators: 200
+    max_depth: 50
+    random_state: 42
     min_samples_split: 4
     min_samples_leaf: 3
     # Here -1 means all available cores

diff --git a/main.py b/main.py
@@ -54,19 +54,60 @@ def go(config: DictConfig):
             ##################
             # Implement here #
             ##################
-            pass
+
+            _ = mlflow.run(
+                f"{config['main']['src_repository']}/basic_cleaning",
+                "main",
+                version='main',
+                env_manager="conda",
+                parameters={
+                    "input_artifact": "sample.csv:latest",
+                    "output_artifact": "cleaned_data.csv",
+                    "output_type": "cleaned_data",
+                    "output_description": "Dataset after basic cleaning",
+                    "min_price": config["etl"]["min_price"],
+                    "max_price": config["etl"]["max_price"]
+                },
+            )  
+
+
 
         if "data_check" in active_steps:
             ##################
             # Implement here #
             ##################
-            pass
+            _ = mlflow.run(
+                f"{config['main']['src_repository']}/data_check",
+                "main",
+                version='main',
+                env_manager="conda",
+                parameters={
+                    "csv": "cleaned_data.csv:latest",
+                    "ref": "cleaned_data.csv:reference",
+                    "kl_threshold": config["data_check"]["kl_threshold"],
+                    "min_price": config["etl"]["min_price"],
+                    "max_price": config["etl"]["max_price"]
+                },
+            )
+
+
 
         if "data_split" in active_steps:
             ##################
             # Implement here #
             ##################
-            pass
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/train_val_test_split",
+                "main",
+                version='main',
+                env_manager="conda",
+                parameters={
+                    "input": "cleaned_data.csv:latest",
+                    "test_size": config["modeling"]["test_size"],
+                    "random_seed": config["modeling"]["random_seed"],
+                    "stratify_by": config["modeling"]["stratify_by"],
+                },
+            )
 
         if "train_random_forest" in active_steps:
 
@@ -81,16 +122,38 @@ def go(config: DictConfig):
             ##################
             # Implement here #
             ##################
-
-            pass
+            _ = mlflow.run(
+                f"{config['main']['src_repository']}/train_random_forest",
+                "main",
+                version='main',
+                env_manager="conda",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "val_size": config["modeling"]["val_size"],
+                    "random_seed": config["modeling"]["random_seed"],
+                    "stratify_by": config["modeling"]["stratify_by"],
+                    "rf_config": rf_config,
+                    "max_tfidf_features": config["modeling"]["max_tfidf_features"],
+                    "output_artifact": "random_forest_export"
+                },
+            ) 
 
         if "test_regression_model" in active_steps:
 
             ##################
             # Implement here #
             ##################
-
-            pass
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/test_regression_model",
+                "main",
+                version='main',
+                env_manager="conda",
+                parameters={
+                    "mlflow_model": "random_forest_export:prod",
+                    "test_dataset": "test_data.csv:latest",
+                },
+            ) 
+
 
 
 if __name__ == "__main__":

diff --git a/mlflow b/mlflow
diff --git a/src/basic_cleaning/conda.yml b/src/basic_cleaning/conda.yml
@@ -6,6 +6,7 @@ dependencies:
   - python=3.10.0
   - pip=23.3.1
   - pandas=2.1.3
+  - numpy=1.24
   - pip:
       - wandb==0.16.0
 
diff --git a/src/basic_cleaning/run.py b/src/basic_cleaning/run.py
@@ -53,43 +53,43 @@ def go(args):
 
     parser.add_argument(
         "--input_artifact", 
-        type = ## INSERT TYPE HERE: str, float or int,
-        help = ## INSERT DESCRIPTION HERE,
+        type = str,
+        help = "Ininital artifact to be cleaned",
         required = True
     )
 
     parser.add_argument(
         "--output_artifact", 
-        type = ## INSERT TYPE HERE: str, float or int,
-        help = ## INSERT DESCRIPTION HERE,
+        type = str,
+        help = "Output artifact for cleaned data",
         required = True
     )
 
     parser.add_argument(
         "--output_type", 
-        type = ## INSERT TYPE HERE: str, float or int,
-        help = ## INSERT DESCRIPTION HERE,
+        type = str,
+        help = "Type of the output dataset",
         required = True
     )
 
     parser.add_argument(
         "--output_description", 
-        type = ## INSERT TYPE HERE: str, float or int,
-        help = ## INSERT DESCRIPTION HERE,
+        type = str,
+        help = "Description of the output dataset",
         required = True
     )
 
     parser.add_argument(
         "--min_price", 
-        type = ## INSERT TYPE HERE: str, float or int,
-        help = ## INSERT DESCRIPTION HERE,
+        type = float,
+        help = "Minimum house price to be considered",
         required = True
     )
 
     parser.add_argument(
         "--max_price",
-        type = ## INSERT TYPE HERE: str, float or int,
-        help = ## INSERT DESCRIPTION HERE,
+        type = float,
+        help = "Maximum house price to be considered",
         required = True
     )
 

diff --git a/src/data_check/conda.yml b/src/data_check/conda.yml
@@ -5,8 +5,8 @@ channels:
 dependencies:
   - python=3.10.0
   - pandas=2.1.3
-  - pytest=6.2.2
-  - scipy=1.5.2
+  - pytest>=7.0.0
+  - scipy>=1.7
   - pip=23.3.1
   - pip:
       - mlflow==2.8.1

diff --git a/src/data_check/test_data.py b/src/data_check/test_data.py
@@ -63,3 +63,15 @@ def test_similar_neigh_distrib(data: pd.DataFrame, ref_data: pd.DataFrame, kl_th
 ########################################################
 # Implement here test_row_count and test_price_range   #
 ########################################################
+
+#test row count
+def test_row_count(data):
+    assert 15000 < data.shape[0] < 1000000
+
+#test price range
+def test_price_range(data, min_price, max_price):
+    assert data['price'].between(min_price, max_price).all(), (
+    f"Prices are not within the range {min_price} to {max_price}."
+    )
+
+