debug

diamonwiggins · diamonwiggins · commit b8d1254409f7 · 2025-04-09T23:56:20.000-04:00
diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml
@@ -277,10 +277,15 @@ jobs:
         run: |
           cd applications/mlflow
           echo "Installing Python dependencies for tests..."
-          pip install mlflow pandas scikit-learn
+          pip install mlflow pandas scikit-learn requests urllib3
           
           echo "Running MLflow application tests against ${{ steps.expose-port.outputs.hostname }}"
-          python tests/mlflow_test.py ${{ steps.expose-port.outputs.hostname }} --protocol https
+          echo "This may take some time as it will retry connections for up to 2 minutes"
+          python tests/mlflow_test.py ${{ steps.expose-port.outputs.hostname }} \
+            --protocol https \
+            --connection-timeout 60 \
+            --debug
+        timeout-minutes: 5
 
       - name: Install troubleshoot
         run: curl -L https://github.com/replicatedhq/troubleshoot/releases/latest/download/support-bundle_linux_amd64.tar.gz | tar xzvf -
diff --git a/applications/mlflow/Makefile b/applications/mlflow/Makefile
@@ -143,7 +143,15 @@ test-replicated-helm-with-values: registry-login
 	MLFLOW_VALUES_ARGS=""; \
 	if [ -n "$$MLFLOW_VALUES" ]; then \
 		echo "Using MLflow values file: $$MLFLOW_VALUES"; \
+		# Check if values file exists
+		if [ ! -f "$$MLFLOW_VALUES" ]; then \
+			echo "ERROR: Values file '$$MLFLOW_VALUES' does not exist"; \
+			exit 1; \
+		fi; \
 		MLFLOW_VALUES_ARGS="--values $$MLFLOW_VALUES"; \
+		echo "Values args: $$MLFLOW_VALUES_ARGS"; \
+	else \
+		echo "No custom values file provided. Using default values."; \
 	fi; \
 	\
 	# Create namespace if it doesn't exist
@@ -163,6 +171,7 @@ test-replicated-helm-with-values: registry-login
 	# Install MLflow chart from Replicated registry with custom values
 	echo "Installing mlflow chart from Replicated registry with custom values..."; \
 	echo "Chart path: $$OCI_URL/mlflow"; \
+	echo "Using values args: $$MLFLOW_VALUES_ARGS"; \
 	helm upgrade --install mlflow-values-test $$OCI_URL/mlflow \
 		--namespace values-test \
 		$$MLFLOW_VALUES_ARGS \
diff --git a/applications/mlflow/tests/mlflow_test.py b/applications/mlflow/tests/mlflow_test.py
@@ -6,28 +6,104 @@
 import subprocess
 import mlflow
 from mlflow.models import infer_signature
+import requests
+import time
+import socket
+from urllib.parse import urlparse
+import logging
 
 import pandas as pd
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score
 
-def run_mlflow_test(tracking_uri):
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+def check_server_connection(tracking_uri, timeout=30, retry_interval=5):
+    """
+    Check if the MLflow server is reachable
+    
+    Args:
+        tracking_uri: The URI of the MLflow server
+        timeout: Maximum time in seconds to wait for the server
+        retry_interval: Interval in seconds between retries
+        
+    Returns:
+        bool: True if the server is reachable, False otherwise
+    """
+    logger.info(f"Checking connection to MLflow server at {tracking_uri}")
+    
+    url = tracking_uri
+    if not url.endswith('/'):
+        url += '/'
+    
+    # Add health check endpoint if using standard MLflow API
+    health_url = f"{url}api/2.0/mlflow/experiments/list"
+    
+    # Parse URL to get host and port for socket check
+    parsed_url = urlparse(tracking_uri)
+    host = parsed_url.hostname
+    port = parsed_url.port or (443 if parsed_url.scheme == 'https' else 80)
+    
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        # First try a basic socket connection
+        try:
+            socket.create_connection((host, port), timeout=5)
+            logger.info(f"Socket connection to {host}:{port} successful")
+        except (socket.timeout, socket.error, ConnectionRefusedError) as e:
+            logger.warning(f"Socket connection failed: {e}")
+            logger.info(f"Retrying in {retry_interval} seconds...")
+            time.sleep(retry_interval)
+            continue
+            
+        # Then try an HTTP request to the API
+        try:
+            response = requests.get(health_url, timeout=5, verify=False)
+            if response.status_code == 200:
+                logger.info(f"MLflow server is reachable at {tracking_uri}")
+                return True
+            else:
+                logger.warning(f"MLflow server returned status code {response.status_code}")
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"HTTP request failed: {e}")
+        
+        logger.info(f"Retrying in {retry_interval} seconds...")
+        time.sleep(retry_interval)
+    
+    logger.error(f"Could not connect to MLflow server at {tracking_uri} after {timeout} seconds")
+    return False
+
+def run_mlflow_test(tracking_uri, connection_timeout=60):
     """
     Run MLflow test with the specified tracking URI
     
     Args:
         tracking_uri: The URI to use for the MLflow tracking server
+        connection_timeout: Timeout in seconds for server connection
         
     Returns:
         True if the test passed, False otherwise
     """
     try:
-        print(f"Setting MLflow tracking URI to: {tracking_uri}")
+        logger.info(f"Setting MLflow tracking URI to: {tracking_uri}")
+        
+        # Disable SSL warnings for self-signed certificates
+        import urllib3
+        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+        
+        # Check if the server is reachable before proceeding
+        if not check_server_connection(tracking_uri, timeout=connection_timeout):
+            logger.error("Failed to connect to MLflow server, aborting test")
+            return False
+        
         mlflow.set_tracking_uri(tracking_uri)
         
         # Load the Iris dataset
+        logger.info("Loading dataset and training model...")
         X, y = datasets.load_iris(return_X_y=True)
         
         # Split the data into training and test sets
@@ -39,7 +115,7 @@ def run_mlflow_test(tracking_uri):
         params = {
             "solver": "lbfgs",
             "max_iter": 1000,
-            "multi_class": "auto",
+            "multi_class": "auto",  # Deprecated but keeping for now
             "random_state": 8888,
         }
         
@@ -53,47 +129,69 @@ def run_mlflow_test(tracking_uri):
         # Calculate metrics
         accuracy = accuracy_score(y_test, y_pred)
         
-        print("Current tracking URI:", mlflow.get_tracking_uri())
+        logger.info(f"Current tracking URI: {mlflow.get_tracking_uri()}")
+        logger.info(f"Model trained with accuracy: {accuracy:.4f}")
         
         # Create a new MLflow Experiment
-        mlflow.set_experiment("MLflow CI Test")
+        logger.info("Creating MLflow experiment...")
+        experiment_name = "MLflow CI Test"
+        try:
+            experiment = mlflow.get_experiment_by_name(experiment_name)
+            if experiment is None:
+                experiment_id = mlflow.create_experiment(experiment_name)
+                logger.info(f"Created new experiment with ID: {experiment_id}")
+            else:
+                logger.info(f"Using existing experiment with ID: {experiment.experiment_id}")
+            mlflow.set_experiment(experiment_name)
+        except Exception as e:
+            logger.error(f"Failed to create or set experiment: {e}")
+            return False
         
         # Start an MLflow run
-        with mlflow.start_run():
-            # Log the hyperparameters
-            mlflow.log_params(params)
-            
-            # Log the loss metric
-            mlflow.log_metric("accuracy", accuracy)
-            
-            # Set a tag that we can use to remind ourselves what this run was for
-            mlflow.set_tag("Training Info", "CI Test for MLflow")
-            
-            # Infer the model signature
-            signature = infer_signature(X_train, lr.predict(X_train))
-            
-            # Log the model
-            model_info = mlflow.sklearn.log_model(
-                sk_model=lr,
-                artifact_path="iris_model",
-                registered_model_name="ci-test-model",
-                signature=signature
-            )
-            
-            print(f"Model URI: {model_info.model_uri}")
-            
-        # Load the model back for predictions as a generic Python Function model
+        logger.info("Starting MLflow run...")
         try:
-            loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
-            predictions = loaded_model.predict(X_test[:3])
-            print(f"Test predictions: {predictions}")
-            return True
+            with mlflow.start_run():
+                # Log the hyperparameters
+                mlflow.log_params(params)
+                
+                # Log the loss metric
+                mlflow.log_metric("accuracy", accuracy)
+                
+                # Set a tag that we can use to remind ourselves what this run was for
+                mlflow.set_tag("Training Info", "CI Test for MLflow")
+                
+                # Infer the model signature
+                signature = infer_signature(X_train, lr.predict(X_train))
+                
+                # Log the model
+                logger.info("Logging model to MLflow...")
+                model_info = mlflow.sklearn.log_model(
+                    sk_model=lr,
+                    artifact_path="iris_model",
+                    registered_model_name="ci-test-model",
+                    signature=signature
+                )
+                
+                logger.info(f"Model URI: {model_info.model_uri}")
+                
+            # Load the model back for predictions as a generic Python Function model
+            try:
+                logger.info("Loading model for predictions...")
+                loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
+                predictions = loaded_model.predict(X_test[:3])
+                logger.info(f"Test predictions: {predictions}")
+                return True
+            except Exception as e:
+                logger.error(f"Error loading model: {e}")
+                return False
         except Exception as e:
-            print(f"Error loading model: {e}")
+            logger.error(f"Error during MLflow run: {e}")
             return False
             
     except Exception as e:
-        print(f"Test failed with error: {e}")
+        logger.error(f"Test failed with error: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
         return False
 
 def ensure_dependencies():
@@ -102,21 +200,29 @@ def ensure_dependencies():
         import mlflow
         import pandas
         import sklearn
+        import requests
     except ImportError:
-        print("Installing required dependencies...")
+        logger.info("Installing required dependencies...")
         subprocess.check_call([
             sys.executable, "-m", "pip", "install", 
-            "mlflow", "pandas", "scikit-learn"
+            "mlflow", "pandas", "scikit-learn", "requests"
         ])
 
 def main():
     parser = argparse.ArgumentParser(description="MLflow CI testing tool")
     parser.add_argument("hostname", help="Hostname of the MLflow server")
     parser.add_argument("--port", type=int, help="Port number (if not included in hostname)")
     parser.add_argument("--protocol", default="https", help="Protocol (http or https, default: https)")
+    parser.add_argument("--connection-timeout", type=int, default=60, 
+                        help="Timeout in seconds for server connection (default: 60)")
+    parser.add_argument("--debug", action="store_true", help="Enable debug logs")
     
     args = parser.parse_args()
     
+    # Set logging level based on debug flag
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+    
     # Build the tracking URI
     tracking_uri = f"{args.protocol}://{args.hostname}"
     if args.port:
@@ -126,13 +232,14 @@ def main():
     ensure_dependencies()
     
     # Run the test
-    success = run_mlflow_test(tracking_uri)
+    logger.info(f"Starting MLflow test against server: {tracking_uri}")
+    success = run_mlflow_test(tracking_uri, connection_timeout=args.connection_timeout)
     
     if success:
-        print("✅ MLflow test completed successfully")
+        logger.info("✅ MLflow test completed successfully")
         sys.exit(0)
     else:
-        print("❌ MLflow test failed")
+        logger.error("❌ MLflow test failed")
         sys.exit(1)
 
 if __name__ == "__main__":