Skip to content

Commit 276e9ae

Browse files
MongoDB Update
1 parent aaf7857 commit 276e9ae

File tree

2 files changed

+278
-5
lines changed

2 files changed

+278
-5
lines changed

.github/workflows/guardian-pipeline-aws.yml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,39 @@ on:
44
push:
55
branches:
66
- main
7+
workflow_dispatch: # Allow manual trigger
78

89
jobs:
910
run-pipeline:
1011
runs-on: [self-hosted, Linux, X64, guardian, gpu] # must match the labels you set
1112
timeout-minutes: 500 # 8.33 hours
1213
steps:
14+
# 1. Checkout code
1315
- name: Check out code
1416
uses: actions/checkout@v4
1517

18+
# 2. Set up Python
1619
- name: Set up Python
1720
uses: actions/setup-python@v5
1821
with:
1922
python-version: '3.11'
2023

24+
# 3. Install system dependencies
2125
- name: Install system dependencies
2226
run: |
2327
sudo apt-get update
2428
sudo apt-get install -y libgl1-mesa-glx
2529
sudo apt-get install -y libglib2.0-0
2630
31+
# 4. Install Python dependencies
2732
- name: Install dependencies
2833
run: |
2934
python -m pip install --upgrade pip
3035
pip install -r requirements.txt
36+
# Ensure MongoDB dependencies are installed
37+
pip install pymongo gridfs
3138
39+
# 5. Setup dataset symlink (for self-hosted runner)
3240
- name: Setup dataset symlink
3341
run: |
3442
echo "🔗 Setting up dataset symlink for self-hosted runner..."
@@ -57,14 +65,29 @@ jobs:
5765
echo "⚠️ Absolute dataset path not found. Pipeline will need to download the dataset."
5866
fi
5967
68+
# 6. Create .secrets file with MongoDB URI if provided
69+
- name: Setup MongoDB secrets file
70+
run: |
71+
if [ -n "$MONGODB_URI" ]; then
72+
echo "MONGODB_URI=$MONGODB_URI" > .secrets
73+
echo "✅ Created .secrets file with MongoDB URI"
74+
else
75+
echo "⚠️ MONGODB_URI not provided, model distribution will not work"
76+
fi
77+
78+
# 7. Verify ClearML configuration
6079
- name: Verify ClearML Configuration
6180
env:
6281
CLEARML_API_ACCESS_KEY: ${{ secrets.CLEARML_API_ACCESS_KEY }}
6382
CLEARML_API_SECRET_KEY: ${{ secrets.CLEARML_API_SECRET_KEY }}
6483
CLEARML_API_HOST: ${{ secrets.CLEARML_API_HOST }}
84+
CLEARML_WEB_HOST: ${{ secrets.CLEARML_WEB_HOST }}
85+
CLEARML_FILES_HOST: ${{ secrets.CLEARML_FILES_HOST }}
6586
run: |
6687
echo "🔍 Checking ClearML configuration..."
6788
echo "CLEARML_API_HOST: ${CLEARML_API_HOST:-'Not Set'}"
89+
echo "CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-'Not Set'}"
90+
echo "CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-'Not Set'}"
6891
echo "CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY:+Set}"
6992
echo "CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY:+Set}"
7093
@@ -81,12 +104,55 @@ jobs:
81104
print('This may be expected if credentials are not configured')
82105
"
83106
107+
# 8. Verify MongoDB configuration
108+
- name: Verify MongoDB Configuration
109+
env:
110+
MONGODB_URI: ${{ secrets.MONGODB_URI }}
111+
run: |
112+
echo "🔍 Checking MongoDB configuration..."
113+
if [ -n "$MONGODB_URI" ]; then
114+
echo "✅ MongoDB URI is configured"
115+
python -c "
116+
try:
117+
from pymongo import MongoClient
118+
import os
119+
client = MongoClient(os.environ['MONGODB_URI'])
120+
db = client.admin
121+
server_info = db.command('serverStatus')
122+
print(f'✅ MongoDB connection successful - connected to version {server_info.get(\"version\", \"unknown\")}')
123+
print(f'✅ MongoDB server: {server_info.get(\"host\", \"unknown\")}')
124+
except Exception as e:
125+
print(f'❌ MongoDB connection failed: {e}')
126+
# Don't fail the workflow, we'll continue without MongoDB
127+
print('Will continue without MongoDB storage')
128+
"
129+
else
130+
echo "⚠️ MongoDB URI not configured. Model will not be stored in MongoDB."
131+
fi
132+
133+
# 9. Run the Guardian AI pipeline
84134
- name: Run ClearML pipeline
85135
env:
86136
CLEARML_API_ACCESS_KEY: ${{ secrets.CLEARML_API_ACCESS_KEY }}
87137
CLEARML_API_SECRET_KEY: ${{ secrets.CLEARML_API_SECRET_KEY }}
88138
CLEARML_API_HOST: ${{ secrets.CLEARML_API_HOST }}
139+
CLEARML_WEB_HOST: ${{ secrets.CLEARML_WEB_HOST }}
140+
CLEARML_FILES_HOST: ${{ secrets.CLEARML_FILES_HOST }}
141+
MONGODB_URI: ${{ secrets.MONGODB_URI }}
89142
run: |
143+
echo "🚀 Starting Guardian AI pipeline..."
90144
python Guardian_pipeline_github.py
145+
146+
# 10. Upload training artifacts
147+
- name: Upload artifacts
148+
if: always()
149+
uses: actions/upload-artifact@v3
150+
with:
151+
name: training-artifacts
152+
path: |
153+
*.png
154+
*.pth
155+
*.json
156+
retention-days: 7
91157

92158

Guardian_pipeline_github.py

Lines changed: 212 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1345,15 +1345,37 @@ def make_torch_dataset_for_loader(split_data, split_labels):
13451345
name="Deploy_Model_GitHub",
13461346
return_values=["deployment_status"],
13471347
cache=False,
1348-
packages=["clearml"]
1348+
packages=["clearml", "pymongo", "torch", "gridfs"]
13491349
)
13501350
def deploy_model_github(
13511351
best_model_id: str,
1352+
best_model_path: str,
13521353
test_accuracy: float,
1353-
min_accuracy_threshold: float = 85.0
1354+
min_accuracy_threshold: float = 85.0,
1355+
mongo_uri: str = None
13541356
):
1355-
"""Deploy the best model if it meets accuracy threshold."""
1357+
"""Deploy the best model if it meets accuracy threshold and save to MongoDB."""
13561358
from clearml import Model, Task
1359+
import os
1360+
import torch
1361+
import json
1362+
import logging
1363+
import shutil
1364+
import sys
1365+
1366+
# Add the current directory to the path for importing local modules
1367+
current_dir = os.path.dirname(os.path.abspath(__file__))
1368+
if current_dir not in sys.path:
1369+
sys.path.append(current_dir)
1370+
1371+
# Try to import the mongodb_model_distribution module
1372+
try:
1373+
from mongodb_model_distribution import GuardianModelDistribution
1374+
has_model_distribution = True
1375+
print("✅ Found mongodb_model_distribution module")
1376+
except ImportError:
1377+
has_model_distribution = False
1378+
print("⚠️ mongodb_model_distribution module not found. Will use basic MongoDB storage.")
13571379

13581380
task = Task.init(
13591381
project_name="Guardian_Training",
@@ -1385,20 +1407,153 @@ def deploy_model_github(
13851407
print(f"⚠️ Could not add tags: {tag_error}")
13861408
# Continue anyway - tags are not critical
13871409

1410+
# Get the best task to retrieve hyperparameters
1411+
best_task = Task.get_task(task_id=model.task)
1412+
if not best_task:
1413+
print("⚠️ Could not retrieve task for model hyperparameters")
1414+
hyperparams = {}
1415+
else:
1416+
hyperparams = best_task.get_parameters()
1417+
print(f"📋 Retrieved hyperparameters from task {best_task.id}")
1418+
13881419
# Update model metadata
13891420
try:
13901421
model.update_design(config_dict={
13911422
"deployment_status": "deployed",
13921423
"test_accuracy": test_accuracy,
13931424
"deployment_date": str(task.created),
13941425
"deployment_threshold": min_accuracy_threshold,
1395-
"deployed_by": "GitHub Actions"
1426+
"deployed_by": "GitHub Actions",
1427+
"mongodb_stored": False # Will update if MongoDB storage succeeds
13961428
})
13971429
print(f"📋 Updated model metadata")
13981430
except Exception as metadata_error:
13991431
print(f"⚠️ Could not update metadata: {metadata_error}")
14001432
# Continue anyway - metadata is not critical
14011433

1434+
# MongoDB integration - Store model weights and hyperparameters
1435+
if mongo_uri:
1436+
try:
1437+
print(f"🔄 Connecting to MongoDB for model storage...")
1438+
1439+
# Ensure the model path exists
1440+
if not os.path.exists(best_model_path):
1441+
model_path = model.get_local_copy()
1442+
print(f"📥 Model weights downloaded to {model_path}")
1443+
else:
1444+
model_path = best_model_path
1445+
print(f"📄 Using existing model weights at {model_path}")
1446+
1447+
# Load model to extract architecture
1448+
try:
1449+
checkpoint = torch.load(model_path, map_location='cpu')
1450+
print(f"✅ Model weights loaded successfully!")
1451+
except Exception as e:
1452+
print(f"⚠️ Error loading model weights: {e}")
1453+
checkpoint = {}
1454+
1455+
# Create model name with timestamp and accuracy
1456+
model_name = f"guardian_model_{best_model_id[:8]}_{int(test_accuracy)}"
1457+
1458+
# Prepare model metadata for distribution
1459+
model_metadata = {
1460+
"model_id": best_model_id,
1461+
"test_accuracy": float(test_accuracy),
1462+
"deployment_date": str(task.created),
1463+
"training_task_id": str(best_task.id) if best_task else "unknown",
1464+
"architecture": model.get_model_design() or {},
1465+
"hyperparameters": hyperparams,
1466+
"checkpoint_keys": list(checkpoint.keys()) if checkpoint else [],
1467+
"input_size": hyperparams.get("General/input_size", {}).get("value", 34),
1468+
"hidden_size": hyperparams.get("General/hidden_size", {}).get("value", 256),
1469+
"num_layers": hyperparams.get("General/num_layers", {}).get("value", 4),
1470+
"num_classes": hyperparams.get("General/num_classes", {}).get("value", 3),
1471+
"framework": "PyTorch",
1472+
"model_type": "BiLSTM_ActionRecognition",
1473+
"description": "Guardian AI Action Recognition Model"
1474+
}
1475+
1476+
# Use the GuardianModelDistribution class if available
1477+
if has_model_distribution:
1478+
print("🔄 Using GuardianModelDistribution for model storage...")
1479+
distributor = GuardianModelDistribution(uri=mongo_uri)
1480+
1481+
if distributor.connect():
1482+
# Upload model using the distribution system
1483+
result = distributor.upload_model(
1484+
model_path=model_path,
1485+
model_metadata=model_metadata,
1486+
model_name=model_name
1487+
)
1488+
1489+
if result:
1490+
print(f"🗃️ Model uploaded to distribution system:")
1491+
print(f" Model Name: {result['model_name']}")
1492+
print(f" Document ID: {result['document_id']}")
1493+
print(f" Download Command: {result['download_command']}")
1494+
1495+
# Update model metadata to reflect MongoDB storage
1496+
model.update_design(config_dict={"mongodb_stored": True})
1497+
else:
1498+
print("❌ Failed to upload model to distribution system")
1499+
else:
1500+
print("❌ Failed to connect to MongoDB distribution system")
1501+
else:
1502+
# Fallback to basic MongoDB storage
1503+
from pymongo import MongoClient
1504+
import gridfs
1505+
1506+
# Connect to MongoDB
1507+
client = MongoClient(mongo_uri)
1508+
db = client.guardian_models
1509+
fs = gridfs.GridFS(db)
1510+
1511+
# Store the model weights
1512+
with open(model_path, 'rb') as f:
1513+
weights_file_id = fs.put(
1514+
f,
1515+
filename=f"{model_name}.pth",
1516+
metadata={
1517+
"model_id": best_model_id,
1518+
"accuracy": float(test_accuracy),
1519+
"deployment_date": str(task.created)
1520+
}
1521+
)
1522+
1523+
# Prepare model metadata and hyperparameters
1524+
model_info = {
1525+
"model_name": model_name,
1526+
"model_id": best_model_id,
1527+
"test_accuracy": float(test_accuracy),
1528+
"weights_file_id": weights_file_id,
1529+
"hyperparameters": hyperparams,
1530+
"deployment_date": str(task.created),
1531+
"deployment_status": "deployed",
1532+
"architecture": model.get_model_design() or {},
1533+
"checkpoint_keys": list(checkpoint.keys()) if checkpoint else [],
1534+
"file_size_mb": os.path.getsize(model_path) / (1024 * 1024),
1535+
"status": "available",
1536+
"download_count": 0,
1537+
"uploaded_at": str(task.created),
1538+
"file_id": weights_file_id
1539+
}
1540+
1541+
# Store model metadata
1542+
db.model_metadata.insert_one(model_info)
1543+
1544+
print(f"🗃️ Model weights and metadata saved to MongoDB")
1545+
print(f" Model Name: {model_name}")
1546+
print(f" File Size: {model_info['file_size_mb']:.2f} MB")
1547+
1548+
# Update model metadata to reflect MongoDB storage
1549+
model.update_design(config_dict={"mongodb_stored": True})
1550+
1551+
except Exception as mongo_error:
1552+
print(f"❌ MongoDB storage error: {mongo_error}")
1553+
logger.report_text(f"MongoDB storage failed: {mongo_error}")
1554+
else:
1555+
print("ℹ️ MongoDB URI not provided, skipping database storage")
1556+
14021557
logger.report_scalar("Deployment", "Status", 1, 0) # 1 = deployed
14031558
logger.report_scalar("Deployment", "Test_Accuracy", test_accuracy, 0)
14041559

@@ -1440,6 +1595,13 @@ def guardian_github_pipeline():
14401595
dataset_name = "Guardian_Dataset"
14411596
dataset_project = "Guardian_Training"
14421597

1598+
# Get MongoDB URI from environment variable
1599+
mongo_uri = os.environ.get("MONGODB_URI", None)
1600+
if mongo_uri:
1601+
logging.info("MongoDB URI configured for model storage")
1602+
else:
1603+
logging.warning("MongoDB URI not found in environment variables. Models will not be stored in MongoDB.")
1604+
14431605
# Multiple path options for your self-hosted runner
14441606
possible_paths = [
14451607
# Your absolute dataset path
@@ -1506,6 +1668,49 @@ def guardian_github_pipeline():
15061668
)
15071669
logging.info(f"HPO completed. Best task ID: {best_task_id}, Best model ID: {best_model_id}")
15081670

1671+
# Get the best model path from ClearML
1672+
try:
1673+
from clearml import Model
1674+
logging.info(f"Retrieving best model with ID: {best_model_id}")
1675+
1676+
# Create a specific path for the best model that includes the model ID
1677+
best_model_filename = f"best_bilstm_github_{best_model_id}.pth"
1678+
best_model_path = os.path.join(os.getcwd(), best_model_filename)
1679+
1680+
# Check if we already have this specific model
1681+
if os.path.exists(best_model_path):
1682+
logging.info(f"Best model already exists at {best_model_path}")
1683+
else:
1684+
# Download the model from ClearML by ID
1685+
best_model = Model(model_id=best_model_id)
1686+
downloaded_path = best_model.get_local_copy()
1687+
1688+
# If the downloaded path is different from our desired path, copy it
1689+
if downloaded_path != best_model_path:
1690+
shutil.copy2(downloaded_path, best_model_path)
1691+
logging.info(f"Copied best model from {downloaded_path} to {best_model_path}")
1692+
else:
1693+
logging.info(f"Downloaded best model to {best_model_path}")
1694+
1695+
# Verify the model file exists and has content
1696+
if not os.path.exists(best_model_path) or os.path.getsize(best_model_path) == 0:
1697+
logging.error(f"Best model file is missing or empty at {best_model_path}")
1698+
raise FileNotFoundError(f"Best model file not found: {best_model_path}")
1699+
1700+
# Verify model architecture by loading it
1701+
try:
1702+
import torch
1703+
checkpoint = torch.load(best_model_path, map_location='cpu')
1704+
logging.info(f"Successfully verified model file integrity. Model contains {len(checkpoint)} keys.")
1705+
except Exception as e:
1706+
logging.error(f"Failed to load model for verification: {e}")
1707+
best_model = Model(model_id=best_model_id)
1708+
best_model_path = best_model.get_local_copy()
1709+
logging.warning(f"Re-downloaded model to {best_model_path} after verification failure")
1710+
except Exception as e:
1711+
logging.error(f"Failed to retrieve best model: {e}")
1712+
best_model_path = "" # Empty string if model download fails
1713+
15091714
# Step 5: Evaluate best model
15101715
logging.info("Starting model evaluation...")
15111716
test_accuracy = evaluate_model_github(
@@ -1523,8 +1728,10 @@ def guardian_github_pipeline():
15231728
try:
15241729
deployment_status = deploy_model_github(
15251730
best_model_id=best_model_id,
1731+
best_model_path=best_model_path,
15261732
test_accuracy=accuracy_value,
1527-
min_accuracy_threshold=85.0 # Deploy if accuracy >= 85%
1733+
min_accuracy_threshold=85.0, # Deploy if accuracy >= 85%
1734+
mongo_uri=mongo_uri
15281735
)
15291736
logging.info(f"Deployment completed. Status: {deployment_status}")
15301737
except Exception as e:

0 commit comments

Comments
 (0)