Symptoms:
- Pods stuck in
PendingorCrashLoopBackOff kubectl get podsshows errors
Diagnosis:
kubectl describe pod <pod-name> -n fl-system
kubectl logs <pod-name> -n fl-systemCommon Causes:
# Check node resources
kubectl top nodes
# Solution: Scale down or add nodes
kubectl scale deployment fl-client --replicas=2# Check image
kubectl describe pod <pod-name> | grep -A 5 Events
# Solution: Verify image exists
docker images | grep fl-platform# Check ConfigMap
kubectl get configmap fl-config -o yaml
# Solution: Update ConfigMap
kubectl edit configmap fl-config
kubectl rollout restart deployment/fl-serverSymptoms:
- Loss not decreasing
- Accuracy stuck at random
Diagnosis:
# Check training logs
kubectl logs deployment/fl-client -n fl-system | grep "loss"
# Check Grafana dashboard
kubectl port-forward svc/grafana 3000:3000Solutions:
# Update values.yaml
config:
learningRate: 0.001 # Try different values# Check partition statistics
from fl.datasets.nih_chest_xray import PatientBasedPartitioner
partitioner = PatientBasedPartitioner(metadata_path, num_hospitals=10)
stats = partitioner.get_partition_stats(partitions)
print(stats)# Reduce compression
config:
compression:
sparsity: 0.5 # Less aggressive
quantization_bits: 16 # More bitsSymptoms:
- OOMKilled pods
- Slow training
Diagnosis:
kubectl top pods -n fl-system
kubectl describe pod <pod-name> | grep -A 5 "Limits"Solutions:
# values.yaml
client:
resources:
limits:
memory: "4Gi" # Increaseconfig:
batchSize: 16 # Smaller batches# In model definition
model.gradient_checkpointing_enable()Symptoms:
- Too many clients flagged as Byzantine
- Training unstable
Diagnosis:
# Check Grafana Byzantine dashboard
# Look at anomaly scoresSolutions:
# In robust_aggregation.py
byzantine = aggregator.detect_byzantine(
client_updates,
threshold=5.0 # Increase threshold
)config:
security:
aggregation_method: "multi_krum" # Try different methodSymptoms:
- DP training stops early
- Privacy errors
Diagnosis:
# Check privacy metrics
kubectl logs deployment/fl-client | grep "epsilon"Solutions:
config:
privacy:
target_epsilon: 10.0 # Higher budgetconfig:
privacy:
noise_multiplier: 0.5 # Less noiseconfig:
num_server_rounds: 5 # Fewer roundsSymptoms:
- Long round times
- High network latency
Diagnosis:
# Check network metrics
kubectl exec -it <pod-name> -- ping fl-server
# Check compression ratio
# View Grafana Communication dashboardSolutions:
config:
compression:
sparsity: 0.95 # More aggressive
quantization_bits: 4 # Fewer bits# Implement async aggregation
# (Currently not implemented)Symptoms:
- Cannot access MLflow UI
- Experiments not logged
Diagnosis:
kubectl get svc mlflow -n fl-system
kubectl logs statefulset/mlflow -n fl-systemSolutions:
kubectl port-forward svc/mlflow 5000:5000 -n fl-systemkubectl get pvc -n fl-system
kubectl describe pvc mlflow-data-mlflow-0kubectl rollout restart statefulset/mlflow -n fl-systemSymptoms:
- Empty dashboards
- No data points
Diagnosis:
# Check Prometheus
kubectl port-forward svc/prometheus 9090:9090
# Visit http://localhost:9090
# Check metrics
curl http://localhost:9090/api/v1/query?query=fl_training_lossSolutions:
# In training code
from fl.metrics import metrics_collector
metrics_collector.record_training_loss(client_id, round_num, loss)kubectl get configmap prometheus-config -o yamlkubectl rollout restart deployment/prometheus- Use GPU
client:
resources:
limits:
nvidia.com/gpu: 1- Increase Workers
DataLoader(dataset, num_workers=4)- Mixed Precision
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()- Aggressive Compression
compression:
sparsity: 0.99
quantization_bits: 2- Reduce Model Size
# Use smaller model
from fl.models import TinyNet- Gradient Accumulation
for i, batch in enumerate(dataloader):
loss = train_step(batch)
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()- Clear Cache
import torch
torch.cuda.empty_cache()# Get all resources
kubectl get all -n fl-system
# Describe deployment
kubectl describe deployment fl-server -n fl-system
# View logs (last 100 lines)
kubectl logs --tail=100 deployment/fl-client -n fl-system
# Follow logs
kubectl logs -f deployment/fl-server -n fl-system
# Execute command in pod
kubectl exec -it <pod-name> -n fl-system -- bash
# Check events
kubectl get events -n fl-system --sort-by='.lastTimestamp'
# Resource usage
kubectl top pods -n fl-system
kubectl top nodes
# Port forward multiple services
kubectl port-forward svc/mlflow 5000:5000 &
kubectl port-forward svc/grafana 3000:3000 &
kubectl port-forward svc/prometheus 9090:9090 &-
Check Logs First
- Server logs:
kubectl logs deployment/fl-server - Client logs:
kubectl logs deployment/fl-client - MLflow logs:
kubectl logs statefulset/mlflow
- Server logs:
-
Check Metrics
- Grafana dashboards
- Prometheus queries
- MLflow experiments
-
Check Configuration
- ConfigMaps
- Secrets
- Helm values
-
GitHub Issues
- Search existing issues
- Create new issue with logs and config