Skip to content

Commit 12e7656

Browse files
committed
ISSUE-#15 - Add host component control tools for starting, stopping, and restarting components in mcp_main.py
- Introduced async functions: start_host_component, stop_host_component, and restart_host_component. - Each function provides detailed control over individual host components in the Ambari cluster. - Enhanced error handling and logging for better traceability. - Related issue: ISSUE-#15 - Related PR: #26 - Affected file: src/mcp_ambari_api/mcp_main.py
1 parent 8634346 commit 12e7656

File tree

1 file changed

+311
-0
lines changed

1 file changed

+311
-0
lines changed

src/mcp_ambari_api/mcp_main.py

Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1372,6 +1372,317 @@ async def restart_service(service_name: str) -> str:
13721372
logger.error("Error occurred while restarting service '%s': %s", service_name, str(e))
13731373
return f"Error: Service '{service_name}' restart operation failed: {str(e)}"
13741374

1375+
@mcp.tool()
1376+
@log_tool
1377+
async def start_host_component(host_name: str, component_name: str) -> str:
1378+
"""
1379+
Starts a specific component on a specific host in the Ambari cluster.
1380+
1381+
[Tool Role]: Dedicated tool for starting individual host-level components,
1382+
enabling fine-grained control without affecting other hosts or the overall service.
1383+
1384+
[Core Functions]:
1385+
- Start the specified component on the given host
1386+
- Skip if the component is already in STARTED state
1387+
- Return request information for progress tracking
1388+
1389+
[Required Usage Scenarios]:
1390+
- When a specific host's component is STOPPED but the overall service is STARTED
1391+
- When users request to start a component on a specific host (e.g., "start DataNode on host-A")
1392+
- When recovering a single component instance without restarting the entire service
1393+
1394+
Args:
1395+
host_name: Hostname where the component resides (e.g., "hdp-node-01.example.com")
1396+
component_name: Name of the component to start (e.g., "DATANODE", "NODEMANAGER")
1397+
1398+
Returns:
1399+
Start operation result (success: request info, failure: error message)
1400+
"""
1401+
cluster_name = AMBARI_CLUSTER_NAME
1402+
try:
1403+
endpoint = f"/clusters/{cluster_name}/hosts/{host_name}/host_components/{component_name}"
1404+
check = await make_ambari_request(endpoint)
1405+
1406+
if check is None or check.get("error"):
1407+
return f"Error: Component '{component_name}' not found on host '{host_name}'."
1408+
1409+
current_state = check.get("HostRoles", {}).get("state", "UNKNOWN")
1410+
1411+
if current_state == "STARTED":
1412+
return f"Component '{component_name}' on '{host_name}' is already STARTED. No action needed."
1413+
1414+
payload = {
1415+
"RequestInfo": {
1416+
"context": f"Start {component_name} on {host_name} via MCP API"
1417+
},
1418+
"Body": {
1419+
"HostRoles": {
1420+
"state": "STARTED"
1421+
}
1422+
}
1423+
}
1424+
1425+
response_data = await make_ambari_request(endpoint, method="PUT", data=payload)
1426+
1427+
if response_data is None or response_data.get("error"):
1428+
error_msg = response_data.get("error") if response_data else "Unknown error occurred"
1429+
return f"Error: Failed to start '{component_name}' on '{host_name}' - {error_msg}"
1430+
1431+
request_info = response_data.get("Requests")
1432+
if request_info is None:
1433+
return f"Component '{component_name}' on '{host_name}' start command sent successfully. Previous state: {current_state}"
1434+
1435+
request_id = request_info.get("id", "Unknown")
1436+
request_status = request_info.get("status", "Unknown")
1437+
request_href = response_data.get("href", "")
1438+
1439+
result_lines = [
1440+
f"START HOST COMPONENT: {component_name} on {host_name}",
1441+
"",
1442+
f"Cluster: {cluster_name}",
1443+
f"Host: {host_name}",
1444+
f"Component: {component_name}",
1445+
f"Previous State: {current_state}",
1446+
f"Request ID: {request_id}",
1447+
f"Status: {request_status}",
1448+
f"Monitor URL: {request_href}",
1449+
"",
1450+
"Use get_request_status(request_id) to track progress."
1451+
]
1452+
1453+
return "\n".join(result_lines)
1454+
1455+
except Exception as e:
1456+
return f"Error: Exception occurred while starting '{component_name}' on '{host_name}' - {str(e)}"
1457+
1458+
@mcp.tool()
1459+
@log_tool
1460+
async def stop_host_component(host_name: str, component_name: str) -> str:
1461+
"""
1462+
Stops a specific component on a specific host in the Ambari cluster.
1463+
1464+
[Tool Role]: Dedicated tool for stopping individual host-level components,
1465+
enabling fine-grained control without affecting other hosts or the overall service.
1466+
1467+
[Core Functions]:
1468+
- Stop the specified component on the given host
1469+
- Skip if the component is already in INSTALLED (stopped) state
1470+
- Return request information for progress tracking
1471+
1472+
[Required Usage Scenarios]:
1473+
- When users request to stop a component on a specific host (e.g., "stop DataNode on host-A")
1474+
- When decommissioning or isolating a specific node's component
1475+
- When troubleshooting a single component instance
1476+
1477+
Args:
1478+
host_name: Hostname where the component resides (e.g., "hdp-node-01.example.com")
1479+
component_name: Name of the component to stop (e.g., "DATANODE", "NODEMANAGER")
1480+
1481+
Returns:
1482+
Stop operation result (success: request info, failure: error message)
1483+
"""
1484+
cluster_name = AMBARI_CLUSTER_NAME
1485+
try:
1486+
endpoint = f"/clusters/{cluster_name}/hosts/{host_name}/host_components/{component_name}"
1487+
check = await make_ambari_request(endpoint)
1488+
1489+
if check is None or check.get("error"):
1490+
return f"Error: Component '{component_name}' not found on host '{host_name}'."
1491+
1492+
current_state = check.get("HostRoles", {}).get("state", "UNKNOWN")
1493+
1494+
if current_state in ["INSTALLED", "INSTALL_FAILED"]:
1495+
return f"Component '{component_name}' on '{host_name}' is already stopped (state: {current_state}). No action needed."
1496+
1497+
payload = {
1498+
"RequestInfo": {
1499+
"context": f"Stop {component_name} on {host_name} via MCP API"
1500+
},
1501+
"Body": {
1502+
"HostRoles": {
1503+
"state": "INSTALLED"
1504+
}
1505+
}
1506+
}
1507+
1508+
response_data = await make_ambari_request(endpoint, method="PUT", data=payload)
1509+
1510+
if response_data is None or response_data.get("error"):
1511+
error_msg = response_data.get("error") if response_data else "Unknown error occurred"
1512+
return f"Error: Failed to stop '{component_name}' on '{host_name}' - {error_msg}"
1513+
1514+
request_info = response_data.get("Requests")
1515+
if request_info is None:
1516+
return f"Component '{component_name}' on '{host_name}' stop command sent successfully. Previous state: {current_state}"
1517+
1518+
request_id = request_info.get("id", "Unknown")
1519+
request_status = request_info.get("status", "Unknown")
1520+
request_href = response_data.get("href", "")
1521+
1522+
result_lines = [
1523+
f"STOP HOST COMPONENT: {component_name} on {host_name}",
1524+
"",
1525+
f"Cluster: {cluster_name}",
1526+
f"Host: {host_name}",
1527+
f"Component: {component_name}",
1528+
f"Previous State: {current_state}",
1529+
f"Request ID: {request_id}",
1530+
f"Status: {request_status}",
1531+
f"Monitor URL: {request_href}",
1532+
"",
1533+
"Use get_request_status(request_id) to track progress."
1534+
]
1535+
1536+
return "\n".join(result_lines)
1537+
1538+
except Exception as e:
1539+
return f"Error: Exception occurred while stopping '{component_name}' on '{host_name}' - {str(e)}"
1540+
1541+
@mcp.tool()
1542+
@log_tool
1543+
async def restart_host_component(host_name: str, component_name: str) -> str:
1544+
"""
1545+
Restarts a specific component on a specific host in the Ambari cluster (stop then start).
1546+
1547+
[Tool Role]: Dedicated tool for restarting individual host-level components,
1548+
enabling fine-grained control without affecting other hosts or the overall service.
1549+
1550+
[Core Functions]:
1551+
- Stop the specified component on the given host and wait for completion
1552+
- Start the component and return request information
1553+
- Return clear success or error message for LLM automation
1554+
1555+
[Required Usage Scenarios]:
1556+
- When users request to restart a component on a specific host (e.g., "restart DataNode on host-A")
1557+
- When recovering a stuck or malfunctioning single component instance
1558+
- When applying config changes that require a component-level restart
1559+
1560+
Args:
1561+
host_name: Hostname where the component resides (e.g., "hdp-node-01.example.com")
1562+
component_name: Name of the component to restart (e.g., "DATANODE", "NODEMANAGER")
1563+
1564+
Returns:
1565+
Restart operation result (success: request info, failure: error message)
1566+
"""
1567+
cluster_name = AMBARI_CLUSTER_NAME
1568+
try:
1569+
endpoint = f"/clusters/{cluster_name}/hosts/{host_name}/host_components/{component_name}"
1570+
check = await make_ambari_request(endpoint)
1571+
1572+
if check is None or check.get("error"):
1573+
return f"Error: Component '{component_name}' not found on host '{host_name}'."
1574+
1575+
initial_state = check.get("HostRoles", {}).get("state", "UNKNOWN")
1576+
1577+
# Step 1: Stop
1578+
stop_payload = {
1579+
"RequestInfo": {
1580+
"context": f"Stop {component_name} on {host_name} via MCP API"
1581+
},
1582+
"Body": {
1583+
"HostRoles": {
1584+
"state": "INSTALLED"
1585+
}
1586+
}
1587+
}
1588+
1589+
stop_response = await make_ambari_request(endpoint, method="PUT", data=stop_payload)
1590+
1591+
if stop_response is None or stop_response.get("error"):
1592+
error_msg = stop_response.get("error") if stop_response else "Unknown error occurred"
1593+
return f"Error: Unable to stop '{component_name}' on '{host_name}'. {error_msg}"
1594+
1595+
stop_requests = stop_response.get("Requests")
1596+
if stop_requests is None:
1597+
stop_request_id = "N/A (already stopped)"
1598+
else:
1599+
stop_request_id = stop_requests.get("id", "Unknown")
1600+
if stop_request_id == "Unknown":
1601+
return f"Error: Failed to retrieve stop request ID for '{component_name}' on '{host_name}'."
1602+
1603+
# Step 2: Wait for stop to complete
1604+
while True:
1605+
status_response = await make_ambari_request(
1606+
f"/clusters/{cluster_name}/requests/{stop_request_id}"
1607+
)
1608+
1609+
if status_response is None or status_response.get("error"):
1610+
error_msg = status_response.get("error") if status_response else "Unknown error"
1611+
return f"Error: Unable to check stop status for '{component_name}' on '{host_name}'. {error_msg}"
1612+
1613+
req_info = status_response.get("Requests", {})
1614+
req_status = req_info.get("request_status", "Unknown")
1615+
progress_percent = req_info.get("progress_percent", 0)
1616+
1617+
if req_status == "COMPLETED":
1618+
break
1619+
elif req_status in ["FAILED", "ABORTED"]:
1620+
return f"Error: Stop operation for '{component_name}' on '{host_name}' failed with status '{req_status}'."
1621+
1622+
logger.info("Stopping '%s' on '%s'... Progress: %d%%", component_name, host_name, progress_percent)
1623+
await asyncio.sleep(2)
1624+
1625+
# Step 3: Start
1626+
start_payload = {
1627+
"RequestInfo": {
1628+
"context": f"Start {component_name} on {host_name} via MCP API"
1629+
},
1630+
"Body": {
1631+
"HostRoles": {
1632+
"state": "STARTED"
1633+
}
1634+
}
1635+
}
1636+
1637+
start_response = await make_ambari_request(endpoint, method="PUT", data=start_payload)
1638+
1639+
if start_response is None or start_response.get("error"):
1640+
error_msg = start_response.get("error") if start_response else "Unknown error occurred"
1641+
return f"Error: Unable to start '{component_name}' on '{host_name}'. {error_msg}"
1642+
1643+
start_requests = start_response.get("Requests")
1644+
if start_requests is None:
1645+
result_lines = [
1646+
f"RESTART HOST COMPONENT: {component_name} on {host_name}",
1647+
f"Stop Request ID: {stop_request_id}",
1648+
f"Start Request ID: N/A (immediate)",
1649+
"",
1650+
f"Cluster: {cluster_name}",
1651+
f"Host: {host_name}",
1652+
f"Component: {component_name}",
1653+
f"Initial State: {initial_state}",
1654+
f"Stop Status: COMPLETED",
1655+
f"Start Status: Command sent successfully",
1656+
"",
1657+
f"Next: get_host_details('{host_name}') to verify current state.",
1658+
]
1659+
return "\n".join(result_lines)
1660+
1661+
start_request_id = start_requests.get("id", "Unknown")
1662+
start_status = start_requests.get("status", "Unknown")
1663+
start_href = start_response.get("href", "")
1664+
1665+
result_lines = [
1666+
f"RESTART HOST COMPONENT: {component_name} on {host_name}",
1667+
f"Stop Request ID: {stop_request_id}",
1668+
f"Start Request ID: {start_request_id}",
1669+
"",
1670+
f"Cluster: {cluster_name}",
1671+
f"Host: {host_name}",
1672+
f"Component: {component_name}",
1673+
f"Initial State: {initial_state}",
1674+
f"Stop Status: COMPLETED",
1675+
f"Start Status: {start_status}",
1676+
f"Start Monitor URL: {start_href}",
1677+
"",
1678+
f"Next: get_request_status({start_request_id}) for updates." if start_request_id != "Unknown" else f"Next: get_host_details('{host_name}') to verify state soon.",
1679+
]
1680+
return "\n".join(result_lines)
1681+
1682+
except Exception as e:
1683+
logger.error("Error occurred while restarting '%s' on '%s': %s", component_name, host_name, str(e))
1684+
return f"Error: Restart of '{component_name}' on '{host_name}' failed: {str(e)}"
1685+
13751686
@mcp.tool()
13761687
@log_tool
13771688
async def restart_all_services() -> str:

0 commit comments

Comments
 (0)