Skip to content

Commit 0228195

Browse files
author
Szabolcs Fazekas
committed
Add reroute command for shard allocation
- Add new 'reroute' command to support manual shard allocation - Support for allocating replica shards (safe operation) - Support for allocating empty primary shards with data loss confirmation - Interactive confirmation dialog for dangerous primary shard operations - Additional options: --explain and --retry-failed - Proper ATD type definitions for reroute operations - JSON serialization and HTTP request handling - Comprehensive help documentation with examples - Updated README.md with anonymized usage examples Usage: es reroute <cluster> -r <index>:<shard>:<node> # allocate replica es reroute <cluster> -p <index>:<shard>:<node> # allocate empty primary The command follows the same patterns as existing elasticsearch-cli commands and integrates seamlessly with the existing codebase.
1 parent 79ae974 commit 0228195

File tree

3 files changed

+216
-0
lines changed

3 files changed

+216
-0
lines changed

README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,48 @@ Display shards which are not in `DONE` stage:
231231
es recovery cluster1.mydomain.com:9200 -e stage done
232232
```
233233

234+
## Shard allocation (reroute)
235+
236+
The `reroute` command allows manual allocation of shards to specific nodes.
237+
238+
### Allocate replica shard (safe operation)
239+
240+
Allocate replica shard 900 of index `myindex-2024.07.26` to node `data001-2`:
241+
242+
```
243+
es reroute cluster1 -r myindex-2024.07.26:900:data001-2
244+
```
245+
246+
### Allocate empty primary shard (WARNING: causes data loss!)
247+
248+
Allocate empty primary shard 0 of index `myindex-2024.07.26` to node `data001-2`:
249+
250+
```
251+
es reroute cluster1 -p myindex-2024.07.26:0:data001-2
252+
```
253+
254+
**Important**: Primary shard allocation will prompt for confirmation since it causes data loss for that shard.
255+
256+
### Additional options
257+
258+
Get detailed explanation of reroute decisions:
259+
260+
```
261+
es reroute cluster1 -r myindex-2024.07.26:900:data001-2 --explain
262+
```
263+
264+
Retry failed allocations:
265+
266+
```
267+
es reroute cluster1 -r myindex-2024.07.26:900:data001-2 --retry-failed
268+
```
269+
270+
### Get help
271+
272+
```
273+
es reroute --help
274+
```
275+
234276
## Get or set cluster setttings
235277

236278
List all persistent and transient settings:

src/elastic.atd

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,3 +234,47 @@ type bulk_action = {
234234
inherit doc_id;
235235
?routing : string option;
236236
}
237+
238+
type reroute_command = {
239+
?allocate_replica : allocate_replica option;
240+
?allocate_empty_primary : allocate_empty_primary option;
241+
?move : move_shard option;
242+
?cancel : cancel_shard option;
243+
}
244+
245+
type allocate_replica = {
246+
index : string;
247+
shard : int;
248+
node : string;
249+
}
250+
251+
type allocate_empty_primary = {
252+
index : string;
253+
shard : int;
254+
node : string;
255+
accept_data_loss : bool;
256+
}
257+
258+
type move_shard = {
259+
index : string;
260+
shard : int;
261+
from_node : string;
262+
to_node : string;
263+
}
264+
265+
type cancel_shard = {
266+
index : string;
267+
shard : int;
268+
node : string;
269+
}
270+
271+
type reroute_request = {
272+
commands : reroute_command list;
273+
?dry_run : bool option;
274+
?explain : bool option;
275+
?retry_failed : bool option;
276+
}
277+
278+
type reroute_response = {
279+
acknowledged : bool;
280+
}

src/es.ml

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,75 @@ let refresh { verbose; _ } {
10681068
| Error error -> fail_lwt "refresh error:\n%s" error
10691069
| Ok result -> Lwt_io.printl result
10701070

1071+
type reroute_action =
1072+
| AllocateReplica of { index: string; shard: int; node: string; }
1073+
| AllocateEmptyPrimary of { index: string; shard: int; node: string; }
1074+
1075+
type reroute_args = {
1076+
host : string;
1077+
actions : reroute_action list;
1078+
explain : bool;
1079+
retry_failed : bool;
1080+
}
1081+
1082+
let confirm_data_loss index shard node =
1083+
let%lwt () = Lwt_io.eprintlf
1084+
"WARNING: You are about to allocate an empty primary shard for index '%s', shard %d to node '%s'."
1085+
index shard node in
1086+
let%lwt () = Lwt_io.eprintl "This operation will result in DATA LOSS for this shard!" in
1087+
let%lwt () = Lwt_io.eprint "Are you sure you want to continue? (yes/no): " in
1088+
let%lwt () = Lwt_io.flush Lwt_io.stderr in
1089+
let%lwt response = Lwt_io.read_line Lwt_io.stdin in
1090+
match String.lowercase_ascii (String.trim response) with
1091+
| "yes" -> Lwt.return true
1092+
| _ -> Lwt.return false
1093+
1094+
let reroute { verbose; _ } {
1095+
host;
1096+
actions;
1097+
explain;
1098+
retry_failed;
1099+
} =
1100+
let config = Common.load_config () in
1101+
let { Common.host; _ } = Common.get_cluster config host in
1102+
Lwt_main.run @@
1103+
let%lwt confirmed_actions =
1104+
Lwt_list.filter_map_s begin function
1105+
| AllocateReplica { index; shard; node; } ->
1106+
let cmd = { Elastic_t.allocate_replica = Some { index; shard; node; };
1107+
allocate_empty_primary = None; move = None; cancel = None; } in
1108+
Lwt.return (Some cmd)
1109+
| AllocateEmptyPrimary { index; shard; node; } ->
1110+
let%lwt confirmed = confirm_data_loss index shard node in
1111+
if confirmed then
1112+
let cmd = { Elastic_t.allocate_replica = None;
1113+
allocate_empty_primary = Some { index; shard; node; accept_data_loss = true; };
1114+
move = None; cancel = None; } in
1115+
Lwt.return (Some cmd)
1116+
else
1117+
let%lwt () = Lwt_io.eprintl "Operation cancelled." in
1118+
Lwt.return None
1119+
end actions
1120+
in
1121+
match confirmed_actions with
1122+
| [] ->
1123+
let%lwt () = Lwt_io.eprintl "No operations to perform." in
1124+
Lwt.return_unit
1125+
| commands ->
1126+
let reroute_request = { Elastic_t.commands;
1127+
dry_run = None;
1128+
explain = None;
1129+
retry_failed = None; } in
1130+
let body = (JSON (Elastic_j.string_of_reroute_request reroute_request) : content_type) in
1131+
let args = [
1132+
"metric", Some (Some "none");
1133+
"explain", if explain then Some (Some "true") else None;
1134+
"retry_failed", if retry_failed then Some (Some "true") else None;
1135+
] in
1136+
match%lwt request ~verbose ~body `POST host [ Some "_cluster"; Some "reroute"; ] args id with
1137+
| Error error -> fail_lwt "reroute error:\n%s" error
1138+
| Ok result -> Lwt_io.printl result
1139+
10711140
type aggregation_field = {
10721141
field : string;
10731142
}
@@ -1964,6 +2033,66 @@ let refresh_tool =
19642033
let man = [] in
19652034
info "refresh" ~doc ~sdocs:Manpage.s_common_options ~exits ~man
19662035

2036+
let reroute_tool =
2037+
let open Common_args in
2038+
let%map common_args = common_args
2039+
and host = host
2040+
and allocate_replica =
2041+
let doc = "allocate replica shard to node (format: INDEX:SHARD:NODE)" in
2042+
Arg.(value & opt_all string [] & info [ "r"; "allocate-replica"; ] ~docv:"INDEX:SHARD:NODE" ~doc)
2043+
and allocate_empty_primary =
2044+
let doc = "allocate empty primary shard to node (format: INDEX:SHARD:NODE) - WARNING: CAUSES DATA LOSS!" in
2045+
Arg.(value & opt_all string [] & info [ "p"; "allocate-empty-primary"; ] ~docv:"INDEX:SHARD:NODE" ~doc)
2046+
and explain = Arg.(value & flag & info [ "e"; "explain"; ] ~doc:"explain the reroute decisions")
2047+
and retry_failed = Arg.(value & flag & info [ "f"; "retry-failed"; ] ~doc:"retry failed allocations")
2048+
in
2049+
let parse_allocation spec =
2050+
match String.split_on_char ':' spec with
2051+
| [index; shard_str; node] ->
2052+
(match int_of_string shard_str with
2053+
| shard -> Some (index, shard, node)
2054+
| exception _ -> None)
2055+
| _ -> None
2056+
in
2057+
let replica_actions =
2058+
List.filter_map (fun spec ->
2059+
match parse_allocation spec with
2060+
| Some (index, shard, node) -> Some (AllocateReplica { index; shard; node; })
2061+
| None -> failwith ("Invalid replica allocation format: " ^ spec)
2062+
) allocate_replica
2063+
in
2064+
let primary_actions =
2065+
List.filter_map (fun spec ->
2066+
match parse_allocation spec with
2067+
| Some (index, shard, node) -> Some (AllocateEmptyPrimary { index; shard; node; })
2068+
| None -> failwith ("Invalid primary allocation format: " ^ spec)
2069+
) allocate_empty_primary
2070+
in
2071+
reroute common_args {
2072+
host;
2073+
actions = replica_actions @ primary_actions;
2074+
explain;
2075+
retry_failed;
2076+
}
2077+
2078+
let reroute_tool =
2079+
reroute_tool,
2080+
let open Term in
2081+
let doc = "reroute shards (allocate replica or empty primary shards)" in
2082+
let exits = default_exits in
2083+
let man = [
2084+
`S Manpage.s_description;
2085+
`P "The reroute command allows manual allocation of shards to specific nodes.";
2086+
`P "Use -r to allocate replica shards (safe operation).";
2087+
`P "Use -p to allocate empty primary shards (WARNING: causes data loss!).";
2088+
`S Manpage.s_examples;
2089+
`P "Allocate replica shard to a specific node:";
2090+
`P "$(tname) reroute cluster -r myindex:900:mynode";
2091+
`P "Allocate empty primary shard (with data loss confirmation):";
2092+
`P "$(tname) reroute cluster -p myindex:0:mynode";
2093+
] in
2094+
info "reroute" ~doc ~sdocs:Manpage.s_common_options ~exits ~man
2095+
19672096
let search_tool =
19682097
let aggregation =
19692098
let module Let_syntax =
@@ -2204,6 +2333,7 @@ let tools = [
22042333
put_tool;
22052334
recovery_tool;
22062335
refresh_tool;
2336+
reroute_tool;
22072337
search_tool;
22082338
settings_tool;
22092339
]

0 commit comments

Comments
 (0)