Skip to content

Commit c48298f

Browse files
authored
Merge pull request #187 from arangodb-helper/feature/leave-command
Added `arangodb remove starter` command
2 parents 6dc51f0 + ff12e46 commit c48298f

File tree

8 files changed

+366
-115
lines changed

8 files changed

+366
-115
lines changed

client/api.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,12 @@ type API interface {
5050
// With goodbye set, it will remove the peer slot for the starter.
5151
Shutdown(ctx context.Context, goodbye bool) error
5252

53+
// RemovePeer removes a peer with given ID from the starter cluster.
54+
// The removal tries to cleanout & properly shutdown servers first.
55+
// If that does not succeed, the operation returns an error,
56+
// unless force is set to true.
57+
RemovePeer(ctx context.Context, id string, force bool) error
58+
5359
// StartDatabaseUpgrade is called to start the upgrade process
5460
StartDatabaseUpgrade(ctx context.Context) error
5561

client/client.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
package client
2424

2525
import (
26+
"bytes"
2627
"context"
2728
"encoding/json"
2829
"io/ioutil"
@@ -198,6 +199,48 @@ func (c *client) Shutdown(ctx context.Context, goodbye bool) error {
198199
return nil
199200
}
200201

202+
// GoodbyeRequest is the JSON structure send in the request to /goodbye.
203+
type GoodbyeRequest struct {
204+
SlaveID string // Unique ID of the slave that should be removed.
205+
}
206+
207+
// RemovePeer removes a peer with given ID from the starter cluster.
208+
// The removal tries to cleanout & properly shutdown servers first.
209+
// If that does not succeed, the operation returns an error,
210+
// unless force is set to true.
211+
func (c *client) RemovePeer(ctx context.Context, id string, force bool) error {
212+
q := url.Values{}
213+
if force {
214+
q.Set("force", "true")
215+
}
216+
url := c.createURL("/goodbye", q)
217+
218+
input := GoodbyeRequest{
219+
SlaveID: id,
220+
}
221+
inputJSON, err := json.Marshal(input)
222+
if err != nil {
223+
return maskAny(err)
224+
}
225+
226+
req, err := http.NewRequest("POST", url, bytes.NewReader(inputJSON))
227+
if err != nil {
228+
return maskAny(err)
229+
}
230+
if ctx != nil {
231+
req = req.WithContext(ctx)
232+
}
233+
resp, err := c.client.Do(req)
234+
if err != nil {
235+
return maskAny(err)
236+
}
237+
if err := c.handleResponse(resp, "POST", url, nil); err != nil {
238+
return maskAny(err)
239+
}
240+
241+
return nil
242+
}
243+
201244
// StartDatabaseUpgrade is called to start the upgrade process
202245
func (c *client) StartDatabaseUpgrade(ctx context.Context) error {
203246
url := c.createURL("/database-auto-upgrade", nil)
Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,6 @@
1-
ArangoDB Starter Recovery Procedure
2-
===================================
1+
# ArangoDB Starter Administration
32

4-
This procedure is intended to recover a cluster (that was started with the ArangoDB
5-
_Starter_) when a machine of that cluster is broken without the possibility to recover
6-
it (e.g. complete HD failure). In the procedure is does not matter if a replacement
7-
machine uses the old or a new IP address.
3+
This chapter documents administering the _ArangoDB Starter_.
84

9-
To recover from this scenario, you must:
10-
- Create a new (replacement) machine with ArangoDB (including _Starter_) installed.
11-
- Create a file called `RECOVERY` in the directory you are going to use as data
12-
directory of the _Starter_ (the one that is passed via the option `--starter.data-dir`).
13-
This file must contain the IP address and port of the _Starter_ that has been
14-
broken (and will be replaced with this new machine).
15-
16-
E.g.
17-
18-
```bash
19-
echo "192.168.1.25:8528" > $DATADIR/RECOVERY
20-
```
21-
22-
After creating the `RECOVERY` file, start the _Starter_ using all the normal command
23-
line arguments.
24-
25-
The _Starter_ will now:
26-
1. Talk to the remaining _Starters_ to find the ID of the _Starter_ it replaces and
27-
use that ID to join the remaining _Starters_.
28-
1. Talk to the remaining _Agents_ to find the ID of the _Agent_ it replaces and
29-
adjust the command-line arguments of the _Agent_ (it will start) to use that ID.
30-
This is skipped if the _Starter_ was not running an _Agent_.
31-
1. Remove the `RECOVERY` file from the data directory.
32-
33-
The cluster will now recover automatically. It will however have one more _Coordinators_
34-
and _DBServers_ than expected. Exactly one _Coordinator_ and one _DBServer_ will
35-
be listed "red" in the web UI of the database. They will have to be removed manually
36-
using the ArangoDB Web UI.
5+
- [Remove a machine from the cluster](./Removal.md)
6+
- [Recover from a failed machine](./Recovery.md)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# ArangoDB Starter Recovery Procedure
2+
3+
This procedure is intended to recover a cluster (that was started with the ArangoDB
4+
_Starter_) when a machine of that cluster is broken without the possibility to recover
5+
it (e.g. complete HD failure). In the procedure is does not matter if a replacement
6+
machine uses the old or a new IP address.
7+
8+
To recover from this scenario, you must:
9+
10+
- Create a new (replacement) machine with ArangoDB (including _Starter_) installed.
11+
- Create a file called `RECOVERY` in the directory you are going to use as data
12+
directory of the _Starter_ (the one that is passed via the option `--starter.data-dir`).
13+
This file must contain the IP address and port of the _Starter_ that has been
14+
broken (and will be replaced with this new machine).
15+
16+
E.g.
17+
18+
```bash
19+
echo "192.168.1.25:8528" > $DATADIR/RECOVERY
20+
```
21+
22+
After creating the `RECOVERY` file, start the _Starter_ using all the normal command
23+
line arguments.
24+
25+
The _Starter_ will now:
26+
27+
1. Talk to the remaining _Starters_ to find the ID of the _Starter_ it replaces and
28+
use that ID to join the remaining _Starters_.
29+
1. Talk to the remaining _Agents_ to find the ID of the _Agent_ it replaces and
30+
adjust the command-line arguments of the _Agent_ (it will start) to use that ID.
31+
This is skipped if the _Starter_ was not running an _Agent_.
32+
1. Remove the `RECOVERY` file from the data directory.
33+
34+
The cluster will now recover automatically. It will however have one more _Coordinators_
35+
and _DBServers_ than expected. Exactly one _Coordinator_ and one _DBServer_ will
36+
be listed "red" in the web UI of the database. They will have to be removed manually
37+
using the ArangoDB Web UI.
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# ArangoDB Starter Removal Procedure
2+
3+
This procedure is intended to remove a machine from a cluster
4+
(that was started with the ArangoDB _Starter_).
5+
6+
It is possible to run this procedure while the machine is still running
7+
or when it has already been removed.
8+
9+
It is not possible to remove machines that have an agent on it!
10+
Use the [recovery procedure](./Recovery.md) if you have a failed machine
11+
with an agent on it.
12+
13+
Note that it is highly recommended to remove a machine while it is still running.
14+
15+
To remove a machine from a cluster, run the following command:
16+
17+
```bash
18+
arangodb remove starter --starter.endpoint=<endpoint> [--starter.id=<id>] [--force]
19+
```
20+
21+
Where `<endpoint>` is the endpoint of the starter that you want to remove,
22+
or the endpoint of one of the remaining starters. E.g. `http://localhost:8528`.
23+
24+
If you want to remove a machine that is no longer running, use the `--starter.id`
25+
option. Set it to the ID of the ArangoDB _Starter_ on the machine that you want to remove.
26+
27+
You can find this ID in a `setup.json` file in the data directory of one of
28+
the remaining ArangoDB _Starters_.
29+
30+
E.g.
31+
```json
32+
{
33+
...
34+
"peers": {
35+
"Peers": [
36+
{
37+
"ID": "21e42415",
38+
"Address": "10.21.56.123",
39+
"Port": 8528,
40+
"PortOffset": 0,
41+
"DataDir": "/mydata/server1",
42+
"HasAgent": true,
43+
"IsSecure": false
44+
},
45+
...
46+
}
47+
```
48+
49+
If the machine you want to remove has address `10.21.56.123` and was listening
50+
on port `8528`, use ID `21e42415`.
51+
52+
The `remove starter` command will attempt the cleanout all data from the servers
53+
of the machine that you want to remove.
54+
This can take a long of time.
55+
If the cleanout fails, the `remove starter` command will fail.
56+
57+
If you want to remove the machine even when the cleanout has failed, use
58+
the `--force` option.
59+
Note that this may lead to data loss!

remove.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package main
24+
25+
import (
26+
"context"
27+
28+
"github.com/spf13/cobra"
29+
)
30+
31+
var (
32+
cmdRemove = &cobra.Command{
33+
Use: "remove",
34+
Short: "Remove something",
35+
Run: cmdShowUsage,
36+
}
37+
cmdRemoveStarter = &cobra.Command{
38+
Use: "starter",
39+
Short: "Remove a starter from the cluster",
40+
Run: cmdRemoveStarterRun,
41+
}
42+
removeStarterOptions struct {
43+
starterEndpoint string
44+
starterID string
45+
force bool
46+
}
47+
)
48+
49+
func init() {
50+
f := cmdRemoveStarter.Flags()
51+
f.StringVar(&removeStarterOptions.starterEndpoint, "starter.endpoint", "", "The endpoint of the starter to connect to. E.g. http://localhost:8528")
52+
f.StringVar(&removeStarterOptions.starterID, "starter.id", "", "The ID of the starter to remove")
53+
f.BoolVar(&removeStarterOptions.force, "force", false, "If set to true, the starter will be removed even if the servers cannot be properly shutdown")
54+
55+
cmdMain.AddCommand(cmdRemove)
56+
cmdRemove.AddCommand(cmdRemoveStarter)
57+
}
58+
59+
func cmdRemoveStarterRun(cmd *cobra.Command, args []string) {
60+
// Setup logging
61+
consoleOnly := true
62+
configureLogging(consoleOnly)
63+
64+
// Create starter client
65+
c := mustCreateStarterClient(removeStarterOptions.starterEndpoint)
66+
67+
// Fetch the ID of the starter for which the endpoint is given
68+
ctx := context.Background()
69+
info, err := c.ID(ctx)
70+
if err != nil {
71+
log.Fatal().Err(err).Msg("Failed to fetch ID from starter")
72+
}
73+
74+
// Compare ID with requested.
75+
if removeStarterOptions.starterID == "" || removeStarterOptions.starterID == info.ID {
76+
// Shutdown (with goodbye) the starter at given endpoint
77+
goodbye := true
78+
if err := c.Shutdown(ctx, goodbye); err != nil {
79+
log.Fatal().Err(err).Msg("Removing starter from cluster failed")
80+
} else {
81+
log.Info().Msg("Starter has been shutdown and removed from cluster")
82+
}
83+
} else {
84+
// Remove another starter from the cluster
85+
if err := c.RemovePeer(ctx, removeStarterOptions.starterID, removeStarterOptions.force); err != nil {
86+
log.Fatal().Err(err).Msg("Removing starter from cluster failed")
87+
} else {
88+
log.Info().Msg("Starter has been removed from cluster")
89+
}
90+
}
91+
}

0 commit comments

Comments
 (0)