Skip to content

Commit a8d948b

Browse files
committed
HBASE-27380 RitDuration histogram metric is broken
1 parent 01f93db commit a8d948b

File tree

4 files changed

+177
-24
lines changed

4 files changed

+177
-24
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,6 @@ public static State convert(ClusterStatusProtos.RegionState.State protoState) {
183183
private final RegionInfo hri;
184184
private final ServerName serverName;
185185
private final State state;
186-
// The duration of region in transition
187-
private long ritDuration;
188186

189187
public static RegionState createForTesting(RegionInfo region, State state) {
190188
return new RegionState(region, state, EnvironmentEdgeManager.currentTime(), null);
@@ -195,16 +193,10 @@ public RegionState(RegionInfo region, State state, ServerName serverName) {
195193
}
196194

197195
public RegionState(RegionInfo region, State state, long stamp, ServerName serverName) {
198-
this(region, state, stamp, serverName, 0);
199-
}
200-
201-
public RegionState(RegionInfo region, State state, long stamp, ServerName serverName,
202-
long ritDuration) {
203196
this.hri = region;
204197
this.state = state;
205198
this.stamp = stamp;
206199
this.serverName = serverName;
207-
this.ritDuration = ritDuration;
208200
}
209201

210202
public State getState() {
@@ -223,19 +215,6 @@ public ServerName getServerName() {
223215
return serverName;
224216
}
225217

226-
public long getRitDuration() {
227-
return ritDuration;
228-
}
229-
230-
/**
231-
* Update the duration of region in transition
232-
* @param previousStamp previous RegionState's timestamp
233-
*/
234-
@InterfaceAudience.Private
235-
void updateRitDuration(long previousStamp) {
236-
this.ritDuration += (this.stamp - previousStamp);
237-
}
238-
239218
public boolean isClosing() {
240219
return state == State.CLOSING;
241220
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,8 @@ public AssignmentManager(MasterServices master, MasterRegion masterRegion) {
284284
DEFAULT_FORCE_REGION_RETAINMENT_WAIT_INTERVAL);
285285
forceRegionRetainmentRetries =
286286
conf.getInt(FORCE_REGION_RETAINMENT_RETRIES, DEFAULT_FORCE_REGION_RETAINMENT_RETRIES);
287+
288+
this.setRitDurationConsumer();
287289
}
288290

289291
private void mirrorMetaLocations() throws IOException, KeeperException {
@@ -759,6 +761,10 @@ private List<RegionInfo> getSystemTables(ServerName serverName) {
759761
return serverNode.getSystemRegionInfoList();
760762
}
761763

764+
private void setRitDurationConsumer() {
765+
regionInTransitionTracker.setRitDurationConsumer(metrics::updateRitDuration);
766+
}
767+
762768
private void preTransitCheck(RegionStateNode regionNode, RegionState.State[] expectedStates)
763769
throws HBaseIOException {
764770
if (regionNode.getProcedure() != null) {
@@ -1743,7 +1749,7 @@ private void update(final Collection<RegionState> regions, final long currentTim
17431749
ritsOverThreshold = new HashMap<String, RegionState>();
17441750
}
17451751
ritsOverThreshold.put(state.getRegion().getEncodedName(), state);
1746-
totalRITsTwiceThreshold += (ritTime > (ritThreshold * 2)) ? 1 : 0;
1752+
totalRITsTwiceThreshold += (ritTime > (ritThreshold * 2L)) ? 1 : 0;
17471753
}
17481754
if (oldestRITTime < ritTime) {
17491755
oldestRITTime = ritTime;

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionInTransitionTracker.java

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@
1919

2020
import java.util.ArrayList;
2121
import java.util.List;
22+
import java.util.concurrent.ConcurrentHashMap;
2223
import java.util.concurrent.ConcurrentSkipListMap;
24+
import java.util.function.Consumer;
2325
import org.apache.hadoop.hbase.TableName;
2426
import org.apache.hadoop.hbase.client.RegionInfo;
2527
import org.apache.hadoop.hbase.client.TableState;
2628
import org.apache.hadoop.hbase.master.RegionState;
2729
import org.apache.hadoop.hbase.master.TableStateManager;
30+
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
2831
import org.apache.yetus.audience.InterfaceAudience;
2932
import org.slf4j.Logger;
3033
import org.slf4j.LoggerFactory;
@@ -43,8 +46,11 @@ public class RegionInTransitionTracker {
4346

4447
private final ConcurrentSkipListMap<RegionInfo, RegionStateNode> regionInTransition =
4548
new ConcurrentSkipListMap<>(RegionInfo.COMPARATOR);
49+
private final ConcurrentHashMap<RegionInfo, Long> regionEnterTimestamp =
50+
new ConcurrentHashMap<>();
4651

4752
private TableStateManager tableStateManager;
53+
private Consumer<Long> ritDurationConsumer;
4854

4955
public boolean isRegionInTransition(final RegionInfo regionInfo) {
5056
return regionInTransition.containsKey(regionInfo);
@@ -131,15 +137,29 @@ public void handleRegionDelete(RegionInfo regionInfo) {
131137
}
132138

133139
private boolean addRegionInTransition(final RegionStateNode regionStateNode) {
134-
return regionInTransition.putIfAbsent(regionStateNode.getRegionInfo(), regionStateNode) == null;
140+
boolean added =
141+
regionInTransition.putIfAbsent(regionStateNode.getRegionInfo(), regionStateNode) == null;
142+
if (added) {
143+
regionEnterTimestamp.putIfAbsent(regionStateNode.getRegionInfo(),
144+
EnvironmentEdgeManager.currentTime());
145+
}
146+
return added;
135147
}
136148

137149
private boolean removeRegionInTransition(final RegionInfo regionInfo) {
138-
return regionInTransition.remove(regionInfo) != null;
150+
boolean removed = regionInTransition.remove(regionInfo) != null;
151+
if (removed) {
152+
Long enter = regionEnterTimestamp.remove(regionInfo);
153+
if (enter != null && ritDurationConsumer != null) {
154+
ritDurationConsumer.accept(EnvironmentEdgeManager.currentTime() - enter);
155+
}
156+
}
157+
return removed;
139158
}
140159

141160
public void stop() {
142161
regionInTransition.clear();
162+
regionEnterTimestamp.clear();
143163
}
144164

145165
public boolean hasRegionsInTransition() {
@@ -153,4 +173,8 @@ public List<RegionStateNode> getRegionsInTransition() {
153173
public void setTableStateManager(TableStateManager tableStateManager) {
154174
this.tableStateManager = tableStateManager;
155175
}
176+
177+
public void setRitDurationConsumer(Consumer<Long> ritDurationConsumer) {
178+
this.ritDurationConsumer = ritDurationConsumer;
179+
}
156180
}
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import static org.junit.jupiter.api.Assertions.assertEquals;
21+
22+
import org.apache.hadoop.conf.Configuration;
23+
import org.apache.hadoop.hbase.CompatibilityFactory;
24+
import org.apache.hadoop.hbase.HBaseTestingUtil;
25+
import org.apache.hadoop.hbase.HConstants;
26+
import org.apache.hadoop.hbase.ServerName;
27+
import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
28+
import org.apache.hadoop.hbase.TableName;
29+
import org.apache.hadoop.hbase.client.Put;
30+
import org.apache.hadoop.hbase.client.RegionInfo;
31+
import org.apache.hadoop.hbase.client.Table;
32+
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
33+
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
34+
import org.apache.hadoop.hbase.test.MetricsAssertHelper;
35+
import org.apache.hadoop.hbase.testclassification.MasterTests;
36+
import org.apache.hadoop.hbase.testclassification.MediumTests;
37+
import org.apache.hadoop.hbase.util.Bytes;
38+
import org.apache.hadoop.hbase.util.TableDescriptorChecker;
39+
import org.junit.jupiter.api.AfterAll;
40+
import org.junit.jupiter.api.BeforeAll;
41+
import org.junit.jupiter.api.BeforeEach;
42+
import org.junit.jupiter.api.Tag;
43+
import org.junit.jupiter.api.Test;
44+
import org.junit.jupiter.api.TestInfo;
45+
import org.slf4j.Logger;
46+
import org.slf4j.LoggerFactory;
47+
48+
@Tag(MasterTests.TAG)
49+
@Tag(MediumTests.TAG)
50+
public class TestAssignmentManagerRitDurationMetrics {
51+
52+
private static final Logger LOG =
53+
LoggerFactory.getLogger(TestAssignmentManagerRitDurationMetrics.class);
54+
55+
private static final MetricsAssertHelper METRICS_HELPER =
56+
CompatibilityFactory.getInstance(MetricsAssertHelper.class);
57+
58+
private static SingleProcessHBaseCluster CLUSTER;
59+
private static HMaster MASTER;
60+
private static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
61+
private static final int MSG_INTERVAL = 1000;
62+
63+
private String methodName;
64+
65+
@BeforeAll
66+
public static void startCluster() throws Exception {
67+
LOG.info("Starting cluster");
68+
Configuration conf = TEST_UTIL.getConfiguration();
69+
70+
// Enable sanity check for coprocessor, so that region reopen fails on the RS
71+
conf.setBoolean(TableDescriptorChecker.TABLE_SANITY_CHECKS, true);
72+
// set RIT stuck warning threshold to a small value
73+
conf.setInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 20);
74+
// set msgInterval to 1 second
75+
conf.setInt("hbase.regionserver.msginterval", MSG_INTERVAL);
76+
// set tablesOnMaster to none
77+
conf.set("hbase.balancer.tablesOnMaster", "none");
78+
// set client sync wait timeout to 5sec
79+
conf.setInt("hbase.client.sync.wait.timeout.msec", 5000);
80+
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
81+
conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 2500);
82+
// set a small interval for updating rit metrics
83+
conf.setInt(AssignmentManager.RIT_CHORE_INTERVAL_MSEC_CONF_KEY, MSG_INTERVAL);
84+
// set a small assign attempts for avoiding assert when retrying. (HBASE-20533)
85+
conf.setInt(AssignmentManager.ASSIGN_MAX_ATTEMPTS, 3);
86+
// keep rs online so it can report the failed opens.
87+
conf.setBoolean(CoprocessorHost.ABORT_ON_ERROR_KEY, false);
88+
89+
TEST_UTIL.startMiniCluster(2);
90+
CLUSTER = TEST_UTIL.getHBaseCluster();
91+
MASTER = CLUSTER.getMaster();
92+
// Disable sanity check for coprocessor, so that modify table runs on the HMaster
93+
MASTER.getConfiguration().setBoolean(TableDescriptorChecker.TABLE_SANITY_CHECKS, false);
94+
}
95+
96+
@AfterAll
97+
public static void after() throws Exception {
98+
LOG.info("AFTER {} <= IS THIS NULL?", TEST_UTIL);
99+
TEST_UTIL.shutdownMiniCluster();
100+
}
101+
102+
@BeforeEach
103+
public void setUp(TestInfo testInfo) throws Exception {
104+
methodName = testInfo.getTestMethod().get().getName();
105+
}
106+
107+
@Test
108+
public void testRitDurationHistogramMetric() throws Exception {
109+
final TableName tableName = TableName.valueOf(methodName);
110+
final byte[] family = Bytes.toBytes("family");
111+
try (Table table = TEST_UTIL.createTable(tableName, family)) {
112+
final byte[] row = Bytes.toBytes("row");
113+
final byte[] qualifier = Bytes.toBytes("qualifier");
114+
final byte[] value = Bytes.toBytes("value");
115+
116+
Put put = new Put(row);
117+
put.addColumn(family, qualifier, value);
118+
table.put(put);
119+
Thread.sleep(MSG_INTERVAL * 2);
120+
121+
MetricsAssignmentManagerSource amSource =
122+
MASTER.getAssignmentManager().getAssignmentManagerMetrics().getMetricsProcSource();
123+
long ritDurationNumOps = getRitCountFromRegionStates(amSource);
124+
125+
RegionInfo regionInfo = MASTER.getAssignmentManager().getRegionStates()
126+
.getRegionsOfTable(tableName).iterator().next();
127+
ServerName current =
128+
MASTER.getAssignmentManager().getRegionStates().getRegionServerOfRegion(regionInfo);
129+
ServerName target = MASTER.getServerManager().getOnlineServersList().stream()
130+
.filter(sn -> !sn.equals(current)).findFirst()
131+
.orElseThrow(() -> new IllegalStateException("Need at least two regionservers"));
132+
133+
TEST_UTIL.getAdmin().move(regionInfo.getEncodedNameAsBytes(), target);
134+
TEST_UTIL.waitFor(10_000, () -> getRitCountFromRegionStates(amSource) > ritDurationNumOps);
135+
TEST_UTIL.waitUntilNoRegionTransitScheduled();
136+
Thread.sleep(MSG_INTERVAL * 5);
137+
assertEquals(ritDurationNumOps + 1, getRitCountFromRegionStates(amSource));
138+
}
139+
}
140+
141+
private long getRitCountFromRegionStates(MetricsAssignmentManagerSource amSource) {
142+
return METRICS_HELPER.getCounter("ritDurationNumOps", amSource);
143+
}
144+
}

0 commit comments

Comments
 (0)