Skip to content

Commit 7ebc490

Browse files
committed
Merge branch 'dev-0.2.3-log-collector' of github.com:WeDataSphere/Streamis into dev-0.2.3-log-collector
2 parents 6f332d5 + 01af6cb commit 7ebc490

File tree

2 files changed

+134
-65
lines changed
  • streamis-jobmanager
    • streamis-job-manager/streamis-job-manager-service/src/main/scala/com/webank/wedatasphere/streamis/jobmanager/manager/service
    • streamis-jobmanager-server/src/main/java/com/webank/wedatasphere/streamis/jobmanager/restful/api

2 files changed

+134
-65
lines changed

streamis-jobmanager/streamis-job-manager/streamis-job-manager-service/src/main/scala/com/webank/wedatasphere/streamis/jobmanager/manager/service/TaskMonitorService.scala

Lines changed: 55 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package com.webank.wedatasphere.streamis.jobmanager.manager.service
1818
import java.util
1919
import java.util.Date
2020
import java.util.concurrent.{Future, TimeUnit}
21+
2122
import com.google.common.collect.Sets
2223
import com.webank.wedatasphere.streamis.jobmanager.launcher.JobLauncherAutoConfiguration
2324
import com.webank.wedatasphere.streamis.jobmanager.launcher.conf.JobConfKeyConstants
@@ -30,7 +31,6 @@ import com.webank.wedatasphere.streamis.jobmanager.manager.conf.JobConf
3031
import com.webank.wedatasphere.streamis.jobmanager.manager.dao.{StreamJobMapper, StreamTaskMapper}
3132
import com.webank.wedatasphere.streamis.jobmanager.manager.entity.{StreamJob, StreamTask}
3233
import com.webank.wedatasphere.streamis.jobmanager.manager.utils.StreamTaskUtils
33-
3434
import javax.annotation.{PostConstruct, PreDestroy, Resource}
3535
import org.apache.commons.lang.exception.ExceptionUtils
3636
import org.apache.linkis.common.exception.ErrorException
@@ -84,58 +84,64 @@ class TaskMonitorService extends Logging {
8484
return
8585
}
8686
streamTasks.filter(shouldMonitor).foreach { streamTask =>
87-
streamTask.setLastUpdateTime(new Date)
88-
streamTaskMapper.updateTask(streamTask)
8987
val job = streamJobMapper.getJobById(streamTask.getJobId)
90-
info(s"Try to update status of StreamJob-${job.getName}.")
91-
val retryHandler = new RetryHandler {}
92-
retryHandler.setRetryNum(3)
93-
retryHandler.setRetryMaxPeriod(2000)
94-
retryHandler.addRetryException(classOf[ErrorException])
95-
var jobInfo:JobInfo = null
96-
Utils.tryCatch {
97-
jobInfo = retryHandler.retry(refresh(streamTask, jobLaunchManager), s"Task-Monitor-${job.getName}")
98-
} { ex => {
99-
error(s"Fetch StreamJob-${job.getName} failed, maybe the Linkis cluster is wrong, please be noticed!", ex)
100-
val errorMsg = ExceptionUtils.getRootCauseMessage(ex)
101-
if (errorMsg != null && errorMsg.contains("Not exists EngineConn")) {
102-
streamTask.setStatus(JobConf.FLINK_JOB_STATUS_FAILED.getValue)
103-
streamTask.setErrDesc("Not exists EngineConn.")
104-
} else {
105-
// 连续三次还是出现异常,说明Linkis的Manager已经不能正常提供服务,告警并不再尝试获取状态,等待下次尝试
106-
val users = getAlertUsers(job)
107-
users.add(job.getCreateBy)
108-
alert(jobService.getAlertLevel(job), s"请求LinkisManager失败,Linkis集群出现异常,请关注!影响任务[${job.getName}]", users, streamTask)
109-
}
110-
}
111-
}
112-
streamTaskMapper.updateTask(streamTask)
113-
if(streamTask.getStatus == JobConf.FLINK_JOB_STATUS_FAILED.getValue) {
114-
warn(s"StreamJob-${job.getName} is failed, please be noticed.")
115-
var extraMessage = ""
116-
Option(jobInfo) match {
117-
case Some(flinkJobInfo: FlinkJobInfo) =>
118-
extraMessage = s",${flinkJobInfo.getApplicationId}"
119-
case _ =>
120-
}
121-
// Need to add restart feature if user sets the restart parameters.
122-
var alertMsg = s"Streamis 流式应用[${job.getName}${extraMessage}]已经失败, 请登陆Streamis查看应用日志."
123-
this.streamJobConfMapper.getRawConfValue(job.getId, JobConfKeyConstants.FAIL_RESTART_SWITCH.getValue) match {
124-
case "ON" =>
125-
alertMsg = s"${alertMsg} 现将自动拉起该应用"
126-
Utils.tryCatch{
127-
info(s"Start to reLaunch the StreamisJob [${job.getName}], now to submit and schedule it...")
128-
// Use submit user to start job
129-
val future: Future[String] = streamTaskService.asyncExecute(job.getId, 0L, job.getSubmitUser, true)
130-
}{
131-
case e:Exception =>
132-
warn(s"Fail to reLaunch the StreamisJob [${job.getName}]", e)
133-
}
134-
case _ =>
135-
}
88+
if(!JobConf.SUPPORTED_MANAGEMENT_JOB_TYPES.getValue.contains(job.getJobType)) {
13689
val userList = Sets.newHashSet(job.getSubmitUser, job.getCreateBy)
13790
userList.addAll(getAlertUsers(job))
91+
val alertMsg = s"Spark Streaming应用[${job.getName}]已经超过 ${Utils.msDurationToString(System.currentTimeMillis - streamTask.getLastUpdateTime.getTime)} 没有更新状态, 请及时确认应用是否正常!"
13892
alert(jobService.getAlertLevel(job), alertMsg, new util.ArrayList[String](userList), streamTask)
93+
} else {
94+
streamTask.setLastUpdateTime(new Date)
95+
streamTaskMapper.updateTask(streamTask)
96+
info(s"Try to update status of StreamJob-${job.getName}.")
97+
val retryHandler = new RetryHandler {}
98+
retryHandler.setRetryNum(3)
99+
retryHandler.setRetryMaxPeriod(2000)
100+
retryHandler.addRetryException(classOf[ErrorException])
101+
var jobInfo:JobInfo = null
102+
Utils.tryCatch {
103+
jobInfo = retryHandler.retry(refresh(streamTask, jobLaunchManager), s"Task-Monitor-${job.getName}")
104+
} { ex =>
105+
error(s"Fetch StreamJob-${job.getName} failed, maybe the Linkis cluster is wrong, please be noticed!", ex)
106+
val errorMsg = ExceptionUtils.getRootCauseMessage(ex)
107+
if (errorMsg != null && errorMsg.contains("Not exists EngineConn")) {
108+
streamTask.setStatus(JobConf.FLINK_JOB_STATUS_FAILED.getValue)
109+
streamTask.setErrDesc("Not exists EngineConn.")
110+
} else {
111+
// 连续三次还是出现异常,说明Linkis的Manager已经不能正常提供服务,告警并不再尝试获取状态,等待下次尝试
112+
val users = getAlertUsers(job)
113+
users.add(job.getCreateBy)
114+
alert(jobService.getAlertLevel(job), s"请求LinkisManager失败,Linkis集群出现异常,请关注!影响任务[${job.getName}]", users, streamTask)
115+
}
116+
}
117+
streamTaskMapper.updateTask(streamTask)
118+
if(streamTask.getStatus == JobConf.FLINK_JOB_STATUS_FAILED.getValue) {
119+
warn(s"StreamJob-${job.getName} is failed, please be noticed.")
120+
var extraMessage = ""
121+
Option(jobInfo) match {
122+
case Some(flinkJobInfo: FlinkJobInfo) =>
123+
extraMessage = s",${flinkJobInfo.getApplicationId}"
124+
case _ =>
125+
}
126+
// Need to add restart feature if user sets the restart parameters.
127+
var alertMsg = s"Streamis 流式应用[${job.getName}${extraMessage}]已经失败, 请登陆Streamis查看应用日志."
128+
this.streamJobConfMapper.getRawConfValue(job.getId, JobConfKeyConstants.FAIL_RESTART_SWITCH.getValue) match {
129+
case "ON" =>
130+
alertMsg = s"${alertMsg} 现将自动拉起该应用"
131+
Utils.tryCatch{
132+
info(s"Start to reLaunch the StreamisJob [${job.getName}], now to submit and schedule it...")
133+
// Use submit user to start job
134+
val future: Future[String] = streamTaskService.asyncExecute(job.getId, 0L, job.getSubmitUser, true)
135+
}{
136+
case e:Exception =>
137+
warn(s"Fail to reLaunch the StreamisJob [${job.getName}]", e)
138+
}
139+
case _ =>
140+
}
141+
val userList = Sets.newHashSet(job.getSubmitUser, job.getCreateBy)
142+
userList.addAll(getAlertUsers(job))
143+
alert(jobService.getAlertLevel(job), alertMsg, new util.ArrayList[String](userList), streamTask)
144+
}
139145
}
140146
}
141147
info("All StreamTasks status have updated.")

streamis-jobmanager/streamis-jobmanager-server/src/main/java/com/webank/wedatasphere/streamis/jobmanager/restful/api/JobRestfulApi.java

Lines changed: 79 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import com.webank.wedatasphere.streamis.jobmanager.exception.JobExceptionManager;
2323
import com.webank.wedatasphere.streamis.jobmanager.launcher.job.JobInfo;
2424
import com.webank.wedatasphere.streamis.jobmanager.launcher.job.manager.JobLaunchManager;
25+
import com.webank.wedatasphere.streamis.jobmanager.launcher.job.state.JobStateInfo;
2526
import com.webank.wedatasphere.streamis.jobmanager.launcher.linkis.entity.LogRequestPayload;
2627
import com.webank.wedatasphere.streamis.jobmanager.launcher.linkis.job.FlinkJobInfo;
2728
import com.webank.wedatasphere.streamis.jobmanager.manager.conf.JobConf;
@@ -51,6 +52,7 @@
5152
import javax.servlet.http.HttpServletRequest;
5253
import java.io.IOException;
5354
import java.util.*;
55+
import java.util.function.Function;
5456
import java.util.stream.Collectors;
5557

5658
@RequestMapping(path = "/streamis/streamJobManager/job")
@@ -277,6 +279,7 @@ public Message addTask(HttpServletRequest req,
277279
if(streamTask != null && JobConf.isRunning(streamTask.getStatus())) {
278280
LOG.warn("Streamis Job {} exists running task, update its status from Running to stopped at first.", jobName);
279281
streamTask.setStatus((Integer) JobConf.FLINK_JOB_STATUS_STOPPED().getValue());
282+
streamTask.setErrDesc("stopped by App's new task.");
280283
streamTaskService.updateTask(streamTask);
281284
} else if(streamTask == null) {
282285
// 这里取个巧,从该工程该用户有权限的Job中找到一个Flink的历史作业,作为这个Spark Streaming作业的jobId和jobInfo
@@ -306,21 +309,65 @@ public Message addTask(HttpServletRequest req,
306309
}
307310
streamTask.setStartTime(new Date());
308311
streamTask.setLastUpdateTime(new Date());
312+
StreamTask finalStreamTask = streamTask;
313+
return withFlinkJobInfo(jobName, streamTask.getLinkisJobInfo(), flinkJobInfo -> {
314+
flinkJobInfo.setApplicationId(appId);
315+
flinkJobInfo.setApplicationUrl(appUrl);
316+
flinkJobInfo.setName(jobName);
317+
flinkJobInfo.setStatus(JobConf.getStatusString((Integer) JobConf.FLINK_JOB_STATUS_RUNNING().getValue()));
318+
StreamTaskUtils.refreshInfo(finalStreamTask, flinkJobInfo);
319+
streamTaskService.updateTask(finalStreamTask);
320+
LOG.info("Streamis Job {} has added a new task successfully.", jobName);
321+
return Message.ok();
322+
});
323+
}
324+
325+
private Message withFlinkJobInfo(String jobName, String flinkJobInfoStr, Function<FlinkJobInfo, Message> flinkJobInfoFunction) {
309326
FlinkJobInfo flinkJobInfo;
310327
try {
311-
flinkJobInfo = DWSHttpClient.jacksonJson().readValue(streamTask.getLinkisJobInfo(), FlinkJobInfo.class);
328+
flinkJobInfo = DWSHttpClient.jacksonJson().readValue(flinkJobInfoStr, FlinkJobInfo.class);
312329
} catch (JsonProcessingException e) {
313-
LOG.error("Job {} deserialize the jobInfo from history Job failed!", jobName, e);
314-
return Message.error("Deserialize the jobInfo from history Job failed!");
315-
}
316-
flinkJobInfo.setApplicationId(appId);
317-
flinkJobInfo.setApplicationUrl(appUrl);
318-
flinkJobInfo.setName(jobName);
319-
flinkJobInfo.setStatus(JobConf.getStatusString((Integer) JobConf.FLINK_JOB_STATUS_RUNNING().getValue()));
320-
StreamTaskUtils.refreshInfo(streamTask, flinkJobInfo);
321-
streamTaskService.updateTask(streamTask);
322-
LOG.info("Streamis Job {} has added a new task successfully.", jobName);
323-
return Message.ok();
330+
LOG.error("Job {} deserialize the flinkJobInfo string to object failed!", jobName, e);
331+
return Message.error("Deserialize the flinkJobInfo string to object failed!");
332+
}
333+
return flinkJobInfoFunction.apply(flinkJobInfo);
334+
}
335+
336+
@RequestMapping(path = "/updateTask", method = RequestMethod.GET)
337+
public Message updateTask(HttpServletRequest req,
338+
@RequestParam(value = "jobName") String jobName,
339+
@RequestParam(value = "appId") String appId,
340+
@RequestParam(value = "metrics") String metrics) {
341+
String username = SecurityFilter.getLoginUsername(req);
342+
LOG.info("User {} try to update task for Streamis job {} with appId: {}.", username, jobName, appId);
343+
List<StreamJob> streamJobs = streamJobService.getJobByName(jobName);
344+
if(CollectionUtils.isEmpty(streamJobs)) {
345+
return Message.error("Not exits Streamis job " + jobName);
346+
} else if(streamJobs.size() > 1) {
347+
return Message.error("Too many Streamis Job named " + jobName + ", we cannot distinguish between them.");
348+
} else if(!"spark.jar".equals(streamJobs.get(0).getJobType())) {
349+
return Message.error("Only spark.jar Job support to update task.");
350+
}
351+
StreamTask streamTask = streamTaskService.getLatestTaskByJobId(streamJobs.get(0).getId());
352+
if(streamTask == null) {
353+
LOG.warn("Job {} is not exists running task, ignore to update its metrics.", jobName);
354+
return Message.ok("not exists running task, ignore it.");
355+
}
356+
return withFlinkJobInfo(jobName, streamTask.getLinkisJobInfo(), flinkJobInfo -> {
357+
if(!flinkJobInfo.getApplicationId().equals(appId)) {
358+
LOG.warn("Job {} with running task <appId: {}> is not equals to the request appId: {}, ignore to update its metrics.",
359+
jobName, flinkJobInfo.getApplicationId(), appId);
360+
return Message.ok("the request appId is not equals to the running task appId " + flinkJobInfo.getApplicationId());
361+
}
362+
JobStateInfo jobStateInfo = new JobStateInfo();
363+
jobStateInfo.setTimestamp(System.currentTimeMillis());
364+
jobStateInfo.setLocation(metrics);
365+
flinkJobInfo.setJobStates(new JobStateInfo[]{jobStateInfo});
366+
StreamTaskUtils.refreshInfo(streamTask, flinkJobInfo);
367+
streamTaskService.updateTask(streamTask);
368+
LOG.info("Streamis Job {} has updated the task metrics successfully.", jobName);
369+
return Message.ok();
370+
});
324371
}
325372

326373
@RequestMapping(path = "/stopTask", method = RequestMethod.GET)
@@ -345,13 +392,23 @@ public Message stopTask(HttpServletRequest req,
345392
// 如果存在正在运行的,将其停止掉
346393
StreamTask streamTask = streamTaskService.getLatestTaskByJobId(streamJobs.get(0).getId());
347394
if(streamTask != null && JobConf.isRunning(streamTask.getStatus())) {
348-
LOG.warn("Streamis Job {} is exists running task, update its status to stopped.", jobName);
349-
streamTask.setStatus((Integer) JobConf.FLINK_JOB_STATUS_STOPPED().getValue());
350-
streamTaskService.updateTask(streamTask);
395+
return withFlinkJobInfo(jobName, streamTask.getLinkisJobInfo(), flinkJobInfo -> {
396+
if(flinkJobInfo.getApplicationId().equals(appId)) {
397+
LOG.warn("Streamis Job {} is exists running task, update its status to stopped.", jobName);
398+
streamTask.setStatus((Integer) JobConf.FLINK_JOB_STATUS_STOPPED().getValue());
399+
streamTask.setErrDesc("stopped by App itself.");
400+
streamTaskService.updateTask(streamTask);
401+
return Message.ok();
402+
} else {
403+
LOG.warn("Job {} with running task <appId: {}> is not equals to the request appId: {}, ignore to stop it.",
404+
jobName, flinkJobInfo.getApplicationId(), appId);
405+
return Message.ok("the request appId is not equals to the running task appId " + flinkJobInfo.getApplicationId());
406+
}
407+
});
351408
} else {
352409
LOG.warn("Streamis Job {} is not exists running task, ignore to stop it.", jobName);
410+
return Message.ok();
353411
}
354-
return Message.ok();
355412
}
356413

357414
@RequestMapping(path = "/progress", method = RequestMethod.GET)
@@ -411,6 +468,12 @@ public Message getLog(HttpServletRequest req,
411468
logType = StringUtils.isBlank(logType) ? "client" : logType;
412469
String username = SecurityFilter.getLoginUsername(req);
413470
StreamJob streamJob = this.streamJobService.getJobById(jobId);
471+
if(streamJob == null) {
472+
return Message.error("not exists job " + jobId);
473+
} else if(!JobConf.SUPPORTED_MANAGEMENT_JOB_TYPES().getValue().contains(streamJob.getJobType()) &&
474+
"client".equals(logType)) {
475+
return Message.error("Job " + streamJob.getName() + " is not supported to get client logs.");
476+
}
414477
if (!streamJobService.hasPermission(streamJob, username) &&
415478
!this.privilegeService.hasAccessPrivilege(req, streamJob.getProjectName())) {
416479
return Message.error("Have no permission to fetch logs from StreamJob [" + jobId + "]");

0 commit comments

Comments
 (0)