This repository was archived by the owner on Jan 9, 2020. It is now read-only.
forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 117
java.net.SocketTimeoutException: timeout #465
Copy link
Copy link
Open
Description
Sometimes,Submit failed with following error:
2017-08-24T09:39:42.725176510+08:00 2017-08-24 01:39:42 INFO Client:54 -
2017-08-24T09:39:53.322499130+08:00 Exception in thread "main" io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred.
2017-08-24T09:39:53.322644244+08:00 at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:61)
2017-08-24T09:39:53.322659443+08:00 at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:52)
2017-08-24T09:39:53.322731405+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.list(BaseOperation.java:577)
2017-08-24T09:39:53.322736250+08:00 at io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager.(WatchConnectionManager.java:84)
2017-08-24T09:39:53.322745350+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:684)
2017-08-24T09:39:53.322772625+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:672)
2017-08-24T09:39:53.322800199+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:70)
2017-08-24T09:39:53.322821586+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$$anonfun$run$10.apply(Client.scala:243)
2017-08-24T09:39:53.322844464+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$$anonfun$run$10.apply(Client.scala:243)
2017-08-24T09:39:53.322871076+08:00 at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2493)
2017-08-24T09:39:53.322893440+08:00 at org.apache.spark.deploy.kubernetes.submit.Client.run(Client.scala:243)
2017-08-24T09:39:53.322922464+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$$anonfun$run$13.apply(Client.scala:352)
2017-08-24T09:39:53.322946539+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$$anonfun$run$13.apply(Client.scala:332)
2017-08-24T09:39:53.322970133+08:00 at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2494)
2017-08-24T09:39:53.322994373+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$.run(Client.scala:332)
2017-08-24T09:39:53.323018669+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$.main(Client.scala:294)
2017-08-24T09:39:53.323041296+08:00 at org.apache.spark.deploy.kubernetes.submit.Client.main(Client.scala)
2017-08-24T09:39:53.323065389+08:00 at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
2017-08-24T09:39:53.323090359+08:00 at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
2017-08-24T09:39:53.323114825+08:00 at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
2017-08-24T09:39:53.323138422+08:00 at java.lang.reflect.Method.invoke(Method.java:498)
2017-08-24T09:39:53.323176331+08:00 at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:761)
2017-08-24T09:39:53.323191406+08:00 at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:188)
2017-08-24T09:39:53.323214965+08:00 at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:213)
2017-08-24T09:39:53.323235089+08:00 at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:127)
2017-08-24T09:39:53.323254232+08:00 at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
2017-08-24T09:39:53.323466783+08:00 Caused by: java.net.SocketTimeoutException: timeout
2017-08-24T09:39:53.323483134+08:00 at okio.Okio$4.newTimeoutException(Okio.java:227)
2017-08-24T09:39:53.323506530+08:00 at okio.AsyncTimeout.exit(AsyncTimeout.java:284)
2017-08-24T09:39:53.323528792+08:00 at okio.AsyncTimeout$2.read(AsyncTimeout.java:240)
2017-08-24T09:39:53.323554349+08:00 at okio.RealBufferedSource.indexOf(RealBufferedSource.java:325)
2017-08-24T09:39:53.323579129+08:00 at okio.RealBufferedSource.indexOf(RealBufferedSource.java:314)
2017-08-24T09:39:53.323603403+08:00 at okio.RealBufferedSource.readUtf8LineStrict(RealBufferedSource.java:210)
2017-08-24T09:39:53.323626480+08:00 at okhttp3.internal.http1.Http1Codec.readResponseHeaders(Http1Codec.java:189)
2017-08-24T09:39:53.323666590+08:00 at okhttp3.internal.http.CallServerInterceptor.intercept(CallServerInterceptor.java:67)
2017-08-24T09:39:53.323689412+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323713552+08:00 at okhttp3.internal.connection.ConnectInterceptor.intercept(ConnectInterceptor.java:45)
2017-08-24T09:39:53.323736991+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323763306+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
2017-08-24T09:39:53.323793784+08:00 at okhttp3.internal.cache.CacheInterceptor.intercept(CacheInterceptor.java:93)
2017-08-24T09:39:53.323818061+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323842487+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
2017-08-24T09:39:53.323865645+08:00 at okhttp3.internal.http.BridgeInterceptor.intercept(BridgeInterceptor.java:93)
2017-08-24T09:39:53.323899622+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323914698+08:00 at okhttp3.internal.http.RetryAndFollowUpInterceptor.intercept(RetryAndFollowUpInterceptor.java:120)
2017-08-24T09:39:53.323936594+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323958773+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
2017-08-24T09:39:53.323981192+08:00 at io.fabric8.kubernetes.client.utils.HttpClientUtils$2.intercept(HttpClientUtils.java:93)
2017-08-24T09:39:53.324006089+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.324026570+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
2017-08-24T09:39:53.324053485+08:00 at okhttp3.RealCall.getResponseWithInterceptorChain(RealCall.java:179)
2017-08-24T09:39:53.324078055+08:00 at okhttp3.RealCall.execute(RealCall.java:63)
2017-08-24T09:39:53.324101270+08:00 at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:239)
2017-08-24T09:39:53.324122713+08:00 at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:234)
2017-08-24T09:39:53.324159784+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.list(BaseOperation.java:575)
2017-08-24T09:39:53.324185947+08:00 ... 23 more
2017-08-24T09:39:53.324344698+08:00 Caused by: java.net.SocketException: Socket closed
2017-08-24T09:39:53.324363440+08:00 at java.net.SocketInputStream.read(SocketInputStream.java:204)
2017-08-24T09:39:53.324388015+08:00 at java.net.SocketInputStream.read(SocketInputStream.java:141)
2017-08-24T09:39:53.324408854+08:00 at okio.Okio$2.read(Okio.java:138)
2017-08-24T09:39:53.324430997+08:00 at okio.AsyncTimeout$2.read(AsyncTimeout.java:236)
2017-08-24T09:39:53.324454744+08:00 ... 48 more
And sometimes,Driver failed with following error:
2017-08-27 22:08:01 ERROR SparkContext: Error initializing SparkContext.
io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred.
at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:61)
at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:52)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.list(BaseOperation.java:577)
at io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager.<init>(WatchConnectionManager.java:84)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:684)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:672)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:70)
at org.apache.spark.scheduler.cluster.kubernetes.KubernetesClusterSchedulerBackend.start(KubernetesClusterSchedulerBackend.scala:232)
at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:156)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:509)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
Caused by: java.net.SocketTimeoutException: timeout
at okio.Okio$4.newTimeoutException(Okio.java:227)
at okio.AsyncTimeout.exit(AsyncTimeout.java:284)
at okio.AsyncTimeout$2.read(AsyncTimeout.java:240)
at okio.RealBufferedSource.indexOf(RealBufferedSource.java:325)
at okio.RealBufferedSource.indexOf(RealBufferedSource.java:314)
at okio.RealBufferedSource.readUtf8LineStrict(RealBufferedSource.java:210)
at okhttp3.internal.http1.Http1Codec.readResponseHeaders(Http1Codec.java:189)
at okhttp3.internal.http.CallServerInterceptor.intercept(CallServerInterceptor.java:67)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.connection.ConnectInterceptor.intercept(ConnectInterceptor.java:45)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at okhttp3.internal.cache.CacheInterceptor.intercept(CacheInterceptor.java:93)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at okhttp3.internal.http.BridgeInterceptor.intercept(BridgeInterceptor.java:93)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RetryAndFollowUpInterceptor.intercept(RetryAndFollowUpInterceptor.java:120)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at io.fabric8.kubernetes.client.utils.HttpClientUtils$2.intercept(HttpClientUtils.java:93)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
at okhttp3.RealCall.getResponseWithInterceptorChain(RealCall.java:179)
at okhttp3.RealCall.execute(RealCall.java:63)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:239)
at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:234)
at io.fabric8.kubernetes.client.dsl.base.BaseOperation.list(BaseOperation.java:575)
... 12 more
Caused by: java.net.SocketException: Socket closed
at java.net.SocketInputStream.read(SocketInputStream.java:204)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at sun.security.ssl.InputRecord.readFully(InputRecord.java:465)
at sun.security.ssl.InputRecord.read(InputRecord.java:503)
at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:973)
at sun.security.ssl.SSLSocketImpl.readDataRecord(SSLSocketImpl.java:930)
at sun.security.ssl.AppInputStream.read(AppInputStream.java:105)
at okio.Okio$2.read(Okio.java:138)
at okio.AsyncTimeout$2.read(AsyncTimeout.java:236)
... 37 more
2017-08-27 22:08:01 WARN MetricsSystem: Stopping a MetricsSystem that is not running
it is obvious that kubernetes client
read timeout,but there is no way to set that parameter.
i guess we can try to add some pars in function createKubernetesClient
as below:
def createKubernetesClient(
master: String,
namespace: Option[String],
kubernetesAuthConfPrefix: String,
sparkConf: SparkConf,
maybeServiceAccountToken: Option[File],
maybeServiceAccountCaCert: Option[File]): KubernetesClient = {
val oauthTokenFileConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_FILE_CONF_SUFFIX"
val oauthTokenConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_CONF_SUFFIX"
val oauthTokenFile = sparkConf.getOption(oauthTokenFileConf)
.map(new File(_))
.orElse(maybeServiceAccountToken)
val oauthTokenValue = sparkConf.getOption(oauthTokenConf)
OptionRequirements.requireNandDefined(
oauthTokenFile,
oauthTokenValue,
s"Cannot specify OAuth token through both a file $oauthTokenFileConf and a" +
s" value $oauthTokenConf.")
val caCertFile = sparkConf
.getOption(s"$kubernetesAuthConfPrefix.$CA_CERT_FILE_CONF_SUFFIX")
.orElse(maybeServiceAccountCaCert.map(_.getAbsolutePath))
val clientKeyFile = sparkConf
.getOption(s"$kubernetesAuthConfPrefix.$CLIENT_KEY_FILE_CONF_SUFFIX")
val clientCertFile = sparkConf
.getOption(s"$kubernetesAuthConfPrefix.$CLIENT_CERT_FILE_CONF_SUFFIX")
val dispatcher = new Dispatcher(
ThreadUtils.newDaemonCachedThreadPool("kubernetes-dispatcher"))
val config = new ConfigBuilder()
.withApiVersion("v1")
.withMasterUrl(master)
.withWebsocketPingInterval(0)
.withOption(oauthTokenValue) {
(token, configBuilder) => configBuilder.withOauthToken(token)
}.withOption(oauthTokenFile) {
(file, configBuilder) =>
configBuilder.withOauthToken(Files.toString(file, Charsets.UTF_8))
}.withOption(caCertFile) {
(file, configBuilder) => configBuilder.withCaCertFile(file)
}.withOption(clientKeyFile) {
(file, configBuilder) => configBuilder.withClientKeyFile(file)
}.withOption(clientCertFile) {
(file, configBuilder) => configBuilder.withClientCertFile(file)
}.withOption(namespace) {
(ns, configBuilder) => configBuilder.withNamespace(ns)
}.build()
val baseHttpClient = HttpClientUtils.createHttpClient(config)
val httpClientWithCustomDispatcher = baseHttpClient.newBuilder()
.dispatcher(dispatcher)
.build()
new DefaultKubernetesClient(httpClientWithCustomDispatcher, config)
}
Adding KUBERNETES_REQUEST_TIMEOUT_SYSTEM_PROPERTY and KUBERNETES_CONNECTION_TIMEOUT_SYSTEM_PROPERTY for this problem while adding KUBERNETES_WATCH_RECONNECT_INTERVAL_SYSTEM_PROPERTY
and KUBERNETES_WATCH_RECONNECT_LIMIT_SYSTEM_PROPERTY
for issue 428.
Metadata
Metadata
Assignees
Labels
No labels