Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

java.net.SocketTimeoutException: timeout #465

@duyanghao

Description

@duyanghao

Sometimes,Submit failed with following error:

2017-08-24T09:39:42.725176510+08:00 2017-08-24 01:39:42 INFO Client:54 - 
2017-08-24T09:39:53.322499130+08:00 Exception in thread "main" io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred.
2017-08-24T09:39:53.322644244+08:00 at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:61)
2017-08-24T09:39:53.322659443+08:00 at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:52)
2017-08-24T09:39:53.322731405+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.list(BaseOperation.java:577)
2017-08-24T09:39:53.322736250+08:00 at io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager.(WatchConnectionManager.java:84)
2017-08-24T09:39:53.322745350+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:684)
2017-08-24T09:39:53.322772625+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:672)
2017-08-24T09:39:53.322800199+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:70)
2017-08-24T09:39:53.322821586+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$$anonfun$run$10.apply(Client.scala:243)
2017-08-24T09:39:53.322844464+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$$anonfun$run$10.apply(Client.scala:243)
2017-08-24T09:39:53.322871076+08:00 at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2493)
2017-08-24T09:39:53.322893440+08:00 at org.apache.spark.deploy.kubernetes.submit.Client.run(Client.scala:243)
2017-08-24T09:39:53.322922464+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$$anonfun$run$13.apply(Client.scala:352)
2017-08-24T09:39:53.322946539+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$$anonfun$run$13.apply(Client.scala:332)
2017-08-24T09:39:53.322970133+08:00 at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2494)
2017-08-24T09:39:53.322994373+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$.run(Client.scala:332)
2017-08-24T09:39:53.323018669+08:00 at org.apache.spark.deploy.kubernetes.submit.Client$.main(Client.scala:294)
2017-08-24T09:39:53.323041296+08:00 at org.apache.spark.deploy.kubernetes.submit.Client.main(Client.scala)
2017-08-24T09:39:53.323065389+08:00 at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
2017-08-24T09:39:53.323090359+08:00 at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
2017-08-24T09:39:53.323114825+08:00 at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
2017-08-24T09:39:53.323138422+08:00 at java.lang.reflect.Method.invoke(Method.java:498)
2017-08-24T09:39:53.323176331+08:00 at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:761)
2017-08-24T09:39:53.323191406+08:00 at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:188)
2017-08-24T09:39:53.323214965+08:00 at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:213)
2017-08-24T09:39:53.323235089+08:00 at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:127)
2017-08-24T09:39:53.323254232+08:00 at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
2017-08-24T09:39:53.323466783+08:00 Caused by: java.net.SocketTimeoutException: timeout
2017-08-24T09:39:53.323483134+08:00 at okio.Okio$4.newTimeoutException(Okio.java:227)
2017-08-24T09:39:53.323506530+08:00 at okio.AsyncTimeout.exit(AsyncTimeout.java:284)
2017-08-24T09:39:53.323528792+08:00 at okio.AsyncTimeout$2.read(AsyncTimeout.java:240)
2017-08-24T09:39:53.323554349+08:00 at okio.RealBufferedSource.indexOf(RealBufferedSource.java:325)
2017-08-24T09:39:53.323579129+08:00 at okio.RealBufferedSource.indexOf(RealBufferedSource.java:314)
2017-08-24T09:39:53.323603403+08:00 at okio.RealBufferedSource.readUtf8LineStrict(RealBufferedSource.java:210)
2017-08-24T09:39:53.323626480+08:00 at okhttp3.internal.http1.Http1Codec.readResponseHeaders(Http1Codec.java:189)
2017-08-24T09:39:53.323666590+08:00 at okhttp3.internal.http.CallServerInterceptor.intercept(CallServerInterceptor.java:67)
2017-08-24T09:39:53.323689412+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323713552+08:00 at okhttp3.internal.connection.ConnectInterceptor.intercept(ConnectInterceptor.java:45)
2017-08-24T09:39:53.323736991+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323763306+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
2017-08-24T09:39:53.323793784+08:00 at okhttp3.internal.cache.CacheInterceptor.intercept(CacheInterceptor.java:93)
2017-08-24T09:39:53.323818061+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323842487+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
2017-08-24T09:39:53.323865645+08:00 at okhttp3.internal.http.BridgeInterceptor.intercept(BridgeInterceptor.java:93)
2017-08-24T09:39:53.323899622+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323914698+08:00 at okhttp3.internal.http.RetryAndFollowUpInterceptor.intercept(RetryAndFollowUpInterceptor.java:120)
2017-08-24T09:39:53.323936594+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.323958773+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
2017-08-24T09:39:53.323981192+08:00 at io.fabric8.kubernetes.client.utils.HttpClientUtils$2.intercept(HttpClientUtils.java:93)
2017-08-24T09:39:53.324006089+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
2017-08-24T09:39:53.324026570+08:00 at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
2017-08-24T09:39:53.324053485+08:00 at okhttp3.RealCall.getResponseWithInterceptorChain(RealCall.java:179)
2017-08-24T09:39:53.324078055+08:00 at okhttp3.RealCall.execute(RealCall.java:63)
2017-08-24T09:39:53.324101270+08:00 at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:239)
2017-08-24T09:39:53.324122713+08:00 at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:234)
2017-08-24T09:39:53.324159784+08:00 at io.fabric8.kubernetes.client.dsl.base.BaseOperation.list(BaseOperation.java:575)
2017-08-24T09:39:53.324185947+08:00 ... 23 more
2017-08-24T09:39:53.324344698+08:00 Caused by: java.net.SocketException: Socket closed
2017-08-24T09:39:53.324363440+08:00 at java.net.SocketInputStream.read(SocketInputStream.java:204)
2017-08-24T09:39:53.324388015+08:00 at java.net.SocketInputStream.read(SocketInputStream.java:141)
2017-08-24T09:39:53.324408854+08:00 at okio.Okio$2.read(Okio.java:138)
2017-08-24T09:39:53.324430997+08:00 at okio.AsyncTimeout$2.read(AsyncTimeout.java:236)
2017-08-24T09:39:53.324454744+08:00 ... 48 more

And sometimes,Driver failed with following error:

2017-08-27 22:08:01 ERROR SparkContext: Error initializing SparkContext.
io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred.
   at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:61)
   at io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:52)
   at io.fabric8.kubernetes.client.dsl.base.BaseOperation.list(BaseOperation.java:577)
   at io.fabric8.kubernetes.client.dsl.internal.WatchConnectionManager.<init>(WatchConnectionManager.java:84)
   at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:684)
   at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:672)
   at io.fabric8.kubernetes.client.dsl.base.BaseOperation.watch(BaseOperation.java:70)
   at org.apache.spark.scheduler.cluster.kubernetes.KubernetesClusterSchedulerBackend.start(KubernetesClusterSchedulerBackend.scala:232)
   at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:156)
   at org.apache.spark.SparkContext.<init>(SparkContext.scala:509)
   at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)

Caused by: java.net.SocketTimeoutException: timeout
   at okio.Okio$4.newTimeoutException(Okio.java:227)
   at okio.AsyncTimeout.exit(AsyncTimeout.java:284)
   at okio.AsyncTimeout$2.read(AsyncTimeout.java:240)
   at okio.RealBufferedSource.indexOf(RealBufferedSource.java:325)
   at okio.RealBufferedSource.indexOf(RealBufferedSource.java:314)
   at okio.RealBufferedSource.readUtf8LineStrict(RealBufferedSource.java:210)
   at okhttp3.internal.http1.Http1Codec.readResponseHeaders(Http1Codec.java:189)
   at okhttp3.internal.http.CallServerInterceptor.intercept(CallServerInterceptor.java:67)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
   at okhttp3.internal.connection.ConnectInterceptor.intercept(ConnectInterceptor.java:45)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
   at okhttp3.internal.cache.CacheInterceptor.intercept(CacheInterceptor.java:93)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
   at okhttp3.internal.http.BridgeInterceptor.intercept(BridgeInterceptor.java:93)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
   at okhttp3.internal.http.RetryAndFollowUpInterceptor.intercept(RetryAndFollowUpInterceptor.java:120)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
   at io.fabric8.kubernetes.client.utils.HttpClientUtils$2.intercept(HttpClientUtils.java:93)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:92)
   at okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:67)
   at okhttp3.RealCall.getResponseWithInterceptorChain(RealCall.java:179)
   at okhttp3.RealCall.execute(RealCall.java:63)
   at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:239)
   at io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:234)
   at io.fabric8.kubernetes.client.dsl.base.BaseOperation.list(BaseOperation.java:575)
   ... 12 more
Caused by: java.net.SocketException: Socket closed
   at java.net.SocketInputStream.read(SocketInputStream.java:204)
   at java.net.SocketInputStream.read(SocketInputStream.java:141)
   at sun.security.ssl.InputRecord.readFully(InputRecord.java:465)
   at sun.security.ssl.InputRecord.read(InputRecord.java:503)
   at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:973)
   at sun.security.ssl.SSLSocketImpl.readDataRecord(SSLSocketImpl.java:930)
   at sun.security.ssl.AppInputStream.read(AppInputStream.java:105)
   at okio.Okio$2.read(Okio.java:138)
   at okio.AsyncTimeout$2.read(AsyncTimeout.java:236)
   ... 37 more
2017-08-27 22:08:01 WARN MetricsSystem: Stopping a MetricsSystem that is not running

it is obvious that kubernetes client read timeout,but there is no way to set that parameter.

i guess we can try to add some pars in function createKubernetesClient as below:

def createKubernetesClient(
      master: String,
      namespace: Option[String],
      kubernetesAuthConfPrefix: String,
      sparkConf: SparkConf,
      maybeServiceAccountToken: Option[File],
      maybeServiceAccountCaCert: Option[File]): KubernetesClient = {
    val oauthTokenFileConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_FILE_CONF_SUFFIX"
    val oauthTokenConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_CONF_SUFFIX"
    val oauthTokenFile = sparkConf.getOption(oauthTokenFileConf)
      .map(new File(_))
      .orElse(maybeServiceAccountToken)
    val oauthTokenValue = sparkConf.getOption(oauthTokenConf)
    OptionRequirements.requireNandDefined(
        oauthTokenFile,
        oauthTokenValue,
        s"Cannot specify OAuth token through both a file $oauthTokenFileConf and a" +
            s" value $oauthTokenConf.")

    val caCertFile = sparkConf
        .getOption(s"$kubernetesAuthConfPrefix.$CA_CERT_FILE_CONF_SUFFIX")
        .orElse(maybeServiceAccountCaCert.map(_.getAbsolutePath))
    val clientKeyFile = sparkConf
        .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_KEY_FILE_CONF_SUFFIX")
    val clientCertFile = sparkConf
        .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_CERT_FILE_CONF_SUFFIX")
    val dispatcher = new Dispatcher(
        ThreadUtils.newDaemonCachedThreadPool("kubernetes-dispatcher"))
    val config = new ConfigBuilder()
        .withApiVersion("v1")
        .withMasterUrl(master)
        .withWebsocketPingInterval(0)
        .withOption(oauthTokenValue) {
          (token, configBuilder) => configBuilder.withOauthToken(token)
        }.withOption(oauthTokenFile) {
          (file, configBuilder) =>
              configBuilder.withOauthToken(Files.toString(file, Charsets.UTF_8))
        }.withOption(caCertFile) {
          (file, configBuilder) => configBuilder.withCaCertFile(file)
        }.withOption(clientKeyFile) {
          (file, configBuilder) => configBuilder.withClientKeyFile(file)
        }.withOption(clientCertFile) {
          (file, configBuilder) => configBuilder.withClientCertFile(file)
        }.withOption(namespace) {
          (ns, configBuilder) => configBuilder.withNamespace(ns)
        }.build()
    val baseHttpClient = HttpClientUtils.createHttpClient(config)
    val httpClientWithCustomDispatcher = baseHttpClient.newBuilder()
      .dispatcher(dispatcher)
      .build()
    new DefaultKubernetesClient(httpClientWithCustomDispatcher, config)
  }

Adding KUBERNETES_REQUEST_TIMEOUT_SYSTEM_PROPERTY and KUBERNETES_CONNECTION_TIMEOUT_SYSTEM_PROPERTY for this problem while adding KUBERNETES_WATCH_RECONNECT_INTERVAL_SYSTEM_PROPERTY
and KUBERNETES_WATCH_RECONNECT_LIMIT_SYSTEM_PROPERTY for issue 428.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions