ClickHouse
diff --git a/‎docs/integrations/data-ingestion/aws-glue/index.md
Lines changed: 126 additions & 48 deletions b/‎docs/integrations/data-ingestion/aws-glue/index.md
Lines changed: 126 additions & 48 deletions
diff --git a/‎docs/integrations/index.mdx
Lines changed: 1 addition & 1 deletion b/‎docs/integrations/index.mdx
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/aspell-ignore/en/aspell-dict.txt
Lines changed: 53 additions & 1 deletion b/‎scripts/aspell-ignore/en/aspell-dict.txt
Lines changed: 53 additions & 1 deletion
diff --git a/‎static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png
66.6 KB b/‎static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png
66.6 KB
diff --git a/‎static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png
74.1 KB b/‎static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png
74.1 KB
@@ -3,58 +3,125 @@ sidebar_label: 'Amazon Glue'
 sidebar_position: 1
 slug: /integrations/glue
 description: 'Integrate ClickHouse and Amazon Glue'
-keywords: ['clickhouse', 'amazon', 'aws', 'glue', 'migrating', 'data']
-title: 'Integrating Amazon Glue with ClickHouse'
+keywords: ['clickhouse', 'amazon', 'aws', 'glue', 'migrating', 'data', 'spark']
+title: 'Integrating Amazon Glue with ClickHouse and Spark'
 ---
 
+import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
+import notebook_connections_config from '@site/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png';
+import dependent_jars_path_option from '@site/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png';
 
-# Integrating Amazon Glue with ClickHouse
+# Integrating Amazon Glue with ClickHouse and Spark
 
 [Amazon Glue](https://aws.amazon.com/glue/) is a fully managed, serverless data integration service provided by Amazon Web Services (AWS). It simplifies the process of discovering, preparing, and transforming data for analytics, machine learning, and application development.
 
-Although there is no Glue ClickHouse connector available yet, the official JDBC connector can be leveraged to connect and integrate with ClickHouse:
+## Installation {#installation}
+
+To integrate your Glue code with ClickHouse, you can use our official Spark connector in Glue via one of the following:
+- Installing the ClickHouse Glue connector from the AWS Marketplace (recommended).
+- Manually adding the Spark Connector's jars to your Glue job.
 
 <Tabs>
-<TabItem value="Java" label="Java" default>
+<TabItem value="AWS Marketplace" label="AWS Marketplace" default>
+
+1. <h3 id="subscribe-to-the-connector">Subscribe to the Connector</h3>
+To access the connector in your account, subscribe to the ClickHouse AWS Glue Connector from AWS Marketplace.
+
+2. <h3 id="grant-required-permissions">Grant Required Permissions</h3>
+Ensure your Glue job’s IAM role has the necessary permissions, as described in the minimum privileges [guide](https://docs.aws.amazon.com/glue/latest/dg/getting-started-min-privs-job.html#getting-started-min-privs-connectors).
+
+3. <h3 id="activate-the-connector">Activate the Connector & Create a Connection</h3>
+You can activate the connector and create a connection directly by clicking [this link](https://console.aws.amazon.com/gluestudio/home#/connector/add-connection?connectorName="ClickHouse%20AWS%20Glue%20Connector"&connectorType="Spark"&connectorUrl=https://709825985650.dkr.ecr.us-east-1.amazonaws.com/clickhouse/clickhouse-glue:0.1&connectorClassName="com.clickhouse.spark.ClickHouseCatalog"), which opens the Glue connection creation page with key fields pre-filled. Give the connection a name, and press create (no need to provide the ClickHouse connection details at this stage).
+
+4. <h3 id="use-in-glue-job">Use in Glue Job</h3>
+In your Glue job, select the `Job details` tab, and expend the `Advanced properties` window. Under the `Connections` section, select the connection you just created. The connector automatically injects the required JARs into the job runtime.
+
+<Image img={notebook_connections_config} size='md' alt='Glue Notebook connections config'  />
+
+:::note
+The JARs used in the Glue connector are built for `Spark 3.2`, `Scala 2`, and `Python 3`. Make sure to select these versions when configuring your Glue job.
+:::
+
+</TabItem>
+<TabItem value="Manual Installation" label="Manual Installation">
+To add the required jars manually, please follow the following:
+1. Upload the following jars to an S3 bucket - `clickhouse-jdbc-0.6.X-all.jar` and `clickhouse-spark-runtime-3.X_2.X-0.8.X.jar`.
+2. Make sure the Glue job has access to this bucket.
+3. Under the `Job details` tab, scroll down and expend the `Advanced properties` drop down, and fill the jars path in `Dependent JARs path`:
+
+<Image img={dependent_jars_path_option} size='md' alt='Glue Notebook JAR path options'  />
+
+</TabItem>
+</Tabs>
+
+## Examples {#example}
+<Tabs>
+<TabItem value="Scala" label="Scala" default>
 
 ```java
-import com.amazonaws.services.glue.util.Job
-import com.amazonaws.services.glue.util.GlueArgParser
 import com.amazonaws.services.glue.GlueContext
-import org.apache.spark.SparkContext
+import com.amazonaws.services.glue.util.GlueArgParser
+import com.amazonaws.services.glue.util.Job
+import com.clickhouseScala.Native.NativeSparkRead.spark
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.DataFrame
+
 import scala.collection.JavaConverters._
-import com.amazonaws.services.glue.log.GlueLogger
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.functions._
 
-// Initialize Glue job
-object GlueJob {
+object ClickHouseGlueExample {
   def main(sysArgs: Array[String]) {
-    val sc: SparkContext = new SparkContext()
-    val glueContext: GlueContext = new GlueContext(sc)
-    val spark: SparkSession = glueContext.getSparkSession
-    val logger = new GlueLogger
-     import spark.implicits._
-    // @params: [JOB_NAME]
     val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray)
-    Job.init(args("JOB_NAME"), glueContext, args.asJava)
 
-    // JDBC connection details
-    val jdbcUrl = "jdbc:ch://{host}:{port}/{schema}"
-    val jdbcProperties = new java.util.Properties()
-    jdbcProperties.put("user", "default")
-    jdbcProperties.put("password", "*******")
-    jdbcProperties.put("driver", "com.clickhouse.jdbc.ClickHouseDriver")
-
-    // Load the table from ClickHouse
-    val df: DataFrame = spark.read.jdbc(jdbcUrl, "my_table", jdbcProperties)
-
-    // Show the Spark df, or use it for whatever you like
-    df.show()
-
-    // Commit the job
+    val sparkSession: SparkSession = SparkSession.builder
+      .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
+      .config("spark.sql.catalog.clickhouse.host", "<your-clickhouse-host>")
+      .config("spark.sql.catalog.clickhouse.protocol", "https")
+      .config("spark.sql.catalog.clickhouse.http_port", "<your-clickhouse-port>")
+      .config("spark.sql.catalog.clickhouse.user", "default")
+      .config("spark.sql.catalog.clickhouse.password", "<your-password>")
+      .config("spark.sql.catalog.clickhouse.database", "default")
+      // for ClickHouse cloud
+      .config("spark.sql.catalog.clickhouse.option.ssl", "true")
+      .config("spark.sql.catalog.clickhouse.option.ssl_mode", "NONE")
+      .getOrCreate
+
+    val glueContext = new GlueContext(sparkSession.sparkContext)
+    Job.init(args("JOB_NAME"), glueContext, args.asJava)
+    import sparkSession.implicits._
+
+    val url = "s3://{path_to_cell_tower_data}/cell_towers.csv.gz"
+
+    val schema = StructType(Seq(
+      StructField("radio", StringType, nullable = false),
+      StructField("mcc", IntegerType, nullable = false),
+      StructField("net", IntegerType, nullable = false),
+      StructField("area", IntegerType, nullable = false),
+      StructField("cell", LongType, nullable = false),
+      StructField("unit", IntegerType, nullable = false),
+      StructField("lon", DoubleType, nullable = false),
+      StructField("lat", DoubleType, nullable = false),
+      StructField("range", IntegerType, nullable = false),
+      StructField("samples", IntegerType, nullable = false),
+      StructField("changeable", IntegerType, nullable = false),
+      StructField("created", TimestampType, nullable = false),
+      StructField("updated", TimestampType, nullable = false),
+      StructField("averageSignal", IntegerType, nullable = false)
+    ))
+
+    val df = sparkSession.read
+      .option("header", "true")
+      .schema(schema)
+      .csv(url)
+
+    // Write to ClickHouse
+    df.writeTo("clickhouse.default.cell_towers").append()
+
+
+    // Read from ClickHouse
+    val dfRead = spark.sql("select * from clickhouse.default.cell_towers")
     Job.commit()
   }
 }
@@ -70,6 +137,8 @@ from awsglue.utils import getResolvedOptions
 from pyspark.context import SparkContext
 from awsglue.context import GlueContext
 from awsglue.job import Job
+from pyspark.sql import Row
+
 
 ## @params: [JOB_NAME]
 args = getResolvedOptions(sys.argv, ['JOB_NAME'])
@@ -80,20 +149,29 @@ logger = glueContext.get_logger()
 spark = glueContext.spark_session
 job = Job(glueContext)
 job.init(args['JOB_NAME'], args)
-jdbc_url = "jdbc:ch://{host}:{port}/{schema}"
-query = "select * from my_table"
-# For cloud usage, please add ssl options
-df = (spark.read.format("jdbc")
-    .option("driver", 'com.clickhouse.jdbc.ClickHouseDriver')
-    .option("url", jdbc_url)
-    .option("user", 'default')
-    .option("password", '*******')
-    .option("query", query)
-    .load())
-
-logger.info("num of rows:")
-logger.info(str(df.count()))
-logger.info("Data sample:")
+
+spark.conf.set("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
+spark.conf.set("spark.sql.catalog.clickhouse.host", "<your-clickhouse-host>")
+spark.conf.set("spark.sql.catalog.clickhouse.protocol", "https")
+spark.conf.set("spark.sql.catalog.clickhouse.http_port", "<your-clickhouse-port>")
+spark.conf.set("spark.sql.catalog.clickhouse.user", "default")
+spark.conf.set("spark.sql.catalog.clickhouse.password", "<your-password>")
+spark.conf.set("spark.sql.catalog.clickhouse.database", "default")
+spark.conf.set("spark.clickhouse.write.format", "json")
+spark.conf.set("spark.clickhouse.read.format", "arrow")
+# for ClickHouse cloud
+spark.conf.set("spark.sql.catalog.clickhouse.option.ssl", "true")
+spark.conf.set("spark.sql.catalog.clickhouse.option.ssl_mode", "NONE")
+
+# Create DataFrame
+data = [Row(id=11, name="John"), Row(id=12, name="Doe")]
+df = spark.createDataFrame(data)
+
+# Write DataFrame to ClickHouse
+df.writeTo("clickhouse.default.example_table").append()
+
+# Read DataFrame from ClickHouse
+df_read = spark.sql("select * from clickhouse.default.example_table")
 logger.info(str(df.take(10)))
 
 job.commit()
@@ -102,4 +180,4 @@ job.commit()
 </TabItem>
 </Tabs>
 
-For more details, please visit our [Spark & JDBC documentation](/integrations/apache-spark/spark-jdbc#read-data).
+For more details, please visit our [Spark documentation](/integrations/apache-spark).
@@ -205,7 +205,7 @@ We are actively compiling this list of ClickHouse integrations below, so it's no
 |Amazon Kinesis|<Kinesissvg style={{width: '3rem', height: '3rem'}} alt="Kenesis logo"/> |Data ingestion|Integration with Amazon Kinesis.|[Documentation](/integrations/clickpipes/kinesis/)|
 |Amazon MSK|<Amazonmsksvg style={{width: '3rem'}} alt="Amazon MSK logo"/> |Data ingestion|Integration with Amazon Managed Streaming for Apache Kafka (MSK).|[Documentation](/integrations/kafka/cloud/amazon-msk/)|
 |Amazon S3|<S3svg style={{width: '3rem', height: '3rem'}} alt="Amazon S3 logo"/>|Data ingestion|Import from, export to, and transform S3 data in flight with ClickHouse built-in S3 functions.|[Documentation](/integrations/data-ingestion/s3/index.md)|
-|Amazon Glue|<Image img={glue_logo} size="logo" alt="Amazon Glue logo"/>|Data ingestion|Query ClickHouse over JDBC|[Documentation](/integrations/glue)|
+|Amazon Glue|<Image img={glue_logo} size="logo" alt="Amazon Glue logo"/>|Data ingestion|Query ClickHouse over Spark using our official Glue connector|[Documentation](/integrations/glue)|
 |Apache Spark|<Sparksvg alt="Amazon Spark logo" style={{width: '3rem'}}/>|Data ingestion|Spark ClickHouse Connector is a high performance connector built on top of Spark DataSource V2.|[GitHub](https://github.com/housepower/spark-clickhouse-connector),<br/>[Documentation](/integrations/data-ingestion/apache-spark/index.md)|
 |Azure Event Hubs|<Azureeventhubssvg alt="Azure Events Hub logo" style={{width: '3rem'}}/>|Data ingestion|A data streaming platform that supports Apache Kafka's native protocol|[Website](https://azure.microsoft.com/en-gb/products/event-hubs)|
 |Azure Synapse|<Image img={azure_synapse_logo} size="logo" alt="Azure Synapse logo"/>|Data ingestion|A cloud-based analytics service for big data and data warehousing.|[Documentation](/integrations/azure-synapse)|
 
@@ -3572,4 +3572,56 @@ zlib
 znode
 znodes
 zookeeperSessionUptime
-zstd
+zstd
+Okta
+specificities
+reproducibility
+CertManager
+Istio
+LogHouse
+Tailscale
+Thanos
+ReplacingReplicatedMergeTree
+ReplacingSharedMergeTree
+SharedMergeTree
+VersionedCollapsing
+subpath
+AICPA
+restartable
+sumArray
+sumForEach
+argMaxIf
+groupArrayResample
+downsampled
+uniqArrayIf
+minSimpleState
+sumArray
+avgMerge
+avgMergeState
+timeslot
+timeslots
+groupArrayDistinct
+avgMap
+avgState
+avgIf
+quantilesTiming
+quantilesTimingIf
+quantilesTimingArrayIf
+downvotes
+sumSimpleState
+upvotes
+uniqArray
+avgResample
+countResample
+avgMerge
+avgState
+argMinIf
+minSimpleState
+maxSimpleState
+TimescaleDB
+columnstore
+TiDB
+resync
+resynchronization
+Sackmann's
+JARs