Start readme, runsh and benchmark.sh

allenshen13 · allenshen13 · commit ceb8747d0128 · 2024-05-29T11:10:32.000-07:00
diff --git a/presto/README.md b/presto/README.md
@@ -0,0 +1,23 @@
+# PrestoDB
+
+Presto is a distributed SQL query engine for big data.
+- [Github](https://github.com/prestodb/presto)
+- [Homepage](https://prestodb.io)
+
+The benchmarks are based on Presto version `0.287`.
+
+We assume that a Presto cluster is already running. For more information, visit [Getting Started](https://prestodb.io/getting-started/).
+
+----------
+## Steps
+
+1. Download parquet file and upload it to S3 Bucket ex. s3://your-bucket/clickbench-parquet/hits/hits.parquet.
+2. Create a new schema for the dataset and create hits table in this new schema using the create.sql file. Add the following to the end of the file to use the parquet file on S3. 
+```
+WITH (
+    format = 'PARQUET',
+    external_location = 's3a://your-bucket/clickbench-parquet/hits/'
+);
+```
+3. Connect to your Presto coordinator and use presto-cli to run `run.sh`.
+4. Presto UI is one of the ways to get detailed information on the queries including runtime. 
diff --git a/presto/benchmark.sh b/presto/benchmark.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+
+PRESTO_VERSION=0.287
+
+# Set the URL to download
+PRESTO_BIN=https://repo1.maven.org/maven2/com/facebook/presto/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz
+
+# Update the base image OS and install wget and python
+sudo apt-get update
+sudo apt-get install -y wget python less
+
+# Download Presto and unpack it to /opt/presto
+wget --quiet ${PRESTO_BIN}
+mkdir -p /opt
+tar -xf presto-server-${PRESTO_VERSION}.tar.gz -C /opt
+rm presto-server-${PRESTO_VERSION}.tar.gz
+ln -s /opt/presto-server-${PRESTO_VERSION} /opt/presto
+
+#Load the data
+wget --no-verbose --continue 'https://datasets.clickhouse.com/hits_compatible/hits.parquet'
+
diff --git a/presto/run.sh b/presto/run.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+
+TRIES=3
+cat queries.sql | while read query; do
+    echo "{\"sql\":\"$query  option(timeoutMs=300000)\"}"| tr -d ';' > query.json
+    for i in $(seq 1 $TRIES); do
+        ./opt/presto-cli --server 127.0.0.1:8080 --schema "clickbench_parquet" --session offset_clause_enabled=true --catalog "hive" --execute "${query}"
+    done
+done