cmu-db
diff --git a/‎Jenkinsfile
Lines changed: 79 additions & 36 deletions b/‎Jenkinsfile
Lines changed: 79 additions & 36 deletions
diff --git a/‎script/forecasting/__init__.py
Lines changed: 3 additions & 0 deletions b/‎script/forecasting/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎script/forecasting/cluster.py
Lines changed: 70 additions & 0 deletions b/‎script/forecasting/cluster.py
Lines changed: 70 additions & 0 deletions
diff --git a/‎script/forecasting/data_loader.py
Lines changed: 87 additions & 0 deletions b/‎script/forecasting/data_loader.py
Lines changed: 87 additions & 0 deletions
@@ -423,45 +423,88 @@ pipeline {
             }
         }
 
-        stage('Self-Driving End-to-End Test') {
-            agent {
-                docker {
-                    image 'noisepage:focal'
-                    args '--cap-add sys_ptrace -v /jenkins/ccache:/home/jenkins/.ccache'
-                }
-            }
-            steps {
-                sh 'echo $NODE_NAME'
+        stage('Self-Driving') {
+            parallel {
+                stage('Workload Forecasting'){
+                    agent {
+                        docker {
+                            image 'noisepage:focal'
+                            args '--cap-add sys_ptrace -v /jenkins/ccache:/home/jenkins/.ccache'
+                        }
+                    }
+                    steps {
+                        sh 'echo $NODE_NAME'
 
-                script{
-                    utils = utils ?: load(utilsFileName)
-                    utils.noisePageBuild(buildType:utils.RELEASE_BUILD, isBuildTests:false, isBuildSelfDrivingTests: true)
+                        script{
+                            utils = utils ?: load(utilsFileName)
+                            utils.noisePageBuild(buildType:utils.RELEASE_BUILD, isBuildTests:false)
+                        }
+
+                        // This scripts runs TPCC benchmark with query trace enabled. It also uses SET command to turn
+                        // on query trace.
+                        // --pattern_iter determines how many times a sequence of TPCC phases is run. Set to 3 so that
+                        // enough trace could be generated for training and testing.
+                        sh script :'''
+                        cd script/forecasting
+                        ./forecaster.py --gen_data --pattern_iter=3 --model_save_path=model.pickle --models=LSTM
+                        ''', label: 'Generate trace and perform training'
+
+                        sh script: 'sudo lsof -i -P -n | grep LISTEN || true', label: 'Check ports.'
+
+                        sh script: '''
+                        cd script/forecasting
+                        ./forecaster.py --test_file=query_trace.csv --model_load_path=model.pickle --test_model=LSTM
+                        ''', label: 'Perform inference on the trained model'
+
+                        sh script: 'sudo lsof -i -P -n | grep LISTEN || true', label: 'Check ports.'
+                    }
+                    post {
+                        cleanup {
+                            deleteDir()
+                        }
+                    }
                 }
+                stage('Modeling'){
+                    agent {
+                        docker {
+                            image 'noisepage:focal'
+                            args '--cap-add sys_ptrace -v /jenkins/ccache:/home/jenkins/.ccache'
+                        }
+                    }
+                    steps {
+                        sh 'echo $NODE_NAME'
 
-                // The parameters to the mini_runners target are (arbitrarily picked to complete tests within a reasonable time / picked to exercise all OUs).
-                // Specifically, the parameters chosen are:
-                // - mini_runner_rows_limit=100, which sets the maximal number of rows/tuples processed to be 100 (small table)
-                // - rerun=0, which skips rerun since we are not testing benchmark performance here
-                // - warm_num=1, which also tests the warm up phase for the mini_runners.
-                // With the current set of parameters, the input generation process will finish under 10min
-                sh script :'''
-                cd build/bin
-                ../benchmark/mini_runners --mini_runner_rows_limit=100 --rerun=0 --warm_num=1
-                ''', label: 'Mini-trainer input generation'
-
-                sh script: 'sudo lsof -i -P -n | grep LISTEN || true', label: 'Check ports.'
-
-                sh script: '''
-                cd build
-                export BUILD_ABS_PATH=`pwd`
-                timeout 10m ninja self_driving_test
-                ''', label: 'Running self-driving test'
-
-                sh script: 'sudo lsof -i -P -n | grep LISTEN || true', label: 'Check ports.'
-            }
-            post {
-                cleanup {
-                    deleteDir()
+                        script{
+                            utils = utils ?: load(utilsFileName)
+                            utils.noisePageBuild(buildType:utils.RELEASE_BUILD, isBuildTests:false, isBuildSelfDrivingTests: true)
+                        }
+
+                        // The parameters to the mini_runners target are (arbitrarily picked to complete tests within a reasonable time / picked to exercise all OUs).
+                        // Specifically, the parameters chosen are:
+                        // - mini_runner_rows_limit=100, which sets the maximal number of rows/tuples processed to be 100 (small table)
+                        // - rerun=0, which skips rerun since we are not testing benchmark performance here
+                        // - warm_num=1, which also tests the warm up phase for the mini_runners.
+                        // With the current set of parameters, the input generation process will finish under 10min
+                        sh script :'''
+                        cd build/bin
+                        ../benchmark/mini_runners --mini_runner_rows_limit=100 --rerun=0 --warm_num=1
+                        ''', label: 'Mini-trainer input generation'
+
+                        sh script: 'sudo lsof -i -P -n | grep LISTEN || true', label: 'Check ports.'
+
+                        sh script: '''
+                        cd build
+                        export BUILD_ABS_PATH=`pwd`
+                        timeout 10m ninja self_driving_test
+                        ''', label: 'Running self-driving test'
+
+                        sh script: 'sudo lsof -i -P -n | grep LISTEN || true', label: 'Check ports.'
+                    }
+                    post {
+                        cleanup {
+                            deleteDir()
+                        }
+                    }
                 }
             }
         }
 
@@ -0,0 +1,3 @@
+import sys
+from pathlib import Path
+sys.path.insert(0, str((Path.cwd() / '..' / 'testing').absolute()))
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""
+This file contains cluster related codes for the forecasting query traces. QueryCluster represents query traces of
+multiple queries in the same cluster.
+
+TODO: clustering implementation
+"""
+
+from typing import Dict, List
+import numpy as np
+
+
+class QueryCluster:
+    """
+    Represents query traces from a single cluster. For queries in the same cluster, they will be aggregated
+    into a single time-series to be used as training input for training. The time-series predicted by the model will
+    then be converted to different query traces for a query cluster
+    """
+
+    def __init__(self, traces: Dict):
+        """
+        NOTE(ricky): I believe this per-cluster query representation will change once we have clustering component
+        added. For now, it simply takes a map of time-series data for each query id.
+
+        :param traces: Map of (id -> timeseries) for each query id
+        """
+        self._traces = traces
+        self._aggregate()
+
+    def _aggregate(self) -> None:
+        """
+        Aggregate time-series of multiple queries in the same cluster into one time-series
+        It stores the aggregated times-eries at self._timeseries, and the ratio map of queries in the
+        same cluster at self._ratio_map
+        """
+        cnt_map = {}
+        total_cnt = 0
+        all_series = []
+        for qid, timeseries in self._traces.items():
+            cnt = sum(timeseries)
+            all_series.append(timeseries)
+            total_cnt += cnt
+
+            cnt_map[qid] = cnt
+
+        # Sum all timeseries element-wise
+        self._timeseries = np.array([sum(x) for x in zip(*all_series)])
+
+        # Compute distribution of each query id in the cluster
+        self._ratio_map = {}
+        for qid, cnt in cnt_map.items():
+            self._ratio_map[qid] = cnt / total_cnt
+
+    def get_timeseries(self) -> np.ndarray:
+        """
+        Get the aggregate time-series for this cluster
+        :return: Time-series for the cluster
+        """
+        return self._timeseries
+
+    def segregate(self, timeseries: List[float]) -> Dict:
+        """
+        From an aggregated time-series, segregate it into multiple time-series, one for each query in the cluster.
+        :param timeseries: Aggregated time-series
+        :return: Time-series for each query id, dict{query id: time-series}
+        """
+        result = {}
+        for qid, ratio in self._ratio_map.items():
+            result[qid] = list([x * ratio for x in timeseries])
+        return result
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""
+This file contains data loading logic from the query trace file produced. Hardcoded CSV format needs to be synced with
+query trace producer.
+"""
+
+from util.constants import LOG
+
+from typing import Dict, Tuple
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+import csv
+
+
+class DataLoader:
+    # Hardcoded query_id column index in the query_trace file
+    QID_IDX = 0
+    # Hardcoded timestamp column index in the query_trace file
+    TS_IDX = 1
+
+    def __init__(self,
+                 interval_us: int,
+                 query_trace_file: str,
+                 ) -> None:
+        """
+        A Dataloader represents a query trace file. The format of the CSV is hardcoded as class attributes, e.g QID_IDX
+        The loader transforms the timestamps in the original file into time-series for each query id.
+        :param interval_us: Interval for the time-series
+        :param query_trace_file: Query trace CSV file
+        """
+        self._query_trace_file = query_trace_file
+        self._interval_us = interval_us
+
+        data = self._load_data()
+        self._to_timeseries(data)
+
+    def _load_data(self) -> np.ndarray:
+        """
+        Load data from csv
+        :return: Loaded 2D numpy array of [query_id, timestamp]
+        """
+        LOG.info(f"Loading data from {self._query_trace_file}")
+        # Load data from the files
+        with open(self._query_trace_file, newline='') as csvfile:
+            reader = csv.DictReader(csvfile)
+            data = np.array(
+                [[int(r['query_id']), int(r[' timestamp'])] for r in reader])
+
+            if len(data) == 0:
+                raise ValueError("Empty trace file")
+
+            return data
+
+    def _to_timeseries(self, data: np.ndarray) -> None:
+        """
+        Convert the 2D array with query id and timestamps into a map of time-series for each query id
+        :param data: Loaded 2D numpy array of [query_id, timestamp]
+        :return: None
+        """
+        # Query trace file is sorted by timestamps
+        start_timestamp = data[0][self.TS_IDX]
+        end_timestamp = data[-1][self.TS_IDX]
+
+        if end_timestamp - start_timestamp <= 1:
+            raise ValueError(
+                "Empty data set with start timestamp >= end timestamp.")
+
+        # Number of data points in the new time-series
+        num_buckets = (end_timestamp - start_timestamp -
+                       1) // self._interval_us + 1
+
+        # Iterate through the timestamps
+        self._ts_data = {}
+        for i in range(len(data)):
+            t = data[i][self.TS_IDX]
+            qid = data[i][self.QID_IDX]
+
+            # Initialize a new query's time-series
+            if self._ts_data.get(qid) is None:
+                self._ts_data[qid] = np.zeros(num_buckets)
+
+            # Bucket index
+            bi = (t - start_timestamp) // self._interval_us
+            self._ts_data[qid][bi] += 1
+
+    def get_ts_data(self) -> Dict:
+        return self._ts_data
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+import sys`
	`2`	`+from pathlib import Path`
	`3`	`+sys.path.insert(0, str((Path.cwd() / '..' / 'testing').absolute()))`