vortex-data
diff --git a/‎benchmarks-website/config.js‎
Lines changed: 31 additions & 2 deletions b/‎benchmarks-website/config.js‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎benchmarks-website/data-processor.js‎
Lines changed: 17 additions & 280 deletions b/‎benchmarks-website/data-processor.js‎
Lines changed: 17 additions & 280 deletions
@@ -76,6 +76,11 @@ export const BENCHMARK_DESCRIPTIONS = {
     "ClickHouse's analytical benchmark suite testing real-world query patterns on web analytics data, run against NVMe storage",
   "TPC-DS (NVMe)":
     "TPC-DS benchmark queries executed on local NVMe storage, testing complex analytical query performance with a retail sales dataset",
+  "Statistical and Population Genetics":`A suite of Statistical and Population genetics queries executed on local NVMe storage.
+
+A custom benchmark for statistical and population genetics workloads using the gnomAD v3.1.2 release of the jointly called One Thousand Genomes (1kG) and Human Genome Diversity Project (HGDP) dataset (1kG+HGDP). Only a prefix of Chromosome 21 is used for benchmarking.
+
+Data source: <https://gnomad.broadinstitute.org/>.`,
 };
 
 // Category tags mapping
@@ -94,6 +99,7 @@ export const CATEGORY_TAGS = {
   "TPC-H (S3) (SF=1000)": ["Queries (S3)", "TPC-H (SF=1000)"],
   "TPC-DS (NVMe) (SF=1)": ["Queries (NVMe)", "TPC-DS (SF=1)"],
   "TPC-DS (NVMe) (SF=10)": ["Queries (NVMe)", "TPC-DS (SF=10)"],
+  "Statistical and Population Genetics": ["Queries (NVMe)", "StatPopGen"],
 };
 
 // Scale factor descriptions
@@ -106,8 +112,30 @@ export const SCALE_FACTOR_DESCRIPTIONS = {
 
 // Query name transformations
 export const QUERY_NAME_MAP = {
+  "TPCH Q1": "TPC-H Q1",
+  "TPCH Q2": "TPC-H Q2",
+  "TPCH Q3": "TPC-H Q3",
+  "TPCH Q4": "TPC-H Q4",
+  "TPCH Q5": "TPC-H Q5",
+  "TPCH Q6": "TPC-H Q6",
+  "TPCH Q7": "TPC-H Q7",
+  "TPCH Q8": "TPC-H Q8",
+  "TPCH Q9": "TPC-H Q9",
+  "TPCH Q10": "TPC-H Q10",
+  "TPCH Q11": "TPC-H Q11",
+  "TPCH Q12": "TPC-H Q12",
+  "TPCH Q13": "TPC-H Q13",
+  "TPCH Q14": "TPC-H Q14",
+  "TPCH Q15": "TPC-H Q15",
+  "TPCH Q16": "TPC-H Q16",
+  "TPCH Q17": "TPC-H Q17",
+  "TPCH Q18": "TPC-H Q18",
+  "TPCH Q19": "TPC-H Q19",
+  "TPCH Q20": "TPC-H Q20",
+  "TPCH Q21": "TPC-H Q21",
+  "TPCH Q22": "TPC-H Q22",
   "VORTEX:RAW SIZE": "VORTEX COMPRESSION RATIO",
-  "VORTEX:PARQUET-ZSTD SIZE": "VORTEX:PARQUET-ZSTD SIZE RATIO",
+  "VORTEX:PARQUET-ZSTD SIZE": "VORTEX:PARQUET-ZSTD SIZE RATIO"
 };
 
 // Engine labels
@@ -135,4 +163,5 @@ export const BENCHMARK_GROUPS = [
   "TPC-H (S3) (SF=1000)",
   "TPC-DS (NVMe) (SF=1)",
   "TPC-DS (NVMe) (SF=10)",
-];
+  "Statistical and Population Genetics"
+];
@@ -1,155 +1,27 @@
 "use strict";
 
-import { BENCHMARK_GROUPS, QUERY_NAME_MAP } from './config.js';
+import { shared } from './data-shared.js';
+import { BENCHMARK_GROUPS } from './config.js';
 
 // Data processing module
 export const dataProcessor = {
-  parseCommits(commitMetadata) {
-    const commits = [];
-    Object.values(commitMetadata)
-      .sort((a, b) => new Date(a.timestamp) - new Date(b.timestamp))
-      .forEach((commit, index) => {
-        commit.sortedIndex = index;
-        commits.push(commit);
-      });
-    return commits;
-  },
-
-  createMissingCommit(commitId) {
-    return {
-      author: { email: "[email protected]", name: "Dan King" },
-      committer: { email: "[email protected]", name: "GitHub" },
-      id: commitId,
-      message: "!! This commit is missing from commits.json !!",
-      timestamp: "1970-01-01T00:00:00Z",
-      tree_id: null,
-      url: `https://github.com/vortex-data/vortex/commit/${commitId}`,
-    };
-  },
+  parseCommits: shared.parseCommits,
 
-  determineGroupId(benchmark) {
-    const { name, dataset, storage } = benchmark;
+  createMissingCommit: shared.createMissingCommit,
 
-    if (dataset?.tpch) {
-      const scaleFactor = dataset.tpch.scale_factor;
-      const isNvme = storage === undefined || storage === "nvme";
-      return this.getTpchGroupId(scaleFactor, isNvme);
-    }
+  determineGroupId: shared.determineGroupId,
 
-    if (dataset?.tpcds) {
-      const scaleFactor = dataset.tpcds.scale_factor;
-      const isNvme = storage === undefined || storage === "nvme";
-      return this.getTpcdsGroupId(scaleFactor, isNvme);
-    }
-
-    if (dataset?.clickbench) return "Clickbench";
-    if (name.startsWith("random-access/")) return "Random Access";
-    if (name.includes("compress time/")) return "Compression";
-    if (name.startsWith("vortex size/")) return "Compression Size";
-    if (
-      name.startsWith("vortex:raw size/") ||
-      name.startsWith("vortex:parquet-zstd size/")
-    ) {
-      return "Compression Size";
-    }
-    if (name.startsWith("tpch_q")) {
-      const isNvme = storage === undefined || storage === "nvme";
-      return isNvme ? "TPC-H (NVMe) (SF=1)" : "TPC-H (S3) (SF=1)";
-    }
-    if (name.startsWith("tpcds_q")) {
-      const isNvme = storage === undefined || storage === "nvme";
-      return isNvme ? "TPC-DS (NVMe) (SF=1)" : "TPC-DS (S3) (SF=1)";
-    }
-    if (name.startsWith("clickbench")) return "Clickbench";
+  getTpchGroupId: shared.getTpchGroupId,
 
-    return null;
-  },
+  getTpcdsGroupId: shared.getTpcdsGroupId,
 
-  getTpchGroupId(scaleFactor, isNvme) {
-    const sf = Number(scaleFactor);
-    const storage = isNvme ? "NVMe" : "S3";
+  normalizeSeriesName: shared.normalizeSeriesName,
 
-    switch (sf) {
-      case 1:
-        return `TPC-H (${storage}) (SF=1)`;
-      case 10:
-        return `TPC-H (${storage}) (SF=10)`;
-      case 100:
-        return `TPC-H (${storage}) (SF=100)`;
-      case 1000:
-        return `TPC-H (${storage}) (SF=1000)`;
-      default:
-        console.warn("Unknown scale factor:", scaleFactor);
-        return null;
-    }
-  },
+  formatQueryName: shared.formatQueryName,
 
-  getTpcdsGroupId(scaleFactor, isNvme) {
-    const sf = Number(scaleFactor);
-    const storage = isNvme ? "NVMe" : "S3";
+  convertValue: shared.convertValue,
 
-    switch (sf) {
-      case 1:
-        return `TPC-DS (${storage}) (SF=1)`;
-      case 10:
-        return `TPC-DS (${storage}) (SF=10)`;
-      case 100:
-        return `TPC-DS (${storage}) (SF=100)`;
-      case 1000:
-        return `TPC-DS (${storage}) (SF=1000)`;
-      default:
-        console.warn("Unknown scale factor:", scaleFactor);
-        return null;
-    }
-  },
-
-  normalizeSeriesName(name, seriesName) {
-    let normalizedName = seriesName;
-    let normalizedQuery = name;
-
-    if (
-      seriesName.endsWith(" throughput") ||
-      seriesName.endsWith("throughput")
-    ) {
-      const suffix = seriesName.endsWith(" throughput")
-        ? " throughput"
-        : "throughput";
-      normalizedName = seriesName.slice(0, seriesName.length - suffix.length);
-      normalizedQuery = name.replace("time", "throughput");
-    }
-
-    return { name: normalizedQuery, seriesName: normalizedName };
-  },
-
-  formatQueryName(query) {
-    let prettyQ = query.replace(/_/g, " ").toUpperCase();
-    prettyQ = QUERY_NAME_MAP[prettyQ] || prettyQ;
-    prettyQ = prettyQ.replace(/^TPCH\s/, "TPC-H ");
-    prettyQ = prettyQ.replace(/^TPCDS\s/, "TPC-DS ");
-    return prettyQ;
-  },
-
-  convertValue(value, unit) {
-    const isNanos = unit === "ns/iter" || unit === "ns";
-    const isBytes = unit === "bytes";
-    const isThroughput = unit === "bytes/ns";
-
-    if (isNanos) return value / 1_000_000;
-    if (isBytes) return value / 1_048_576;
-    if (isThroughput) return (value * 1_000_000_000) / 1_048_576;
-    return value;
-  },
-
-  getUnit(unit) {
-    const isNanos = unit === "ns/iter" || unit === "ns";
-    const isBytes = unit === "bytes";
-    const isThroughput = unit === "bytes/ns";
-
-    if (isNanos) return "ms/iter";
-    if (isBytes) return "MiB";
-    if (isThroughput) return "MiB/s";
-    return unit;
-  },
+  getUnit: shared.getUnit,
 
   downloadAndGroupData(data, commitMetadata, seriesRenameFn) {
     const commits = this.parseCommits(commitMetadata);
@@ -190,148 +62,13 @@ export const dataProcessor = {
     }));
   },
 
-  initializeGroups() {
-    const groups = {};
-    BENCHMARK_GROUPS.forEach((name) => {
-      groups[name] = new Map();
-    });
-    return groups;
-  },
-
-  processBenchmark(
-    benchmark,
-    commitMetadata,
-    commits,
-    groups,
-    seriesRenameFn,
-    missingCommits,
-    uncategorizableNames
-  ) {
-    // Ensure commit metadata
-    if (!benchmark.commit) {
-      benchmark.commit = commitMetadata[benchmark.commit_id];
-      if (!benchmark.commit) {
-        missingCommits.add(benchmark.commit_id);
-        benchmark.commit = commitMetadata[benchmark.commit_id] =
-          this.createMissingCommit(benchmark.commit_id);
-      }
-    }
-
-    // Determine group
-    const groupId = this.determineGroupId(benchmark);
-    if (!groupId) {
-      uncategorizableNames.add(benchmark.name);
-      return;
-    }
-
-    const group = groups[groupId];
-    if (!group) {
-      console.warn("Cannot find group element in group:", groupId);
-      return;
-    }
-
-    // Process benchmark data
-    let [query, seriesName] = benchmark.name.split("/");
-    const normalized = this.normalizeSeriesName(query, seriesName);
-    query = normalized.name;
-    seriesName = normalized.seriesName;
-
-    // Apply series renaming
-    seriesName = this.applySeriesRenaming(
-      seriesName,
-      groupId,
-      seriesRenameFn
-    );
-
-    // Format query name
-    const prettyQ = this.formatQueryName(query);
-    if (prettyQ.includes("PARQUET-UNC")) return;
-
-    // Set units
-    let unit = benchmark.unit;
-    if (!unit && benchmark.name.startsWith("vortex size/")) {
-      unit = "bytes";
-    } else if (
-      !unit &&
-      (benchmark.name.startsWith("vortex:raw size/") ||
-        benchmark.name.startsWith("vortex:parquet-zstd size/"))
-    ) {
-      unit = "ratio";
-    }
-
-    // Calculate sort position
-    const sortPosition =
-      query.slice(0, 4) === "tpch" || query.slice(0, 5) === "tpcds"
-        ? parseInt(prettyQ.split(" ")[1].substring(1), 10)
-        : 0;
+  initializeGroups: shared.initializeGroups,
 
-    // Add to group
-    this.addToGroup(
-      group,
-      prettyQ,
-      seriesName,
-      benchmark,
-      unit,
-      sortPosition,
-      commits
-    );
-  },
+  processBenchmark: shared.processBenchmark,
 
-  applySeriesRenaming(seriesName, groupId, seriesRenameFn) {
-    if (!seriesRenameFn) return seriesName;
+  applySeriesRenaming: shared.applySeriesRenaming,
 
-    const renamer = seriesRenameFn.find(([name]) => name === groupId);
-    if (renamer?.[1]?.renamedDatasets) {
-      const renameDict = renamer[1].renamedDatasets;
-      return renameDict[seriesName] || seriesName;
-    }
-    return seriesName;
-  },
+  addToGroup: shared.addToGroup,
 
-  addToGroup(
-    group,
-    queryName,
-    seriesName,
-    benchmark,
-    unit,
-    sortPosition,
-    commits
-  ) {
-    let arr = group.get(queryName);
-    if (!arr) {
-      group.set(queryName, {
-        sort_position: sortPosition,
-        commits,
-        unit: this.getUnit(unit),
-        series: new Map(),
-      });
-      arr = group.get(queryName);
-    }
-
-    let series = arr.series.get(seriesName);
-    if (!series) {
-      arr.series.set(seriesName, new Array(commits.length).fill(null));
-      series = arr.series.get(seriesName);
-    }
-
-    series[benchmark.commit.sortedIndex] = {
-      range: "this was the range",
-      value: this.convertValue(benchmark.value, unit),
-    };
-  },
-
-  sortGroups(groups) {
-    const sortByPositionThenName = (a, b) => {
-      const positionCompare = a[1].sort_position - b[1].sort_position;
-      return positionCompare !== 0
-        ? positionCompare
-        : a[0].localeCompare(b[0]);
-    };
-
-    Object.entries(groups).forEach(([name, charts]) => {
-      groups[name] = new Map(
-        [...charts.entries()].sort(sortByPositionThenName)
-      );
-    });
-  },
-};
+  sortGroups: shared.sortGroups,
+};