Skip to content

Commit fd47271

Browse files
committed
Initial public release
Signed-off-by: Christian Weilbach <christian@weilbach.name>
0 parents  commit fd47271

File tree

119 files changed

+52560
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+52560
-0
lines changed

.circleci/config.yml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
version: 2.1
2+
3+
jobs:
4+
build:
5+
docker:
6+
- image: cimg/clojure:1.12-openjdk-21.0
7+
resource_class: large
8+
working_directory: ~/repo
9+
environment:
10+
JVM_OPTS: >-
11+
--add-modules=jdk.incubator.vector
12+
--enable-native-access=ALL-UNNAMED
13+
-Xmx4g
14+
steps:
15+
- checkout
16+
- restore_cache:
17+
keys:
18+
- v1-deps-{{ checksum "deps.edn" }}
19+
- v1-deps-
20+
- run:
21+
name: Compile Java sources
22+
command: |
23+
mkdir -p target/classes
24+
javac --add-modules jdk.incubator.vector \
25+
-d target/classes \
26+
src-java/stratum/internal/ColumnOps.java \
27+
src-java/stratum/internal/ColumnOpsExt.java \
28+
src-java/stratum/internal/PgWireServer.java
29+
- run:
30+
name: Download dependencies
31+
command: clj -M:release:test -Stree
32+
- save_cache:
33+
paths:
34+
- ~/.m2
35+
- ~/.gitlibs
36+
key: v1-deps-{{ checksum "deps.edn" }}
37+
- run:
38+
name: Run tests
39+
command: clj -M:release:test
40+
- run:
41+
name: Build JAR
42+
command: clj -T:build jar
43+
- persist_to_workspace:
44+
root: target
45+
paths:
46+
- "*.jar"
47+
- classes
48+
49+
deploy:
50+
docker:
51+
- image: cimg/clojure:1.12-openjdk-21.0
52+
working_directory: ~/repo
53+
steps:
54+
- checkout
55+
- restore_cache:
56+
keys:
57+
- v1-deps-{{ checksum "deps.edn" }}
58+
- attach_workspace:
59+
at: target
60+
- run:
61+
name: Deploy to Clojars
62+
command: clj -T:build deploy
63+
64+
workflows:
65+
build-and-deploy:
66+
jobs:
67+
- build
68+
- deploy:
69+
requires:
70+
- build
71+
filters:
72+
branches:
73+
only: main
74+
context:
75+
- clojars-deploy
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
name: Render Clay Notebooks
2+
3+
on:
4+
push:
5+
branches: [main]
6+
paths:
7+
- 'notebooks/**'
8+
- 'src/**'
9+
- 'deps.edn'
10+
- '.github/workflows/clay-notebooks.yml'
11+
workflow_dispatch: # Allow manual trigger
12+
13+
permissions:
14+
contents: write # Required to push changes back
15+
16+
jobs:
17+
render-notebooks:
18+
runs-on: ubuntu-latest
19+
20+
steps:
21+
- name: Checkout repository
22+
uses: actions/checkout@v4
23+
with:
24+
fetch-depth: 0 # Full history for proper git operations
25+
26+
- name: Set up Java 21
27+
uses: actions/setup-java@v4
28+
with:
29+
distribution: 'temurin'
30+
java-version: '21'
31+
32+
- name: Install Clojure CLI
33+
uses: DeLaGuardo/setup-clojure@12.5
34+
with:
35+
cli: 1.12.0.1488
36+
37+
- name: Cache Maven dependencies
38+
uses: actions/cache@v4
39+
with:
40+
path: |
41+
~/.m2/repository
42+
~/.gitlibs
43+
key: ${{ runner.os }}-maven-${{ hashFiles('**/deps.edn') }}
44+
restore-keys: |
45+
${{ runner.os }}-maven-
46+
47+
- name: Compile Java sources
48+
run: |
49+
mkdir -p target/classes
50+
javac --add-modules jdk.incubator.vector \
51+
-d target/classes \
52+
src-java/stratum/internal/ColumnOps.java \
53+
src-java/stratum/internal/ColumnOpsExt.java \
54+
src-java/stratum/internal/ColumnOpsChunked.java \
55+
src-java/stratum/internal/ColumnOpsChunkedSimd.java \
56+
src-java/stratum/internal/ColumnOpsAnalytics.java
57+
58+
- name: Download Clojure dependencies
59+
run: clj -P -M:dev
60+
61+
- name: Render Clay notebooks
62+
env:
63+
JVM_OPTS: "--add-modules=jdk.incubator.vector --enable-native-access=ALL-UNNAMED"
64+
run: |
65+
clj -M:dev -e "
66+
(require '[scicloj.clay.v2.api :as clay])
67+
68+
;; Render main notebook
69+
(clay/make! {:source-path \"notebooks/stratum_intro.clj\"
70+
:base-target-path \"docs\"
71+
:show false})
72+
73+
(println \"✓ Rendered notebooks to docs/\")
74+
(System/exit 0)
75+
"
76+
77+
- name: Check for changes
78+
id: git-check
79+
run: |
80+
git diff --exit-code docs/ || echo "changed=true" >> $GITHUB_OUTPUT
81+
82+
- name: Commit rendered notebooks
83+
if: steps.git-check.outputs.changed == 'true'
84+
run: |
85+
git config --local user.email "github-actions[bot]@users.noreply.github.com"
86+
git config --local user.name "github-actions[bot]"
87+
git add docs/
88+
git commit -m "docs: render Clay notebooks [skip ci]"
89+
git push
90+
91+
- name: Summary
92+
run: |
93+
echo "### Clay Notebook Rendering" >> $GITHUB_STEP_SUMMARY
94+
echo "" >> $GITHUB_STEP_SUMMARY
95+
if [ "${{ steps.git-check.outputs.changed }}" == "true" ]; then
96+
echo "✅ Notebooks rendered and committed to \`docs/\`" >> $GITHUB_STEP_SUMMARY
97+
else
98+
echo "ℹ️ No changes detected in rendered notebooks" >> $GITHUB_STEP_SUMMARY
99+
fi
100+
echo "" >> $GITHUB_STEP_SUMMARY
101+
echo "View at: https://replikativ.github.io/stratum/stratum_intro.html" >> $GITHUB_STEP_SUMMARY

.github/workflows/dco.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
name: DCO Check
2+
3+
on:
4+
pull_request:
5+
branches: [main]
6+
7+
jobs:
8+
dco:
9+
runs-on: ubuntu-latest
10+
steps:
11+
- name: Checkout
12+
uses: actions/checkout@v4
13+
with:
14+
fetch-depth: 0
15+
16+
- name: Check DCO
17+
uses: christophebedard/dco-require-action@v1

.github/workflows/test.yml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: Tests
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- name: Checkout repository
15+
uses: actions/checkout@v4
16+
17+
- name: Set up Java 21
18+
uses: actions/setup-java@v4
19+
with:
20+
distribution: 'temurin'
21+
java-version: '21'
22+
23+
- name: Install Clojure CLI
24+
uses: DeLaGuardo/setup-clojure@12.5
25+
with:
26+
cli: 1.12.0.1488
27+
28+
- name: Cache Maven dependencies
29+
uses: actions/cache@v4
30+
with:
31+
path: |
32+
~/.m2/repository
33+
~/.gitlibs
34+
key: ${{ runner.os }}-maven-${{ hashFiles('**/deps.edn') }}
35+
restore-keys: |
36+
${{ runner.os }}-maven-
37+
38+
- name: Compile Java sources
39+
run: |
40+
mkdir -p target/classes
41+
javac --add-modules jdk.incubator.vector \
42+
-d target/classes \
43+
src-java/stratum/internal/ColumnOps.java \
44+
src-java/stratum/internal/ColumnOpsExt.java \
45+
src-java/stratum/internal/ColumnOpsChunked.java \
46+
src-java/stratum/internal/ColumnOpsChunkedSimd.java \
47+
src-java/stratum/internal/ColumnOpsAnalytics.java
48+
49+
- name: Download Clojure dependencies
50+
run: clj -P -M:ci
51+
52+
- name: Run tests
53+
run: clj -M:ci

.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
data/
2+
target/
3+
.cpcache/
4+
.lsp/
5+
.clj-kondo/
6+
.nrepl-port
7+
classes/
8+
hs_err_pid*.log
9+
.internal/
10+
*.bundle
11+
*.swp
12+
*.swo
13+
bench/results_*.txt
14+
results*.txt
15+
olap_null_fix.txt

CHANGELOG.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Changelog
2+
3+
## v0.1.0
4+
5+
Initial public release.
6+
7+
### Query Engine
8+
- SIMD-accelerated filter + aggregate via Java Vector API (DoubleVector/LongVector)
9+
- Fused single-pass execution for filter + aggregate queries
10+
- Multi-sum SIMD: up to 4 SUM/AVG aggregations in one pass
11+
- Dense group-by with direct array indexing (up to 200K groups)
12+
- Radix-partitioned hash group-by (256 L2-sized tables for high cardinality)
13+
- Morsel-driven parallelism (64K work units, L2-cache-friendly)
14+
- Native VARIANCE/STDDEV/CORR with Welford/co-moment accumulators
15+
- COUNT DISTINCT with per-group hash tables
16+
- LIKE / ILIKE fast-path: `%literal%` -> contains, `prefix%` -> startsWith, `%suffix` -> endsWith
17+
- Zone map pruning: skip chunks based on min/max statistics
18+
- Hash JOIN (INNER, LEFT, RIGHT, FULL) with fused join+group-by and fused join+global-agg paths
19+
- Multi-column JOIN with Java composite key encoding
20+
- Window functions: ROW_NUMBER, RANK, DENSE_RANK, NTILE, LAG, LEAD, SUM, AVG, COUNT
21+
- Window having pushdown: filter after window execution without materializing discarded rows
22+
- Statistical aggregates: exact MEDIAN/PERCENTILE (QuickSelect), approximate quantile (t-digest)
23+
- Expression evaluation: arithmetic, date/time, string, COALESCE, NULLIF, CASE, GREATEST, LEAST
24+
- Columnar result format (`:result :columns`) for 15x faster high-cardinality output
25+
26+
### Data Types
27+
- `long[]` / `double[]` heap arrays (JVM GC managed)
28+
- `PersistentColumnIndex` -- chunked B-tree with per-chunk statistics
29+
- Dictionary-encoded `String[]` for group-by and LIKE operations
30+
- NULL handling via NaN (double) / Long.MIN_VALUE (long) sentinels
31+
- PostgreSQL-compliant NULL semantics: NaN-safe SIMD aggregation, CSV NULL import, NULL group keys
32+
33+
### Dataset Persistence
34+
- `StratumDataset` type with transient/persistent lifecycle
35+
- O(1) fork with structural sharing (copy-on-write per chunk)
36+
- `ds-sync!` / `ds-load` / `ds-gc!` for konserve-backed storage
37+
- Branch management with atomic HEAD updates
38+
- Temporal queries: as-of commit, branch, Datahike tx floor lookup
39+
- Commit metadata for cross-system coordination
40+
41+
### SQL Interface
42+
- PostgreSQL wire protocol v3 server (psql, DBeaver, JDBC, psycopg2)
43+
- JSqlParser-based SQL to query map translation
44+
- Full DML: INSERT, UPDATE, DELETE, UPSERT (INSERT ON CONFLICT DO UPDATE/NOTHING)
45+
- UPDATE FROM for joined updates with table-qualified column disambiguation
46+
- FILTER clause on aggregates: `SUM(x) FILTER (WHERE status = 1)`
47+
- CREATE TABLE / DROP TABLE for mutable table management
48+
- Table registration with automatic dictionary encoding
49+
- Ad-hoc file queries: `SELECT ... FROM read_csv('file.csv')`
50+
- Persistent file indexing with zone map pruning, mtime-invalidated cache
51+
- EXPLAIN support
52+
- `--host` / `--port` / `--data-dir` CLI flags
53+
54+
### Data Import
55+
- CSV import with auto type detection (long/double/string)
56+
- Parquet import via parquet-java (no Hadoop runtime required)
57+
- `from-maps` for converting Clojure map sequences
58+
59+
### Anomaly Detection
60+
- Isolation forest training, scoring, and prediction
61+
- Online rotation for concept drift adaptation
62+
- SQL functions: ANOMALY_SCORE, ANOMALY_PREDICT, ANOMALY_PROBA, ANOMALY_CONFIDENCE
63+
64+
### Integrations
65+
- Yggdrasil adapter (Snapshotable, Branchable, Graphable protocols)
66+
- tablecloth / tech.ml.dataset interop
67+
- Clay notebook rendering
68+
- Datahike tx-coordinated sync workflow
69+
70+
### Performance (6M rows, 8-core Intel Lunar Lake)
71+
- TPC-H Q6 filter+sum: 4.3ms (DuckDB: 7.1ms)
72+
- SSB Q1.1 filter+sum: 3.8ms (DuckDB: 7.0ms)
73+
- Filtered COUNT: 1.0ms (DuckDB: 2.9ms)
74+
- Group-by COUNT: 3.8ms (DuckDB: 5.7ms)
75+
- STDDEV group-by: 17.1ms (DuckDB: 28.4ms)
76+
- CORR group-by: 32.4ms (DuckDB: 33.5ms)
77+
- Exact median: 64.2ms (DuckDB: 193.4ms)

0 commit comments

Comments
 (0)