Skip to content

Commit 6970f71

Browse files
Merge pull request #2971 from scottteal/iceberg-v3-guide
Add Iceberg V3 Tables Comprehensive Guide
2 parents 25909c7 + c2fec2a commit 6970f71

33 files changed

+6829
-0
lines changed
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
-- Snowflake Iceberg V3 Comprehensive Guide
2+
-- Script 02: Create Database, Schema, and Warehouse
3+
-- ================================================
4+
5+
-- Use ACCOUNTADMIN role for setup
6+
USE ROLE ACCOUNTADMIN;
7+
8+
-- Create warehouse for all operations
9+
CREATE WAREHOUSE IF NOT EXISTS FLEET_ANALYTICS_WH
10+
WAREHOUSE_SIZE = 'MEDIUM'
11+
AUTO_SUSPEND = 60
12+
AUTO_RESUME = TRUE
13+
INITIALLY_SUSPENDED = TRUE
14+
COMMENT = 'Warehouse for Fleet Analytics Iceberg V3 Guide';
15+
16+
-- Create the main database with Iceberg V3 as default
17+
CREATE DATABASE IF NOT EXISTS FLEET_ANALYTICS_DB
18+
COMMENT = 'Iceberg V3 Comprehensive Guide - Fleet Analytics';
19+
20+
-- Set Iceberg V3 as the default version for all Iceberg tables in this database
21+
-- See: https://docs.snowflake.com/en/LIMITEDACCESS/iceberg/tables-iceberg-v3-specification-support
22+
ALTER DATABASE FLEET_ANALYTICS_DB SET ICEBERG_VERSION_DEFAULT = 3;
23+
24+
-- Set default external volume for all Iceberg tables in this database
25+
-- This is set after the external volume is created in 01_external_volume.sql
26+
-- ALTER DATABASE FLEET_ANALYTICS_DB SET EXTERNAL_VOLUME = 'FLEET_ICEBERG_VOL';
27+
28+
-- Create schema for raw/source tables
29+
CREATE SCHEMA IF NOT EXISTS FLEET_ANALYTICS_DB.RAW
30+
COMMENT = 'Raw data layer - source Iceberg tables';
31+
32+
-- Create schema for transformed/curated tables
33+
CREATE SCHEMA IF NOT EXISTS FLEET_ANALYTICS_DB.CURATED
34+
COMMENT = 'Curated data layer - transformed Iceberg tables';
35+
36+
-- Create schema for analytics/reporting
37+
CREATE SCHEMA IF NOT EXISTS FLEET_ANALYTICS_DB.ANALYTICS
38+
COMMENT = 'Analytics layer - aggregated Iceberg tables';
39+
40+
-- Create internal stage for JSON log files
41+
CREATE STAGE IF NOT EXISTS FLEET_ANALYTICS_DB.RAW.LOGS_STAGE
42+
COMMENT = 'Internal stage for maintenance log JSON files';
43+
44+
-- Create file format for JSON
45+
CREATE FILE FORMAT IF NOT EXISTS FLEET_ANALYTICS_DB.RAW.JSON_FORMAT
46+
TYPE = 'JSON'
47+
STRIP_OUTER_ARRAY = TRUE
48+
COMMENT = 'JSON file format for log ingestion';
49+
50+
-- ============================================
51+
-- EXTERNAL ACCESS INTEGRATION for API calls
52+
-- Required for Python code in notebooks to access external APIs
53+
-- ============================================
54+
55+
-- Create network rule allowing access to Open-Meteo weather API
56+
CREATE OR REPLACE NETWORK RULE FLEET_ANALYTICS_DB.RAW.OPEN_METEO_NETWORK_RULE
57+
MODE = EGRESS
58+
TYPE = HOST_PORT
59+
VALUE_LIST = ('api.open-meteo.com:443');
60+
61+
-- Create external access integration
62+
CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION OPEN_METEO_ACCESS
63+
ALLOWED_NETWORK_RULES = (FLEET_ANALYTICS_DB.RAW.OPEN_METEO_NETWORK_RULE)
64+
ENABLED = TRUE
65+
COMMENT = 'External access for Open-Meteo weather API';
66+
67+
-- Verify setup
68+
USE DATABASE FLEET_ANALYTICS_DB;
69+
SHOW SCHEMAS;
70+
SHOW STAGES IN SCHEMA RAW;
71+
SHOW WAREHOUSES LIKE 'FLEET_ANALYTICS_WH';
72+
73+
SELECT 'Database setup complete!' AS STATUS;
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
-- Snowflake Iceberg V3 Comprehensive Guide
2+
-- Script 03: Create Iceberg V3 Tables
3+
-- ====================================
4+
5+
USE ROLE ACCOUNTADMIN;
6+
USE DATABASE FLEET_ANALYTICS_DB;
7+
USE SCHEMA RAW;
8+
USE WAREHOUSE FLEET_ANALYTICS_WH;
9+
10+
-- ============================================
11+
-- Table 1: VEHICLE_TELEMETRY_STREAM
12+
-- Purpose: Real-time streaming vehicle telemetry with VARIANT
13+
-- Note: Using TIMESTAMP(6) for microsecond precision (Spark compatibility)
14+
-- ============================================
15+
CREATE OR REPLACE ICEBERG TABLE VEHICLE_TELEMETRY_STREAM (
16+
VEHICLE_ID STRING NOT NULL,
17+
EVENT_TIMESTAMP TIMESTAMP_NTZ(6) NOT NULL,
18+
TELEMETRY_DATA VARIANT NOT NULL,
19+
INGESTED_AT TIMESTAMP_LTZ(6))
20+
EXTERNAL_VOLUME = 'FLEET_ICEBERG_VOL'
21+
CATALOG = 'SNOWFLAKE'
22+
BASE_LOCATION = 'FLEET_ANALYTICS_DB/RAW/VEHICLE_TELEMETRY_STREAM'
23+
COMMENT = 'Real-time vehicle telemetry events streamed via Snowpipe Streaming';
24+
25+
-- ============================================
26+
-- Table 2: MAINTENANCE_LOGS
27+
-- Purpose: Batch-loaded JSON maintenance/diagnostic logs
28+
-- Note: Using TIMESTAMP(6) for microsecond precision (Spark compatibility)
29+
-- ============================================
30+
CREATE OR REPLACE ICEBERG TABLE MAINTENANCE_LOGS (
31+
LOG_ID STRING NOT NULL,
32+
VEHICLE_ID STRING NOT NULL,
33+
LOG_TIMESTAMP TIMESTAMP_NTZ(6) NOT NULL,
34+
LOG_DATA VARIANT NOT NULL,
35+
SOURCE_FILE STRING,
36+
INGESTED_AT TIMESTAMP_LTZ(6))
37+
EXTERNAL_VOLUME = 'FLEET_ICEBERG_VOL'
38+
CATALOG = 'SNOWFLAKE'
39+
BASE_LOCATION = 'FLEET_ANALYTICS_DB/RAW/MAINTENANCE_LOGS'
40+
COMMENT = 'Maintenance and diagnostic logs loaded from JSON files';
41+
42+
-- ============================================
43+
-- Table 3: SENSOR_READINGS
44+
-- Purpose: High-precision time-series sensor data
45+
-- Note: Using TIMESTAMP(6) for microsecond precision (Spark compatibility)
46+
-- ============================================
47+
CREATE OR REPLACE ICEBERG TABLE SENSOR_READINGS (
48+
READING_ID STRING NOT NULL,
49+
VEHICLE_ID STRING NOT NULL,
50+
READING_TIMESTAMP TIMESTAMP_NTZ(6) NOT NULL, -- Microsecond precision for Spark compatibility
51+
ENGINE_TEMP_F FLOAT,
52+
OIL_PRESSURE_PSI FLOAT,
53+
BATTERY_VOLTAGE FLOAT,
54+
FUEL_CONSUMPTION_GPH FLOAT,
55+
TIRE_PRESSURE_FL FLOAT,
56+
TIRE_PRESSURE_FR FLOAT,
57+
TIRE_PRESSURE_RL FLOAT,
58+
TIRE_PRESSURE_RR FLOAT,
59+
ODOMETER_MILES FLOAT,
60+
INGESTED_AT TIMESTAMP_LTZ(6))
61+
EXTERNAL_VOLUME = 'FLEET_ICEBERG_VOL'
62+
CATALOG = 'SNOWFLAKE'
63+
BASE_LOCATION = 'FLEET_ANALYTICS_DB/RAW/SENSOR_READINGS'
64+
COMMENT = 'High-precision time-series sensor readings';
65+
66+
-- ============================================
67+
-- Table 4: VEHICLE_LOCATIONS
68+
-- Purpose: Geospatial vehicle position data
69+
-- Note: Using TIMESTAMP(6) for microsecond precision (Spark compatibility)
70+
-- ============================================
71+
CREATE OR REPLACE ICEBERG TABLE VEHICLE_LOCATIONS (
72+
LOCATION_ID STRING NOT NULL,
73+
VEHICLE_ID STRING NOT NULL,
74+
LOCATION_TIMESTAMP TIMESTAMP_NTZ(6) NOT NULL,
75+
LATITUDE FLOAT NOT NULL,
76+
LONGITUDE FLOAT NOT NULL,
77+
LOCATION_POINT GEOGRAPHY,
78+
ALTITUDE_FT FLOAT,
79+
HEADING_DEGREES FLOAT,
80+
SPEED_MPH FLOAT,
81+
FLEET_REGION STRING,
82+
INGESTED_AT TIMESTAMP_LTZ(6))
83+
EXTERNAL_VOLUME = 'FLEET_ICEBERG_VOL'
84+
CATALOG = 'SNOWFLAKE'
85+
BASE_LOCATION = 'FLEET_ANALYTICS_DB/RAW/VEHICLE_LOCATIONS'
86+
COMMENT = 'Geospatial vehicle location data with GEOGRAPHY type';
87+
88+
-- ============================================
89+
-- Table 5: VEHICLE_REGISTRY
90+
-- Purpose: Master data for vehicles and drivers
91+
-- Note: Using TIMESTAMP(6) for microsecond precision (Spark compatibility)
92+
-- ============================================
93+
CREATE OR REPLACE ICEBERG TABLE VEHICLE_REGISTRY (
94+
VEHICLE_ID STRING NOT NULL,
95+
VIN STRING,
96+
MAKE STRING,
97+
MODEL STRING,
98+
YEAR INT,
99+
LICENSE_PLATE STRING,
100+
DRIVER_ID STRING,
101+
DRIVER_NAME STRING,
102+
DRIVER_EMAIL STRING,
103+
DRIVER_PHONE STRING,
104+
FLEET_REGION STRING,
105+
VEHICLE_STATUS STRING DEFAULT 'ACTIVE',
106+
REGISTRATION_DATE DATE,
107+
LAST_SERVICE_DATE DATE,
108+
CREATED_AT TIMESTAMP_LTZ(6),
109+
UPDATED_AT TIMESTAMP_LTZ(6))
110+
EXTERNAL_VOLUME = 'FLEET_ICEBERG_VOL'
111+
CATALOG = 'SNOWFLAKE'
112+
BASE_LOCATION = 'FLEET_ANALYTICS_DB/RAW/VEHICLE_REGISTRY'
113+
COMMENT = 'Master data for vehicles and drivers (contains PII)';
114+
115+
-- ============================================
116+
-- Table 6: API_WEATHER_DATA
117+
-- Purpose: Weather data from public API with VARIANT
118+
-- Note: Using TIMESTAMP(6) for microsecond precision (Spark compatibility)
119+
-- ============================================
120+
CREATE OR REPLACE ICEBERG TABLE API_WEATHER_DATA (
121+
CITY_NAME STRING NOT NULL,
122+
LATITUDE FLOAT NOT NULL,
123+
LONGITUDE FLOAT NOT NULL,
124+
WEATHER_DATA VARIANT NOT NULL,
125+
INGESTED_AT TIMESTAMP_LTZ(6))
126+
EXTERNAL_VOLUME = 'FLEET_ICEBERG_VOL'
127+
CATALOG = 'SNOWFLAKE'
128+
BASE_LOCATION = 'FLEET_ANALYTICS_DB/RAW/API_WEATHER_DATA'
129+
COMMENT = 'Weather data fetched from Open-Meteo API';
130+
131+
-- ============================================
132+
-- Load Sample Data into VEHICLE_REGISTRY
133+
-- ============================================
134+
INSERT INTO VEHICLE_REGISTRY (
135+
VEHICLE_ID, VIN, MAKE, MODEL, YEAR, LICENSE_PLATE,
136+
DRIVER_ID, DRIVER_NAME, DRIVER_EMAIL, DRIVER_PHONE,
137+
FLEET_REGION, VEHICLE_STATUS, REGISTRATION_DATE, LAST_SERVICE_DATE
138+
)
139+
SELECT
140+
'VH-' || LPAD(SEQ4()::VARCHAR, 4, '0') AS VEHICLE_ID,
141+
UPPER(RANDSTR(17, RANDOM())) AS VIN,
142+
CASE MOD(SEQ4(), 5)
143+
WHEN 0 THEN 'Ford'
144+
WHEN 1 THEN 'Chevrolet'
145+
WHEN 2 THEN 'Toyota'
146+
WHEN 3 THEN 'Ram'
147+
ELSE 'Freightliner'
148+
END AS MAKE,
149+
CASE MOD(SEQ4(), 5)
150+
WHEN 0 THEN 'Transit'
151+
WHEN 1 THEN 'Express'
152+
WHEN 2 THEN 'Tacoma'
153+
WHEN 3 THEN 'ProMaster'
154+
ELSE 'Cascadia'
155+
END AS MODEL,
156+
2020 + MOD(SEQ4(), 6) AS YEAR,
157+
UPPER(RANDSTR(3, RANDOM())) || '-' || LPAD(MOD(SEQ4() * 7, 9999)::VARCHAR, 4, '0') AS LICENSE_PLATE,
158+
'DRV-' || LPAD(SEQ4()::VARCHAR, 4, '0') AS DRIVER_ID,
159+
CASE MOD(SEQ4(), 10)
160+
WHEN 0 THEN 'John Smith'
161+
WHEN 1 THEN 'Sarah Johnson'
162+
WHEN 2 THEN 'Michael Brown'
163+
WHEN 3 THEN 'Emily Davis'
164+
WHEN 4 THEN 'David Wilson'
165+
WHEN 5 THEN 'Jessica Taylor'
166+
WHEN 6 THEN 'Christopher Lee'
167+
WHEN 7 THEN 'Amanda Martinez'
168+
WHEN 8 THEN 'Daniel Anderson'
169+
ELSE 'Jennifer Garcia'
170+
END AS DRIVER_NAME,
171+
LOWER(SPLIT_PART(DRIVER_NAME, ' ', 1)) || '.' || LOWER(SPLIT_PART(DRIVER_NAME, ' ', 2)) || '@fleetco.com' AS DRIVER_EMAIL,
172+
'+1-555-' || LPAD(MOD(SEQ4() * 13, 9999)::VARCHAR, 4, '0') AS DRIVER_PHONE,
173+
CASE MOD(SEQ4(), 5)
174+
WHEN 0 THEN 'Pacific Northwest'
175+
WHEN 1 THEN 'California'
176+
WHEN 2 THEN 'Mountain West'
177+
WHEN 3 THEN 'Midwest'
178+
ELSE 'Northeast'
179+
END AS FLEET_REGION,
180+
CASE WHEN MOD(SEQ4(), 20) = 0 THEN 'MAINTENANCE' ELSE 'ACTIVE' END AS VEHICLE_STATUS,
181+
DATEADD('day', -MOD(SEQ4() * 17, 1000), CURRENT_DATE()) AS REGISTRATION_DATE,
182+
DATEADD('day', -MOD(SEQ4() * 7, 90), CURRENT_DATE()) AS LAST_SERVICE_DATE
183+
FROM TABLE(GENERATOR(ROWCOUNT => 100));
184+
185+
-- ============================================
186+
-- Load Sample Data into SENSOR_READINGS
187+
-- ============================================
188+
INSERT INTO SENSOR_READINGS (
189+
READING_ID, VEHICLE_ID, READING_TIMESTAMP,
190+
ENGINE_TEMP_F, OIL_PRESSURE_PSI, BATTERY_VOLTAGE,
191+
FUEL_CONSUMPTION_GPH, TIRE_PRESSURE_FL, TIRE_PRESSURE_FR,
192+
TIRE_PRESSURE_RL, TIRE_PRESSURE_RR, ODOMETER_MILES
193+
)
194+
SELECT
195+
UUID_STRING() AS READING_ID,
196+
'VH-' || LPAD(MOD(SEQ4(), 100)::VARCHAR, 4, '0') AS VEHICLE_ID,
197+
-- Spread readings over 30 days to overlap with maintenance logs
198+
TIMESTAMPADD('minute', -SEQ4() * 5, CURRENT_TIMESTAMP())::TIMESTAMP_NTZ(6) AS READING_TIMESTAMP,
199+
180 + RANDOM() / POWER(10, 18) * 50 AS ENGINE_TEMP_F,
200+
30 + RANDOM() / POWER(10, 18) * 30 AS OIL_PRESSURE_PSI,
201+
11.5 + RANDOM() / POWER(10, 18) * 3 AS BATTERY_VOLTAGE,
202+
2 + RANDOM() / POWER(10, 18) * 8 AS FUEL_CONSUMPTION_GPH,
203+
32 + RANDOM() / POWER(10, 18) * 6 AS TIRE_PRESSURE_FL,
204+
32 + RANDOM() / POWER(10, 18) * 6 AS TIRE_PRESSURE_FR,
205+
32 + RANDOM() / POWER(10, 18) * 6 AS TIRE_PRESSURE_RL,
206+
32 + RANDOM() / POWER(10, 18) * 6 AS TIRE_PRESSURE_RR,
207+
10000 + SEQ4() * 50 + RANDOM() / POWER(10, 18) * 100 AS ODOMETER_MILES
208+
FROM TABLE(GENERATOR(ROWCOUNT => 10000));
209+
210+
-- ============================================
211+
-- Load Sample Data into VEHICLE_LOCATIONS
212+
-- ============================================
213+
-- Using CTEs to generate realistic coordinates around major US cities
214+
INSERT INTO VEHICLE_LOCATIONS (
215+
LOCATION_ID, VEHICLE_ID, LOCATION_TIMESTAMP,
216+
LATITUDE, LONGITUDE, LOCATION_POINT,
217+
ALTITUDE_FT, HEADING_DEGREES, SPEED_MPH, FLEET_REGION
218+
)
219+
WITH city_centers AS (
220+
-- Define city centers for each region (lat, lon, city, region)
221+
SELECT * FROM (VALUES
222+
(47.6062, -122.3321, 'Seattle', 'Pacific Northwest'),
223+
(45.5152, -122.6784, 'Portland', 'Pacific Northwest'),
224+
(34.0522, -118.2437, 'Los Angeles', 'California'),
225+
(37.7749, -122.4194, 'San Francisco', 'California'),
226+
(32.7157, -117.1611, 'San Diego', 'California'),
227+
(39.7392, -104.9903, 'Denver', 'Mountain West'),
228+
(40.7608, -111.8910, 'Salt Lake City', 'Mountain West'),
229+
(33.4484, -112.0740, 'Phoenix', 'Mountain West'),
230+
(41.8781, -87.6298, 'Chicago', 'Midwest'),
231+
(44.9778, -93.2650, 'Minneapolis', 'Midwest'),
232+
(39.0997, -94.5786, 'Kansas City', 'Midwest'),
233+
(40.7128, -74.0060, 'New York', 'Northeast'),
234+
(42.3601, -71.0589, 'Boston', 'Northeast'),
235+
(39.9526, -75.1652, 'Philadelphia', 'Northeast'),
236+
(38.9072, -77.0369, 'Washington DC', 'Northeast'),
237+
(29.7604, -95.3698, 'Houston', 'Texas'),
238+
(32.7767, -96.7970, 'Dallas', 'Texas'),
239+
(30.2672, -97.7431, 'Austin', 'Texas'),
240+
(33.4484, -84.3880, 'Atlanta', 'Southeast'),
241+
(25.7617, -80.1918, 'Miami', 'Southeast')
242+
) AS t(base_lat, base_lon, city, region)
243+
),
244+
numbered_cities AS (
245+
SELECT *, ROW_NUMBER() OVER (ORDER BY base_lat) - 1 AS city_idx
246+
FROM city_centers
247+
),
248+
base_data AS (
249+
SELECT
250+
SEQ4() AS seq,
251+
'VH-' || LPAD(MOD(SEQ4(), 100)::VARCHAR, 4, '0') AS VEHICLE_ID,
252+
TIMESTAMPADD('second', -SEQ4() * 30, CURRENT_TIMESTAMP())::TIMESTAMP_NTZ AS LOCATION_TIMESTAMP,
253+
MOD(SEQ4(), 20) AS city_selector
254+
FROM TABLE(GENERATOR(ROWCOUNT => 5000))
255+
)
256+
SELECT
257+
UUID_STRING() AS LOCATION_ID,
258+
b.VEHICLE_ID,
259+
b.LOCATION_TIMESTAMP,
260+
-- Add random offset within ~30 miles (0.5 degrees) of city center
261+
c.base_lat + (UNIFORM(-50, 50, RANDOM()) / 100.0) AS LATITUDE,
262+
c.base_lon + (UNIFORM(-50, 50, RANDOM()) / 100.0) AS LONGITUDE,
263+
ST_POINT(
264+
c.base_lon + (UNIFORM(-50, 50, RANDOM()) / 100.0),
265+
c.base_lat + (UNIFORM(-50, 50, RANDOM()) / 100.0)
266+
) AS LOCATION_POINT,
267+
UNIFORM(100, 5000, RANDOM())::FLOAT AS ALTITUDE_FT,
268+
UNIFORM(0, 359, RANDOM())::FLOAT AS HEADING_DEGREES,
269+
UNIFORM(0, 80, RANDOM())::FLOAT AS SPEED_MPH,
270+
c.region AS FLEET_REGION
271+
FROM base_data b
272+
JOIN numbered_cities c ON b.city_selector = c.city_idx;
273+
274+
-- Verify tables
275+
SHOW ICEBERG TABLES IN SCHEMA RAW;
276+
277+
SELECT 'VEHICLE_REGISTRY' AS TABLE_NAME, COUNT(*) AS ROW_COUNT FROM VEHICLE_REGISTRY
278+
UNION ALL
279+
SELECT 'SENSOR_READINGS', COUNT(*) FROM SENSOR_READINGS
280+
UNION ALL
281+
SELECT 'VEHICLE_LOCATIONS', COUNT(*) FROM VEHICLE_LOCATIONS;
282+
283+
SELECT 'Iceberg tables created and sample data loaded!' AS STATUS;

0 commit comments

Comments
 (0)