Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
fe7e93a
feat: add devcontainer base infrastructure and Dockerfile
Jan 27, 2026
29e457d
feat: implement isolated workspace setup and promotion logic
Jan 27, 2026
fb5325e
feat: add devcontainer configurations and dbt profiles for DuckDB/Big…
Jan 27, 2026
9c7232e
feat: add taxi data ingestion and GCS sync scripts
Jan 27, 2026
6ffee05
docs: add Codespaces integration guides and badges
Jan 27, 2026
a26c77a
tooling: add deployment and verification scripts
Jan 27, 2026
03dff90
Add
Jan 28, 2026
50311ae
Merge branch 'main' into feat/devcontainer
Jan 28, 2026
6ecdb87
Merge branch 'main' into feat/devcontainer
Jan 29, 2026
f453b7e
chore: remove database files and update .gitignore
Jan 29, 2026
15de4f0
fix: BigQuery compatibility for fct_trips model
Jan 29, 2026
9276345
docs: add comprehensive BigQuery setup guide and helper scripts
Jan 29, 2026
b8248c4
perf: optimize codespace machine sizing for workload requirements
Jan 29, 2026
51bdf7e
docs: add comprehensive DuckDB setup guide for Module 4 homework
Jan 29, 2026
5c4d341
perf: switch DuckDB to download pre-built database for 60% faster sta…
Jan 29, 2026
3da59d8
perf: optimize DuckDB data loading with selective Parquet downloads
Jan 29, 2026
26329e5
fix: use CSV.gz files instead of Parquet for data download
Jan 29, 2026
77ebbd1
fix: download CSV files locally before loading to avoid rate limits
Jan 29, 2026
621efea
docs: update postCreate message with actual performance metrics
Jan 29, 2026
b4c9434
fix: update setup guide to use --target prod consistently
Jan 29, 2026
ba2abda
cleanup: remove unused BAKE_DATA build arg from devcontainer configs
Jan 29, 2026
78855a7
docs: enhance BigQuery setup guide with homework preparation sections
Jan 29, 2026
2fb4bca
refactor: simplify to single dev target, remove prod schema complexity
Feb 2, 2026
e8fc67a
fix: remove homework solution leaks from setup guides
Feb 2, 2026
379cb0b
fix: correct postStartCommand path after Dockerfile context change
Feb 2, 2026
2d55192
fix: use correct versioned tag for Python devcontainer base image
Feb 2, 2026
a2709bf
fix: copy shared scripts into each variant and fix all file paths
Feb 2, 2026
4dbb5c6
Merge branch 'main' into feat/devcontainer
Feb 2, 2026
0b16257
fix: remove homework solution leak from HOMEWORK.md
Feb 2, 2026
67ecba2
docs: simplify setup guides and remove homework guidance
Feb 2, 2026
e04bd7c
docs: update to 2026 cohort homework
Feb 2, 2026
7b7a181
Update
Feb 2, 2026
20830f5
fix: correct Python devcontainer image tag
Feb 2, 2026
6672e76
fix: use official Python image instead of MCR devcontainer
Feb 2, 2026
913c392
Merge branch 'main' into feat/devcontainer
Feb 2, 2026
940b458
docs: focus on DuckDB codespace only
Feb 2, 2026
b0dbebe
docs: remove BigQuery option from setup/codespaces.md
Feb 2, 2026
7b547cc
chore: remove BigQuery devcontainer
Feb 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .devcontainer/duckdb/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
FROM python:3.11-slim-bookworm

# Install sudo and create vscode user for compatibility with devcontainers
RUN apt-get update && apt-get install -y sudo && \
useradd -m -s /bin/bash vscode && \
echo "vscode ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

# Create a virtual environment and activate it
ENV VIRTUAL_ENV=/opt/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install DuckDB-specific packages into the virtual environment
RUN python3 -m pip install --upgrade pip && \
python3 -m pip install \
dbt-duckdb \
requests \
pandas \
pyarrow

# Install DuckDB CLI, common tools, and GitHub CLI
RUN rm -f /etc/apt/sources.list.d/yarn.list && \
apt-get update && apt-get install -y unzip wget curl gnupg && \
# DuckDB CLI
curl -L https://github.com/duckdb/duckdb/releases/download/v1.1.3/duckdb_cli-linux-amd64.zip -o duckdb.zip && \
unzip duckdb.zip -d /usr/local/bin && \
rm duckdb.zip && \
chmod +x /usr/local/bin/duckdb && \
# GitHub CLI
mkdir -p -m 755 /etc/apt/keyrings && \
curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | gpg --dearmor -o /etc/apt/keyrings/githubcli-archive-keyring.gpg && \
chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg && \
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null && \
apt-get update && apt-get install -y gh && \
# Cleanup
apt-get clean && rm -rf /var/lib/apt/lists/*

# Copy devcontainer config to image for reliable setup
COPY . /opt/devcontainer/

# Ensure the virtual environment is automatically activated in every new shell
RUN echo "source $VIRTUAL_ENV/bin/activate" >> /home/vscode/.bashrc

# Create the homework directory and set permissions
RUN mkdir -p /home/vscode/homework && \
chown -R vscode:vscode /home/vscode/homework

# Fix permissions for the virtual environment and scripts
RUN chown -R vscode:vscode /opt/venv /opt/devcontainer && \
chmod -R +x /opt/devcontainer/scripts/*.sh
24 changes: 24 additions & 0 deletions .devcontainer/duckdb/dbt/profiles.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
taxi_rides_ny:
target: dev
outputs:
dev:
type: duckdb
path: "/home/vscode/homework/taxi_rides_ny.duckdb"
schema: dev
threads: 1
extensions:
- parquet
settings:
memory_limit: "8GB"
preserve_insertion_order: false

prod:
type: duckdb
path: "/home/vscode/homework/taxi_rides_ny.duckdb"
schema: prod
threads: 1
extensions:
- parquet
settings:
memory_limit: "8GB"
preserve_insertion_order: false
57 changes: 57 additions & 0 deletions .devcontainer/duckdb/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"name": "AE Homework: Local (DuckDB)",
"build": {
"dockerfile": "Dockerfile",
"context": "."
},
"remoteUser": "vscode",
"workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/data-engineering-zoomcamp,type=bind",
"workspaceFolder": "/home/vscode/homework",
"hostRequirements": {
"cpus": 4,
"memory": "16gb",
"storage": "32gb"
},
"containerEnv": {
"DBT_PROFILES_DIR": "/home/vscode/homework/profiles",
"DBT_DUCKDB_PATH": "/home/vscode/homework/taxi_rides_ny.duckdb",
"POETRY_VIRTUALENVS_CREATE": "false"
},
"postStartCommand": "bash /opt/devcontainer/scripts/postCreate.sh",
"features": {
"ghcr.io/devcontainers/features/sshd:1": {
"version": "latest"
}
},
"customizations": {
"vscode": {
"extensions": [
"innoverio.vscode-dbt-power-user",
"ms-python.python",
"ms-python.black-formatter",
"tamasfe.even-better-toml"
],
"settings": {
"python.defaultInterpreterPath": "/opt/venv/bin/python",
"dbt.dbtPythonPath": "/opt/venv/bin/python",
"dbt.executablePath": "/opt/venv/bin/dbt",
"dbt.enableRunResultDecorations": true,
"files.associations": {
"*.sql": "jinja-sql"
},
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"github.copilot.chat.codeGeneration.instructions": [
{
"text": "This dev container is optimized for the Data Engineering Zoomcamp. It includes dbt-core and dbt-duckdb pre-installed in a virtual environment at `/opt/venv`."
},
{
"text": "The workspace is isolated in `/home/vscode/homework`. Always run dbt commands from the root of this workspace."
}
]
}
}
}
}
61 changes: 61 additions & 0 deletions .devcontainer/duckdb/scripts/common_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env bash
set -euo pipefail

# This script handles shared setup tasks for all Data Engineering Zoomcamp Codespace variants.
# It promotes the project to the isolated root and configures VS Code.

DEST_DIR="/home/vscode/homework"
REPO_ROOT="/workspaces/data-engineering-zoomcamp"

# 1. Identify Repo Root (Robust Detection)
if [[ ! -d "$REPO_ROOT" ]]; then
REPO_ROOT=$(find /workspaces -mindepth 1 -maxdepth 1 -type d -not -name ".*" -not -name "lost+found" | head -n 1)
fi

echo "Detected repository root: $REPO_ROOT"

# 2. Promote dbt project to root
AE_DIR="$REPO_ROOT/04-analytics-engineering"
DBT_PROJECT_DIR="$AE_DIR/taxi_rides_ny"

mkdir -p "$DEST_DIR"
if [[ -d "$DBT_PROJECT_DIR" ]]; then
echo "Promoting dbt project contents to root..."
cp -RT "$DBT_PROJECT_DIR/" "$DEST_DIR/"
else
echo "Error: dbt project directory $DBT_PROJECT_DIR not found!"
exit 1
fi

# 3. Copy essential shared files
cp "$AE_DIR/HOMEWORK.md" "$DEST_DIR/" 2>/dev/null || cp "$REPO_ROOT/cohorts/2023/week_4_analytics_engineering/homework.md" "$DEST_DIR/HOMEWORK.md"
mkdir -p "$DEST_DIR/setup"
cp -RT "$AE_DIR/setup/" "$DEST_DIR/setup/"

# 4. Initialize Git in the isolated workspace
echo "Initializing Git repository in $DEST_DIR..."
cd "$DEST_DIR"
git init -b main
git config user.email "student@dataengineering-zoomcamp.local"
git config user.name "DE Zoomcamp Student"
git add .
git commit -m "Initial commit: Isolated AE Homework environment"

# 5. Configure VS Code Settings from template
echo "Configuring workspace-level VS Code settings..."
mkdir -p "$DEST_DIR/.vscode"

# Determine variables based on environment
ADAPTER_NAME="dbt-duckdb"
EXTRA_COPILOT_INFO=""
if [[ -n "${IS_GCP_ENV:-}" ]]; then
ADAPTER_NAME="dbt-bigquery"
EXTRA_COPILOT_INFO="Use gcloud auth login or gcp_key.json for authentication."
fi

# Simple replacement using sed (instead of envsubst to keep dependencies minimal)
sed -e "s|\\\${ADAPTER_NAME}|$ADAPTER_NAME|g" \
-e "s|\\\${EXTRA_COPILOT_INFO}|$EXTRA_COPILOT_INFO|g" \
"/opt/devcontainer/scripts/settings.json.template" > "$DEST_DIR/.vscode/settings.json"

echo "Common setup complete."
134 changes: 134 additions & 0 deletions .devcontainer/duckdb/scripts/download_homework_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env bash
set -euo pipefail

# Script to download only necessary taxi data for Module 4 homework
# Downloads CSV.gz files locally first, then loads into DuckDB to avoid GitHub rate limits

log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}

DB_PATH="${1:-/home/vscode/homework/taxi_rides_ny.duckdb}"
BASE_URL="https://github.com/DataTalksClub/nyc-tlc-data/releases/download"

log "Starting selective data download for homework..."
log "Database path: $DB_PATH"

# Create temp directory for downloads
TEMP_DIR=$(mktemp -d)
trap "rm -rf $TEMP_DIR" EXIT

log "Temporary directory: $TEMP_DIR"

# Initialize DuckDB database (main schema exists by default)
log "Initializing DuckDB database..."

# Helper function to download a file
download_file() {
local url="$1"
local dest="$2"
if command -v wget &> /dev/null; then
wget -q -O "$dest" "$url" || {
log "Failed to download $url"
return 1
}
elif command -v curl &> /dev/null; then
curl -sL -o "$dest" "$url" || {
log "Failed to download $url"
return 1
}
else
log "Error: Neither wget nor curl found"
return 1
fi
}

# ============================================
# Green Taxi: 2019-2020 (for Q5 YoY comparison and Q6)
# ============================================
log "Downloading Green Taxi 2019-2020 data (24 files)..."
log "This covers Question 5 (YoY growth) and Question 6 (April 2020 percentiles)"

# Download all green taxi files
for year in 2019 2020; do
for month in {01..12}; do
filename="green_tripdata_${year}-${month}.csv.gz"
log " Downloading $filename..."
download_file "${BASE_URL}/green/${filename}" "${TEMP_DIR}/${filename}"
done
done

# Load all downloaded green files into DuckDB
log "Loading Green Taxi data into DuckDB..."
duckdb "$DB_PATH" "CREATE OR REPLACE TABLE main.green_tripdata AS SELECT * FROM read_csv('${TEMP_DIR}/green_tripdata_*.csv.gz', auto_detect=true, compression='gzip', filename=true);"

GREEN_COUNT=$(duckdb "$DB_PATH" "SELECT COUNT(*) FROM main.green_tripdata;" -csv -noheader)
log "Green Taxi loaded: $GREEN_COUNT records"

# Clean up green files to save space
rm -f "${TEMP_DIR}"/green_tripdata_*.csv.gz

# ============================================
# Yellow Taxi: 2019-2020 (for Q5 YoY comparison and Q6)
# ============================================
log "Downloading Yellow Taxi 2019-2020 data (24 files)..."
log "This covers Question 5 (YoY growth) and Question 6 (April 2020 percentiles)"

# Download all yellow taxi files
for year in 2019 2020; do
for month in {01..12}; do
filename="yellow_tripdata_${year}-${month}.csv.gz"
log " Downloading $filename..."
download_file "${BASE_URL}/yellow/${filename}" "${TEMP_DIR}/${filename}"
done
done

# Load all downloaded yellow files into DuckDB
log "Loading Yellow Taxi data into DuckDB..."
duckdb "$DB_PATH" "CREATE OR REPLACE TABLE main.yellow_tripdata AS SELECT * FROM read_csv('${TEMP_DIR}/yellow_tripdata_*.csv.gz', auto_detect=true, compression='gzip', filename=true);"

YELLOW_COUNT=$(duckdb "$DB_PATH" "SELECT COUNT(*) FROM main.yellow_tripdata;" -csv -noheader)
log "Yellow Taxi loaded: $YELLOW_COUNT records"

# Clean up yellow files to save space
rm -f "${TEMP_DIR}"/yellow_tripdata_*.csv.gz

# ============================================
# FHV: November 2019 only (for Q7)
# ============================================
log "Downloading FHV November 2019 data..."
download_file "${BASE_URL}/fhv/fhv_tripdata_2019-11.csv.gz" "${TEMP_DIR}/fhv_tripdata_2019-11.csv.gz"

log "Loading FHV data into DuckDB..."
duckdb "$DB_PATH" "CREATE OR REPLACE TABLE main.fhv_tripdata AS SELECT * FROM read_csv('${TEMP_DIR}/fhv_tripdata_2019-11.csv.gz', auto_detect=true, compression='gzip');"

FHV_COUNT=$(duckdb "$DB_PATH" "SELECT COUNT(*) FROM main.fhv_tripdata;" -csv -noheader)
log "FHV loaded: $FHV_COUNT records"

# ============================================
# Summary
# ============================================
log ""
log "════════════════════════════════════════════════════════════"
log "✅ Data download and loading complete!"
log "════════════════════════════════════════════════════════════"
log ""
log "Database: $DB_PATH"
log "Size: $(du -h "$DB_PATH" | cut -f1)"
log ""
log "Records loaded:"
log " - Green Taxi (2019-2020): $GREEN_COUNT"
log " - Yellow Taxi (2019-2020): $YELLOW_COUNT"
log " - FHV (Nov 2019): $FHV_COUNT"
log " - TOTAL: $((GREEN_COUNT + YELLOW_COUNT + FHV_COUNT))"
log ""
log "Coverage:"
log " ✅ Question 5: Quarterly revenue YoY (2019-2020 ✅)"
log " ✅ Question 6: Fare percentiles (April 2020 ✅)"
log " ✅ Question 7: FHV travel time (November 2019 ✅)"
log ""
log "Note: Downloads only necessary months for homework (49 CSV files)"
log "instead of full dataset, with same homework answer accuracy."
log ""
log "You can now run: dbt build"
log "════════════════════════════════════════════════════════════"
55 changes: 55 additions & 0 deletions .devcontainer/duckdb/scripts/postCreate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env bash
set -euo pipefail

DEST_DIR="/home/vscode/homework"

# Debug logging
exec > >(tee -a "$DEST_DIR/setup.log") 2>&1
echo "Starting AE Homework: Local (DuckDB) setup..."
date

# 1. Run common setup (Repo root detection, File promotion, VS Code settings)
bash /opt/devcontainer/scripts/common_setup.sh

# 2. Copy variant-specific profiles
echo "Copying DuckDB profiles..."
mkdir -p "$DEST_DIR/profiles"
cp "/opt/devcontainer/dbt/profiles.yml" "$DEST_DIR/profiles/"

# 3. Download selective CSV files for homework
DB_DEST="$DEST_DIR/taxi_rides_ny.duckdb"

if [[ ! -f "$DB_DEST" ]]; then
echo "Loading taxi data for homework (selective download)..."
echo "Downloading 49 CSV files (2019-2020 data, ~2.5 min)..."
echo ""

# Run the selective download script
bash /opt/devcontainer/scripts/download_homework_data.sh "$DB_DEST"
else
echo "Database already exists, skipping download."
fi

# 4. Diagnostics and Deps
echo "dbt Version:"
dbt --version
cd "$DEST_DIR"
dbt deps

# 5. Copy setup guide and open it
if [[ -f "/opt/devcontainer/setup_guide.md" ]]; then
echo "Copying setup guide..."
cp "/opt/devcontainer/setup_guide.md" "$DEST_DIR/SETUP_GUIDE.md"
# Try to open the setup guide (will fail silently if not in VS Code)
code "$DEST_DIR/SETUP_GUIDE.md" --reuse-window 2>/dev/null || true
fi

echo "------------------------------------------------------------------"
echo "Environment Ready!"
echo "Workspace: $DEST_DIR"
echo ""
echo "Taxi data (Yellow/Green 2019-2020, FHV 2019) has been pre-loaded."
echo "You can run 'dbt build' or 'dbt show' immediately."
echo ""
echo "📖 Setup guide: $DEST_DIR/SETUP_GUIDE.md"
echo "------------------------------------------------------------------"
Loading