|
| 1 | +#!/usr/bin/env bash |
| 2 | +set -euo pipefail |
| 3 | + |
| 4 | +# Constants |
| 5 | +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 6 | +VENV_DIR="$REPO_ROOT/venv" |
| 7 | +PYTHON="python3" |
| 8 | +DATA_DIR="$REPO_ROOT/data" |
| 9 | +WIKTIONARY_URL="https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2" |
| 10 | +NGGRAM_BASE="https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-" |
| 11 | +NGGRAM_SHARDS=(a b c d e f g h i j k l m n o p q r s t u v w x y z) |
| 12 | +ARTIFACT_CACHE="$REPO_ROOT/artifacts/cache.tar.gz" |
| 13 | +ARTIFACT_META_DIR="$REPO_ROOT/artifacts/metadata" |
| 14 | +SHARD_RECORD="$ARTIFACT_META_DIR/ngram_shards.txt" |
| 15 | + |
| 16 | +log() { |
| 17 | + printf '[setup] %s\n' "$1" |
| 18 | +} |
| 19 | + |
| 20 | +ensure_python() { |
| 21 | + if ! command -v "$PYTHON" >/dev/null 2>&1; then |
| 22 | + log "python3 not found; install Python 3 before running this script" |
| 23 | + exit 1 |
| 24 | + fi |
| 25 | +} |
| 26 | + |
| 27 | +ensure_http_clients() { |
| 28 | + if command -v curl >/dev/null 2>&1; then |
| 29 | + HTTP_CLIENT="curl" |
| 30 | + elif command -v wget >/dev/null 2>&1; then |
| 31 | + HTTP_CLIENT="wget" |
| 32 | + else |
| 33 | + log "neither curl nor wget is available; install one to download datasets" |
| 34 | + exit 1 |
| 35 | + fi |
| 36 | +} |
| 37 | + |
| 38 | +create_venv() { |
| 39 | + if [ ! -d "$VENV_DIR" ]; then |
| 40 | + log "creating virtual environment" |
| 41 | + "$PYTHON" -m venv "$VENV_DIR" |
| 42 | + fi |
| 43 | + # shellcheck disable=SC1091 |
| 44 | + source "$VENV_DIR/bin/activate" |
| 45 | + log "upgrading pip" |
| 46 | + pip install --upgrade pip |
| 47 | +} |
| 48 | + |
| 49 | +install_requirements() { |
| 50 | + log "installing Python dependencies" |
| 51 | + pip install -r "$REPO_ROOT/requirements.txt" |
| 52 | +} |
| 53 | + |
| 54 | +ensure_dirs() { |
| 55 | + log "creating data and artifact directories" |
| 56 | + mkdir -p \ |
| 57 | + "$REPO_ROOT/data/wiktionary" \ |
| 58 | + "$REPO_ROOT/data/ngrams" \ |
| 59 | + "$REPO_ROOT/artifacts/lemmas" \ |
| 60 | + "$REPO_ROOT/artifacts/years" \ |
| 61 | + "$REPO_ROOT/artifacts/trie" \ |
| 62 | + "$REPO_ROOT/artifacts/layout" \ |
| 63 | + "$REPO_ROOT/outputs/frames" \ |
| 64 | + "$ARTIFACT_META_DIR" |
| 65 | +} |
| 66 | + |
| 67 | +restore_artifact_cache() { |
| 68 | + if [ -f "$ARTIFACT_CACHE" ]; then |
| 69 | + log "restoring cached artifacts" |
| 70 | + tar -xzf "$ARTIFACT_CACHE" -C "$REPO_ROOT" |
| 71 | + fi |
| 72 | +} |
| 73 | + |
| 74 | +checkpoint_artifacts() { |
| 75 | + if [ -d "$REPO_ROOT/artifacts" ]; then |
| 76 | + log "saving artifact cache" |
| 77 | + mkdir -p "$REPO_ROOT/artifacts" |
| 78 | + tar --exclude="$(basename "$ARTIFACT_CACHE")" -czf "$ARTIFACT_CACHE" -C "$REPO_ROOT" artifacts |
| 79 | + fi |
| 80 | +} |
| 81 | + |
| 82 | +download_wiktionary() { |
| 83 | + local target="$DATA_DIR/wiktionary/enwiktionary-latest-pages-articles.xml.bz2" |
| 84 | + if [ -f "$target" ]; then |
| 85 | + log "wiktionary dump already present" |
| 86 | + return |
| 87 | + fi |
| 88 | + log "downloading wiktionary dump" |
| 89 | + mkdir -p "$(dirname "$target")" |
| 90 | + if [ "$HTTP_CLIENT" = "curl" ]; then |
| 91 | + curl -L "$WIKTIONARY_URL" -o "$target" |
| 92 | + else |
| 93 | + wget -O "$target" "$WIKTIONARY_URL" |
| 94 | + fi |
| 95 | +} |
| 96 | + |
| 97 | +is_gzip() { |
| 98 | + local file="$1" |
| 99 | + "$PYTHON" -c 'import io, sys |
| 100 | +from pathlib import Path |
| 101 | +path = Path(sys.argv[1]) |
| 102 | +try: |
| 103 | + with path.open("rb") as handle: |
| 104 | + head = handle.read(2) |
| 105 | + sys.exit(0 if head == b"\x1f\x8b" else 1) |
| 106 | +except FileNotFoundError: |
| 107 | + sys.exit(1) |
| 108 | +except OSError: |
| 109 | + sys.exit(1)' "$file" |
| 110 | +} |
| 111 | + |
| 112 | +download_ngrams() { |
| 113 | + mkdir -p "$DATA_DIR/ngrams" |
| 114 | + for legacy in "$DATA_DIR"/ngrams/eng-all-1gram-*.gz; do |
| 115 | + if [ -e "$legacy" ]; then |
| 116 | + log "removing legacy shard $(basename "$legacy")" |
| 117 | + rm -f "$legacy" |
| 118 | + fi |
| 119 | + done |
| 120 | + for shard in "${NGGRAM_SHARDS[@]}"; do |
| 121 | + local name="${NGGRAM_BASE##*/}${shard}.gz" |
| 122 | + local target="$DATA_DIR/ngrams/${name}" |
| 123 | + local url="${NGGRAM_BASE}${shard}.gz" |
| 124 | + if [ -f "$target" ]; then |
| 125 | + if is_gzip "$target"; then |
| 126 | + log "ngram shard ${name} already present" |
| 127 | + continue |
| 128 | + fi |
| 129 | + log "existing shard ${name} is invalid; re-downloading" |
| 130 | + rm -f "$target" |
| 131 | + fi |
| 132 | + log "downloading ngram shard ${name}" |
| 133 | + if [ "$HTTP_CLIENT" = "curl" ]; then |
| 134 | + curl -L "$url" -o "$target" |
| 135 | + else |
| 136 | + wget -O "$target" "$url" |
| 137 | + fi |
| 138 | + if ! is_gzip "$target"; then |
| 139 | + log "downloaded shard ${name} is not a valid gzip; please check the URL" |
| 140 | + exit 1 |
| 141 | + fi |
| 142 | + done |
| 143 | +} |
| 144 | + |
| 145 | +run_pipelines() { |
| 146 | + # shellcheck disable=SC1091 |
| 147 | + source "$VENV_DIR/bin/activate" |
| 148 | + local expected_shards="${NGGRAM_SHARDS[*]}" |
| 149 | + local rebuild=0 |
| 150 | + if [ ! -f "$REPO_ROOT/artifacts/trie/prefix_counts.jsonl" ]; then |
| 151 | + rebuild=1 |
| 152 | + elif [ ! -f "$SHARD_RECORD" ]; then |
| 153 | + rebuild=1 |
| 154 | + else |
| 155 | + local recorded |
| 156 | + recorded=$(<"$SHARD_RECORD") |
| 157 | + if [ "$recorded" != "$expected_shards" ]; then |
| 158 | + rebuild=1 |
| 159 | + fi |
| 160 | + fi |
| 161 | + if [ "$rebuild" -eq 1 ]; then |
| 162 | + log "extracting lemmas from wiktionary" |
| 163 | + python -m src.ingest.wiktionary_extract "$DATA_DIR/wiktionary/enwiktionary-latest-pages-articles.xml.bz2" "$REPO_ROOT/artifacts/lemmas/lemmas.tsv" |
| 164 | + log "computing first-year data" |
| 165 | + python -m src.ingest.ngram_first_year "$REPO_ROOT/artifacts/lemmas/lemmas.tsv" "$DATA_DIR/ngrams" "$REPO_ROOT/artifacts/years/first_years.tsv" |
| 166 | + log "building prefix trie" |
| 167 | + python -m src.build.build_prefix_trie "$REPO_ROOT/artifacts/years/first_years.tsv" "$REPO_ROOT/artifacts/trie/prefix_counts.jsonl" |
| 168 | + printf '%s |
| 169 | +' "$expected_shards" >"$SHARD_RECORD" |
| 170 | + checkpoint_artifacts |
| 171 | + else |
| 172 | + log "cached prefix counts match shard set; skipping ingest and build" |
| 173 | + fi |
| 174 | + log "rendering frames" |
| 175 | + python -m src.viz.render_frames "$REPO_ROOT/artifacts/trie/prefix_counts.jsonl" "$REPO_ROOT/outputs/frames" |
| 176 | + log "encoding video and gif" |
| 177 | + python -m src.viz.encode "$REPO_ROOT/outputs/frames" "$REPO_ROOT/outputs/english_trie_timelapse.mp4" "$REPO_ROOT/outputs/english_trie_timelapse.gif" |
| 178 | +} |
| 179 | + |
| 180 | +main() { |
| 181 | + ensure_python |
| 182 | + ensure_http_clients |
| 183 | + create_venv |
| 184 | + install_requirements |
| 185 | + ensure_dirs |
| 186 | + restore_artifact_cache |
| 187 | + download_wiktionary |
| 188 | + download_ngrams |
| 189 | + run_pipelines |
| 190 | + log "setup complete. activate with 'source venv/bin/activate'" |
| 191 | +} |
| 192 | + |
| 193 | +main "$@" |
0 commit comments