-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbq.sh
More file actions
114 lines (101 loc) · 2.92 KB
/
bq.sh
File metadata and controls
114 lines (101 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env bash
set -euo pipefail
# Load configuration
if [ ! -f .env ]; then
echo "Error: .env file not found. Copy .env.example to .env and fill in your values."
exit 1
fi
source .env
DATASETS=(authors awards concepts domains fields funders institutions publishers sources subfields topics works)
MAX_BAD_RECORDS=0
usage() {
echo "Usage: bash bq.sh [-b N] <dataset> [dataset ...]"
echo ""
echo "Creates BigQuery tables from GCS data."
echo "Tables are named <dataset>_<VERSION> (e.g. works_${VERSION})."
echo ""
echo "Options:"
echo " -b N Max bad records to skip per file (default: 0)"
echo ""
echo "Available datasets: ${DATASETS[*]}"
echo "Use 'all' to load everything."
echo ""
echo "Examples:"
echo " bash bq.sh works"
echo " bash bq.sh -b 100 works"
echo " bash bq.sh all"
exit 1
}
while getopts ":b:" opt; do
case $opt in
b) MAX_BAD_RECORDS="$OPTARG" ;;
*) usage ;;
esac
done
shift $((OPTIND - 1))
if [ $# -eq 0 ]; then
usage
fi
for var in VERSION PROJECT_ID GCS_BUCKET BQ_DATASET; do
if [ -z "${!var:-}" ]; then
echo "Error: $var is not set in .env"
exit 1
fi
done
# Resolve dataset list
if [ "$1" = "all" ]; then
selected=("${DATASETS[@]}")
else
selected=("$@")
for ds in "${selected[@]}"; do
valid=false
for known in "${DATASETS[@]}"; do
if [ "$ds" = "$known" ]; then
valid=true
break
fi
done
if [ "$valid" = false ]; then
echo "Error: unknown dataset '$ds'"
echo "Available datasets: ${DATASETS[*]}"
exit 1
fi
done
fi
for ds in "${selected[@]}"; do
TABLE="${ds}_${VERSION}"
GCS_PATH="gs://${GCS_BUCKET}/${VERSION}/${ds}/"
# Find schema file: try exact version first, then most recent available
SCHEMA=""
# Build list: exact version first, then all schema dirs sorted descending
SCHEMA_DIRS=("schemas/${VERSION}")
for d in $(ls -d schemas/*/ 2>/dev/null | sort -r); do
dir="${d%/}"
if [ "$dir" != "schemas/${VERSION}" ]; then
SCHEMA_DIRS+=("$dir")
fi
done
for schema_dir in "${SCHEMA_DIRS[@]}"; do
for f in "${schema_dir}/${ds}.schema.json" "${schema_dir}/${ds}_schema.json"; do
if [ -f "$f" ]; then
SCHEMA="$f"
break 2
fi
done
done
if [ -z "$SCHEMA" ]; then
echo "Warning: no schema found for ${ds}, skipping"
continue
fi
echo "Loading ${BQ_DATASET}.${TABLE} from ${GCS_PATH} (schema: ${SCHEMA}) ..."
bq load \
--source_format=NEWLINE_DELIMITED_JSON \
--project_id="${PROJECT_ID}" \
--replace=true \
--max_bad_records="${MAX_BAD_RECORDS}" \
"${BQ_DATASET}.${TABLE}" \
"${GCS_PATH}*.gz" \
"${SCHEMA}"
echo "${ds} done."
done
echo "Done."