bigquery_openalex/bq.sh at main · Innovation-Information-Initiative/bigquery_openalex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env bash
set -euo pipefail

# Load configuration
if [ ! -f .env ]; then
    echo "Error: .env file not found. Copy .env.example to .env and fill in your values."
    exit 1
fi
source .env

DATASETS=(authors awards concepts domains fields funders institutions publishers sources subfields topics works)
MAX_BAD_RECORDS=0

usage() {
    echo "Usage: bash bq.sh [-b N] <dataset> [dataset ...]"
    echo ""
    echo "Creates BigQuery tables from GCS data."
    echo "Tables are named <dataset>_<VERSION> (e.g. works_${VERSION})."
    echo ""
    echo "Options:"
    echo "  -b N    Max bad records to skip per file (default: 0)"
    echo ""
    echo "Available datasets: ${DATASETS[*]}"
    echo "Use 'all' to load everything."
    echo ""
    echo "Examples:"
    echo "  bash bq.sh works"
    echo "  bash bq.sh -b 100 works"
    echo "  bash bq.sh all"
    exit 1
}

while getopts ":b:" opt; do
    case $opt in
        b) MAX_BAD_RECORDS="$OPTARG" ;;
        *) usage ;;
    esac
done
shift $((OPTIND - 1))

if [ $# -eq 0 ]; then
    usage
fi

for var in VERSION PROJECT_ID GCS_BUCKET BQ_DATASET; do
    if [ -z "${!var:-}" ]; then
        echo "Error: $var is not set in .env"
        exit 1
    fi
done

# Resolve dataset list
if [ "$1" = "all" ]; then
    selected=("${DATASETS[@]}")
else
    selected=("$@")
    for ds in "${selected[@]}"; do
        valid=false
        for known in "${DATASETS[@]}"; do
            if [ "$ds" = "$known" ]; then
                valid=true
                break
            fi
        done
        if [ "$valid" = false ]; then
            echo "Error: unknown dataset '$ds'"
            echo "Available datasets: ${DATASETS[*]}"
            exit 1
        fi
    done
fi

for ds in "${selected[@]}"; do
    TABLE="${ds}_${VERSION}"
    GCS_PATH="gs://${GCS_BUCKET}/${VERSION}/${ds}/"

    # Find schema file: try exact version first, then most recent available
    SCHEMA=""
    # Build list: exact version first, then all schema dirs sorted descending
    SCHEMA_DIRS=("schemas/${VERSION}")
    for d in $(ls -d schemas/*/ 2>/dev/null | sort -r); do
        dir="${d%/}"
        if [ "$dir" != "schemas/${VERSION}" ]; then
            SCHEMA_DIRS+=("$dir")
        fi
    done
    for schema_dir in "${SCHEMA_DIRS[@]}"; do
        for f in "${schema_dir}/${ds}.schema.json" "${schema_dir}/${ds}_schema.json"; do
            if [ -f "$f" ]; then
                SCHEMA="$f"
                break 2
            fi
        done
    done

    if [ -z "$SCHEMA" ]; then
        echo "Warning: no schema found for ${ds}, skipping"
        continue
    fi

    echo "Loading ${BQ_DATASET}.${TABLE} from ${GCS_PATH} (schema: ${SCHEMA}) ..."
    bq load \
        --source_format=NEWLINE_DELIMITED_JSON \
        --project_id="${PROJECT_ID}" \
        --replace=true \
        --max_bad_records="${MAX_BAD_RECORDS}" \
        "${BQ_DATASET}.${TABLE}" \
        "${GCS_PATH}*.gz" \
        "${SCHEMA}"

    echo "${ds} done."
done

echo "Done."