-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathschema.sh
More file actions
136 lines (117 loc) · 3.68 KB
/
schema.sh
File metadata and controls
136 lines (117 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env bash
set -euo pipefail
# Load configuration
if [ ! -f .env ]; then
echo "Error: .env file not found. Copy .env.example to .env and fill in your values."
exit 1
fi
source .env
if [ -z "${VERSION:-}" ]; then
echo "Error: VERSION is not set in .env"
exit 1
fi
DATASETS=(authors awards concepts domains fields funders institutions publishers sources subfields topics works)
SCHEMA_DIR="schemas/${VERSION}"
MAX_FILES=0 # 0 means no limit
usage() {
echo "Usage: bash schema.sh [-m max_files] <dataset> [dataset ...]"
echo ""
echo "Generates BigQuery schema from downloaded data using bigquery-schema-generator."
echo "Schemas are saved to ${SCHEMA_DIR}/."
echo ""
echo "Options:"
echo " -m N Process at most N files per dataset (default: all files)"
echo ""
echo "Available datasets: ${DATASETS[*]}"
echo "Use 'all' to generate all schemas."
echo ""
echo "Examples:"
echo " bash schema.sh authors"
echo " bash schema.sh -m 5 works"
echo " bash schema.sh -m 10 all"
exit 1
}
while getopts ":m:" opt; do
case $opt in
m) MAX_FILES="$OPTARG" ;;
*) usage ;;
esac
done
shift $((OPTIND - 1))
if [ $# -eq 0 ]; then
usage
fi
if ! command -v generate-schema &> /dev/null; then
echo "Error: generate-schema not found. Run 'pixi run setup' first."
exit 1
fi
# Resolve dataset list
if [ "$1" = "all" ]; then
selected=("${DATASETS[@]}")
else
selected=("$@")
for ds in "${selected[@]}"; do
valid=false
for known in "${DATASETS[@]}"; do
if [ "$ds" = "$known" ]; then
valid=true
break
fi
done
if [ "$valid" = false ]; then
echo "Error: unknown dataset '$ds'"
echo "Available datasets: ${DATASETS[*]}"
exit 1
fi
done
fi
mkdir -p "$SCHEMA_DIR"
for ds in "${selected[@]}"; do
# Use converted data if available, otherwise raw
CONVERTED_DIR="data/converted/${VERSION}/${ds}"
RAW_DIR="data/raw/${VERSION}/${ds}"
if [ -d "$CONVERTED_DIR" ]; then
DATA_DIR="$CONVERTED_DIR"
elif [ -d "$RAW_DIR" ]; then
DATA_DIR="$RAW_DIR"
else
echo "Warning: no data found for ${ds}, skipping"
continue
fi
SCHEMA_FILE="${SCHEMA_DIR}/${ds}.schema.json"
echo "Generating schema for ${ds} ..."
# Sample 1000 records from every .gz file to capture all field variations
TMPFILE=$(mktemp)
python3 -c "
import gzip, glob, sys, os
raw_dir = sys.argv[1]
max_files = int(sys.argv[2])
files = sorted(glob.glob(os.path.join(raw_dir, '**', '*.gz'), recursive=True))
if not files:
print(f'Warning: no .gz files found in {raw_dir}', file=sys.stderr)
sys.exit(1)
total = len(files)
if max_files > 0:
files = files[:max_files]
print(f' Sampling 1000 records from {len(files)} of {total} files (limited by -m {max_files}) ...', file=sys.stderr)
else:
print(f' Sampling 1000 records from each of {len(files)} files ...', file=sys.stderr)
for path in files:
with gzip.open(path, 'rt') as f:
for i, line in enumerate(f):
if i >= 1000: break
sys.stdout.write(line)
" "$DATA_DIR" "$MAX_FILES" > "$TMPFILE"
LINES=$(wc -l < "$TMPFILE")
echo " Sampled ${LINES} records total"
SCHEMA_FLAGS="--keep_nulls --ignore_invalid_lines"
if [ -f "$SCHEMA_FILE" ]; then
echo " Updating existing schema incrementally"
SCHEMA_FLAGS="$SCHEMA_FLAGS --existing_schema_path $SCHEMA_FILE"
fi
generate-schema $SCHEMA_FLAGS < "$TMPFILE" > "${SCHEMA_FILE}.tmp"
mv "${SCHEMA_FILE}.tmp" "$SCHEMA_FILE"
rm -f "$TMPFILE"
echo " Saved to ${SCHEMA_FILE}"
done
echo "Done."