Skip to content

Commit 92f82b2

Browse files
authored
Add CKAN Data Pipeline workflow configuration
This YAML file defines a GitHub Actions workflow for a CKAN Data Pipeline, including multiple scripts for processing data and generating reports.
1 parent cd036d4 commit 92f82b2

File tree

1 file changed

+385
-0
lines changed

1 file changed

+385
-0
lines changed
Lines changed: 385 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,385 @@
1+
name: CKAN Data Pipeline
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
skip_scripts:
7+
description: 'Comma-separated list of script numbers to skip (e.g., "2,4")'
8+
required: false
9+
default: ''
10+
type: string
11+
process_rows:
12+
description: 'Number of rows to process (leave empty for all)'
13+
required: false
14+
default: ''
15+
type: string
16+
17+
env:
18+
PYTHONUNBUFFERED: 1
19+
PYTHONIOENCODING: utf-8
20+
21+
jobs:
22+
setup:
23+
runs-on: ubuntu-latest
24+
outputs:
25+
skip_list: ${{ steps.parse_skip.outputs.skip_list }}
26+
steps:
27+
- name: Parse skip list
28+
id: parse_skip
29+
run: |
30+
skip_input="${{ github.event.inputs.skip_scripts }}"
31+
if [ -n "$skip_input" ]; then
32+
echo "skip_list=[$skip_input]" >> $GITHUB_OUTPUT
33+
else
34+
echo "skip_list=[]" >> $GITHUB_OUTPUT
35+
fi
36+
37+
pipeline:
38+
runs-on: ubuntu-latest
39+
needs: setup
40+
continue-on-error: false
41+
42+
steps:
43+
- name: Checkout code
44+
uses: actions/checkout@v4
45+
46+
- name: Set up Python
47+
uses: actions/setup-python@v4
48+
with:
49+
python-version: '3.9'
50+
cache: 'pip'
51+
52+
- name: Install dependencies
53+
run: |
54+
cd sites-data-fetch
55+
pip install -r requirements.txt
56+
57+
- name: Verify initial data file
58+
run: |
59+
cd sites-data-fetch
60+
if [ ! -f "0.csv" ]; then
61+
echo "Error: 0.csv not found"
62+
exit 1
63+
fi
64+
echo "Initial data file found: 0.csv"
65+
wc -l 0.csv
66+
67+
- name: Run Script 1 - Name Processing
68+
id: script1
69+
continue-on-error: true
70+
run: |
71+
cd sites-data-fetch
72+
skip_list='${{ needs.setup.outputs.skip_list }}'
73+
if echo "$skip_list" | grep -q "1"; then
74+
echo "Skipping script 1 as requested"
75+
if [ ! -f "1.csv" ]; then
76+
echo "Warning: 1.csv doesn't exist and script 1 was skipped"
77+
cp 0.csv 1.csv
78+
fi
79+
exit 0
80+
fi
81+
82+
echo "Starting Script 1: Name Processing"
83+
python 1-nameProcess.py
84+
85+
if [ $? -eq 0 ]; then
86+
echo "Script 1 completed successfully"
87+
if [ -f "1.csv" ]; then
88+
echo "Output file created: 1.csv"
89+
wc -l 1.csv
90+
else
91+
echo "Warning: Expected output file 1.csv not found"
92+
fi
93+
else
94+
echo "Script 1 failed with exit code $?"
95+
# Create fallback file to allow pipeline to continue
96+
if [ ! -f "1.csv" ]; then
97+
cp 0.csv 1.csv
98+
echo "Created fallback 1.csv from 0.csv to continue pipeline"
99+
fi
100+
exit 1
101+
fi
102+
103+
- name: Run Script 2 - CKAN Action API
104+
id: script2
105+
continue-on-error: true
106+
run: |
107+
cd sites-data-fetch
108+
skip_list='${{ needs.setup.outputs.skip_list }}'
109+
if echo "$skip_list" | grep -q "2"; then
110+
echo "Skipping script 2 as requested"
111+
if [ ! -f "2.csv" ]; then
112+
echo "Warning: 2.csv doesn't exist and script 2 was skipped"
113+
cp 1.csv 2.csv
114+
fi
115+
exit 0
116+
fi
117+
118+
echo "Starting Script 2: CKAN Action API"
119+
python 2-CKANActionAPI.py
120+
121+
if [ $? -eq 0 ]; then
122+
echo "Script 2 completed successfully"
123+
if [ -f "2.csv" ]; then
124+
echo "Output file created: 2.csv"
125+
wc -l 2.csv
126+
fi
127+
else
128+
echo "Script 2 failed with exit code $?"
129+
if [ ! -f "2.csv" ]; then
130+
cp 1.csv 2.csv
131+
echo "Created fallback 2.csv from 1.csv to continue pipeline"
132+
fi
133+
exit 1
134+
fi
135+
136+
- name: Run Script 3 - Site Type Detection
137+
id: script3
138+
continue-on-error: true
139+
run: |
140+
cd sites-data-fetch
141+
skip_list='${{ needs.setup.outputs.skip_list }}'
142+
if echo "$skip_list" | grep -q "3"; then
143+
echo "Skipping script 3 as requested"
144+
if [ ! -f "3.csv" ]; then
145+
echo "Warning: 3.csv doesn't exist and script 3 was skipped"
146+
cp 2.csv 3.csv
147+
fi
148+
exit 0
149+
fi
150+
151+
echo "Starting Script 3: Site Type Detection"
152+
python 3-siteType.py
153+
154+
if [ $? -eq 0 ]; then
155+
echo "Script 3 completed successfully"
156+
if [ -f "3.csv" ]; then
157+
echo "Output file created: 3.csv"
158+
wc -l 3.csv
159+
fi
160+
else
161+
echo "Script 3 failed with exit code $?"
162+
if [ ! -f "3.csv" ]; then
163+
cp 2.csv 3.csv
164+
echo "Created fallback 3.csv from 2.csv to continue pipeline"
165+
fi
166+
exit 1
167+
fi
168+
169+
- name: Run Script 4 - Description Extraction
170+
id: script4
171+
continue-on-error: true
172+
run: |
173+
cd sites-data-fetch
174+
skip_list='${{ needs.setup.outputs.skip_list }}'
175+
if echo "$skip_list" | grep -q "4"; then
176+
echo "Skipping script 4 as requested"
177+
if [ ! -f "4.csv" ]; then
178+
echo "Warning: 4.csv doesn't exist and script 4 was skipped"
179+
cp 3.csv 4.csv
180+
fi
181+
exit 0
182+
fi
183+
184+
echo "Starting Script 4: Description Extraction"
185+
python 4-description.py
186+
187+
if [ $? -eq 0 ]; then
188+
echo "Script 4 completed successfully"
189+
if [ -f "4.csv" ]; then
190+
echo "Output file created: 4.csv"
191+
wc -l 4.csv
192+
fi
193+
else
194+
echo "Script 4 failed with exit code $?"
195+
if [ ! -f "4.csv" ]; then
196+
cp 3.csv 4.csv
197+
echo "Created fallback 4.csv from 3.csv to continue pipeline"
198+
fi
199+
exit 1
200+
fi
201+
202+
- name: Run Script 5 - Location Analysis
203+
id: script5
204+
continue-on-error: true
205+
env:
206+
OPEN_ROUTER_KEY: ${{ secrets.OPEN_ROUTER_KEY }}
207+
run: |
208+
cd sites-data-fetch
209+
skip_list='${{ needs.setup.outputs.skip_list }}'
210+
if echo "$skip_list" | grep -q "5"; then
211+
echo "Skipping script 5 as requested"
212+
if [ ! -f "5.csv" ]; then
213+
echo "Warning: 5.csv doesn't exist and script 5 was skipped"
214+
cp 4.csv 5.csv
215+
fi
216+
exit 0
217+
fi
218+
219+
if [ -z "$OPEN_ROUTER_KEY" ]; then
220+
echo "Error: OPEN_ROUTER_KEY secret not set, required for script 5"
221+
cp 4.csv 5.csv
222+
echo "Created fallback 5.csv from 4.csv due to missing API key"
223+
exit 1
224+
fi
225+
226+
echo "Starting Script 5: Location Analysis"
227+
228+
# Modify rows to process if specified
229+
process_rows="${{ github.event.inputs.process_rows }}"
230+
if [ -n "$process_rows" ]; then
231+
echo "Limiting processing to $process_rows rows"
232+
# This would require modifying the script or using environment variables
233+
# For now, we'll run as-is since the script has ROWS_TO_PROCESS = None
234+
fi
235+
236+
python 5-locationAnalyser.py
237+
238+
if [ $? -eq 0 ]; then
239+
echo "Script 5 completed successfully"
240+
if [ -f "5.csv" ]; then
241+
echo "Output file created: 5.csv"
242+
wc -l 5.csv
243+
fi
244+
else
245+
echo "Script 5 failed with exit code $?"
246+
if [ ! -f "5.csv" ]; then
247+
cp 4.csv 5.csv
248+
echo "Created fallback 5.csv from 4.csv to continue pipeline"
249+
fi
250+
exit 1
251+
fi
252+
253+
- name: Run Script 6 - Geocoding
254+
id: script6
255+
continue-on-error: true
256+
run: |
257+
cd sites-data-fetch
258+
skip_list='${{ needs.setup.outputs.skip_list }}'
259+
if echo "$skip_list" | grep -q "6"; then
260+
echo "Skipping script 6 as requested"
261+
if [ ! -f "6.csv" ]; then
262+
echo "Warning: 6.csv doesn't exist and script 6 was skipped"
263+
cp 5.csv 6.csv
264+
fi
265+
exit 0
266+
fi
267+
268+
echo "Starting Script 6: Geocoding"
269+
python 6-geocode.py
270+
271+
if [ $? -eq 0 ]; then
272+
echo "Script 6 completed successfully"
273+
if [ -f "6.csv" ]; then
274+
echo "Output file created: 6.csv"
275+
wc -l 6.csv
276+
fi
277+
else
278+
echo "Script 6 failed with exit code $?"
279+
if [ ! -f "6.csv" ]; then
280+
cp 5.csv 6.csv
281+
echo "Created fallback 6.csv from 5.csv to continue pipeline"
282+
fi
283+
exit 1
284+
fi
285+
286+
- name: Run Script 7 - Timestamp
287+
id: script7
288+
continue-on-error: true
289+
run: |
290+
cd sites-data-fetch
291+
skip_list='${{ needs.setup.outputs.skip_list }}'
292+
if echo "$skip_list" | grep -q "7"; then
293+
echo "Skipping script 7 as requested"
294+
if [ ! -f "7.csv" ]; then
295+
echo "Warning: 7.csv doesn't exist and script 7 was skipped"
296+
cp 6.csv 7.csv
297+
fi
298+
exit 0
299+
fi
300+
301+
echo "Starting Script 7: Timestamp"
302+
python 7-tstamp.py
303+
304+
if [ $? -eq 0 ]; then
305+
echo "Script 7 completed successfully"
306+
if [ -f "7.csv" ]; then
307+
echo "Output file created: 7.csv"
308+
wc -l 7.csv
309+
fi
310+
else
311+
echo "Script 7 failed with exit code $?"
312+
if [ ! -f "7.csv" ]; then
313+
cp 6.csv 7.csv
314+
echo "Created fallback 7.csv from 6.csv to continue pipeline"
315+
fi
316+
exit 1
317+
fi
318+
319+
- name: Generate Pipeline Report
320+
if: always()
321+
run: |
322+
echo "## CKAN Data Pipeline Report" >> $GITHUB_STEP_SUMMARY
323+
echo "" >> $GITHUB_STEP_SUMMARY
324+
echo "| Script | Status | Notes |" >> $GITHUB_STEP_SUMMARY
325+
echo "|--------|--------|-------|" >> $GITHUB_STEP_SUMMARY
326+
327+
cd sites-data-fetch
328+
329+
# Check each script outcome
330+
scripts=("1-nameProcess" "2-CKANActionAPI" "3-siteType" "4-description" "5-locationAnalyser" "6-geocode" "7-tstamp")
331+
outcomes=("${{ steps.script1.outcome }}" "${{ steps.script2.outcome }}" "${{ steps.script3.outcome }}" "${{ steps.script4.outcome }}" "${{ steps.script5.outcome }}" "${{ steps.script6.outcome }}" "${{ steps.script7.outcome }}")
332+
333+
for i in "${!scripts[@]}"; do
334+
script_num=$((i + 1))
335+
script_name="${scripts[$i]}"
336+
outcome="${outcomes[$i]}"
337+
output_file="${script_num}.csv"
338+
339+
if [ "$outcome" = "success" ]; then
340+
status="✅ Success"
341+
elif [ "$outcome" = "failure" ]; then
342+
status="❌ Failed"
343+
elif [ "$outcome" = "skipped" ]; then
344+
status="⏭️ Skipped"
345+
else
346+
status="❓ Unknown"
347+
fi
348+
349+
if [ -f "$output_file" ]; then
350+
row_count=$(tail -n +2 "$output_file" | wc -l)
351+
notes="Output: $output_file ($row_count rows)"
352+
else
353+
notes="No output file"
354+
fi
355+
356+
echo "| $script_num - $script_name | $status | $notes |" >> $GITHUB_STEP_SUMMARY
357+
done
358+
359+
echo "" >> $GITHUB_STEP_SUMMARY
360+
echo "### File Progression" >> $GITHUB_STEP_SUMMARY
361+
for i in {0..7}; do
362+
if [ -f "${i}.csv" ]; then
363+
row_count=$(tail -n +2 "${i}.csv" | wc -l)
364+
file_size=$(ls -lh "${i}.csv" | awk '{print $5}')
365+
echo "- ${i}.csv: $row_count rows, $file_size" >> $GITHUB_STEP_SUMMARY
366+
fi
367+
done
368+
369+
- name: Upload Pipeline Artifacts
370+
uses: actions/upload-artifact@v3
371+
if: always()
372+
with:
373+
name: ckan-pipeline-results
374+
path: |
375+
sites-data-fetch/*.csv
376+
sites-data-fetch/*.log
377+
retention-days: 30
378+
379+
- name: Upload Final Dataset
380+
uses: actions/upload-artifact@v3
381+
if: always()
382+
with:
383+
name: final-dataset
384+
path: sites-data-fetch/7.csv
385+
retention-days: 90

0 commit comments

Comments
 (0)