-
Notifications
You must be signed in to change notification settings - Fork 40
288 lines (240 loc) · 10.5 KB
/
update-and-process-tranco.yml
File metadata and controls
288 lines (240 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
name: Update and Process Tranco CSV
on:
schedule:
# Runs at 00:00 UTC on the 1st of every month
- cron: '0 0 1 * *'
# Allow manual triggering
workflow_dispatch:
permissions:
contents: write
pull-requests: write
jobs:
update-and-process-tranco:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 1
- name: Set up date variables
id: date
run: |
echo "today=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
- name: Fetch Tranco list ID
id: tranco-id
run: |
# Maximum retry count
MAX_RETRIES=5
RETRY_COUNT=0
SUCCESS=false
while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ "$SUCCESS" = "false" ]; do
# Save the curl verbose output to a log file and the response to another file
echo "Attempt $(($RETRY_COUNT + 1)) of $MAX_RETRIES: Fetching Tranco list ID..."
# Use -w to capture HTTP status code
HTTP_STATUS=$(curl -s -o tranco_response.txt -w "%{http_code}" \
--retry 3 --retry-delay 5 --retry-max-time 120 \
--connect-timeout 10 --max-time 60 \
https://tranco-list.eu/top-1m-id)
echo "HTTP Status Code: $HTTP_STATUS"
# Check if HTTP status code is 200 (OK)
if [ "$HTTP_STATUS" -eq 200 ]; then
# Get the response
TRANCO_ID=$(cat tranco_response.txt)
echo "Raw response: '$TRANCO_ID'"
# Check if we got a valid ID (non-empty and contains alphanumeric characters)
if [[ ! -z "$TRANCO_ID" && "$TRANCO_ID" =~ ^[A-Za-z0-9]+$ ]]; then
echo "id=$TRANCO_ID" >> $GITHUB_OUTPUT
echo "Successfully fetched Tranco list ID: $TRANCO_ID"
SUCCESS=true
else
echo "Received invalid Tranco ID: '$TRANCO_ID' despite HTTP 200"
RETRY_COUNT=$((RETRY_COUNT + 1))
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
echo "Retrying in 10 seconds..."
sleep 10
fi
fi
else
echo "Request failed with HTTP status code: $HTTP_STATUS"
RETRY_COUNT=$((RETRY_COUNT + 1))
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
echo "Retrying in 10 seconds..."
sleep 10
fi
fi
done
if [ "$SUCCESS" = "false" ]; then
echo "Failed to fetch Tranco list ID after $MAX_RETRIES attempts"
exit 1
fi
- name: Download Tranco list
id: download
run: |
# Maximum retry count
MAX_RETRIES=5
RETRY_COUNT=0
SUCCESS=false
while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ "$SUCCESS" = "false" ]; do
echo "Attempt $(($RETRY_COUNT + 1)) of $MAX_RETRIES: Downloading Tranco list ${{ steps.tranco-id.outputs.id }}..."
# Use -w to capture HTTP status code
HTTP_STATUS=$(curl -s -L -o tranco.zip -w "%{http_code}" \
--retry 3 --retry-delay 10 --retry-max-time 300 \
--connect-timeout 15 --max-time 300 \
"https://tranco-list.eu/download_daily/${{ steps.tranco-id.outputs.id }}")
echo "HTTP Status Code: $HTTP_STATUS"
# Check if HTTP status code is 200 (OK)
if [ "$HTTP_STATUS" -eq 200 ]; then
# Check if file was actually downloaded and has content
if [ -s tranco.zip ]; then
echo "Successfully downloaded Tranco list ${{ steps.tranco-id.outputs.id }}"
SUCCESS=true
else
echo "Downloaded file is empty despite HTTP 200"
RETRY_COUNT=$((RETRY_COUNT + 1))
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
echo "Retrying in 15 seconds..."
sleep 15
fi
fi
else
echo "Download failed with HTTP status code: $HTTP_STATUS"
RETRY_COUNT=$((RETRY_COUNT + 1))
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
echo "Retrying in 15 seconds..."
sleep 15
fi
fi
done
if [ "$SUCCESS" = "false" ]; then
echo "Failed to download Tranco list after $MAX_RETRIES attempts"
exit 1
fi
- name: Extract Tranco list
id: extract
run: |
if unzip -o tranco.zip; then
if [ -f "top-1m.csv" ]; then
echo "Successfully extracted Tranco list"
else
echo "Expected file 'top-1m.csv' not found in the zip archive"
ls -la
exit 1
fi
else
echo "Failed to extract zip file"
exit 1
fi
- name: Validate Tranco list
id: validate_tranco_list
run: |
# Count lines to verify it's exactly 1 million
LINE_COUNT=$(wc -l < top-1m.csv)
# Verify the first line starts with "1,"
FIRST_LINE=$(head -n 1 top-1m.csv)
# Verify the last line starts with "1000000,"
LAST_LINE=$(tail -n 1 top-1m.csv)
if [ "$LINE_COUNT" -eq 1000000 ] && [[ "$FIRST_LINE" =~ ^1, ]] && [[ "$LAST_LINE" =~ ^1000000, ]]; then
echo "File validation passed:"
echo "- Exactly 1,000,000 lines"
echo "- First line: $FIRST_LINE"
echo "- Last line: $LAST_LINE"
# Move the file to desired location
mv top-1m.csv tranco.csv
else
echo "File validation failed:"
echo "- Line count: $LINE_COUNT (expected 1,000,000)"
echo "- First line: $FIRST_LINE (should start with '1,')"
echo "- Last line: $LAST_LINE (should start with '1000000,')"
exit 1
fi
- name: Set configuration for top files
id: set_config_top
run: |
# Configuration is defined here
CONFIG='[{"count": 10000, "filename": "tranco_top_10k.csv"}, {"count": 50000, "filename": "tranco_top_50k.csv"}]'
echo "CONFIG=$CONFIG" >> $GITHUB_ENV
echo "Using configuration: $CONFIG"
- name: Validate manifest.json
id: validate_manifest
run: |
# Check if manifest.json exists
if [ ! -f "manifest.json" ]; then
echo "Error: manifest.json file not found"
exit 1
fi
# Create a temporary file to store validation results
TEMP_FILE=$(mktemp)
# Check each output file in the configuration
echo $CONFIG | jq -c '.[]' | while read -r config; do
filename=$(echo $config | jq -r '.filename')
# Check if the filename is in manifest.json
if ! grep -q "\"file\": \"$filename\"" manifest.json; then
echo "Error: $filename is not defined in manifest.json"
echo "VALIDATION_FAILED=true" >> $TEMP_FILE
else
echo "✓ $filename is defined in manifest.json"
fi
done
# Exit if any file is not defined in manifest.json
if grep -q "VALIDATION_FAILED=true" $TEMP_FILE; then
echo "One or more output files are not defined in manifest.json. Please update manifest.json first."
rm $TEMP_FILE
exit 1
fi
rm $TEMP_FILE
- name: Process Tranco CSV
id: process
run: |
echo "Processing Tranco CSV with configuration: $CONFIG"
# Check if tranco.csv exists
if [ ! -f "tranco.csv" ]; then
echo "Error: tranco.csv file not found"
exit 1
fi
# Parse the JSON configuration and process each output
echo $CONFIG | jq -c '.[]' | while read -r config; do
count=$(echo $config | jq -r '.count')
filename=$(echo $config | jq -r '.filename')
if [ -z "$count" ] || [ -z "$filename" ]; then
echo "Skipping invalid configuration: $config"
continue
fi
# Get exactly the requested number of lines from the file
head -n $count tranco.csv > "$filename"
lines=$(wc -l < "$filename")
echo "Successfully created $filename with $lines rows"
done
- name: Configure Git
run: |
git config --local user.email "hello@sublimesecurity.com"
git config --local user.name "Tranco Process Bot"
- name: Create and push branch
id: create-branch
run: |
# Create a unique branch name with timestamp
BRANCH_NAME="tranco_update-${{ steps.date.outputs.today }}-${{ steps.date.outputs.timestamp }}"
echo "branch_name=$BRANCH_NAME" >> $GITHUB_OUTPUT
git checkout -b "$BRANCH_NAME"
git add tranco.csv
# Parse the configuration to get the filenames
echo $CONFIG | jq -c '.[]' | while read -r config; do
filename=$(echo $config | jq -r '.filename')
# Add each generated file individually
git add "$filename"
done
git commit -m "Update Tranco list for ${{ steps.date.outputs.today }} (ID: ${{ steps.tranco-id.outputs.id }})"
git push origin "$BRANCH_NAME"
- name: Create Pull Request
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh pr create \
--title "Update Tranco list and derived files - ${{ steps.date.outputs.today }}" \
--body "This PR updates the Tranco top 1 million domains list and all derived files.
- Date: ${{ steps.date.outputs.today }}
- Tranco List ID: ${{ steps.tranco-id.outputs.id }}
- List URL: https://tranco-list.eu/list/${{ steps.tranco-id.outputs.id }}
- Automated update via GitHub Actions" \
--head "$BRANCH_NAME" \
--base "master"