-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_crawler.sh
More file actions
executable file
·286 lines (252 loc) · 8.4 KB
/
run_crawler.sh
File metadata and controls
executable file
·286 lines (252 loc) · 8.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/bin/bash
# PCrawler - Professional Web Crawler with Phase Selection
# This script forces users to select a phase before running the crawler
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to show help
show_help() {
echo "PCrawler - Professional Web Crawler with Phase Selection"
echo ""
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --phase PHASE Start from specific phase (1,2,3,4,5,6,auto)"
echo " --force-restart Force restart from Phase 1"
echo " --config CONFIG Config name (default: 1900comvn)"
echo " --scale N Scale workers to N instances (default: 1)"
echo " --logs Show logs from running containers"
echo " --help Show this help message"
echo ""
echo "Phases:"
echo " 1 - Crawl links for all industries"
echo " 2 - Crawl detail pages from links"
echo " 3 - Extract company details from detail HTML"
echo " 4 - Crawl contact pages from company details"
echo " 5 - Extract emails from contact HTML"
echo " 6 - Export final CSV"
echo " auto - Auto-detect starting phase (recommended)"
echo ""
echo "Examples:"
echo " $0 --phase auto # Auto-detect and start from appropriate phase"
echo " $0 --phase 2 # Start from Phase 2 (detail crawling)"
echo " $0 --phase 1 --force-restart # Force restart from Phase 1"
echo " $0 --phase auto --config myconfig # Use custom config with auto phase"
echo " $0 --phase 3 --scale 5 # Start Phase 3 with 5 workers"
echo " $0 --phase 6 # Export only"
echo " $0 --logs # Show logs from running containers"
}
# Function to validate phase
validate_phase() {
local phase=$1
case $phase in
1|2|3|4|5|6|auto)
return 0
;;
*)
print_error "Invalid phase: $phase"
print_error "Valid phases: 1, 2, 3, 4, 5, auto"
return 1
;;
esac
}
# Function to run crawler with phase selection
run_crawler() {
local phase=$1
local force_restart=$2
local config=$3
local scale=$4
print_info "Starting PCrawler with phase selection..."
print_info "Phase: $phase"
print_info "Config: $config"
print_info "Workers: $scale"
if [ "$force_restart" = "true" ]; then
print_warning "Force restart enabled - will start from Phase 1"
fi
# Build command using docker compose run with real-time output
local cmd=""
if [ "$force_restart" = "true" ]; then
cmd="docker compose run --rm --no-deps -T crawler_app python -m app.main crawl --phase 1 --force-restart --config $config"
else
if [ "$phase" = "6" ]; then
# main.py CLI does not accept phase=6; call run() directly
cmd="docker compose run --rm --no-deps -T crawler_app python -c 'import asyncio; from app.main import run; asyncio.run(run(\"$config\", start_phase=6))'"
else
cmd="docker compose run --rm --no-deps -T crawler_app python -m app.main crawl --phase $phase --config $config"
fi
fi
print_info "Executing: $cmd"
echo ""
# Execute command with real-time output and show logs
print_info "Starting crawler with real-time logs..."
echo ""
# Run the command in background and show logs real-time
print_info "Running crawler command in background..."
eval $cmd &
local crawler_pid=$!
# Show real-time logs while crawler is running
print_info "Showing real-time logs from running containers..."
docker compose logs -f &
local logs_pid=$!
# Wait for crawler to complete
wait $crawler_pid
local crawler_exit_code=$?
# Stop logs display
kill $logs_pid 2>/dev/null
# Show final status
if [ $crawler_exit_code -eq 0 ]; then
print_info "Crawler completed successfully!"
else
print_error "Crawler failed with exit code: $crawler_exit_code"
fi
# Show recent logs
echo ""
print_info "Recent logs:"
docker compose logs --tail=50
}
# Function to show current status
show_status() {
print_info "Checking current crawler status..."
# Check if containers are running
if docker compose ps | grep -q "Up"; then
print_success "Docker containers are running"
else
print_warning "Docker containers are not running"
fi
# Check data directory
if [ -d "data" ]; then
local checkpoint_count=$(find data -name "checkpoint_*.json" 2>/dev/null | wc -l)
local csv_exists=""
if [ -f "data/company_contacts.csv" ]; then
csv_exists=" (CSV exists)"
fi
print_info "Data directory: $checkpoint_count checkpoint files$csv_exists"
else
print_warning "Data directory not found"
fi
}
# Main script logic
main() {
local phase=""
local force_restart="false"
local config="1900comvn"
local scale="1"
local show_help_flag="false"
local show_logs_flag="false"
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--phase)
phase="$2"
shift 2
;;
--force-restart)
force_restart="true"
shift
;;
--config)
config="$2"
shift 2
;;
--scale)
scale="$2"
shift 2
;;
--logs)
show_logs_flag="true"
shift
;;
--help|-h)
show_help_flag="true"
shift
;;
*)
print_error "Unknown option: $1"
show_help
exit 1
;;
esac
done
# Show help if requested
if [ "$show_help_flag" = "true" ]; then
show_help
exit 0
fi
# Show logs if requested
if [ "$show_logs_flag" = "true" ]; then
print_info "Showing logs from running containers..."
docker compose logs -f
exit 0
fi
# If no phase specified, prompt user
if [ -z "$phase" ]; then
echo "PCrawler - Professional Web Crawler with Phase Selection"
echo ""
show_status
echo ""
echo "Please select a phase to start from:"
echo " 1) Phase 1 - Crawl links for all industries"
echo " 2) Phase 2 - Crawl detail pages from links"
echo " 3) Phase 3 - Extract company details"
echo " 4) Phase 4 - Crawl contact pages"
echo " 5) Phase 5 - Extract emails"
echo " 6) Phase 6 - Export final CSV"
echo " a) Auto-detect starting phase (recommended)"
echo " f) Force restart from Phase 1"
echo " h) Show help"
echo " q) Quit"
echo ""
read -p "Enter your choice (1-6, a, f, h, q): " choice
case $choice in
1) phase="1" ;;
2) phase="2" ;;
3) phase="3" ;;
4) phase="4" ;;
5) phase="5" ;;
6) phase="6" ;;
a|A) phase="auto" ;;
f|F) phase="1"; force_restart="true" ;;
h|H) show_help; exit 0 ;;
q|Q) print_info "Exiting..."; exit 0 ;;
*) print_error "Invalid choice: $choice"; exit 1 ;;
esac
# Ask for scale
echo ""
read -p "Enter number of workers (default: 1): " scale_input
if [ -n "$scale_input" ] && [ "$scale_input" -gt 0 ]; then
scale="$scale_input"
fi
fi
# Validate phase
if ! validate_phase "$phase"; then
exit 1
fi
# Scale workers if needed
if [ "$scale" != "1" ]; then
print_info "Scaling workers to $scale instances..."
docker compose up -d --scale worker=$scale
print_info "Waiting 5 seconds for workers to start..."
sleep 5
fi
# Run crawler
run_crawler "$phase" "$force_restart" "$config" "$scale"
}
# Run main function
main "$@"