@@ -22,6 +22,64 @@ kill_port() {
2222 fi
2323}
2424
25+ # Function to check and download dataset with progress bar
26+ check_and_download_dataset () {
27+ echo " 🔍 Checking for github_meta.duckdb dataset..."
28+
29+ data_file=" public/data/github_meta.duckdb"
30+
31+ # Check if file exists and has reasonable size (>1MB)
32+ if [ -f " $data_file " ]; then
33+ file_size=$( stat -f%z " $data_file " 2> /dev/null || stat -c%s " $data_file " 2> /dev/null)
34+ if [ " $file_size " -gt 1048576 ]; then # Greater than 1MB
35+ echo " ✅ Dataset found (${file_size} bytes)"
36+ return 0
37+ else
38+ echo " ⚠️ Dataset exists but is too small (${file_size} bytes), will re-download"
39+ fi
40+ else
41+ echo " ❌ Dataset not found"
42+ fi
43+
44+ echo " 📥 Downloading dataset from Hugging Face..."
45+
46+ # Create data directory if it doesn't exist
47+ mkdir -p public/data
48+
49+ # Download URL
50+ url=" https://huggingface.co/datasets/deepgit/github_meta/resolve/main/github_meta.duckdb"
51+
52+ # Download with progress bar using wget (more reliable than curl for progress)
53+ if command -v wget > /dev/null 2>&1 ; then
54+ echo " Using wget for download..."
55+ if wget --progress=bar:force:noscroll -O " $data_file " " $url " ; then
56+ echo " ✅ Successfully downloaded dataset"
57+ return 0
58+ else
59+ echo " ❌ Failed to download dataset with wget"
60+ return 1
61+ fi
62+ elif command -v curl > /dev/null 2>&1 ; then
63+ echo " Using curl for download..."
64+ if curl -L -o " $data_file " --progress-bar " $url " ; then
65+ echo " ✅ Successfully downloaded dataset"
66+ return 0
67+ else
68+ echo " ❌ Failed to download dataset with curl"
69+ return 1
70+ fi
71+ else
72+ echo " ❌ Neither wget nor curl found. Please install one of them."
73+ return 1
74+ fi
75+ }
76+
77+ # Check and download dataset first
78+ if ! check_and_download_dataset; then
79+ echo " ❌ Dataset download failed. Exiting."
80+ exit 1
81+ fi
82+
2583# Kill any existing process on port 5002
2684kill_port 5002
2785
0 commit comments