-
Notifications
You must be signed in to change notification settings - Fork 14
Open
Description
Hi, overcast07
I borrowed a part of your code and combined it with lynx to make a script. The function of this script is to collect all the URLs in the website in a loop and query Wayback Machine's API. Only when Wayback Machine does not backup this link, the script will submit a request to Wayback Machine for backup. This makes my backup much faster, and maybe this particular mode can be added to your scripts ~
#!/usr/bin/bash
#spider the website using lynx to get sitemap and cache it using waybackmachine
auth=''
custom_dir=''
pattern=''
exclude_pattern=''
while getopts 'a:f:o:' flag; do
case "${flag}" in
a) auth="$OPTARG" ;;
f) custom_dir="$OPTARG" ;;
o) pattern=$OPTARG ;;
*) echo "Usage: $0 [-a auth] [-f file] [-o pattern] [-x exclude_pattern] URL"
exit 1 ;;
esac
done
shift "$((OPTIND-1))"
if [[ -n "$custom_dir" ]]; then
dir="$custom_dir"
if [[ ! -d "$dir" ]]; then
mkdir "$dir" || { echo "The folder $dir could not be created"; exit 1; }
echo "Created data folder $dir"
else
echo "Using the existing data folder $dir"
fi
cd "$dir"
fi
# get links from input parameter
echo $1 > link.txt
:> cached_urls.txt
:> history.txt
if [[ -f daily_limit.txt ]]; then
rm daily_limit.txt
elif [[ -f lock.txt ]]; then
rm lock.txt
fi
export wait_time=1
# recursively crawl the website
while read line
do
if [[ -f daily_limit.txt ]];
then
echo "daily limit reached"
sleep 3600
fi
url_list=$(lynx -accept_all_cookies -dump -listonly -nonumbers $line | grep -E '^https?://.*' | grep -v -E '(#|error|\"|\.jpg|\.png|\.gif|comment|feed|\?share\=|[Ff]acebook|twitter|[Gg]oogle|public-api)|(^https://wordpress\.com)' | sort | uniq)
echo "$line Crawled"
for url in $url_list
do
if ! grep -q $url history.txt; then
if ! [[ "$url" =~ $pattern ]]; then
continue
fi
echo $url >> history.txt
echo $url >> link.txt
# use the wayback machine API to check if the urls has been cached
line_encoded=$(python3 -c "import urllib.parse, sys; print(urllib.parse.quote(sys.argv[1]))" "$url")
if ! curl -s "http://archive.org/wayback/available?url=$line_encoded" | grep -q "available"; then
echo "$url not cached by Internet Archive yet, will be cached by us"
while :; do
request=$(curl -s -m 60 -X POST --data-urlencode "url=$url" -H "Accept: application/json" -H "Authorization: LOW ${auth}" "https://web.archive.org/save/")
message=$(echo "$request" | grep -E -A 1 "<h2>" | tail -1 | sed -Ee 's|</?p>||g')
if [[ -z "$message" ]]; then
if [[ "$request" =~ "429 Too Many Requests" ]] || [[ "$request" == "" ]]; then
echo "$request"
if [[ ! -f lock$f.txt ]]; then
touch lock.txt
sleep 20
rm lock.txt
fi
elif [[ "$request" =~ "400 Bad Request" ]]; then
echo "$request"
echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $url"
echo "$(date -u '+%Y-%m-%d %H:%M:%S') $url" >> invalid.log
echo "$request" >> invalid.log
else
sleep 2
break
fi
else
echo "$message"
if ! [[ "$message" =~ "You have already reached the limit of active sessions" || "$message" =~ "Cannot start capture" || "$message" =~ "Crawling this host is paused" ]]; then
if [[ "$message" =~ "You cannot make more than "[1-9][0-9,]*" captures per day" ]]; then
touch daily_limit.txt
break
else
echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $url"
echo "$(date -u '+%Y-%m-%d %H:%M:%S') $url" >> invalid.log
echo "$message" >> invalid.log
break
fi
fi
if [[ ! -f lock.txt ]]; then
touch lock.txt
while [[ -f lock.txt ]]; do
# Retry the request until either the job is submitted or a different error is received
sleep 2
request=$(curl -s -m 60 -X POST --data-urlencode "url=$url" -H "Accept: application/json" -H "Authorization: LOW ${auth}" "https://web.archive.org/save/")
echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $url"
message=$(echo "$request" | grep -E -A 1 "<h2>" | tail -1 | sed -Ee 's|</?p>||g')
if [[ -z "$message" ]]; then
if [[ "$request" =~ "429 Too Many Requests" ]] || [[ "$request" == "" ]]; then
echo "$request"
sleep 20
else
sleep 5
rm lock.txt
break
fi
else
echo "$message"
if [[ "$message" =~ "You have already reached the limit of active sessions" || "$message" =~ "Cannot start capture" || "$message" =~ "Crawling this host is paused" ]]; then
:
elif [[ "$message" =~ "You cannot make more than "[1-9][0-9,]*" captures per day" ]]; then
rm lock.txt
touch daily_limit.txt
break
else
rm lock.txt
echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $url"
echo "$(date -u '+%Y-%m-%d %H:%M:%S') $url" >> invalid.log
echo "$message" >> invalid.log
break
fi
fi
done
break
fi
fi
done
echo "$url $message" >> cached_urls.txt
fi
sleep 0.5
fi
done
# sleep to avoid being banned
if [ "$before" != "$line" ]; then
sleep 3
else
sleep $wait_time
wait_time=$(echo "$(($wait_time*2))")
echo "wait for $wait_time"
fi
cnt=$(echo "$(($cnt+1))")
before=$line
done < link.txt
sort history.txt | uniq > temp.txt
mv temp.txt history.txt
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels