|
| 1 | +#!/bin/sh |
| 2 | +# FIND FAKE GOOGLEBOTS AND BINGBOTS FROM APACHE SERVER LOG FILES |
| 3 | +# Created by: Mitchell Krog ([email protected]) |
| 4 | +# Copyright: Mitchell Krog - https://github.com/mitchellkrogza |
| 5 | +# Repo Url: https://github.com/mitchellkrogza/apache-ultimate-bad-bot-blocker |
| 6 | + |
| 7 | +############################################################################## |
| 8 | +# ___ __ # |
| 9 | +# / _ | ___ ___ _____/ / ___ # |
| 10 | +# / __ |/ _ \/ _ `/ __/ _ \/ -_) # |
| 11 | +# /_/ |_/ .__/\_,_/\__/_//_/\__/ # |
| 12 | +# __/_/ __ ___ __ ___ __ __ # |
| 13 | +# / _ )___ ____/ / / _ )___ / /_ / _ )/ /__ ____/ /_____ ____ # |
| 14 | +# / _ / _ `/ _ / / _ / _ \/ __/ / _ / / _ \/ __/ '_/ -_) __/ # |
| 15 | +# /____/\_,_/\_,_/ /____/\___/\__/ /____/_/\___/\__/_/\_\\__/_/ # |
| 16 | +# # |
| 17 | +############################################################################## |
| 18 | + |
| 19 | +# ------------------------------------------------------------------------------ |
| 20 | +# MIT License |
| 21 | +# ------------------------------------------------------------------------------ |
| 22 | +# Copyright (c) 2017 Mitchell Krog - [email protected] |
| 23 | +# https://github.com/mitchellkrogza |
| 24 | +# ------------------------------------------------------------------------------ |
| 25 | +# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 26 | +# of this software and associated documentation files (the "Software"), to deal |
| 27 | +# in the Software without restriction, including without limitation the rights |
| 28 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 29 | +# copies of the Software, and to permit persons to whom the Software is |
| 30 | +# furnished to do so, subject to the following conditions: |
| 31 | +# ------------------------------------------------------------------------------ |
| 32 | +# The above copyright notice and this permission notice shall be included in all |
| 33 | +# copies or substantial portions of the Software. |
| 34 | +# ------------------------------------------------------------------------------ |
| 35 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 36 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 37 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 38 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 39 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 40 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 41 | +# SOFTWARE. |
| 42 | +# ------------------------------------------------------------------------------ |
| 43 | + |
| 44 | +# --------------- |
| 45 | +# WHAT THIS DOES? |
| 46 | +# --------------------------------------------------------------------------------------------------------- |
| 47 | +# It extracts every single log line from all log files which claim to be Googlebot / bingbot |
| 48 | +# This includes all valid Google and Bing bots too. |
| 49 | +# These are extracted from your logs into new temporary log files. |
| 50 | +# These files are then processed with some magic to find only the fake bots which are then emailed to you. |
| 51 | +# After the script has run and emailed you all temporary files are cleaned up and you original log files |
| 52 | +# are not touched or modified in any way whatsoever. |
| 53 | +# |
| 54 | +# It is lightning fast !!! |
| 55 | +# 33.946s from start to finish for a full months worth of log files from 40 web sites. |
| 56 | +# |
| 57 | +# THIS SCRIPT WILL PROCESS ALL CURRENT LOG FILES IN YOUR APACHE LOG FILE LOCATION |
| 58 | +# This means it ONLY processes this months "current" log files which are xxxxxxxx-access.log |
| 59 | +# It will NOT process rolled over log files ie xxxxxxxx-access.log.1 and xxxxxxx-access.log.2 |
| 60 | +# It is pointless looking for Fake Bots in older logs anyway as these guys change IP's frequently. |
| 61 | +# |
| 62 | +# This script does NOT touch or modify ANY of your real log files. |
| 63 | +# --------------------------------------------------------------------------------------------------------- |
| 64 | + |
| 65 | +# ---------------------- |
| 66 | +# REQUIREMENTS AND NOTES |
| 67 | +# ---------------------- |
| 68 | +# - mutt (sending emails) - sudo apt install mutt |
| 69 | +# - awk |
| 70 | +# - nawk |
| 71 | +# - sed |
| 72 | +# - dig |
| 73 | +# - USES: ANY existing Apache log format that starts with '%h' |
| 74 | + |
| 75 | +# ---------------------- |
| 76 | +# INSTALLATION AND USAGE |
| 77 | +# ---------------------- |
| 78 | + |
| 79 | +# -------------------------------------------------------- |
| 80 | +# 1. STOP Mutt from storing all sent emails |
| 81 | +# otherwise it creates an ever growing file called "sent" |
| 82 | +# -------------------------------------------------------- |
| 83 | +# |
| 84 | +# sudo nano /etc/Muttrc |
| 85 | +# |
| 86 | +# --------------------------------------- |
| 87 | +# 2. PASTE this at the bottom of the file |
| 88 | +# --------------------------------------- |
| 89 | +# |
| 90 | +# set copy = no |
| 91 | +# set folder = "" |
| 92 | +# |
| 93 | +# ---------------------------------------------------------- |
| 94 | +# 3. SAVE this script in your HOME folder as findfakebots.sh |
| 95 | +# ---------------------------------------------------------- |
| 96 | +# |
| 97 | +# ------------------------------ |
| 98 | +# 4. MAKE this script executable |
| 99 | +# ------------------------------ |
| 100 | +# |
| 101 | +# sudo chmod +x findfakebots.sh |
| 102 | +# |
| 103 | +# ------------------------------------- |
| 104 | +# 5. EDIT the USER SETTINGS block below |
| 105 | +# ------------------------------------- |
| 106 | +# |
| 107 | +# --------------------------- |
| 108 | +# 6. RUN the script with sudo |
| 109 | +# --------------------------- |
| 110 | +# |
| 111 | +# cd ${HOME} |
| 112 | +# sudo ./findfakebots.sh |
| 113 | +# |
| 114 | +# RUN FROM CRON as you like, make sure you have allowed your user to run sudo from CRON through visudo !!! |
| 115 | +# You should only need to run this perhaps once a week. |
| 116 | +# |
| 117 | +# -------------------------- |
| 118 | +# 7. REPORTING YOUR FINDINGS |
| 119 | +# -------------------------- |
| 120 | +# ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| 121 | +# When you see the email you will receive you will see a list of IP's detected and below that a list of the same IP's with their reverse DNS Names. |
| 122 | +# Before you report them in this repo as issues you need to first get the whois details of each and log ONLY ONE IP per issue. |
| 123 | +# |
| 124 | +# See example: https://github.com/mitchellkrogza/nginx-ultimate-bad-bot-blocker/issues/293 |
| 125 | +# |
| 126 | +# Your issue MUST include: |
| 127 | +# - the whois output from https://www.ultratools.com/tools/ipWhoisLookupResult |
| 128 | +# - An excerpt from your log file |
| 129 | +# - DO NOT log issues with any IP's that resolve with 'dynamic' or 'adsl' in the reverse lookup it is pointless blocking dynamic addresses. |
| 130 | +# ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| 131 | + |
| 132 | +# ------------- |
| 133 | +# USER SETTINGS |
| 134 | +# ------------- |
| 135 | + |
| 136 | +recipient="" # < ADD your own email address between the "" |
| 137 | +apachelogslocation=/var/log/apache2 # < Location of your apache log directory |
| 138 | + |
| 139 | +# ----------------- |
| 140 | +# END USER SETTINGS |
| 141 | +# ----------------- |
| 142 | + |
| 143 | +# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
| 144 | +# ------------------------------------ |
| 145 | +# DONT MODIFY ANYTHING BELOW THIS LINE |
| 146 | +# ------------------------------------ |
| 147 | +# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
| 148 | + |
| 149 | +# --------- |
| 150 | +# VARIABLES |
| 151 | +# --------- |
| 152 | + |
| 153 | +datenow=$(date +%F) |
| 154 | +timenow=$(date +%T) |
| 155 | + |
| 156 | +# ----------------- |
| 157 | +# TEMP FILES WE USE |
| 158 | +# ----------------- |
| 159 | + |
| 160 | +googlelog=${apachelogslocation}/googlebots.log |
| 161 | +googlefile=${HOME}/googlebots.list |
| 162 | +googleemailfile=${HOME}/fakegooglebots.txt |
| 163 | +googletestfile=${HOME}/googlebots.tested |
| 164 | +googlefakefile=${HOME}/googlebots.fake |
| 165 | + |
| 166 | +binglog=${apachelogslocation}/bingbots.log |
| 167 | +bingfile=${HOME}/bingbots.list |
| 168 | +bingemailfile=${HOME}/fakebingbots.txt |
| 169 | +bingtestfile=${HOME}/bingbots.tested |
| 170 | +bingfakefile=${HOME}/bingbots.fake |
| 171 | + |
| 172 | +tempfile=${HOME}/file.tmp |
| 173 | + |
| 174 | +# ----------------------------- |
| 175 | +# PROCESS ALL CURRENT LOG FILES |
| 176 | +# ----------------------------- |
| 177 | + |
| 178 | +cd ${apachelogslocation} |
| 179 | + |
| 180 | +# FIND ALL GOOGLEBOTS AND WRITE THEM TO A NEW LOG FILE |
| 181 | +for logfile in $(find . -type f -name '*access.log' -a ! -name '*access.log.'); do |
| 182 | + grep 'compatible\; Googlebot\/' ${logfile} >> ${googlelog} |
| 183 | +done |
| 184 | + |
| 185 | +# FIND ALL BINGBOTS AND WRITE THEM TO A NEW LOG FILE |
| 186 | +for logfile in $(find . -type f -name '*access.log' -a ! -name '*access.log.'); do |
| 187 | + grep 'compatible\; bingbot\/' ${logfile} >> ${binglog} |
| 188 | +done |
| 189 | + |
| 190 | +# ----------------------------- |
| 191 | +# FIND AND TEST FAKE GOOGLEBOTS |
| 192 | +# ----------------------------- |
| 193 | + |
| 194 | +# Prepare Latest File GoogleBots File |
| 195 | +nawk '{print $1}' ${googlelog} > ${tempfile} && mv ${tempfile} ${googlefile} |
| 196 | +# Sort the File for Duplicates |
| 197 | +sort -u ${googlefile} -o ${googlefile} |
| 198 | +# Test Each Fake Bot |
| 199 | +while read line |
| 200 | +do |
| 201 | + echo $line - `dig -x "$line" +short` |
| 202 | +done < ${googlefile} > ${googletestfile} |
| 203 | +# Print all Reverse DNS Results NOT Containing "Google" ie. Possible FAKE BOTS |
| 204 | +awk '!/google/' ${googletestfile} > ${googlefakefile} |
| 205 | +# Prepare our Email File |
| 206 | +# Print list of IP's only first |
| 207 | +nawk '{print $1}' ${googlefakefile} > ${tempfile} && mv ${tempfile} ${googleemailfile} |
| 208 | +# Sort the File |
| 209 | +sort -u ${googleemailfile} -o ${googleemailfile} |
| 210 | +# Add Extra Info to Email File this section at bottom of email file includes the Reverse DNS Names we looked up |
| 211 | +printf '\n-----------------------------------\nIP ADDRESSES WITH REVERSE DNS NAMES\n-----------------------------------\n\n' >> ${googleemailfile} |
| 212 | +awk -F "-" '{print $1,$NF}' ${googlefakefile} >> ${googleemailfile} |
| 213 | +# Print Message Date and Time at Top of Email |
| 214 | +sed -i "1s/^/Possible Fake GoogleBots Detected\n$datenow - $timenow\n---------------------------------\n\n---------------------------------\nIP ADDRESSES FOUND\n---------------------------------\n/" ${googleemailfile} |
| 215 | +# If our File is Empty we do ot Send an Email, OPtherwise we send the email |
| 216 | +if [ -s "$googleemailfile" ] |
| 217 | +then |
| 218 | +# Email Me the Files |
| 219 | +echo "Fake GoogleBots" | mutt -s "Fake GoogleBots" -a ${googleemailfile} -- ${recipient} |
| 220 | +else |
| 221 | + : |
| 222 | +fi |
| 223 | + |
| 224 | +# --------------------------------- |
| 225 | +# END FIND AND TEST FAKE GOOGLEBOTS |
| 226 | +# --------------------------------- |
| 227 | + |
| 228 | +# --------------------------- |
| 229 | +# FIND AND TEST FAKE BINGBOTS |
| 230 | +# --------------------------- |
| 231 | + |
| 232 | +# Prepare Latest File BingBots File |
| 233 | +nawk '{print $1}' ${binglog} > ${tempfile} && mv ${tempfile} ${bingfile} |
| 234 | +# Sort the File for Duplicates |
| 235 | +sort -u ${bingfile} -o ${bingfile} |
| 236 | +# Test Each Fake Bot |
| 237 | +while read line |
| 238 | +do |
| 239 | + echo $line - `dig -x "$line" +short` |
| 240 | +done < ${bingfile} > ${bingtestfile} |
| 241 | +# Print all Reverse DNS Results NOT Containing "Google" ie. Possible FAKE BOTS |
| 242 | +awk '!/msn/' ${bingtestfile} > ${bingfakefile} |
| 243 | +# Prepare our Email File |
| 244 | +# Print list of IP's only first |
| 245 | +nawk '{print $1}' ${bingfakefile} > ${tempfile} && mv ${tempfile} ${bingemailfile} |
| 246 | +# Sort the File |
| 247 | +sort -u ${bingemailfile} -o ${bingemailfile} |
| 248 | +# Add Extra Info to Email File this section at ottom of email file includes the Reverse DNS Names we looked up |
| 249 | +printf '\n-----------------------------------\nIP ADDRESSES WITH REVERSE DNS NAMES\n-----------------------------------\n\n' >> ${bingemailfile} |
| 250 | +awk -F "-" '{print $1,$NF}' ${bingfakefile} >> ${bingemailfile} |
| 251 | +# Print Message Date and Time at Top of Email |
| 252 | +sed -i "1s/^/Possible Fake BingBots Detected\n$datenow - $timenow\n---------------------------------\n\n---------------------------------\nIP ADDRESSES FOUND\n---------------------------------\n/" ${bingemailfile} |
| 253 | +# If our File is Empty we do ot Send an Email, OPtherwise we send the email |
| 254 | +if [ -s "$bingemailfile" ] |
| 255 | +then |
| 256 | +# Email Me the Files |
| 257 | +echo "Fake BingBots" | mutt -s "Fake Bing Bots" -a ${bingemailfile} -- ${recipient} |
| 258 | +else |
| 259 | + : |
| 260 | +fi |
| 261 | + |
| 262 | +# ------------------------------- |
| 263 | +# END FIND AND TEST FAKE BINGBOTS |
| 264 | +# ------------------------------- |
| 265 | + |
| 266 | +# --------------------------------------------------- |
| 267 | +# DELETE ALL TEMP FILES |
| 268 | +# --------------------------------------------------- |
| 269 | +# This does NOT delete any of your real server logs |
| 270 | +# only the temp log files created by this script |
| 271 | +# --------------------------------------------------- |
| 272 | + |
| 273 | +sudo rm ${googlelog} |
| 274 | +sudo rm ${binglog} |
| 275 | +sudo rm ${googlefile} |
| 276 | +sudo rm ${googleemailfile} |
| 277 | +sudo rm ${googletestfile} |
| 278 | +sudo rm ${googlefakefile} |
| 279 | +sudo rm ${bingfile} |
| 280 | +sudo rm ${bingemailfile} |
| 281 | +sudo rm ${bingtestfile} |
| 282 | +sudo rm ${bingfakefile} |
| 283 | + |
| 284 | +# ---------------------- |
| 285 | +# EXIT WITH ERROR NUMBER |
| 286 | +# ---------------------- |
| 287 | + |
| 288 | +exit ${?} |
| 289 | + |
| 290 | +# ------------------------------------------------------------------------------ |
| 291 | +# MIT License |
| 292 | +# ------------------------------------------------------------------------------ |
| 293 | +# Copyright (c) 2017 Mitchell Krog - [email protected] |
| 294 | +# https://github.com/mitchellkrogza |
| 295 | +# ------------------------------------------------------------------------------ |
| 296 | +# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 297 | +# of this software and associated documentation files (the "Software"), to deal |
| 298 | +# in the Software without restriction, including without limitation the rights |
| 299 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 300 | +# copies of the Software, and to permit persons to whom the Software is |
| 301 | +# furnished to do so, subject to the following conditions: |
| 302 | +# ------------------------------------------------------------------------------ |
| 303 | +# The above copyright notice and this permission notice shall be included in all |
| 304 | +# copies or substantial portions of the Software. |
| 305 | +# ------------------------------------------------------------------------------ |
| 306 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 307 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 308 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 309 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 310 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 311 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 312 | +# SOFTWARE. |
| 313 | +# ------------------------------------------------------------------------------ |
| 314 | + |
| 315 | + |
0 commit comments