Skip to content

https support #11

@gwen001

Description

@gwen001

Hi,

I had trouble to use Parsero with https so I made some little changes. It's the first time I wrote Python so I apologize if the code sucks...

Here is the diff:

diff --git a/parsero.py b/parsero.py
old mode 100644
new mode 100755
index 4ee24ef..a5dabea
--- a/parsero.py
+++ b/parsero.py
@@ -34,6 +34,7 @@ Author:
 
 class bcolors:
     OKGREEN = '\033[92m'
+    REDIR = '\033[37m'
     FAIL = '\033[91m'
     ENDC = '\033[0m'
     YELLOW = '\033[33m'
@@ -54,6 +55,7 @@ if sys.version_info < (3, 0, 0):
 import urllib.request
 import argparse
 import time
+import http.client
 
 try:
     import urllib3
@@ -76,15 +78,27 @@ def logo():
     print(bcolors.YELLOW + hello + bcolors.ENDC)
     now = time.strftime("%c")
 
-def conn_check(url, only200):
-    global pathlist
+def conn_check(url, only200, https):
+    global pathlist, http
     pathlist = []
     salida = 1
+
+    if https == True:
+        protocol = "https"
+        conn = http.client.HTTPSConnection(url)
+    else:
+        protocol = "http"
+        conn = http.client.HTTPConnection(url)
+    
     try:
-        for line in urllib.request.urlopen("http://" + url + "/robots.txt"):
-            lineStr = str(line, encoding='utf8')
+        conn.request("GET", "/robots.txt")
+        res = conn.getresponse()
+        data = str(res.read(), encoding='utf8')
+        datas = data.split('\n')
+        for line in datas:
+            lineStr = line
             path = lineStr.split(': /')
-            if "Disallow" == path[0]:
+            if ("Disallow" == path[0]) or ("Noindex" == path[0]):
                 pathlist.append(path[1].replace("\n", "").replace("\r", ""))
                 pathlist = list(set(pathlist))
             try:
@@ -99,21 +113,24 @@ def conn_check(url, only200):
         print("\n" + bcolors.FAIL + "Please, type a valid URL. This URL can't be resolved." + bcolors.ENDC)
         print("\n" + bcolors.FAIL + "e.g: python3 parsero.py -u www.behindthefirewalls.com -o -sb" + bcolors.ENDC + "\n")
         salida = 0
-
+    
     http = urllib3.PoolManager()
     count = 0
     count_ok = 0
-
+    
     for p in pathlist:
-        disurl = "http://" + url + '/' + p
-        r1 = http.request('GET', disurl, redirect=False, retries=5)
+        disurl = protocol+"://"+url+'/'+p
+        r1 = http.request('GET', disurl, redirect = False, retries = 5)
+        count = count + 1
         if r1.status == 200:
             print(bcolors.OKGREEN + disurl + ' ' + str(r1.status) + ' ' + str(r1.reason) + bcolors.ENDC)
             count_ok = count_ok + 1
         elif only200 == False:
-            print(bcolors.FAIL + disurl + ' ' + str(r1.status) + ' ' + str(r1.reason) + bcolors.ENDC)
-        count = count + 1
-
+            if r1.status >= 300 and r1.status < 400:
+                print(bcolors.REDIR + disurl + ' ' + str(r1.status) + ' ' + str(r1.reason) + bcolors.ENDC)
+            else:
+                print(bcolors.FAIL + disurl + ' ' + str(r1.status) + ' ' + str(r1.reason) + bcolors.ENDC)
+    
     count_int = int(count)
     count_ok_int = int(count_ok)
 
@@ -127,21 +144,25 @@ def conn_check(url, only200):
         else:
             print('\n' + bcolors.FAIL + '[+] %i links have been analyzed but any them are available...' % count_int + bcolors.ENDC)
 
-def search_bing(url, searchbing, only200):
+def search_bing(url, searchbing, only200, https):
+    if https == True:
+        protocol = "https"
+    else:
+        protocol = "http"
     try:
         print("\nSearching the Disallows entries in Bing...\n")
         from bs4 import BeautifulSoup
 
         count = 0
         for p in pathlist:
-            disurl = "http://" + url + '/' + p
+            disurl = protocol+"://" + url + '/' + p
             opener = urllib.request.build_opener()
             opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0')]
             url2 = "http://www.bing.com/search?q=site:" + disurl
             print(url2)
 
             page = opener.open(url2)
-            soup = BeautifulSoup(page)
+            soup = BeautifulSoup(page, 'lxml')
 
             http = urllib3.PoolManager()
             for cite in soup.findAll('cite'):
@@ -152,7 +173,10 @@ def search_bing(url, searchbing, only200):
                         if r2.status == 200:
                             print(bcolors.OKGREEN + ' - ' + cite.text + ' ' + str(r2.status) + ' ' + str(r2.reason) + bcolors.ENDC)
                         elif only200 == False:
-                            print(bcolors.FAIL + ' - ' + cite.text + ' ' + str(r2.status) + ' ' + str(r2.reason) + bcolors.ENDC)
+                            if r2.status >= 300 and r2.status < 400:
+                                print(bcolors.REDIR + ' - ' + cite.text + ' ' + str(r2.status) + ' ' + str(r2.reason) + bcolors.ENDC)
+                            else:
+                                print(bcolors.FAIL + ' - ' + cite.text + ' ' + str(r2.status) + ' ' + str(r2.reason) + bcolors.ENDC)
                 except UnicodeEncodeError:
                     pass
 
@@ -170,6 +194,7 @@ def main():
     parse = argparse.ArgumentParser()
     parse.add_argument('-u', action='store', dest='url', help='Type the URL which will be analyzed')
     parse.add_argument('-o', action='store_true', dest='only200', help='Show only the "HTTP 200" status code')
+    parse.add_argument('-s', action='store_true', dest='https', help='Enable https')
     parse.add_argument('-sb', action='store_true', dest='searchbing', help='Search in Bing indexed Disallows')
     parse.add_argument('-f', action='store', dest='file', help='Scan a list of domains from a list')
 
@@ -204,12 +229,13 @@ def main():
         if url.find("http://") == 0:
             url = url.replace("http://", "")
         start_time = time.time()
+        https = args.https
         only200 = args.only200
         searchbing = args.searchbing
         date(url)
-        conn_check(url, only200)
+        conn_check(url, only200, https)
         if searchbing == True:
-            search_bing(url, searchbing, only200)
+            search_bing(url, searchbing, only200, https)
         print("\nFinished in %0.2f seconds.\n" % (time.time() - start_time))
 
 if __name__ == "__main__":

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions