diff --git a/code3/pagerank/spider.py b/code3/pagerank/spider.py index 822e9d3c..9eb4200f 100644 --- a/code3/pagerank/spider.py +++ b/code3/pagerank/spider.py @@ -77,6 +77,8 @@ document = urlopen(url, context=ctx) html = document.read() + soup = BeautifulSoup(html, "html.parser") + if document.getcode() != 200 : print("Error on page: ",document.getcode()) cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) @@ -86,10 +88,9 @@ cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) ) conn.commit() continue - + print('('+str(len(html))+')', end=' ') - soup = BeautifulSoup(html, "html.parser") except KeyboardInterrupt: print('') print('Program interrupted by user...')