Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 36 additions & 49 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
# pulls company information from site to save time that would be spent manually typing out the info
# Gavin Inglis
# January 2019
# Updated September 2023 BJI

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

import zipfile
import time
import datetime
import gspread
Expand All @@ -18,86 +18,72 @@
import re
import getpass

# Get latest chromedriver zip file for mac, extract into same folder
try:
version = requests.get('https://chromedriver.storage.googleapis.com/LATEST_RELEASE').text
url = 'https://chromedriver.storage.googleapis.com/{0}/{1}'.format(version, 'chromedriver_mac64.zip')
r = requests.get(url, allow_redirects=True)
open('chromedriver.zip', 'wb').write(r.content)
with zipfile.ZipFile("chromedriver.zip", "r") as zip_ref:
zip_ref.extractall()
except:
pass

'''Globals'''

GOOGLE_URL = 'http://www.google.com/search'

# scope of access for api
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']

# credentials file generated by google developer console when creating sheets api
credentials = ServiceAccountCredentials.from_json_keyfile_name('PATH TO YOUR CREDENTIALS', scope)
credentials = ServiceAccountCredentials.from_json_keyfile_name('PATH_TO_YOUR_CREDS', scope)
gc = gspread.authorize(credentials)

# login url for site
url = 'https://www.magicformulainvesting.com/Account/LogOn'

# declare driver as chrome headless instance
service = Service()
options = webdriver.ChromeOptions()
options.add_argument('headless')

# declare driver as chrome headless instance
driver = webdriver.Chrome(executable_path="./chromedriver", options=options)
driver = webdriver.Chrome(service=service, options=options)

'''Functions'''
def scrapeSite():

print("Scraping stock info...") # update for terminal
print('Scraping stock info...')

# find all td elements, write needed elements to file
trs=driver.find_elements_by_xpath('//table[@class="divheight screeningdata"]/tbody/tr')
trs = driver.find_elements(By.XPATH,'//table[@class="divheight screeningdata"]/tbody/tr')

names = []
tikrs = []

for tr in trs:
td = tr.find_elements_by_xpath(".//td")
td = tr.find_elements(By.XPATH,".//td")

company_name=td[0].get_attribute("innerHTML")
company_tikr=td[1].get_attribute("innerHTML")
company_name = td[0].get_attribute("innerHTML")
company_tikr = td[1].get_attribute("innerHTML")

names.append(company_name)
tikrs.append(company_tikr)

return names, tikrs

def writeSheet(names, tikrs):

print("Writing to sheet...") # update to terminal
print('Writing to sheet...')

# access sheet by url
wks = gc.open_by_url("YOUR URL HERE").get_worksheet(1) # worksheet number

#wks.append_row([' '], table_range='A1') # append a blank line before tickers as requested by OC

date=datetime.datetime.today().strftime('%Y-%m-%d') # current date
wks.append_row([date], table_range='A1') # append the date, starts in first column
wks = gc.open_by_url("YOUR URL HERE").get_worksheet(1) # worksheet num 1 is Research

date = datetime.datetime.today().strftime('%Y-%m-%d') # current date
# wks.append_row([date], table_range='A1') # append the date starting in first column
wks.append_row([date])

for i in range(len(names)):
price = '=GOOGLEFINANCE("' + tikrs[i] + '","price")'

query = names[i]

url = getUrl(query)

wks.append_row([names[i],tikrs[i], price, url], table_range='A1', value_input_option="USER_ENTERED") # start in first column
wks.append_row([names[i],tikrs[i], price, url], value_input_option="USER_ENTERED")

def getUrl(companyName):
url = GOOGLE_URL + '?q=' + companyName
url = GOOGLE_URL + '?q=' + companyName
result = requests.get(url)
# fancy regex courtesy of pbui
urls = re.findall('/url\?q=([^&]*)', result.text)
urls = re.findall('/url\?q=([^&]*)', result.text)
return urls[0]

'''Main Execution'''
Expand All @@ -106,30 +92,31 @@ def getUrl(companyName):
driver.get(url)

# find the input elements for logging in
username=driver.find_element_by_name("Email")
password=driver.find_element_by_name("Password")
username=driver.find_element(By.NAME,"Email")
password=driver.find_element(By.NAME,"Password")

# enter email and password. uses getpass to hide password (i.e. not using plaintext)
your_email=raw_input("Please enter your email for magicformulainvesting.com: ")
your_password=getpass.getpass("Please enter your password for magicformulainvesting.com: ")
# Replace with raw_input() with input() for python 3
your_email = input("Please enter your email for magicformulainvesting.com: ")

# Have to run scaper.py from terminal so getpass will work :)
your_password = getpass.getpass("Please enter your password for magicformulainvesting.com: ")

# selenium sends info to mfi.com
username.send_keys(your_email)
password.send_keys(your_password)

# enter email and password (for hard coding only)
# username.send_keys("EMAIL")
# password.send_keys("PASSWORD")

# click login button
button=driver.find_element_by_name("login")
button = driver.find_element(By.NAME,"login")
button.click()

time.sleep(1) # seconds
time.sleep(1) # seconds

# use xpathing to find the radio button element for 50 stocks and click it
radio = driver.find_element_by_xpath('//input[@value="false" and contains(@name,"Select30")]')
# use xpath to find the radio button element for 50 stocks and click it
radio = driver.find_element(By.XPATH,'//*[@id="Select30" and @value="false"]')
radio.click()

button2=driver.find_element_by_name("stocks")
button2 = driver.find_element(By.NAME,"stocks")
button2.click()

time.sleep(.5)
Expand Down