-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathscraper.py
More file actions
180 lines (160 loc) · 5.87 KB
/
scraper.py
File metadata and controls
180 lines (160 loc) · 5.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Microsoft Outlook Web Access scraper
Retrieves full, raw e-mails from Microsoft Outlook Web Access by
screen scraping. Can do the following:
* Log into a Microsoft Outlook Web Access account with a given username
and password.
* Retrieve all e-mail IDs from the first page of your Inbox.
* Retrieve the full, raw source of the e-mail with a given ID.
* Delete an e-mail with a given ID (technically, move it to the "Deleted
Items" folder).
The main class you use is OutlookWebScraper. See the docstrings in the code
and the "sample usage" section below.
This module does no caching. Each time you retrieve something, it does a fresh
HTTP request. It does cache your session, though, so that you only have to log
in once.
"""
# Documentation / sample usage:
#
# # Throws InvalidLogin exception for invalid username/password.
# >>> s = OutlookWebScraper('https://webmaildomain.com', 'username', 'invalid password')
# >>> s.login()
# Traceback (most recent call last):
# ...
# scraper.InvalidLogin
#
# >>> s = OutlookWebScraper('https://webmaildomain.com', 'username', 'correct password')
# >>> s.login()
#
# # Display IDs of messages in the inbox.
# >>> s.inbox()
# ['/Inbox/Hey%20there.EML', '/Inbox/test-3.EML']
#
# # Display IDs of messages in the "sent items" folder.
# >>> s.get_folder('sent items')
# ['/Sent%20Items/test-2.EML']
#
# # Display the raw source of a particular message.
# >>> print s.get_message('/Inbox/Hey%20there.EML')
# [...]
#
# # Delete a message.
# >>> s.delete_message('/Inbox/Hey%20there.EML')
# Forked from:
# Adrian Holovaty's weboutlook project
#
# By:
# Raja Kapur <raja.kapur@gmail.com>
#
# https://github.com/aonic/owa2gmail
# weboutlook: http://code.google.com/p/weboutlook/
# Copyright (C) 2006 Adrian Holovaty <holovaty@gmail.com>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA 02111-1307 USA
import re, socket, urllib, urlparse
from Cookie import SimpleCookie
__version__ = '0.2'
__author__ = 'Raja Kapur <raja.kapur@gmail.com>'
socket.setdefaulttimeout(15)
class InvalidLogin(Exception):
pass
class RetrievalError(Exception):
pass
class CookieScraper(object):
"Scraper that keeps track of getting and setting cookies."
def __init__(self):
self._cookies = SimpleCookie()
def get_page(self, url, post_data=None, headers=()):
"""
Helper method that gets the given URL, handling the sending and storing
of cookies. Returns the requested page as a string.
"""
socket.timeout(300)
opener = urllib.URLopener()
opener.addheader('Cookie', self._cookies.output(attrs=[], header='',
sep=';').strip())
for k, v in headers:
opener.addheader(k, v)
try:
f = opener.open(url, post_data)
except IOError, e:
if e[1] == 302:
# Got a 302 redirect, but check for cookies before redirecting.
# e[3] is a httplib.HTTPMessage instance.
if e[3].dict.has_key('set-cookie'):
self._cookies.load(e[3].dict['set-cookie'])
return self.get_page(e[3].getheader('location'))
else:
raise
if f.headers.dict.has_key('set-cookie'):
self._cookies.load(f.headers.dict['set-cookie'])
return f.read()
class OutlookWebScraper(CookieScraper):
def __init__(self, domain, username, password):
self.domain = domain
self.username, self.password = username, password
self.is_logged_in = False
self.base_href = None
super(OutlookWebScraper, self).__init__()
def login(self):
url = urlparse.urljoin(self.domain, 'exchweb/bin/auth/owaauth.dll')
html = self.get_page(url, urllib.urlencode({
'destination': urlparse.urljoin(self.domain, 'exchange'),
'flags': '0',
'username': self.username,
'password': self.password,
'SubmitCreds': 'Log On',
'forcedownlevel': '0',
'trusted': '4',
}))
if 'You could not be logged on to Outlook Web Access' in html:
raise InvalidLogin
m = re.search(r'(?i)<BASE href="([^"]*)">', html)
if not m:
raise RetrievalError, "Couldn't find <base href> on page after logging in."
self.base_href = m.group(1)
self.is_logged_in = True
def inbox(self, unread=False):
"""
Returns the message IDs for all messages on the first page of the
Inbox, regardless of whether they've already been read.
"""
return self.get_folder('Inbox', unread)
def get_folder(self, folder_name, unread=False):
"""
Returns the message IDs for all messages on the first page of the
folder with the given name, regardless of whether the messages have
already been read. The folder name is case insensitive.
"""
if not self.is_logged_in: self.login()
url = self.base_href + urllib.quote(folder_name) + '/?Cmd=contents'
if unread == True: url = url + "&View=Unread%20Messages"
html = self.get_page(url)
message_urls = re.findall(r'(?i)NAME=MsgID value="([^"]*)"', html)
return message_urls
def get_message(self, msgid):
"Returns the raw e-mail for the given message ID."
if not self.is_logged_in: self.login()
# Sending the "Translate=f" HTTP header tells Outlook to include
# full e-mail headers. Figuring that out took way too long.
return self.get_page(self.base_href + msgid + '?Cmd=body', headers=[('Translate', 'f')])
def delete_message(self, msgid):
"Deletes the e-mail with the given message ID."
if not self.is_logged_in: self.login()
return self.get_page(self.base_href + msgid, urllib.urlencode({
'MsgId': msgid,
'Cmd': 'delete',
'ReadForm': '1',
}))