diff --git a/COPYING b/COPYING deleted file mode 100644 index 6f01560..0000000 --- a/COPYING +++ /dev/null @@ -1,13 +0,0 @@ -Copyright 2008 Google Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. diff --git a/Makefile b/Makefile deleted file mode 100644 index 56ce3f7..0000000 --- a/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -spec: pubsubhubbub-core-0.3.html - -pubsubhubbub-core-0.3.html: pubsubhubbub-core-0.3.xml - xml2rfc pubsubhubbub-core-0.3.xml pubsubhubbub-core-0.3.html - cat pubsubhubbub-core-0.3.html | head -n -1 > out.html - cat analytics.txt >> out.html - mv out.html pubsubhubbub-core-0.3.html - diff --git a/README.md b/README.md new file mode 100644 index 0000000..aed3622 --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +PubSubHubbub +============ + +__IMPORTANT NOTE__: The PubSubHubbub protocol has now been adopted by the W3C as a Candidate Recommendation. It's also been renamed [WebSub](https://github.com/w3c/websub) for clarity and concision. Please consider upgrading all older PubSubHubbub implementations to WebSub. + + +**PubSubHubbub** is an open protocol for distributed publish/subscribe communication on the Internet. It generalizes the concept of webhooks +and allows data producers and data consumers to work in a decoupled way. + +PubSubHubbub provides a way to subscribe, unsubscribe and receive updates from a resource, whether it's an RSS or Atom feed or any web accessible document (JSON...). + +The current version of the spec is 0.4. Please, [read it here](http://pubsubhubbub.github.io/PubSubHubbub/pubsubhubbub-core-0.4.html). + +Open hubs are provided by: +* [Superfeedr](http://pubsubhubbub.superfeedr.com/) +* [Google](http://pubsubhubbub.appspot.com/) +* Aaron Parecki: [p3k](https://switchboard.p3k.io/) +* Christian Weiske: [phubb](http://phubb.cweiske.de/) + +Several other publishing platforms, like Wordpress, include their own hubs. + +If you're looking for tutorials on how to get started with PubSubHubbub, check the links below: +* [How to PubSubHubbub](http://blog.superfeedr.com/howto-pubsubhubbub/) by Superfeedr +* [How to publish and consume PubSubHubbub](http://indiewebcamp.com/How_to_publish_and_consume_PubSubHubbub) on indiewebcamp.com + diff --git a/analytics.txt b/analytics.txt deleted file mode 100644 index e6941a1..0000000 --- a/analytics.txt +++ /dev/null @@ -1,10 +0,0 @@ - - - diff --git a/appengine_hub_preso_gadget.xml b/appengine_hub_preso_gadget.xml deleted file mode 100644 index 2552b3d..0000000 --- a/appengine_hub_preso_gadget.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/bookmarklet/Makefile b/bookmarklet/Makefile deleted file mode 100644 index 5e29c28..0000000 --- a/bookmarklet/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -bookmarklet: bookmarklet.min.js bookmarklet_jsonp.min.js bookmarklet_config.html - -%.min.js: %.js - cat $^ | python jsmin.py | tail -n 1 > $@ - -bookmarklet_config.html: bookmarklet_config.template bookmarklet.min.js - python generate_config.py > $@ diff --git a/bookmarklet/bookmarklet.html b/bookmarklet/bookmarklet.html deleted file mode 100644 index 2868d87..0000000 --- a/bookmarklet/bookmarklet.html +++ /dev/null @@ -1,115 +0,0 @@ - - - - - - - -
- -
- Found this feed: -
-
- -
- Publishing new events to this hub: -
-
- -
- - -
- -
- Configure this bookmarklet - | - About -
- -
- - - \ No newline at end of file diff --git a/bookmarklet/bookmarklet.js b/bookmarklet/bookmarklet.js deleted file mode 100644 index 7269764..0000000 --- a/bookmarklet/bookmarklet.js +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2009 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -(function() { - if (document.body && !document.xmlVersion) { - var jsonp = document.createElement('script'); - jsonp.type = 'text/javascript'; - // Remove the domain portion of the URL below for local testing. - jsonp.src = 'http://pubsubhubbub.appspot.com/bookmarklet_jsonp.min.js?rand=' + Math.floor(Math.random() * 1000); - var head = document.getElementsByTagName('head')[0].appendChild(jsonp); - }; -})() diff --git a/bookmarklet/bookmarklet.min.js b/bookmarklet/bookmarklet.min.js deleted file mode 100644 index 8084297..0000000 --- a/bookmarklet/bookmarklet.min.js +++ /dev/null @@ -1 +0,0 @@ -(function(){if(document.body&&!document.xmlVersion){var jsonp=document.createElement('script');jsonp.type='text/javascript';jsonp.src='http://pubsubhubbub.appspot.com/bookmarklet_jsonp.min.js?rand='+Math.floor(Math.random()*1000);var head=document.getElementsByTagName('head')[0].appendChild(jsonp);};})() \ No newline at end of file diff --git a/bookmarklet/bookmarklet_config.html b/bookmarklet/bookmarklet_config.html deleted file mode 100644 index dfdb6f1..0000000 --- a/bookmarklet/bookmarklet_config.html +++ /dev/null @@ -1,46 +0,0 @@ - - - PubSubHubbub - Configure bookmarklet - - - - - - -
- - -
- -
- Drag into your Bookmarks → - Publish to Hub -
- - - - diff --git a/bookmarklet/bookmarklet_config.template b/bookmarklet/bookmarklet_config.template deleted file mode 100644 index 76a7b50..0000000 --- a/bookmarklet/bookmarklet_config.template +++ /dev/null @@ -1,45 +0,0 @@ - - - PubSubHubbub - Configure bookmarklet - - - - - - -
- - -
- -
- Drag into your Bookmarks → - Publish to Hub -
- - - diff --git a/bookmarklet/bookmarklet_gadget.xml b/bookmarklet/bookmarklet_gadget.xml deleted file mode 100644 index 2a5f22d..0000000 --- a/bookmarklet/bookmarklet_gadget.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/bookmarklet/bookmarklet_jsonp.js b/bookmarklet/bookmarklet_jsonp.js deleted file mode 100644 index e0d2211..0000000 --- a/bookmarklet/bookmarklet_jsonp.js +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2009 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the 'License'); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an 'AS IS' BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -var _____pshb_BookmarkletRun = function() { - - var post_id = 'pshb-bookmarklet-iframe'; - var post = null; - var close = null; - - if (document.getElementById(post_id) == null) { - post = document.createElement('iframe'); - post.id = post_id; - post.width = '250'; - post.height = '120'; - var s = post.style; - s.position = 'absolute'; - s.top = '10px'; - s.right = '10px'; - s.padding = '0'; - s.margin = '0'; - s.border = '5px solid #9c0'; - s.zIndex = '1000000'; - - close = document.createElement('a'); - close.href = 'javascript:window._____pshb_closeMe();'; - close.innerHTML = '×'; - s = close.style; - s.cursor = 'default'; - s.fontWeight = 'bold'; - s.fontSize = '12px'; - s.position = 'absolute'; - s.top = '15px'; - s.right = '15px'; - s.margin = '0'; - s.borderStyle = 'dotted'; - s.borderColor = '#aaa;'; - s.borderWidth = '0 0 1px 1px'; - s.padding = '0 3px 0 3px'; - s.display = 'block'; - s.textDecoration = 'none'; - s.color = '#000'; - s.zIndex = '1000001'; - }; - - // Thanks Prototype. - var canonicalize = function(s) { - if (s == '' || s == null) { - return ''; - }; - var temp = document.createElement('div'); - temp.innerHTML = s.toLowerCase(); - var result = temp.childNodes[0].nodeValue; - temp.removeChild(temp.firstChild); // garbage collection - return result; - }; - - window._____pshb_closeMe = function() { - document.body.removeChild(close); - document.body.removeChild(post); - }; - - var findAtomFeed = function() { - var links = document.getElementsByTagName('link'); - for (var i = 0; i < links.length; ++i) { - var item = links[i]; - if (item.type != undefined && - item.href != undefined && - item.rel != undefined && - canonicalize(item.type).indexOf('application/atom') == 0 && - canonicalize(item.rel).indexOf('alternate') == 0 && - item.href.length > 0) { - return item.href; - }; - }; - return null; - }; - - window._____pshb_sawLoad = false; - window._____pshb_autoClose = true; - - // TODO: Figure out a better way to detect event delivery completion. - // XHR can see 204 responses but doing this cross-domain seems impossible. - // Proxying the publish POST through a server because that will hide the IP - // address of the requestor; this leaves publishing open to a DoS attack, - // which we want to avoid. - window._____pshb_handleLoad = function() { - if (!window._____pshb_sawLoad) { - window._____pshb_sawLoad = true; - } else { - // This means the iframe has loaded another page, which could not possibly - // be a 204 response (since browsers do nothing on that response). So here - // we assume the post failed and do not automatically close the window. - window._____pshb_autoClose = false; - }; - }; - - var autoClose = function() { - if (window._____pshb_autoClose) { - window._____pshb_closeMe(); - }; - }; - - - if (post != null) { - var feed = findAtomFeed(); - var hub = _____pshb_getHub(); - post.onload = _____pshb_handleLoad; - // Remove the domain portion of the URL below for local testing. - post.src = 'http://pubsubhubbub.appspot.com/bookmarklet.html' + '?feed=' + feed + '&hub=' + hub; - document.body.appendChild(post); - document.body.appendChild(close); - setTimeout(autoClose, 2000); - }; -}; - -_____pshb_BookmarkletRun(); diff --git a/bookmarklet/bookmarklet_jsonp.min.js b/bookmarklet/bookmarklet_jsonp.min.js deleted file mode 100644 index 7d83689..0000000 --- a/bookmarklet/bookmarklet_jsonp.min.js +++ /dev/null @@ -1 +0,0 @@ -var _____pshb_BookmarkletRun=function(){var post_id='pshb-bookmarklet-iframe';var post=null;var close=null;if(document.getElementById(post_id)==null){post=document.createElement('iframe');post.id=post_id;post.width='250';post.height='120';var s=post.style;s.position='absolute';s.top='10px';s.right='10px';s.padding='0';s.margin='0';s.border='5px solid #9c0';s.zIndex='1000000';close=document.createElement('a');close.href='javascript:window._____pshb_closeMe();';close.innerHTML='×';s=close.style;s.cursor='default';s.fontWeight='bold';s.fontSize='12px';s.position='absolute';s.top='15px';s.right='15px';s.margin='0';s.borderStyle='dotted';s.borderColor='#aaa;';s.borderWidth='0 0 1px 1px';s.padding='0 3px 0 3px';s.display='block';s.textDecoration='none';s.color='#000';s.zIndex='1000001';};var canonicalize=function(s){if(s==''||s==null){return'';};var temp=document.createElement('div');temp.innerHTML=s.toLowerCase();var result=temp.childNodes[0].nodeValue;temp.removeChild(temp.firstChild);return result;};window._____pshb_closeMe=function(){document.body.removeChild(close);document.body.removeChild(post);};var findAtomFeed=function(){var links=document.getElementsByTagName('link');for(var i=0;i0){return item.href;};};return null;};window._____pshb_sawLoad=false;window._____pshb_autoClose=true;window._____pshb_handleLoad=function(){if(!window._____pshb_sawLoad){window._____pshb_sawLoad=true;}else{window._____pshb_autoClose=false;};};var autoClose=function(){if(window._____pshb_autoClose){window._____pshb_closeMe();};};if(post!=null){var feed=findAtomFeed();var hub=_____pshb_getHub();post.onload=_____pshb_handleLoad;post.src='http://pubsubhubbub.appspot.com/bookmarklet.html'+'?feed='+feed+'&hub='+hub;document.body.appendChild(post);document.body.appendChild(close);setTimeout(autoClose,2000);};};_____pshb_BookmarkletRun(); \ No newline at end of file diff --git a/bookmarklet/generate_config.py b/bookmarklet/generate_config.py deleted file mode 100755 index c9163a2..0000000 --- a/bookmarklet/generate_config.py +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env python - -js = open('bookmarklet.min.js').read() -config = open('bookmarklet_config.template').read() -print config % {'bookmarklet_min_js': js} diff --git a/bookmarklet/jsmin.py b/bookmarklet/jsmin.py deleted file mode 100644 index 2339f7d..0000000 --- a/bookmarklet/jsmin.py +++ /dev/null @@ -1,218 +0,0 @@ -#!/usr/bin/python - -# This code is original from jsmin by Douglas Crockford, it was translated to -# Python by Baruch Even. The original code had the following copyright and -# license. -# -# /* jsmin.c -# 2007-05-22 -# -# Copyright (c) 2002 Douglas Crockford (www.crockford.com) -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal in -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -# of the Software, and to permit persons to whom the Software is furnished to do -# so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# The Software shall be used for Good, not Evil. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# */ - -from StringIO import StringIO - -def jsmin(js): - ins = StringIO(js) - outs = StringIO() - JavascriptMinify().minify(ins, outs) - str = outs.getvalue() - if len(str) > 0 and str[0] == '\n': - str = str[1:] - return str - -def isAlphanum(c): - """return true if the character is a letter, digit, underscore, - dollar sign, or non-ASCII character. - """ - return ((c >= 'a' and c <= 'z') or (c >= '0' and c <= '9') or - (c >= 'A' and c <= 'Z') or c == '_' or c == '$' or c == '\\' or (c is not None and ord(c) > 126)); - -class UnterminatedComment(Exception): - pass - -class UnterminatedStringLiteral(Exception): - pass - -class UnterminatedRegularExpression(Exception): - pass - -class JavascriptMinify(object): - - def _outA(self): - self.outstream.write(self.theA) - def _outB(self): - self.outstream.write(self.theB) - - def _get(self): - """return the next character from stdin. Watch out for lookahead. If - the character is a control character, translate it to a space or - linefeed. - """ - c = self.theLookahead - self.theLookahead = None - if c == None: - c = self.instream.read(1) - if c >= ' ' or c == '\n': - return c - if c == '': # EOF - return '\000' - if c == '\r': - return '\n' - return ' ' - - def _peek(self): - self.theLookahead = self._get() - return self.theLookahead - - def _next(self): - """get the next character, excluding comments. peek() is used to see - if an unescaped '/' is followed by a '/' or '*'. - """ - c = self._get() - if c == '/' and self.theA != '\\': - p = self._peek() - if p == '/': - c = self._get() - while c > '\n': - c = self._get() - return c - if p == '*': - c = self._get() - while 1: - c = self._get() - if c == '*': - if self._peek() == '/': - self._get() - return ' ' - if c == '\000': - raise UnterminatedComment() - - return c - - def _action(self, action): - """do something! What you do is determined by the argument: - 1 Output A. Copy B to A. Get the next B. - 2 Copy B to A. Get the next B. (Delete A). - 3 Get the next B. (Delete B). - action treats a string as a single character. Wow! - action recognizes a regular expression if it is preceded by ( or , or =. - """ - if action <= 1: - self._outA() - - if action <= 2: - self.theA = self.theB - if self.theA == "'" or self.theA == '"': - while 1: - self._outA() - self.theA = self._get() - if self.theA == self.theB: - break - if self.theA <= '\n': - raise UnterminatedStringLiteral() - if self.theA == '\\': - self._outA() - self.theA = self._get() - - - if action <= 3: - self.theB = self._next() - if self.theB == '/' and (self.theA == '(' or self.theA == ',' or - self.theA == '=' or self.theA == ':' or - self.theA == '[' or self.theA == '?' or - self.theA == '!' or self.theA == '&' or - self.theA == '|' or self.theA == ';' or - self.theA == '{' or self.theA == '}' or - self.theA == '\n'): - self._outA() - self._outB() - while 1: - self.theA = self._get() - if self.theA == '/': - break - elif self.theA == '\\': - self._outA() - self.theA = self._get() - elif self.theA <= '\n': - raise UnterminatedRegularExpression() - self._outA() - self.theB = self._next() - - - def _jsmin(self): - """Copy the input to the output, deleting the characters which are - insignificant to JavaScript. Comments will be removed. Tabs will be - replaced with spaces. Carriage returns will be replaced with linefeeds. - Most spaces and linefeeds will be removed. - """ - self.theA = '\n' - self._action(3) - - while self.theA != '\000': - if self.theA == ' ': - if isAlphanum(self.theB): - self._action(1) - else: - self._action(2) - elif self.theA == '\n': - if self.theB in ['{', '[', '(', '+', '-']: - self._action(1) - elif self.theB == ' ': - self._action(3) - else: - if isAlphanum(self.theB): - self._action(1) - else: - self._action(2) - else: - if self.theB == ' ': - if isAlphanum(self.theA): - self._action(1) - else: - self._action(3) - elif self.theB == '\n': - if self.theA in ['}', ']', ')', '+', '-', '"', '\'']: - self._action(1) - else: - if isAlphanum(self.theA): - self._action(1) - else: - self._action(3) - else: - self._action(1) - - def minify(self, instream, outstream): - self.instream = instream - self.outstream = outstream - self.theA = '\n' - self.theB = None - self.theLookahead = None - - self._jsmin() - self.instream.close() - -if __name__ == '__main__': - import sys - jsm = JavascriptMinify() - jsm.minify(sys.stdin, sys.stdout) diff --git a/design-doc.txt b/design-doc.txt deleted file mode 100644 index 74f9a3a..0000000 --- a/design-doc.txt +++ /dev/null @@ -1,593 +0,0 @@ -Note: -===== - -This document is old. Instead, see: - -http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.1.html - - - - -============================================================================ ->> Overview -============================================================================ - -An open, simple web-scale pubsub protocol, along with an open source -reference implentation targetting Google App Engine. Notably, -however, nothing in the protocol is centralized, or Google- or App -Engine-specific. Anybody can play. - -As opposed to more developed (and more complex) pubsub specs like -XEP-0060, this spec's base profile (the barrier-to-entry to speak it) -is dead simple. The fancy bits required for high-volume publishers -and subscribers are optional. The base profile is HTTP-based, as -opposed to XMPP (see more on this below). - -To dramatically simplify the spec in several places where we had to -choose between supporting A or B, we took it upon ourselves to say -"only A", rather than making it an implementation decision. - -We offer this spec in hopes that it fills a need or at least advances -the state of the discussion in the pubsub space. Polling sucks. We -think a decentralized pubsub layer is a fundamental, missing layer in -the Internet architecture today and its existence, more than just -enabling the obvious lower latency feed readers, would enable many -cool applications, most of which we can't even imagine. But we're -looking forward to decentralized social networking. - - - -============================================================================ ->> Terminology -============================================================================ - -Topic: an Atom feed URL. The unit to which one can subscribe to - changes. RSS isn't supported for simplicity. Further, the spec - currently only addresses public (unauthenticated) Atom feed URLs. - -Pubsub Hub ("the hub"): the server (URL) which implements this protocol. - We're currently implementing this and running at server at - http://pubsubhubbub.appspot.com/ that's at least for now open for anybody - to use, as either a publisher or subscriber. Any hub is free to - implement its own policies on who can use it. - -Publisher: an owner of a topic. Notifies the pubsub hub when the topic - (Atom feed) has been updated. Just notifies that it _has_ been updated, - but not how. As in almost all pubsub systems, the publisher is unaware - of the subscribers, if any. - -Subscriber: an entity (person or program) that wants to be notified of - changed on a topic. Must be directly network-accessible, not being - a NAT. PubSubHubbub is a server-to-server protocol. If you're being - NAT, you're a client, out-of-scope for this protocol. (Browser channels, - long-polling a server would be more appropriate for you.) - -Subscription: a tuple (Topic URL, Subscriber). For network-accessible - subscribers, the subscription's unique key is actually the tuple - (Topic URL, Subscriber Callback URL). For NAT'd subscribers, - the unique key for a subscription is (Topic URL, SubscriberToken). - In both cases, subscriptions may (at the hub's decision) have expiration - times akin to DHCP leases and then must be renewed. - -Event: an event that's visible to multiple topics. For each event - that happens (e.g. "Brad posted to the Linux Community."), multiple - topics could be affected (e.g. "Brad posted." and "Linux community - has new post"). Publisher events update topics, and the hub looks - up all subscriptions for all affected topics, sending out - notifications to subscribers. - -Notification: a delta on a topic, computed by the hub and sent to all - subscribers. (TBD: format of this delta. likely: an Atom feed - itself with just the new or changed stuff, and gravestones for - removed items?) The notification can be the result of a publisher - telling the hub of an update, or the hub proactively polling a topic - feed, perhaps for a subscriber subscribing to a topic that's not - pubsub-aware. Note also that a notification to a subscriber can be - a payload consisting of updates for multiple topics. Publishers MAY - choose to send multi-topic notifications as an optimization for - heavy subscribers, but subscribers MUST understand them. - - - - - -============================================================================ ->> Notes: -============================================================================ - -* There is no relationship or hierarchy between topics. In the future - such an Atom extension could exist, but that's entirely out of this - spec, both now and then. Non-goal. If a publisher wants to offer - a hierarchy, they need to offer 'n' Atom feeds. - -* For HTTP callback subscribers, the add-subscription part of the - protocol requires that the hub verifies (via a pingback: "did you - really mean that?") before actually adding the subscription. This - is to prevent people from DoS'ing each other by subscribing victims - to many and/or high-volume publishers. - -* In same way openid was bootstrappable with a simple tag, should - be similar for publishers to delegate their pubsubhub with a simple - link tag. Example: - - -* Multi-protocol would be nice, but simple would probably win... HTTP - only at first. XMPP later. XMPP has a few advantages, but really - only authentication. A good HTTP implementation can do long polling, - pingbacks, etc. - -* Loops. Perhaps Atom child element (repeated) of all the Atom Entry - IDs that entry used to be or came from or is. Neat to see the - HTTP-like TRACE. (perhaps extension to Atom, not part of this spec) - (** Looked it up, and implies this, but it - only works if all of the feeds in the trace correctly supply a 'via' - tag. Then it's on the client to iteratively follow the trace). - - - -============================================================================ ->> High-level protocol flow: -============================================================================ - - - -* Publishers POST a ping to their hub(s) URLs when their topic(s) - change. - -* Subscribers POST to one or more of the advertised hubs for a topic they're interested in. Alternatively, some hubs may offer auto-polling capability, to let {their,any} subscribers subscribe to topics which don't advertise a hub. - -* The hub caches minimal metadata (id, data, entry digest) about each topic's previous state. When the hub refetches a topic feed (on its own initiative or as a result of a publisher's ping) and finds a delta, it enqueues a notification to all registered subscribers. Subscribers can be notified of topic deltas in a variety of ways: - - - - - - - In the base profile, subscribers must be directly network accessible (not behind a NAT), - running a listening webserver, and can receive an HTTP callback to notify them their topic - changed. To avoid authentication issues with HTTP, this - callback doesn't include any payload but rather just a note for - the subscriber to check the hub for the topic URL (which - presumably they trust, if they subscribed to it in the first - place). In the future, this HTTP callback could include a - signed (OAuth?) payload, avoiding the need for the extra HTTP - request in the other direction. In any high transaction - scenario, though, it's hoped that all parties (hub, publisher, - subscriber) would make proper use of HTTP Keep-Alive - connections, negating the ugliest part of the multiple HTTP - requests (new TCP connections: 3-way handshake, slow start, - ephemeral port exhaustion, etc). - - - Also in the base profile, but slightly lower priority for us - implementation-wise, is support for NAT'd subscribers unable - to run a publicly accessible listening webserver. Instead, - these subscribers need to connect to the hub to retrieve their - enqueued notifications. A smart hub implementation here would - support HTTP long-polling (aka "comet") so the client doesn't - need to make HTTP requests often to get low-latency updates. - (TODO/FUTURE: define recommendations for this long-polling behavior - on both client and server: ideally server just does it, hanging - after the GET, but then what's the recommendation for the client's - HTTP client timeout value, which might not be under their control? - Ignore that and document it? Separate URL for long polling? - Then subscriber caches hub's long-polling ability? Server includes - X- header to signal that it did or wants to do long polling?) - - - Fancier implementations may choose to use HTTP long polling - ("comet") or XMPP. We're punting on this for now in the - interest of getting something basic working for the common case. - - - -============================================================================ ->> Atom details -============================================================================ - -Notification and source formats will be Atom. More detail follows this example. - - - # ... source, title, etc ... - - - - 2008-08-11T02:15:01Z - - # Example of a full entry. - - Heathcliff - - http://publisher.com/happycat25.xml - 2008-08-11T02:15:01Z - - What a happy cat. Full content goes here. - - - - # Example of an entity that isn't full/is truncated. This is implied - # by the lack of a element and a element instead. - - Heathcliff - - http://publisher.com/happycat25.xml - 2008-08-11T02:15:01Z - - What a happy cat! - - - - # Meta-data only; implied by the lack of and elements. - - Garfield - - http://publisher.com/happycat25.xml - 2008-08-11T02:15:01Z - - - # Context entry that's meta-data only and not new. Implied because the - # update time on this entry is before the //atom:feed/updated time. - - Nermal - - http://publisher.com/happycat25.xml - 2008-07-10T12:28:13Z - - - - -Publisher makes the decision as to include full body, truncated body, -or meta data of most recent event(s). One of: - - URL + metadata - URL + metadata + truncated - URL + metadata + full - -The trade-off between including all content in outgoing notifications -or having the thundering herd (by clients who fetch the -//atom:feed/entry/link in response to a notification) is up to the -publisher. - -Entries of most recent 10 events (for recipient to know whether or not -they'd missed any recent items... like TCP SACK) will be provided as -context. This is implied by the difference between the -//atom:feed/updated field and the //atom:feed/entry/updated -fields. The //atom:feed/updated field will be set to the time of the -*oldest* in the list that is new. All items with - times before then are context; all with times equal to or -after are new. This also lets subscribers know how long it has been -from when the notification was first sent by the publisher to when -they actually received it from the hub. - -The //atom:feed/link[@rel="self"] element will indicate the original -URL for the entire event stream with no truncation (if available). - -The //atom:feed/link[@rel="hub.delegate"] element indicates the URL -that the hub should use for retrieving new notifications from a -publisher. The publisher can make this delegate URL contain a -meta-data only or truncated view of the feed. If a hub.delegate is not -provided, then the 'self' URL is used as both the source of -notifications and the source for the topic URL feed. - -Topic URLs must be unique, but multiple topics may use the same -hub.delegate. In this situation, the delegate URL may serve a -MIME multipart response, each part of which will contain a -separate Atom document for an individual topic. The hub -must understand this delegation. Once it has fetched the topic URL -once to see this delegation is present, it will use the delegation url -to pull the feed. This allows the publisher to be more efficient at -publishing across many topics at once with a single fetch from the -hub. - -TODO: How do you indicate to the hub that you no longer want to have a -delegate URL? - -Requirement is that topic URLs and delegate URLs can never overlap! - -More info on atom:link tag meanings here: - http://intertwingly.net/wiki/pie/LinkTagMeaning - -============================================================================ ->> Subscribing -============================================================================ - -There are multiple ways to subscribe, depending on the type and -needs of the subscriber. Roughly, the types are as follows: - - 1. Internet-accessible subscriber using HTTP callback - (new subscriptions need to be verified to prevent using - the hub to DoS others) - 1.1. verification synchronously - 1.2. verification asynchronously ("deferred") - 2. NAT'd subscribers or those without an HTTP server - (no verification necessary) - -Flow for subscription, using the following example URLs: - -http://subr.com/notify-callback.php -http://pubr.com/happycats.xml -http://hub.com/hubpoint - - 1. Subr does POST to /hubpoint with payload: - - & hub.mode=subscribe - & hub.callback = http://subr.com/notify-callback.php - & hub.topic = http://pubr.com/happycats.xml - (may be repeated for large subscriptions) - & hub.verify = async,sync - & hub.verify_token = [opaque] - - The hub.verify is an optional comma-separated list of the - subscribers ordered preferences and capabiliies, - verification-wise. One of: - - sync -- Subr only supports synchronous verification. - async -- Subr only supports async verification. - WARNING: it's not required that servers support - async, so this type of subscription may fail. - sync,async -- Subr prefers sync to async. - async,sync -- Subr prefers async to sync. - - The optional hub.verify_token is opaque to the hub and is simply - echoed back to the subscriber in the verification request. - Subscribers can put whatever they want in it: database primary - keys, encrypted data, etc... anything that makes processing the - hub.mode=subverify request easier. - - 2. Hub sends new request "oh do you want this topic?" to - /notify-callback.php with x-requester-ip: 1.2.3.4 (so DoSing - clients can be detected). - - POST /notify-callback.php - Host: subr.com - - hub.mode=subverify & - hub.topic=whatever - - NOTE: Maybe this should be a GET to the callback URL instead of a POST, since - it represents a steady state for the subscriber? We should probably be rigid - about the 204 here, if possible; otherwise it's really hard to differentiate - between a callback success and just pointing at a random good page on the - web that will return a 200 no matter what you throw at it. - - 3. Subr says, "yes, I really do want this topic": - - HTTP/1.1 204 No Content - - 4. Hub responds to Subr with "okay". Either 204 if the - subscription(s) were verified and created, or 202 if the - subscriptions were enqueued to be verified later. - -TODO: Somewhere in here we should require the subscriber to re-confirm their subscription after a certain amount of time. We need to convey to them what the expiration period of their subscription will be. - -If verification is being done asynchronously, steps 2 and 3 above are -skipped and Hub's 2xx response in step 4 is really just saying, -"Potential subscription enqueued for later verification." - -Publisher must provide synchronous capability at a minimum. - -Sub | Situation | Results - -SA - fetch succeed --> 204 (no content) -SA - fetch fail or server prefers async, async logged --> 202 (accepted): best effort. min 1 retry in the future only. -SA - fetch fail, async not supported --> 501 (not implemented) -AS - async supported --> 202 accepted. best effort. -AS - async not supported (or not preferred) + fetch success --> 204 no content. success! -AS - async not supported (or not preferred) + fetch failure --> 5xx -S - fetch succeed --> 204 (no content; success!) -S - fetch failed --> 5xx -A - server supports --> 202 (accepted) best effort later -A - server doesn't support --> 501 (not implemented) - -TODO: 5xx on fetch failure isn't clear enough. Maybe we should use 409 ("Conflict") to indicate when a synchronous subscription request tries to confirm and fails. Then it's clearly the requestor's fault and not a server error. - -In the case of temporary server error, the server should return 503. - -============================================================================- ->> Subscribe Protocol -============================================================================ - -POST -http://publisher.com/subpoint? - callback=http://subscriber.com/callback.php - topic=http://publisher.com/foo.xml - async={AS, SA, A, S} - mode=unsubscribe (optional: default is 'subscribe') - - Error cases: - * If callback is invalid: TODO - * If topic isn't handled by this pubsubhub: TODO - - Probably if it's an unknown topic, issue a 404 - * Async option is bogus (400 bad request) - -TODO: What about support for multi-part data for the subscriber? For -very simple subscribers, we probably don't even want to do multipart -form-data, because it's more complex to parse? Or is it a minimum -requirement that the post body will always be multipart? - -============================================================================ ->> Publishing -============================================================================ - -Overview: - - A publisher pings the hub with the URL(s) which have been updated - and the hub schedules them to be fetched & diffed. Because it's - just a ping to wake up the hub, no authentication from the publisher - is required. - -Protocol: - -POST -http://pubsubhubbub.com/hubpoint? - hub.mode=publish & - hub.url=http://publisher.com/topic1.xml & - hub.url=http://publisher.com/topic2.xml & - ... - - The 'url' field can be repeated for any combination of topic URL or - delegate URLs. The hub should deal properly with duplicate URLs. - - Error cases: - * Topic(s) known/accepted. -> 204 No content. - * Topic(s) unknown/unaccepted -> 4xx Bad Request / Forbidden. - -This will enqueue a feed-fetch for sometime in the future, followed by -pushing the new notifications of potential deltas to all subscribers. -The hub may decide to combine this publish notification with any -earlier publish notification that have not yet been pushed to -subscribers (this could happen if events are coming in faster than the -hub will allow). - -The hub's GET request of the Atom topic URL may include a Google -Reader Feed-fetcher style thing where there is a statistics header on -the request for the feed every time we pull it. Then the publisher -always knows how many subscribers are on the hub. Example: - - GET /foo.xml HTTP/1.1 - Host: publisher.com - X-Hub-Subscribers: 120 - - -============================================================================ ->> Receive Events -============================================================================ - -POST -http://subscriber.com/callback.php - -Post body will be the Atom notification feed described above. The hub will -keep track of the last N known elements for the topic, and send -updates only for the newest elements (along with N entries for -context). - -The subscriber will know the topic URL by looking at the -//atom:feed/link[@rel="self"] value? Or maybe we'll make it rel="source" for -the notifications? - -The subscriber should return 200 or 204 on successful acceptance of -the event. 4xx and 5xx responses will be considered errors (and -delivery will be attempted again later). TODO: What should we do with -3xx responses? - -=========================================================================== ->> Meeting notes from 2008-09-16: -=========================================================================== - -Priorities: - - ignore for now NAT'ed token polling (requires https anyway) - - ignore for now XMPP (requires XMPP anyway) - - ignore for now huge subscribers: - - multi-topic notifications - - long-lived connections, - - one HTTP in-flight at a time, - - ignore for now huge publishers: - - publishing tons of updated URLs at a time (e.g. Blogger) - - ignore for now (until v2) all authentication issues: - - no pushing payloads to subscribers. send them notification - to poll us instead. perhaps with token. - - ignore for now private Atom URLs/topics. public topics for now. - OAuth or something later. - -Keep atomid of all feed entries we've seen on an Atom URL in the past. -(or just the immediate past one perhaps? or 'n' days of them?). keep -(atomid, date, digest) - -Lexicon: - topicid: an Atom URL - topicdeltaid: a diff of two Atom URLs (t1 and t2). - -POST /pubber/?topic_url=http://lolcats/lolcatz.xml - SELECT subberid FROM subbers WHERE topicid=? LIMIT 1 - ("does anybody give a shit?") - If no, - return "Thanks bye! 200 OK!" (optionally tell google - crawlers, based on publisher's preference. TODO: put this - in spec somehow. perhaps reuse the term "noindex"?) - If yes, - enqueue a poll-this-url-later record. one insert. bounded - latency. return 200 OK - -Cron: -GET /do-some-work/fetch-updated-feeds-and-find-deltas - pull feed, - compute digests. find ids, dates. compute deltas from our copy of that thing's previous value. - INSERT INTO topicdeltapayloads - SET topicdeltaid="yyyymmhhddmss.mmss:topicid", - payload=..., topicid=.... - INSERT INTO topics_what_are_new_but_people_need_to_be_notified - SET topicdeltaid=?, subid-where-i-left-off="" - - -GET /do-some-work/send-notifications - SELECT topiciddeltaid, subid-where-i-left-off FROM topics_what_are_new_but_people_need_to_be_notified LIMIT 500 - RANDOMIZE LIST - Foreach topicid: - try-to-get-lock { - SELECT the topicdeltaid payload - SELECT subscribers WHERE topicid = ? AND subid > subid-where-i-left-off - BATCH urlfetch POST to them all, - scatter-gather errors. - - For those that fail from the 100-some batch, create - to-do-later (notification) records. increase subid if the - selected count == the previous limit, - else DELETE FROM - topics_where_people_need_to_be_notified WHERE - topiciddeltaid = ? - - } // end lock - -XMPP: - -* in the future, if/when App Engine supports it. but it's a special - thingy. HTTP is base and required. XMPP support for pubbers and - subbers is optional. - -Polling mode for subscribers: - -* a) callbacks won't always work (subscribers behind NATs, etc) -* b) callbacks won't always fit all subscriber's model (not easy for them) -* so must have poll mode. -* in the future: can be long-poll, when App Engine supports it. maybe. -* needs auth -* 1MB payload on responses, so server needs ability to paginate and set "but there's more!" flag w/ continuation token. - -The hub notifies all subbers: POST /callback/url/ "yo, something's new for you. don't trust me. fetch: http://pubsubhubbub.appspost.com/poll-for-new-shit/?subid=234&token=23482903482340923849023840923i4" - -Large subscribers: (may be v2) - -* one in-flight HTTP POST to subscribers at a time. use memcacheg 10 second or so lock. -* if another POST is attempted while another is already in flight, enqueue/append the payloadid to a new - table, contentious_or_big_subscriber. still mark that (topicdeltaid, subid) pair as done for the purposes of - /do-some-work/sent-notifications -* new do-some-work: - /do-some-work/sent-notifications-to-big-peeps -* optional property on subscriptions for big subscribers to say, "Yo, it's okay to mix my subscriptions together - in one HTTP payload post." in which case it's atom-stream.xml style (updates.sixapart.com) and the payloads are mixed: - - -Misc notes: ----------------- -* can subscribe to anything, regardless of whether or not there are any publishers. -* server's choice whether or not to actually poll proactively for changes vs. getting notified. - - -Discovery: --------------- -in Atom.xml: - - (repeated. client should pick one) - -in /index.html - - - -then bookmarklet to ping the publish URL. - -=========================================================================== - end meeting notes from 2009-09-16 -=========================================================================== - -=== Open issues... === - -Is there an existing standard for aggregators to specify how many readers they're requesting on behalf of? - - diff --git a/hub/all_stats.html b/hub/all_stats.html deleted file mode 100644 index d7e3403..0000000 --- a/hub/all_stats.html +++ /dev/null @@ -1,65 +0,0 @@ - - - Hub - Statistics - - - - - -
- -
- -

Table of contents

- - -

Fetch stats

-

Per-URL error rate

-{% for result in fetch_url_error %} - {% include "stats_table.html" %} -{% endfor %} - -

Per-URL latency

-{% for result in fetch_url_latency %} - {% include "stats_table.html" %} -{% endfor %} - -

Per-domain error rate

-{% for result in fetch_domain_error %} - {% include "stats_table.html" %} -{% endfor %} - -

Per-domain latency

-{% for result in fetch_domain_latency %} - {% include "stats_table.html" %} -{% endfor %} - - -

Delivery stats

-

Per-URL error rate

-{% for result in delivery_url_error %} - {% include "stats_table.html" %} -{% endfor %} - -

Per-URL latency

-{% for result in delivery_url_latency %} - {% include "stats_table.html" %} -{% endfor %} - -

Per-domain error rate

-{% for result in delivery_domain_error %} - {% include "stats_table.html" %} -{% endfor %} - -

Per-domain latency

-{% for result in delivery_domain_latency %} - {% include "stats_table.html" %} -{% endfor %} - - - - diff --git a/hub/app.yaml b/hub/app.yaml deleted file mode 100644 index fd655af..0000000 --- a/hub/app.yaml +++ /dev/null @@ -1,62 +0,0 @@ -application: pubsubhubbub -version: 1 -runtime: python -api_version: 1 - -skip_files: -- ^(.*/)?app\.yaml -- ^(.*/)?app\.yml -- ^(.*/)?index\.yaml -- ^(.*/)?index\.yml -- ^(.*/)?#.*# -- ^(.*/)?.*~ -- ^(.*/)?.*\.py[co].* -- ^(.*/)?.*/RCS/.* -- ^(.*/)?\..* -- ^(.*/)?(main_test|remote_shell|testutil|urlfetch_test_stub|feed_diff_test)\.py -- ^(.*/)?feed_diff_testdata - -handlers: -- url: /base\.css - static_files: base.css - upload: base\.css - secure: optional - -- url: /favicon\.ico - static_files: favicon.ico - upload: favicon\.ico - secure: optional - -# Admin tools -- url: /remote_api - script: $PYTHON_LIB/google/appengine/ext/remote_api/handler.py - login: admin - secure: optional - -- url: /admin(/.*)? - script: $PYTHON_LIB/google/appengine/ext/admin - login: admin - -- url: /stats - script: main.py - login: admin - -- url: /mapreduce(/.*)? - script: mapreduce/main.py - login: admin - -# Optional bookmarklet creation gadget. -- url: /bookmarklet(_jsonp\.min\.js|\.min\.js|\.html|_config\.html|_gadget\.xml) - static_files: bookmarklet/bookmarklet\1 - upload: bookmarklet/.+\.(min\.js|html|xml) - secure: optional - -# Always require the subscriber details form to be securely accessed. -- url: /subscription-details - script: main.py - secure: always - -# Everything else -- url: .* - script: main.py - secure: optional diff --git a/hub/async_apiproxy.py b/hub/async_apiproxy.py deleted file mode 100644 index 478dae4..0000000 --- a/hub/async_apiproxy.py +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2008 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""APIProxy-like object that enables asynchronous API calls.""" - -import collections -import logging -import sys - -from google.appengine.api import apiproxy_stub_map -from google.appengine.runtime import apiproxy -from google.appengine.runtime import apiproxy_errors -from google3.apphosting.runtime import _apphosting_runtime___python__apiproxy - - -class DevAppServerRPC(apiproxy.RPC): - """RPC-like object for use in the dev_appserver environment.""" - - def MakeCall(self): - pass - - def Wait(self): - pass - - def CheckSuccess(self): - apiproxy_stub_map.MakeSyncCall(self.package, self.call, - self.request, self.response) - self.callback() - - -if hasattr(_apphosting_runtime___python__apiproxy, 'MakeCall'): - AsyncRPC = apiproxy.RPC - logging.debug('Using apiproxy.RPC') -else: - logging.debug('Using DevAppServerRPC') - AsyncRPC = DevAppServerRPC - - -class AsyncAPIProxy(object): - """Proxy for asynchronous API calls.""" - - def __init__(self): - # TODO: Randomize this queue in the dev_appserver to simulate a real - # asynchronous queue and better catch any funny race-conditions or - # unclear event ordering dependencies. - self.enqueued = collections.deque() - self.complete = collections.deque() - - def start_call(self, package, call, pbrequest, pbresponse, user_callback, - deadline=None): - """user_callback is a callback that takes (response, exception)""" - if not callable(user_callback): - raise TypeError('%r not callable' % user_callback) - - # Do not actually supply the callback to the async call function because - # when it runs it could interfere with global state (like Datastore - # transactions). The callback will be run from the wait_one() function. - done_callback = lambda: user_callback(pbresponse, None) - rpc = AsyncRPC(package, call, pbrequest, pbresponse, - lambda: self.end_call(done_callback), - deadline=deadline) - setattr(rpc, 'user_callback', user_callback) - setattr(rpc, 'pbresponse', pbresponse) - - self.enqueued.append(rpc) - show_request = '...' - if rpc.package == 'urlfetch': - show_request = pbrequest.url() - logging.debug('Making call for RPC(%s, %s, %s, ..)', - rpc.package, rpc.call, show_request) - rpc.MakeCall() - - def end_call(self, rpc): - """An outstanding RPC has completed, enqueue its callback for execution.""" - self.complete.append(rpc) - - def rpcs_outstanding(self): - """Returns the number of asynchronous RPCs pending in this proxy.""" - return len(self.enqueued) - - def _wait_one(self): - """Wait for a single RPC to finish.""" - if not self.enqueued: - return - rpc = self.enqueued.popleft() - logging.debug('Waiting for RPC(%s, %s, .., ..)', rpc.package, rpc.call) - rpc.Wait() - try: - rpc.CheckSuccess() - except (apiproxy_errors.Error, apiproxy_errors.ApplicationError), e: - rpc.user_callback(None, e) - - def _run_callbacks(self): - """Runs a single RPC's success callback. - - Callbacks are run in a loop like this from the wait() callstack to avoid - race conditions from the APIProxy. Any API call can cause asynchronous - callbacks to fire before the main thread goes to sleep, which means a - user callback could run *before* a Commit() call finishes; this causes - really bad situations when the user callback also does some API calls. To - handle this properly, all callbacks will just go onto the completion - queue, and then run at the top of the stack here after at least one RPC - waiting period has finished. - """ - while True: - try: - callback = self.complete.popleft() - except IndexError: - return - else: - callback() - - def wait(self): - """Wait for RPCs to finish. Returns True if any were processed.""" - while self.enqueued or self.complete: - # Run the callbacks before even waiting, because a response could have - # come back during any outbound API call. - self._run_callbacks() - self._wait_one() diff --git a/hub/base.css b/hub/base.css deleted file mode 100644 index 9813cd9..0000000 --- a/hub/base.css +++ /dev/null @@ -1,31 +0,0 @@ -body, input, select, table { - font-family: "Trebuchet MS", "Helvetica", sans-serif; - font-size: 14px; -} -label { display: block; } -input[type="text"] { - width: 400px; -} -hr { margin-top: 1em; } -em { font-size: 0.8em; } -h3 { margin-bottom: 0.2em } -.stats-table > table { - width: 100%; - border-color: #000; - border-width: 0 0 1px 1px; - border-style: solid; - border-spacing: 0; - border-collapse: collapse; -} -.stats-table > table td, -.stats-table > table th { - border-color: #000; - border-width: 1px 1px 0 0; - border-style: solid; - margin: 0; - padding: 3px; - background-color: #E0ECF8; -} -.stats-table > .summary > span { - margin-right: 0.5em; -} \ No newline at end of file diff --git a/hub/bookmarklet b/hub/bookmarklet deleted file mode 120000 index bd18264..0000000 --- a/hub/bookmarklet +++ /dev/null @@ -1 +0,0 @@ -../bookmarklet \ No newline at end of file diff --git a/hub/cron.yaml b/hub/cron.yaml deleted file mode 100644 index d573068..0000000 --- a/hub/cron.yaml +++ /dev/null @@ -1,12 +0,0 @@ -cron: -- description: Bootstrap polling - url: /work/poll_bootstrap - schedule: every 5 minutes - -- description: Subscription cleanup - url: /work/subscription_cleanup - schedule: every 1 minutes - -- description: Subscription reconfirmation - url: /work/reconfirm_subscriptions - schedule: every 3 hours diff --git a/hub/dos.py b/hub/dos.py deleted file mode 100644 index f9f526d..0000000 --- a/hub/dos.py +++ /dev/null @@ -1,1055 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Decorators and utilities for attack protection and statistics gathering.""" - -import gc -import logging -import os -import random -import re -import struct -import time - -from google.appengine.api import memcache - - -# Set to true in tests to disable DoS protection. -DISABLE_FOR_TESTING = False - - -class ConfigError(Exception): - """Something is wrong with a configured DoS limit, sampler, or scorer.""" - -################################################################################ - - -_DEFAULT_MESSAGE = ( - 'Too many requests for "%(key)s"; ' - 'current rate is %(rate).3f/s, ' - 'limit is %(limit).3f/s') - - -def limit(param=None, - header='REMOTE_ADDR', - count=None, - period=None, - error_code=503, - retry_after=120, - message=_DEFAULT_MESSAGE, - param_whitelist=None, - header_whitelist=None): - """Limits a webapp.RequestHandler method to a specific rate. - - Either 'param', 'header', or both 'param' and 'header' must be specified. If - values cannot be found for either of these, the request will be allowed to - go through. Unlike the limiting constraints, if whitelists are supplied, then - *any* match of either whitelist will cause the dos limit to be skipped. - - Args: - func: The RequestHandler method to decorate with the rate limit. - param: A request parameter to use for rate limiting. If None, no parameters - will be used. - header: Header to use for rate limiting. If None, no header value will be - used. This header name must be in the CGI environment variable format, - e.g., HTTP_X_FORWARDED_FOR. - count: Maximum number of executions of this function to allow. - period: Period over which the 'count' executions should be allowed, - specified in seconds. Must be less than a month in length. - error_code: Error code to return when the rate limit has been exceeded. - Defaults to 503. - retry_after: Number of seconds to return for the 'Retry-After' header when - an error is served. If None, no header will be returned. - message: Error message to serve in the body of an error response. May have - formatting parameters 'key', 'rate', and 'limit'. - param_whitelist: If not None, a set of values of 'param' that are allowed - to pass the dos limit without throttling. - header_whitelist: If not None, a set of values of 'header' that are allowed - to pass the dos limit without throttling. - - Returns: - The decorated method. - - Raises: - ConfigError at decoration time if any rate limit parameters are invalid. - """ - if not (param or header): - raise ConfigError('Must specify "param" and/or "header" keywords') - if count is None or count < 0: - raise ConfigError('Must specify count >= 0') - if period is None or period < 1: - raise ConfigError('Must specify period >= 1') - - limit = float(count) / period - required_parts = 2 # two becuase path and method name are always in the key - if param: - required_parts += 1 - if header: - required_parts += 1 - if param_whitelist is None: - param_whitelist = frozenset([]) - if header_whitelist is None: - header_whitelist = frozenset([]) - - def wrapper(func): - if func.func_name not in ('post', 'get') and param: - raise ConfigError('May only specify param limit for GET and POST') - def decorated(myself, *args, **kwargs): - method = myself.request.method - parts = [method, myself.request.path] - whitelisted = False - - if DISABLE_FOR_TESTING: - return func(myself, *args, **kwargs) - - if param: - value = myself.request.get(param) - if value: - parts.append('%s=%s' % (param, value)) - if value in param_whitelist: - whitelisted = True - if header: - value = os.environ.get(header) - if value: - parts.append('%s=%s' % (header, value)) - if value in header_whitelist: - whitelisted = True - - key = ' '.join(parts) - result = None - if len(parts) != required_parts: - logging.critical('Incomplete rate-limit key = "%s" for param = "%s", ' - 'header = "%s" on "%s" where count = %s, period = %s, ' - 'limit = %.3f/sec', key, param, header, method, - count, period, limit) - else: - result = memcache.incr(key) - if result is None: - # Rate limit not yet in memcache. - result = 1 - if not memcache.add(key, result, time=period): - # Possible race for who adds to the cache first. - result = memcache.incr(key) - if result is None: - # Memcache definitely down. - skip_enforcement = True - logging.error('Memcache failed for rate limit on "%s" by "%s" ' - 'where count = %s, period = %s, limit = %.3f/s', - method, key, count, period, limit) - - if not whitelisted and result > count: - rate = float(result) / period - if (result - count) == 1: - log_level = logging.error - else: - log_level = logging.debug - log_level('Hit rate limit on "%s" by "%s" where ' - 'count = %s, period = %s, rate = %.3f/s, limit = %.3f/s', - method, key, count, period, rate, limit) - myself.response.set_status(error_code) - myself.response.headers['Content-Type'] = 'text/plain' - if retry_after is not None: - myself.response.headers['Retry-After'] = str(retry_after) - values = {'key': key, 'rate': rate, 'limit': limit} - myself.response.out.write(message % values) - else: - return func(myself, *args, **kwargs) - - decorated.func_name = func.func_name # Fun with hacking the Python stack! - return decorated - - return wrapper - -################################################################################ - -# TODO: Determine if URL/domain caching is necessary due to regex performance. - -# TODO: Add ane exception list of domains that should use the full domain, -# not just the last suffix, when going through the get_url_domain. This is -# needed for domains like 'appspot.com' that are shared across totally -# different developers. - -# Matches four groups: -# 1) an IP, 2) a domain prefix, 3) a domain suffix, 4) other (eg, localhost) -URL_DOMAIN_RE = re.compile( - r'https?://(?:' - r'([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)|' # IP address - r'(?:((?:[a-zA-Z0-9-]+\.)*)([a-zA-Z0-9-]+\.[a-zA-Z0-9-]+))|' # Domain - r'([^/]+)' # Anyting else - r')(?:/.*)?') # The rest of the URL - -# Domains where only the suffix is important. -DOMAIN_SUFFIX_EXCEPTIONS = frozenset([ - 'blogspot.com', - 'livejournal.com', -]) - -# Maximum size of the cache of URLs to domains. -DOMAIN_CACHE_SIZE = 100 - -# Simple local cache used for per-request URL to domain mappings. -_DOMAIN_CACHE = {} - - -def get_url_domain(url): - """Returns the domain for a URL or 'bad_url if it's not a valid URL.""" - result = _DOMAIN_CACHE.get(url) - if result is not None: - return result - if len(_DOMAIN_CACHE) >= DOMAIN_CACHE_SIZE: - _DOMAIN_CACHE.clear() - - match = URL_DOMAIN_RE.match(url) - if match: - groups = list(match.groups()) - if groups[1] and groups[2] and groups[2] not in DOMAIN_SUFFIX_EXCEPTIONS: - groups[2] = groups[1] + groups[2] - groups[1] = None - groups = filter(bool, groups) - else: - groups = [] - result = (groups + ['bad_url'])[0] - - _DOMAIN_CACHE[url] = result - return result - -################################################################################ - -def offset_or_add(offsets, - period, - prefix='', - offset_multi=memcache.offset_multi, - add_multi=memcache.add_multi): - """Offsets values in memcache or re-adds them if not present. - - This method is required when you want to offset memcache values *and* set - their expiration time to a distinct time in the future. - - Args: - offsets: Dictionary mapping keys to offset integers. - period: Time in seconds before these keys should expire. - prefix: Any memcache prefix to use for these keys. - offset_multi: Used for testing. - add_multi: Used for testing. - - Returns: - Dictionary mapping input keys to updated values. None will be returned - for keys that could not be updated. - """ - results = offset_multi(offsets, key_prefix=prefix) - - # Add any non-existent items. - adds = {} - for key, value in results.iteritems(): - if value is None: - adds[key] = offsets[key] - if adds: - failed = set(add_multi(adds, time=period, key_prefix=prefix)) - for key, value in adds.iteritems(): - if key not in failed: - results[key] = value - else: - failed = set() - - # If the add fails, then someone else won the race, so increment. - second_offsets = dict((k, offsets[k]) for k in failed) - if second_offsets: - second_results = offset_multi(second_offsets, key_prefix=prefix) - else: - second_results = {} - - failed_keys = set(k for k, v in second_results.iteritems() if v is None) - if failed_keys: - logging.warning('Failed memcache offset_or_add for prefix=%r, keys=%r', - prefix, failed_keys) - - results.update(second_results) - return results - -################################################################################ - -# TODO: Support more than 1MB of sample data by using multiple memcache calls -# to retrieve all of the sample values. - -# TODO: Support configs that *always* sample particular URLs or domains. -# Could have a console insert a list of values into memcache which is fetched -# and cached in each runtime, and then those domains would always match. - -# TODO: Allow for synchronized reservoirs that reset at synchronized intervals. -# This will let us view overlapping windows of samples across a time period -# instead of having to reset every N seconds. - -class ReservoirConfig(object): - """Configuration for a reservoir sampler.""" - - def __init__(self, - name, - period=None, - samples=None, - by_domain=False, - by_url=False, - rate=1, - max_value_length=75, - tolerance=10, - title=None, - key_name='Key', - value_units=''): - """Initializer. - - Args: - name: Programmatic name to use for this reservoir in memcache. - period: Time period for this reservoir in seconds. - samples: Total number of samples to use in the reservoir. - by_domain: True if URL domains should be used as the sampling key. - by_url: True if the whole URL should be used as the sampling key. - rate: Sampling rate (between 0 and 1) to reduce the latency overhead of - applying this sampler. - max_value_length: Length to truncate a sampling key to before storing - it in memcache. - tolerance: Number of seconds to allow for samples to stay valid for - after a reservoir reset has happened. - title: Nice-looking title of this config; will use the name if - not supplied. - key_name: The noun of the key (e.g., 'domain' or 'url'). - value_units: The noun of the value units (e.g., 'milliseconds'). - """ - if not name: - raise ConfigError('Must specify a name') - - try: - period = int(period) - except ValueError, e: - raise ConfigError('Invalid period: %s' % e) - if period <= 0: - raise ConfigError('period must be positive') - - try: - samples = int(samples) - except ValueError, e: - raise ConfigError('Invalid samples: %s' % e) - if samples <= 0: - raise ConfigError('samples must be positive') - - if not (by_domain ^ by_url): - raise ConfigError('Must specify by_domain or by_url') - - try: - rate = float(rate) - except ValueError, e: - raise ConfigError('Invalid rate: %s' % e) - if not (0 <= rate <= 1): - raise ConfigError('rate must be between 0 and 1') - - try: - tolerance = int(tolerance) - except ValueError, e: - raise ConfigError('Invalid tolerance: %s' % e) - if tolerance < 0: - raise ConfigError('tolerance must be non-negative') - - self.name = name - self.title = title or name - self.key_name = key_name - self.value_units = value_units - self.samples = samples - self.rate = rate - self.period = period - self.inverse_rate = 1.0 / self.rate - self.by_url = by_url - self.by_domain = by_domain - self.max_value_length = max_value_length - self.tolerance = tolerance - - if by_url: - self.kind = 'by_url' - else: - self.kind = 'by_domain' - self.counter_key = '%s:%s:counter' % (self.name, self.kind) - self.start_key = '%s:%s:start_time' % (self.name, self.kind) - self._position_key_template = '%s:%s:%%d' % (self.name, self.kind) - - def position_key(self, index): - """Generates the position key for the sample slot with the given index. - - Args: - index: Numerical index of the sample position who's key to retrieve. - - Returns: - Memcache key to use. - """ - return self._position_key_template % index - - def is_expired(self, last_time, current_time): - """Checks if this config is expired. - - Args: - last_time: UNIX timestamp when this config's period started. - current_time: UNIX timestamp of the current time. - - Returns: - True if the config period has expired. - """ - return (current_time - last_time) > self.period - - def adjust_value(self, key): - """Adjust the value for a sampling key. - - Args: - key: The sampling key to adjust. - - Returns: - The adjusted key. - """ - if self.by_url: - value = key - else: - value = get_url_domain(key) - if len(value) > self.max_value_length: - value = value[:self.max_value_length] - if isinstance(value, unicode): - value = unicode(value).encode('utf-8') - return value - - def should_sample(self, key, coin_flip): - """Checks if the key should be sampled. - - Args: - key: The sampling key to check. - coin_flip: Random value between 0 and 1. - - Return: - True if the sample should be taken, False otherwise. - """ - return coin_flip < self.rate - - def compute_frequency(self, count, found, total, elapsed): - """Computes the frequency of a sample. - - Args: - count: Total number of samples of this key. - found: Total number of samples present for all keys. - total: The total number of sampling events so far, regardless of - whether or not the sample was saved. - elapsed: Seconds elapsed during the current sampling period. - - Returns: - The frequency, in events per second, of this key in the time period, - or None if no samples have been taken yet. - """ - if not total or not found: - return None - return self.inverse_rate * (1.0 * count / found) * (1.0 * total / elapsed) - - -class Reporter(object): - """Contains a batch of keys and values for potential sampling.""" - - def __init__(self): - """Initializer.""" - # Keep a list of input keys in order so we can iterate through the - # dictionary in order during testing. This costs little and vastly - # simplifies testing. - self.keys = [] - # Maps key -> {config -> value} - self.param_dict = {} - # Maps config -> [key, ...] - self.config_dict = {} - - def set(self, key, config, value=1): - """Sets a key/value for a specific config. - - Each config/key combination may only have a single value. Subsequent - calls to this method with the same key/config will overwrite the - previous value. - - Args: - key: The sampling key to add. - config: The ReservoirConfig object to set the value for. - value: The value to set for this config. - """ - value_dict = self.param_dict.get(key) - if value_dict is None: - self.param_dict[key] = value_dict = {} - value_dict[config] = value - self.keys.append(key) - - present_list = self.config_dict.get(config) - if present_list is None: - self.config_dict[config] = present_list = [] - present_list.append(key) - - def get(self, key, config): - """Gets the value for a key/config. - - Args: - key: The sampling key to retrieve the value for. - config: The ReservoirConfig object to get the value for. - - Returns: - The value for the key/config or None if it's not present. - """ - return self.param_dict.get(key, {}).get(config) - - def remove(self, key, config): - """Removes a key/value for a specific config. - - If the key is not present for the config, this method does nothing. - - Args: - key: The sampling key to remove. - config: The ReservoirConfig object to remove the key for. - """ - try: - del self.param_dict[key][config] - self.config_dict[config].remove(key) - except KeyError: - pass - - def all_keys(self): - """Returns all the sampling keys present across all configs. - - Each key will be present at least once, but some keys may be present - more than once if they were inserted repeatedly. The keys are in - insertion order. This simplifies testing of this class. - """ - return self.keys - - def get_keys(self, config): - """Retrieves the keys present for a specific ReservoirConfig. - - Args: - config: The ReservoirConfig object to get the keys for. - - Returns: - The list of keys present for this config, with no duplicates. - """ - return self.config_dict.get(config, []) - - -class SampleResult(object): - """Contains the current results of a sampler for a given config.""" - - def __init__(self, config, total_samples, time_elapsed): - """Initializer. - - Args: - config: The ReservoirConfig these results are for. - total_samples: The total number of sampling events that have occurred. - This is *not* the number of unique samples present in the table. - time_elapsed: Time in seconds that have elapsed in the current period. - """ - self.config = config - self.total_samples = total_samples - self.time_elapsed = time_elapsed - self.unique_samples = 0 - self.title = config.title - self.key_name = config.key_name - self.value_units = config.value_units - - # Maps key -> [(when, value), ...] - self.sample_dict = {} - - def add(self, key, when, value): - """Adds a new sample to these results. - - Args: - key: The sampling key. - when: When the sample was made, as a UNIX timestamp. - value: The value that was sampled. - """ - samples = self.sample_dict.get(key) - if samples is None: - self.sample_dict[key] = samples = [] - samples.append((when, value)) - self.unique_samples += 1 - - def overall_rate(self): - """Gets the overall rate of events. - - Returns: - Total events per second. - """ - return 1.0 * self.total_samples / self.time_elapsed - - def get_min(self, key): - """Gets the min value seen for a key. - - Args: - key: The sampling key. - - Returns: - The minimum value or None if this key does not exist. - """ - samples = self.sample_dict.get(key) - if samples is None: - return None - return min(samples, key=lambda x: x[1])[1] - - def get_max(self, key): - """Gets the max value seen for a key. - - Args: - key: The sampling key. - - Returns: - The maximum value or None if this key does not exist. - """ - samples = self.sample_dict.get(key) - if samples is None: - return None - return max(samples, key=lambda x: x[1])[1] - - def get_frequency(self, key): - """Gets the frequency of events for this key during the sampling period. - - Args: - key: The sampling key. - - Returns: - The frequency as events per second or None if this key does not exist. - """ - samples = self.sample_dict.get(key) - if samples is None: - return None - return self.config.compute_frequency( - len(samples), - self.unique_samples, - self.total_samples, - self.time_elapsed) - - def get_average(self, key): - """Gets the weighted average of this key's sampled values. - - Args: - key: The sampling key. - - Returns: - The weighted average or None if this key does not exist. - """ - samples = self.sample_dict.get(key) - if not samples: - return None - total = 0.0 - for sample in samples: - total += sample[1] - return total / len(samples) - - def get_count(self, key): - """Gets the count of unique samples for a key. - - Args: - key: The sampling key. - - Returns: - The number of items. Will be zero if the key does not exist. - """ - return len(self.sample_dict.get(key, [])) - - def get_samples(self, key): - """Gets the unique sample data for a key. - - Args: - key: The sampling key. - - Returns: - List of tuple (when, value) where: - when: The UNIX timestamp for the sample. - value: The sample value. - """ - return self.sample_dict.get(key, []) - - def set_single_sample(self, key): - """Sets that this result is for a single key. - - Args: - key: The sampling key. - """ - self.total_samples = self.get_count(key) - - def sample_objects(self): - """Gets the contents of this result object for use in template rendering. - - Returns: - Generator of model objects. - """ - for key in self.sample_dict: - yield { - 'key': key, - 'count': self.get_count(key), - 'frequency': self.get_frequency(key), - 'min': self.get_min(key), - 'max': self.get_max(key), - 'average': self.get_average(key), - } - - -class MultiSampler(object): - """Sampler that saves key/value pairs for multiple reservoirs in parallel. - - The basic algorithm is: - - 1. Get the reservoir start timestamp. - 2. If more than period seconds have elapsed, set the timestamp to now, set - the reservoir's event counter to zero (average case this is skipped). - 3. Increment the event counter by the number of new samples. - 4. Set memcache values to incoming samples following the reservoir - algorithm, potentially only sampling a subset. - - The benefit of this approach is it can be applied to many reservoirs in - parallel without incurring additional API calls. The only limit is the 1MB - limit on App Engine API calls, which puts a cap on the amount of samples - that can be made simultaneously. - - Samples are stored in keys like: 'sampler_name:0', 'sampler_name:1' - - Values stored for samples look like: 'key_sample:NNNN:WWWW' where the 'N's - represent the sample value as a big-endian-encoded 4-byte string, and the - 'W's are a UNIX timestamp as a big-endian-encoded 4-byte string. The - timestamp is used to ignore samples that are not from the current period. - - There can be a race for resetting the timestamp for a sampler right after - the period starts, but it always favors the caller who inserted last (all - earlier data will be overwritten). This results in some missing data for - short-period samplers, but it's okay. - """ - - def __init__(self, configs, gettime=time.time): - """Initializer. - - Args: - configs: Iterable of ReservoirConfig objects. - gettime: Used for testing. - """ - self.configs = list(configs) - self.gettime = gettime - - def sample(self, - reporter, - getrandom=random.random, - randrange=random.randrange): - """Samples a set of reported key/values. - - Args: - reporter: Reporter instance containing key/values to sample. - getrandom: Used for testing. - randrange: Used for testing. - """ - # Update period start times if they're expired or non-existent. - now = int(self.gettime()) - start_times = memcache.get_multi([c.start_key for c in self.configs]) - config_sets = {} - for config in self.configs: - start = start_times.get(config.start_key) - if start is None or config.is_expired(start, now): - config_sets[config.start_key] = now - config_sets[config.counter_key] = 0 - if config_sets: - memcache.set_multi(config_sets) - - # Flip coin for sample rate of all Keys on all configs. - for key in reporter.all_keys(): - coin_flip = getrandom() - for config in self.configs: - if not config.should_sample(key, coin_flip): - reporter.remove(key, config) - - # Increment counters for affected configs. - counter_offsets = {} - for config in self.configs: - matching = reporter.get_keys(config) - if matching: - counter_offsets[config.counter_key] = len(matching) - if not counter_offsets: - return - counter_results = memcache.offset_multi(counter_offsets, initial_value=0) - - # Apply the reservoir algorithm. - value_sets = {} - now_encoded = struct.pack('!l', now) - for config in self.configs: - matching = list(reporter.get_keys(config)) - counter = counter_results.get(config.counter_key) - if counter is None: - # Incrementing the config failed, so give up on these Key samples. - continue - counter = int(counter) # Deal with wonky serialization types. - for (value_index, sample_number) in zip( - xrange(len(matching)), xrange(counter - len(matching), counter)): - insert_index = None - if sample_number < config.samples: - insert_index = sample_number - else: - random_index = randrange(sample_number) - if random_index < config.samples: - insert_index = random_index - if insert_index is not None: - key = matching[value_index] - value_key = config.position_key(insert_index) - value = reporter.get(key, config) - if value is not None: - # Value may be none if this key was removed from the samples - # list due to not passing the coin flip. - value_encoded = struct.pack('!l', value) - sample = '%s:%s:%s' % ( - config.adjust_value(key), now_encoded, value_encoded) - value_sets[value_key] = sample - memcache.set_multi(value_sets) - - def get(self, config, single_key=None): - """Gets statistics for a particular config and/or key. - - This will only retrieve samples for the current time period. Samples - from previous time periods will be ignored. - - Args: - config: The ReservoirConfig to retrieve stats for. - single_key: If None, then global stats for the config will be retrieved. - When a key value (a string), then only stats for that particular key - will be returned to the caller. - - Returns: - SampleResult object containing the result data. - """ - # Make sure the key is converted into the format expected by the config. - if single_key is not None: - single_key = config.adjust_value(single_key) - - keys = [config.start_key, config.counter_key] - for i in xrange(config.samples): - keys.append(config.position_key(i)) - sample_data = memcache.get_multi(keys) - - # Deal with wonky serialization types. - counter = int(sample_data.get(config.counter_key, 0)) - start_time = sample_data.get(config.start_key) - now = self.gettime() - if start_time is None: - # If the start time isn't there, then just assume it started exactly - # the period ago. This should only happen if the start time gets - # evicted for some weird reason. - start_time = now - config.period - elapsed = now - start_time - - # Find all samples that fall within the reset time validity window. - results = SampleResult(config, counter, elapsed) - for i in xrange(config.samples): - combined_value = sample_data.get(config.position_key(i)) - if combined_value is None: - continue - key, when_encoded, value_encoded = ( - combined_value.rsplit(':', 2) + ['', '', ''])[:3] - if single_key is not None and single_key != key: - continue - - if len(when_encoded) != 4: - continue - when = struct.unpack('!l', when_encoded)[0] - if len(value_encoded) != 4: - continue - value = struct.unpack('!l', value_encoded)[0] - - if ((start_time - config.tolerance) - < when < - (start_time + config.period + config.tolerance)): - results.add(key, when, value) - - # For a single sample we need to set the counter to the number of unique - # samples so we don't leak the overall QPS being pushed for this event. - if single_key is not None: - results.set_single_sample(single_key) - - return results - - def get_chain(self, *configs, **kwargs): - """Gets statistics for a set of configs, optionally for a single key. - - For retrieving multiple configs sequentially in a way that ensures that - the memory usage of the previous result is garbage collected before the - next one is returned. - - Args: - *configs: Set of configs to retrieve. - **kwargs: Keyword arguments to pass to the 'get' method of this class. - - Returns: - Generator that yields each SampleResult object for each config. - """ - for config in configs: - result = self.get(config, **kwargs) - yield result - del result - # NOTE: This kinda sucks, but the result sets are really large so - # we need to make sure the garbage collector is doing its job so we - # don't run bloat memory over the course of a single stats request. - gc.collect() - -################################################################################ - -class UrlScorer(object): - """Classifies incoming URLs by domain as passing a filter or being blocked. - - Used to enforce per-callback and per-feed limits on slowness, failures, - and misbehavior. - """ - - def __init__(self, - period=None, - min_requests=None, - max_failure_percentage=None, - prefix=None): - """Initializer. - - Args: - period: Over what time period, in seconds, the scorer should track - statistics for URLs. The shorter the more reliable the enforcement - due to cache eviction. Must be a positive number, forced to integer. - min_requests: Minimum number of requests to receive (per second, - independent of the enforcement 'period' parameter) before rate limiting - takes effect. Must be non-negative. Floating point is fine. - max_failure_percentage: Maximum percentage of failures (as a moving - rate) to allow before a domain is blocked. Value can be a number - between 0 and 1 and float. - prefix: Memcache key prefix to use for this scorer configuration. - Must not be empty. - - Raises: - ConfigError if any of the parameters above are invalid. - """ - try: - period = int(period) - except ValueError, e: - raise ConfigError('Invalid period: %s' % e) - if period <= 0: - raise ConfigError('period must be non-zero') - - try: - min_requests = float(min_requests) - except ValueError, e: - raise ConfigError('Invalid min_requests: %s' % e) - if min_requests < 0: - raise ConfigError('min_requests must be non-negative') - - try: - max_failure_percentage = float(max_failure_percentage) - except ValueError, e: - raise ConfigError('invalid max_failure_percentage: %s' % e) - if not (0 <= max_failure_percentage <= 1): - raise ConfigError('max_failure_percentage must be between 0 and 1') - - if not isinstance(prefix, basestring) or not prefix: - raise ConfigError('prefix must be a non-empty string') - - self.period = period - self.min_requests = int(self.period * min_requests) - self.max_failure_percentage = max_failure_percentage - self.prefix = 'scoring:%s:' % prefix - - def filter(self, urls): - """Checks if each URL can proceed based on a successful score. - - Args: - urls: Iterable of URLs to check. Each input URL will have a corresponding - result returned in the same order they were passed in. - - Returns: - List of tuple (allowed, failure_percentage) where: - allowed: True if the URL passed the filter, False otherwise. - failure_percentage: Percent of failures seen during the current - scoring period. Number between 0 and 1. - """ - domain_list = [get_url_domain(u) for u in urls] - keys = ['success:' + d for d in domain_list] - keys.extend('failure:' + d for d in domain_list) - values = memcache.get_multi(keys, key_prefix=self.prefix) - - result = [] - for domain in domain_list: - success = values.get('success:' + domain, 0) - failure = values.get('failure:' + domain, 0) - requests = success + failure - - if requests > 0: - failure_percentage = (1.0 * failure) / requests - else: - failure_percentage = 0 - - allow = bool( - DISABLE_FOR_TESTING or - requests < self.min_requests or - failure_percentage < self.max_failure_percentage) - result.append((allow, failure_percentage)) - - return result - - def report(self, success, failure): - """Reports the status of interactions with a set of URLs. - - Args: - success: Iterable of URLs that had successful interactions. - failure: Iterable of URLs that had failed interactions. - """ - if success is None: - success = [] - if failure is None: - failure = [] - - offsets = {} - # Always set successful and failed even if the other one doesn't exist yet - # so we're reasonably sure that their expiration times are synchronized. - for url in success: - domain = get_url_domain(url) - success_key = 'success:' + domain - fail_key = 'failure:' + domain - offsets[success_key] = offsets.get(success_key, 0) + 1 - offsets[fail_key] = offsets.get(fail_key, 0) - for url in failure: - domain = get_url_domain(url) - success_key = 'success:' + domain - fail_key = 'failure:' + domain - offsets[success_key] = offsets.get(success_key, 0) - offsets[fail_key] = offsets.get(fail_key, 0) + 1 - - offset_or_add(offsets, self.period, self.prefix) - - def get_scores(self, urls): - """Retrieves the scores for a set of URLs. - - Args: - urls: Iterable of URLs to retrieve the scores for. Each input will have - a corresponding entry in the returned value in the same order. - - Returns: - List of tuple (success, failure) where: - success: Number of successful requests. - failure: Number of failed requests. - """ - domain_list = [get_url_domain(u) for u in urls] - keys = ['success:' + d for d in domain_list] - keys.extend('failure:' + d for d in domain_list) - values = memcache.get_multi(keys, key_prefix=self.prefix) - return [(values.get('success:' + d, 0), values.get('failure:' + d, 0)) - for d in domain_list] - - def blackhole(self, urls): - """Blackholes a set of URLs by domain for the rest of the current period. - - Args: - urls: Iterable of URLs to blackhole. - """ - values = dict(('failure:' + get_url_domain(u), self.min_requests) - for u in urls) - memcache.set_multi(values, key_prefix=self.prefix) diff --git a/hub/dos_test.py b/hub/dos_test.py deleted file mode 100755 index 69ab449..0000000 --- a/hub/dos_test.py +++ /dev/null @@ -1,1717 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Tests for the dos module.""" - -import cProfile -import gc -import logging -logging.basicConfig(format='%(levelname)-8s %(filename)s] %(message)s') -import os -import random -import sys -import unittest - -import testutil -testutil.fix_path() - -from google.appengine.api import memcache -from google.appengine.ext import webapp - -import dos - -################################################################################ - -class LimitTestBase(testutil.HandlerTestBase): - """Base class for limit function tests.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.HandlerTestBase.setUp(self) - self.old_environ = os.environ.copy() - os.environ['PATH_INFO'] = '/foobar_path' - - def tearDown(self): - """Tears down the test hardness.""" - testutil.HandlerTestBase.tearDown(self) - os.environ.clear() - os.environ.update(self.old_environ) - - -class HeaderHandler(webapp.RequestHandler): - - # Rate limit by headers - @dos.limit(count=3, period=10) - def get(self): - self.response.out.write('get success') - - # Rate limit by custom header - @dos.limit(header='HTTP_FANCY_HEADER', count=3, period=10) - def post(self): - self.response.out.write('post success') - - -class HeaderTest(LimitTestBase): - """Tests for limiting by only headers.""" - - handler_class = HeaderHandler - - def testDefaultHeader(self): - """Tests limits on a default header.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - for i in xrange(3): - self.handle('get') - self.assertEquals(200, self.response_code()) - self.assertEquals('get success', self.response_body()) - self.handle('get') - self.assertEquals(503, self.response_code()) - - # Different header value will not be limited. - os.environ['REMOTE_ADDR'] = '10.1.1.4' - self.handle('get') - self.assertEquals(200, self.response_code()) - - def testCustomHeader(self): - """Tests limits on a default header.""" - header = 'HTTP_FANCY_HEADER' - os.environ[header] = 'my cool header value' - for i in xrange(3): - self.handle('post') - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - self.handle('post') - self.assertEquals(503, self.response_code()) - - # Different header value will not be limited. - os.environ['HTTP_FANCY_HEADER'] = 'something else' - self.handle('post') - self.assertEquals(200, self.response_code()) - - def testHeaderMissing(self): - """Tests when rate-limiting on a header that's missing.""" - # Should not allow more than three requests here, but - # since there is no limit key, we let them all through. - for i in xrange(4): - self.handle('get') - self.assertEquals(200, self.response_code()) - self.assertEquals('get success', self.response_body()) - - -class ParamHandler(webapp.RequestHandler): - - # Limit by parameter - @dos.limit(param='foo', header=None, count=3, period=10) - def post(self): - self.response.out.write('post success') - - -class ParamTest(LimitTestBase): - """Tests for limiting by only parameters.""" - - handler_class = ParamHandler - - def testParam(self): - """Tests limits on a parameter.""" - for i in xrange(3): - self.handle('post', ('foo', 'meep')) - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - self.handle('post', ('foo', 'meep')) - self.assertEquals(503, self.response_code()) - - # Different parameter value will not be limited. - self.handle('post', ('foo', 'wooh')) - self.assertEquals(200, self.response_code()) - - def testParamMissing(self): - """Tests when rate-limiting on a parameter that's missing.""" - # Should not allow more than three requests here, but - # since there is no limit key, we let them all through. - for i in xrange(4): - self.handle('post') - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - - -class ParamAndHeaderHandler(webapp.RequestHandler): - - # Limit by headers and params - @dos.limit(param='foo', count=3, period=10) - def post(self): - self.response.out.write('post success') - - -class ParamAndHeaderTest(LimitTestBase): - """Tests for limiting by parameters and headers.""" - - handler_class = ParamAndHeaderHandler - - def testHeaderAndParam(self): - """Tests when a header and parameter are limited.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - for i in xrange(3): - self.handle('post', ('foo', 'meep')) - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - self.handle('post', ('foo', 'meep')) - self.assertEquals(503, self.response_code()) - - # Different header *or* parmaeter values will not be limited. - self.handle('post', ('foo', 'stuff')) - self.assertEquals(200, self.response_code()) - - os.environ['REMOTE_ADDR'] = '10.1.1.4' - self.handle('post', ('foo', 'meep')) - self.assertEquals(200, self.response_code()) - - def testHeaderMissing(self): - """Tests when the header should be there too but isn't.""" - # Should not allow more than three requests here, but - # since there is no limit key, we let them all through. - for i in xrange(4): - self.handle('post', ('foo', 'meep')) - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - - def testParamMissing(self): - """Tests when the parameter should be there too but isn't.""" - # Should not allow more than three requests here, but - # since there is no limit key, we let them all through. - os.environ['REMOTE_ADDR'] = '10.1.1.4' - for i in xrange(4): - self.handle('post') - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - - def testBothMissing(self): - """Tests when the header and parameter are missing.""" - # Should not allow more than three requests here, but - # since there is no limit key, we let them all through. - for i in xrange(4): - self.handle('post') - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - - -class MethodsAndUrlsHandler(webapp.RequestHandler): - - @dos.limit(count=3, period=10) - def get(self): - self.response.out.write('get success') - - @dos.limit(count=3, period=10) - def post(self): - self.response.out.write('post success') - - -class MethodsAndUrlsTest(LimitTestBase): - """Tests for limiting across various methods and URLs.""" - - handler_class = MethodsAndUrlsHandler - - def testMethods(self): - """Tests that methods are limited separately.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - for i in xrange(3): - self.handle('post') - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - self.handle('post') - self.assertEquals(503, self.response_code()) - - # Different method still works. - self.handle('get') - self.assertEquals(200, self.response_code()) - - def testUrls(self): - """Tests that limiting for the same verb on different URLs works.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - for i in xrange(3): - self.handle('post') - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - self.handle('post') - self.assertEquals(503, self.response_code()) - - # Different path still works. - os.environ['PATH_INFO'] = '/other_path' - self.handle('post') - self.assertEquals(200, self.response_code()) - - -class ErrorParamsHandler(webapp.RequestHandler): - - # Alternate error code - @dos.limit(count=0, period=1, error_code=409, retry_after=99) - def get(self): - self.response.out.write('get success') - - # No retry-after time - @dos.limit(count=0, period=1, retry_after=None) - def post(self): - self.response.out.write('post success') - - # Defaults - @dos.limit(count=0, period=1) - def put(self): - self.response.out.write('put success') - - -class ErrorParamsTest(LimitTestBase): - """Tests the error and retry paramters""" - - handler_class = ErrorParamsHandler - - def testDefaultRetryAmount(self): - """Tests the supplied retry amount is valid.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - self.handle('put') - self.assertEquals(503, self.response_code()) - self.assertEquals('120', self.response_headers().get('Retry-After')) - - def testCustomErrorCode(self): - """Tests when a custom error code and retry time are specified.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - self.handle('get') - self.assertEquals(409, self.response_code()) - self.assertEquals('99', self.response_headers().get('Retry-After')) - - def testNoRetryTime(self): - """Tests when no retry time should be returned.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - self.handle('post') - self.assertEquals(503, self.response_code()) - self.assertEquals(None, self.response_headers().get('Retry-After')) - - -class ConfigErrorTest(unittest.TestCase): - """Tests various limit configuration errors.""" - - def testNoKeyError(self): - """Tests when there is no limiting key to derive.""" - self.assertRaises( - dos.ConfigError, - dos.limit, - header=None, - param=None) - - def testNegativeCount(self): - """Tests when the count is less than zero.""" - self.assertRaises( - dos.ConfigError, - dos.limit, - count=-1, - period=1) - - def testZeroPeriod(self): - """Tests when the count is less than zero.""" - self.assertRaises( - dos.ConfigError, - dos.limit, - count=1, - period=0) - - def testParamForDifferentVerb(self): - """Tests trying to rate limit a .""" - def put(): - pass - wrapper = dos.limit(param='okay', count=1, period=1) - self.assertRaises(dos.ConfigError, wrapper, put) - - -class MemcacheDetailsHandler(webapp.RequestHandler): - - @dos.limit(count=0, period=5.234) - def post(self): - self.response.out.write('post success') - - -class MemcacheDetailTest(LimitTestBase): - """Tests for various memcache details and failures.""" - - handler_class = MemcacheDetailsHandler - - def setUp(self): - """Sets up the test harness.""" - LimitTestBase.setUp(self) - os.environ['REMOTE_ADDR'] = '10.1.1.4' - self.expected_key = 'POST /foobar_path REMOTE_ADDR=10.1.1.4' - self.expected_incr = [] - self.expected_add = [] - self.old_incr = dos.memcache.incr - self.old_add = dos.memcache.add - - def incr(key): - self.assertEquals(self.expected_key, key) - return self.expected_incr.pop(0) - dos.memcache.incr = incr - - def add(key, value, time=None): - self.assertEquals(self.expected_key, key) - self.assertEquals(1, value) - self.assertEquals(5.234, time) - return self.expected_add.pop(0) - dos.memcache.add = add - - def tearDown(self): - """Tears down the test harness.""" - LimitTestBase.tearDown(self) - self.assertEquals(0, len(self.expected_incr)) - self.assertEquals(0, len(self.expected_add)) - dos.memcache.incr = self.old_incr - dos.memcache.add = self.old_add - - def testIncrFailure(self): - """Tests when the initial increment fails.""" - self.expected_incr.append(None) - self.expected_add.append(True) - self.handle('post') - self.assertEquals(503, self.response_code()) - - def testIncrAndAddFailure(self): - """Tests when the initial increment and the following add fail.""" - self.expected_incr.append(None) - self.expected_add.append(False) - self.expected_incr.append(14) - self.handle('post') - self.assertEquals(503, self.response_code()) - - def testCompleteFailure(self): - """Tests when all memcache calls fail.""" - self.expected_incr.append(None) - self.expected_add.append(False) - self.expected_incr.append(None) - self.handle('post') - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - - -class WhiteListHandler(webapp.RequestHandler): - - @dos.limit(count=0, period=1, - header_whitelist=set(['10.1.1.5', '10.1.1.6'])) - def get(self): - self.response.out.write('get success') - - @dos.limit(param='foobar', header=None, count=0, period=1, - param_whitelist=set(['meep', 'stuff'])) - def post(self): - self.response.out.write('post success') - - -class WhiteListTest(LimitTestBase): - """Tests for white-listing.""" - - handler_class = WhiteListHandler - - def testHeaderWhitelist(self): - """Tests white-lists for headers.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - self.handle('get') - self.assertEquals(503, self.response_code()) - - for addr in ('10.1.1.5', '10.1.1.6'): - os.environ['REMOTE_ADDR'] = addr - self.handle('get') - self.assertEquals(200, self.response_code()) - self.assertEquals('get success', self.response_body()) - - def testParameterWhitelist(self): - """Tests white-lists for parameters.""" - self.handle('post', ('foobar', 'zebra')) - self.assertEquals(503, self.response_code()) - - for value in ('meep', 'stuff'): - self.handle('post', ('foobar', value)) - self.assertEquals(200, self.response_code()) - self.assertEquals('post success', self.response_body()) - - -class LayeredHandler(webapp.RequestHandler): - - @dos.limit(count=3, period=1) - @dos.limit(header=None, param='stuff', count=0, period=1) - def get(self): - self.response.out.write('get success') - - -class LayeringTest(LimitTestBase): - """Tests that dos limits can be layered.""" - - handler_class = LayeredHandler - - def testLayering(self): - """Tests basic layering.""" - os.environ['REMOTE_ADDR'] = '10.1.1.3' - - # First request works normally, limiting by IP. - self.handle('get') - self.assertEquals(200, self.response_code()) - self.assertEquals('get success', self.response_body()) - - # Next request uses param and is blocked. - self.handle('get', ('stuff', 'meep')) - self.assertEquals(503, self.response_code()) - - # Next request without param is allowed, following one is blocked. - self.handle('get') - self.assertEquals(200, self.response_code()) - self.assertEquals('get success', self.response_body()) - self.handle('get') - self.assertEquals(503, self.response_code()) - -################################################################################ - -class GetUrlDomainTest(unittest.TestCase): - """Tests for the get_url_domain function.""" - - def testDomain(self): - """Tests good domain names.""" - # No subdomain - self.assertEquals( - 'example.com', - dos.get_url_domain('http://example.com/foo/bar?meep=stuff#asdf')) - # One subdomain - self.assertEquals( - 'www.example.com', - dos.get_url_domain('http://www.example.com/foo/bar?meep=stuff#asdf')) - # Many subdomains - self.assertEquals( - '1.2.3.many.sub.example.com', - dos.get_url_domain('http://1.2.3.many.sub.example.com/')) - # Domain with no trailing path - self.assertEquals( - 'www.example.com', - dos.get_url_domain('http://www.example.com')) - - def testDomainExceptions(self): - """Tests that some URLs may use more than the domain suffix.""" - self.assertEquals( - 'blogspot.com', - dos.get_url_domain('http://example.blogspot.com/this-is?some=test')) - - def testIP(self): - """Tests IP addresses.""" - self.assertEquals( - '192.168.1.1', - dos.get_url_domain('http://192.168.1.1/foo/bar?meep=stuff#asdf')) - # No trailing path - self.assertEquals( - '192.168.1.1', - dos.get_url_domain('http://192.168.1.1')) - - def testOther(self): - """Tests anything that's not IP- or domain-like.""" - self.assertEquals( - 'localhost', - dos.get_url_domain('http://localhost/foo/bar?meep=stuff#asdf')) - # No trailing path - self.assertEquals( - 'localhost', - dos.get_url_domain('http://localhost')) - - def testBadUrls(self): - """Tests URLs that are bad.""" - self.assertEquals('bad_url', - dos.get_url_domain('this is bad')) - self.assertEquals('bad_url', - dos.get_url_domain('example.com/foo/bar?meep=stuff#asdf')) - self.assertEquals('bad_url', - dos.get_url_domain('example.com')) - self.assertEquals('bad_url', - dos.get_url_domain('//example.com')) - self.assertEquals('bad_url', - dos.get_url_domain('/myfeed.atom')) - self.assertEquals('bad_url', - dos.get_url_domain('192.168.0.1/foobar')) - self.assertEquals('bad_url', - dos.get_url_domain('192.168.0.1')) - - def testCaching(self): - """Tests that cache eviction works properly.""" - dos._DOMAIN_CACHE.clear() - old_size = dos.DOMAIN_CACHE_SIZE - try: - dos.DOMAIN_CACHE_SIZE = 2 - dos._DOMAIN_CACHE['http://a.example.com/stuff'] = 'a.example.com' - dos._DOMAIN_CACHE['http://b.example.com/stuff'] = 'b.example.com' - dos._DOMAIN_CACHE['http://c.example.com/stuff'] = 'c.example.com' - self.assertEquals(3, len(dos._DOMAIN_CACHE)) - - # Old cache entries are hit: - self.assertEquals('c.example.com', - dos.get_url_domain('http://c.example.com/stuff')) - self.assertEquals(3, len(dos._DOMAIN_CACHE)) - - # New cache entries clear the contents. - self.assertEquals('d.example.com', - dos.get_url_domain('http://d.example.com/stuff')) - self.assertEquals(1, len(dos._DOMAIN_CACHE)) - finally: - dos.DOMAIN_CACHE_SIZE = old_size - -################################################################################ - -class OffsetOrAddTest(unittest.TestCase): - """Tests for the offset_or_add function.""" - - def setUp(self): - """Sets up the test harness.""" - self.offsets = None - self.offset_multi = lambda *a, **k: self.offsets.next()(*a, **k) - self.adds = None - self.add_multi = lambda *a, **k: self.adds.next()(*a, **k) - - def testAlreadyExist(self): - """Tests when the keys already exist and can just be added to.""" - def offset_multi(): - yield lambda *a, **k: {'one': 2, 'three': 4} - self.offsets = offset_multi() - - self.assertEquals( - {'one': 2, 'three': 4}, - dos.offset_or_add({'blue': 15, 'red': 10}, 5, - offset_multi=self.offset_multi, - add_multi=self.add_multi)) - - def testKeysAdded(self): - """Tests when some keys need to be re-added.""" - def offset_multi(): - yield lambda *a, **k: {'one': None, 'three': 4, 'five': None} - self.offsets = offset_multi() - - def add_multi(): - def run(adds, **kwargs): - self.assertEquals({'one': 5, 'five': 10}, adds) - return [] - yield run - self.adds = add_multi() - - self.assertEquals( - {'one': 5, 'three': 4, 'five': 10}, - dos.offset_or_add({'one': 5, 'three': 0, 'five': 10}, 5, - offset_multi=self.offset_multi, - add_multi=self.add_multi)) - - def testAddsRace(self): - """Tests when re-adding keys is a race that is lost.""" - def offset_multi(): - yield lambda *a, **k: {'one': None, 'three': 4, 'five': None} - yield lambda *a, **k: {'one': 5, 'five': 10} - self.offsets = offset_multi() - - def add_multi(): - def run(adds, **kwargs): - self.assertEquals({'one': 5, 'five': 10}, adds) - return ['one', 'five'] - yield run - self.adds = add_multi() - - self.assertEquals( - {'one': 5, 'three': 4, 'five': 10}, - dos.offset_or_add({'one': 5, 'three': 0, 'five': 10}, 5, - offset_multi=self.offset_multi, - add_multi=self.add_multi)) - - def testOffsetsFailAfterRace(self): - """Tests when the last offset call fails.""" - def offset_multi(): - yield lambda *a, **k: {'one': None, 'three': 4, 'five': None} - yield lambda *a, **k: {'one': None, 'five': None} - self.offsets = offset_multi() - - def add_multi(): - def run(adds, **kwargs): - self.assertEquals({'one': 5, 'five': 10}, adds) - return ['one', 'five'] - yield run - self.adds = add_multi() - - self.assertEquals( - {'one': None, 'three': 4, 'five': None}, - dos.offset_or_add({'one': 5, 'three': 0, 'five': 10}, 5, - offset_multi=self.offset_multi, - add_multi=self.add_multi)) - -################################################################################ - -class SamplerTest(unittest.TestCase): - """Tests for the MultiSampler class.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.domainA = 'mydomain.com' - self.domainB = 'example.com' - self.domainC = 'other.com' - self.domainD = 'meep.com' - self.url1 = 'http://mydomain.com/stuff/meep' - self.url2 = 'http://example.com/some-path?a=b' - self.url3 = 'http://example.com' - self.url4 = 'http://other.com/relative' - self.url5 = 'http://meep.com/another-one' - self.all_urls = [self.url1, self.url2, self.url3, self.url4, self.url5] - - self.randrange_results = [] - self.fake_randrange = lambda value: self.randrange_results.pop(0) - - self.random_results = [] - self.fake_random = lambda: self.random_results.pop(0) - - self.gettime_results = [] - self.fake_gettime = lambda: self.gettime_results.pop(0) - - def verify_sample(self, - results, - key, - expected_count, - expected_frequency, - expected_average=1, - expected_min=1, - expected_max=1): - """Verifies a sample key is present in the results. - - Args: - results: SampleResult object. - key: String key of the sample to test. - expected_count: How many samples should be present in the results. - expected_frequency: The frequency of this single key. - expected_average: Expected average value across samples of this key. - expected_min: Expected minimum value across samples of this key. - expected_max: Expected maximum value across samples of this key. - - Raises: - AssertionError if any of the expectations are not met. - """ - self.assertEquals(expected_count, results.get_count(key)) - self.assertTrue( - -0.001 < (expected_frequency - results.get_frequency(key)) < 0.001, - 'Difference %f - %f = %f' % ( - expected_frequency, results.get_frequency(key), - expected_frequency - results.get_frequency(key))) - self.assertTrue( - -0.001 < (expected_average - results.get_average(key)) < 0.001, - 'Difference %f - %f %f' % ( - expected_average, results.get_average(key), - expected_average - results.get_average(key))) - self.assertEquals(expected_min, results.get_min(key)) - self.assertEquals(expected_max, results.get_max(key)) - - def verify_no_sample(self, results, key): - """Verifies a sample key is not present in the results. - - Args: - results: SampleResult object. - key: String key of the sample to test. - - Raises: - AssertionError if the key is present. - """ - self.assertEquals(0, len(results.get_samples(key))) - - def testSingleAlways(self): - """Tests single-config sampling when the sampling rate is 100%.""" - config = dos.ReservoirConfig( - 'always', - period=300, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - reporter.set(self.url4, config) - reporter.set(self.url5, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(5, results.total_samples) - self.assertEquals(5, results.unique_samples) - self.verify_sample(results, self.domainA, 1, 0.1) - self.verify_sample(results, self.domainB, 2, 0.2) - self.verify_sample(results, self.domainC, 1, 0.1) - self.verify_sample(results, self.domainD, 1, 0.1) - - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(10, results.total_samples) - self.assertEquals(10, results.unique_samples) - self.verify_sample(results, self.domainA, 2, 0.2) - self.verify_sample(results, self.domainB, 4, 0.4) - self.verify_sample(results, self.domainC, 2, 0.2) - self.verify_sample(results, self.domainD, 2, 0.2) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(11, results.total_samples) - self.assertEquals(11, results.unique_samples) - self.verify_sample(results, self.domainA, 3, 0.3) - self.verify_sample(results, self.domainB, 4, 0.4) - self.verify_sample(results, self.domainC, 2, 0.2) - self.verify_sample(results, self.domainD, 2, 0.2) - - def testSingleOverwrite(self): - """Tests when the number of slots is lower than the sample count.""" - config = dos.ReservoirConfig( - 'always', - period=300, - rate=1, - samples=2, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - # Writes samples index 0 and 1, then overwrites index 1 again with - # a URL in the same domain. - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - self.gettime_results.extend([0, 1]) - self.randrange_results.extend([1]) - sampler.sample(reporter, randrange=self.fake_randrange) - results = sampler.get(config) - self.assertEquals(3, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_sample(results, self.domainA, 1, 1.5) - self.verify_sample(results, self.domainB, 1, 1.5) - - # Overwrites the sample at index 0, skewing all results towards the - # domain from index 1. - reporter = dos.Reporter() - reporter.set(self.url3, config) - self.gettime_results.extend([0, 1]) - self.randrange_results.extend([0]) - sampler.sample(reporter, randrange=self.fake_randrange) - results = sampler.get(config) - self.assertEquals(4, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_sample(results, self.domainB, 2, 4.0) - self.verify_no_sample(results, self.domainA) - - # Now a sample outside the range won't replace anything. - self.gettime_results.extend([0, 1]) - self.randrange_results.extend([3]) - sampler.sample(reporter, randrange=self.fake_randrange) - results = sampler.get(config) - self.assertEquals(5, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_sample(results, self.domainB, 2, 5.0) - self.verify_no_sample(results, self.domainA) - - def testSingleSampleRate(self): - """Tests when the sampling rate is less than 1.""" - config = dos.ReservoirConfig( - 'always', - period=300, - rate=0.2, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - reporter.set(self.url4, config) - reporter.set(self.url5, config) - self.gettime_results.extend([0, 10]) - self.random_results.extend([0.25, 0.199, 0.1, 0, 0.201]) - sampler.sample(reporter, getrandom=self.fake_random) - results = sampler.get(config) - self.assertEquals(3, results.total_samples) - self.assertEquals(3, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainD) - self.verify_sample(results, self.domainB, 2, - (1.0/0.2) * (2.0/3.0) * (3.0/10.0)) - self.verify_sample(results, self.domainC, 1, - (1.0/0.2) * (1.0/3.0) * (3.0/10.0)) - - def testSingleDoubleSampleRemoved(self): - """Tests when the same sample key is set twice and one is skipped. - - Setting the value twice should just overwite the previous value for a key, - but we store the keys in full order (with dupes) for simpler tests. This - ensures that incorrectly using the sampler with multiple sets won't barf. - """ - config = dos.ReservoirConfig( - 'always', - period=300, - rate=0.2, - samples=4, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - reporter.set(self.url4, config) - reporter.set(self.url5, config) - self.gettime_results.extend([0, 10]) - self.randrange_results.extend([0]) - self.random_results.extend([0.25, 0.199, 0.1, 0, 0.3, 0.3]) - sampler.sample(reporter, getrandom=self.fake_random) - results = sampler.get(config) - self.assertEquals(3, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainC) - self.verify_no_sample(results, self.domainD) - self.verify_sample(results, self.domainB, 2, - (1.0/0.2) * (2.0/2.0) * (3.0/10.0)) - - def testSingleSampleRateReplacement(self): - """Tests when the sample rate is < 1 and slots are overwritten.""" - config = dos.ReservoirConfig( - 'always', - period=300, - rate=0.2, - samples=2, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - reporter.set(self.url4, config) - self.gettime_results.extend([0, 10]) - self.randrange_results.extend([1]) - self.random_results.extend([0.25, 0.199, 0.1, 0]) - sampler.sample(reporter, getrandom=self.fake_random) - results = sampler.get(config) - self.assertEquals(3, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainD) - self.verify_sample(results, self.domainB, 1, - (1.0/0.2) * (1.0/2.0) * (3.0/10.0)) - self.verify_sample(results, self.domainC, 1, - (1.0/0.2) * (1.0/2.0) * (3.0/10.0)) - - def testSingleSampleValues(self): - """Tests various samples with expected values.""" - config = dos.ReservoirConfig( - 'always', - period=300, - rate=0.2, - samples=4, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config, 5) - reporter.set(self.url1, config, 20) # in - reporter.set(self.url2, config, 10) # in - reporter.set(self.url2 + '&more=true', config, 25) # in - reporter.set(self.url3, config, 20) # in - reporter.set(self.url4, config, 40) # in - reporter.set(self.url5, config, 60) - self.gettime_results.extend([0, 10]) - self.randrange_results.extend([0]) - self.random_results.extend([0.25, 0.199, 0.1, 0, 0, 0.1, 0.3]) - sampler.sample(reporter, - randrange=self.fake_randrange, - getrandom=self.fake_random) - results = sampler.get(config) - self.assertEquals(5, results.total_samples) - self.assertEquals(4, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainD) - self.verify_sample(results, self.domainB, 3, - (1.0/0.2) * (3.0/4.0) * (5.0/10.0), - expected_average=18.333, - expected_min=10, - expected_max=25) - self.verify_sample(results, self.domainC, 1, - (1.0/0.2) * (1.0/4.0) * (5.0/10.0), - expected_average=40, - expected_min=40, - expected_max=40) - - def testResetTimestamp(self): - """Tests resetting the timestamp after the period elapses.""" - config = dos.ReservoirConfig( - 'always', - period=10, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - self.gettime_results.extend([0, 5]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(1, results.total_samples) - self.assertEquals(1, results.unique_samples) - self.verify_sample(results, self.domainA, 1, 1.0 / 5) - self.verify_no_sample(results, self.domainB) - self.verify_no_sample(results, self.domainC) - self.verify_no_sample(results, self.domainD) - - reporter = dos.Reporter() - reporter.set(self.url2, config) - self.gettime_results.extend([15, 16]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(1, results.total_samples) - self.assertEquals(1, results.unique_samples) - self.verify_sample(results, self.domainB, 1, 1.0) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainC) - self.verify_no_sample(results, self.domainD) - - def testSingleUnicodeKey(self): - """Tests when a sampling key is unicode. - - Keys must be UTF-8 encoded because the memcache API will do this for us - (and break) if we don't. - """ - config = dos.ReservoirConfig( - 'always', - period=300, - samples=10000, - by_url=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - key = u'this-breaks-stuff\u30d6\u30ed\u30b0\u8846' - key_utf8 = key.encode('utf-8') - reporter.set(key, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(1, results.total_samples) - self.assertEquals(1, results.unique_samples) - self.verify_sample(results, key_utf8, 1, 0.1) - - def testMultiple(self): - """Tests multiple configs being applied together.""" - config1 = dos.ReservoirConfig( - 'first', - period=300, - samples=10000, - by_domain=True) - config2 = dos.ReservoirConfig( - 'second', - period=300, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config1, config2], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config1) - reporter.set(self.url2, config1) - reporter.set(self.url3, config1) - reporter.set(self.url4, config1) - reporter.set(self.url5, config1) - reporter.set(self.url1, config2, 5) - reporter.set(self.url2, config2, 5) - reporter.set(self.url3, config2, 5) - reporter.set(self.url4, config2, 5) - reporter.set(self.url5, config2, 5) - self.gettime_results.extend([0, 10, 10]) - sampler.sample(reporter) - - results1 = sampler.get(config1) - self.assertEquals(5, results1.total_samples) - self.assertEquals(5, results1.unique_samples) - self.verify_sample(results1, self.domainA, 1, 0.1) - self.verify_sample(results1, self.domainB, 2, 0.2) - self.verify_sample(results1, self.domainC, 1, 0.1) - self.verify_sample(results1, self.domainD, 1, 0.1) - - results2 = sampler.get(config2) - self.assertEquals(5, results2.total_samples) - self.assertEquals(5, results2.unique_samples) - self.verify_sample(results2, self.domainA, 1, 0.1, - expected_max=5, - expected_min=5, - expected_average=5) - self.verify_sample(results2, self.domainB, 2, 0.2, - expected_max=5, - expected_min=5, - expected_average=5) - self.verify_sample(results2, self.domainC, 1, 0.1, - expected_max=5, - expected_min=5, - expected_average=5) - self.verify_sample(results2, self.domainD, 1, 0.1, - expected_max=5, - expected_min=5, - expected_average=5) - - def testGetSingleKey(self): - """Tests getting the stats for a single key.""" - config = dos.ReservoirConfig( - 'single-sample', - period=300, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - reporter.set(self.url3 + '&okay=1', config) - reporter.set(self.url3 + '&okay=2', config) - reporter.set(self.url3 + '&okay=3', config) - reporter.set(self.url3 + '&okay=4', config) - reporter.set(self.url4, config) - reporter.set(self.url5, config) - self.gettime_results.extend([0, 10, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(9, results.total_samples) - self.assertEquals(9, results.unique_samples) - self.verify_sample(results, self.domainA, 1, 0.1) - self.verify_sample(results, self.domainB, 6, 0.6) - self.verify_sample(results, self.domainC, 1, 0.1) - self.verify_sample(results, self.domainD, 1, 0.1) - - results = sampler.get(config, self.url2) - self.assertEquals(6, results.total_samples) - self.assertEquals(6, results.unique_samples) - self.verify_sample(results, self.domainB, 6, 0.6) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainC) - self.verify_no_sample(results, self.domainD) - - def testCountLost(self): - """Tests when the count variable disappears between samples.""" - config = dos.ReservoirConfig( - 'lost_count', - period=300, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(2, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_no_sample(results, self.domainC) - self.verify_no_sample(results, self.domainD) - self.verify_sample(results, self.domainA, 1, 0.1) - self.verify_sample(results, self.domainB, 1, 0.1) - - memcache.delete('lost_count:by_domain:counter') - reporter = dos.Reporter() - reporter.set(self.url4, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(1, results.total_samples) - - # Two samples found because we're still in the same period tolerance. - # Sample at index 0 will be overwritten with the new entry, meaning - # domain A is gone. - self.assertEquals(2, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainD) - self.verify_sample(results, self.domainB, 1, 0.05) - self.verify_sample(results, self.domainC, 1, 0.05) - - def testStampLost(self): - """Tests when the start timestamp is lost between samples.""" - config = dos.ReservoirConfig( - 'lost_stamp', - period=300, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(2, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_no_sample(results, self.domainC) - self.verify_no_sample(results, self.domainD) - self.verify_sample(results, self.domainA, 1, 0.1) - self.verify_sample(results, self.domainB, 1, 0.1) - - memcache.delete('lost_stamp:by_domain:start_time') - reporter = dos.Reporter() - reporter.set(self.url4, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - results = sampler.get(config) - self.assertEquals(1, results.total_samples) - - # Just like losing the count, old samples found because we're still in the - # same period tolerance. Sample at index 0 will be overwritten with the new - # entry, meaning domain A is gone. - self.assertEquals(2, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainD) - self.verify_sample(results, self.domainB, 1, 0.05) - self.verify_sample(results, self.domainC, 1, 0.05) - - def testSamplesLost(self): - """Tests when some unique samples were evicted.""" - config = dos.ReservoirConfig( - 'lost_sample', - period=300, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - reporter.set(self.url4, config) - reporter.set(self.url5, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - - memcache.delete_multi([ - 'lost_sample:by_domain:0', - 'lost_sample:by_domain:1', - 'lost_sample:by_domain:2', - ]) - - results = sampler.get(config) - self.assertEquals(5, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainB) - self.verify_sample(results, self.domainC, 1, 0.25) - self.verify_sample(results, self.domainD, 1, 0.25) - - def testBeforePeriod(self): - """Tests when the samples retrieved are too old.""" - config = dos.ReservoirConfig( - 'old_samples', - period=10, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - reporter.set(self.url4, config) - reporter.set(self.url5, config) - self.gettime_results.extend([20, 40]) - sampler.sample(reporter) - - memcache.set('old_samples:by_domain:start_time', 0) - results = sampler.get(config) - self.assertEquals(5, results.total_samples) - self.assertEquals(0, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainB) - self.verify_no_sample(results, self.domainC) - self.verify_no_sample(results, self.domainD) - - def testBadSamples(self): - """Tests when getting samples with memcache values that are bad.""" - config = dos.ReservoirConfig( - 'bad_samples', - period=10, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config) - reporter.set(self.url2, config) - reporter.set(self.url3, config) - reporter.set(self.url4, config) - reporter.set(self.url5, config) - self.gettime_results.extend([0, 10]) - sampler.sample(reporter) - - # Totaly bad - memcache.set('bad_samples:by_domain:0', 'garbage') - # Bad value. - memcache.set('bad_samples:by_domain:1', - '%s:\0\0\0\1:' % self.domainB) - # Bad when. - memcache.set('bad_samples:by_domain:2', - '%s::\0\0\0\1' % self.domainB) - - results = sampler.get(config) - self.assertEquals(5, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_no_sample(results, self.domainA) - self.verify_no_sample(results, self.domainB) - self.verify_sample(results, self.domainC, 1, 0.25) - self.verify_sample(results, self.domainD, 1, 0.25) - - def testGetChain(self): - """Tests getting results from multiple configs in a single call.""" - config1 = dos.ReservoirConfig( - 'first', - period=300, - rate=1, - samples=10000, - by_domain=True) - config2 = dos.ReservoirConfig( - 'second', - period=300, - rate=1, - samples=10000, - by_url=True) - sampler = dos.MultiSampler([config1, config2], gettime=self.fake_gettime) - - reporter = dos.Reporter() - reporter.set(self.url1, config1) - reporter.set(self.url2, config1) - reporter.set(self.url3, config1) - reporter.set(self.url4, config1) - reporter.set(self.url5, config1) - reporter.set(self.url1, config2) - reporter.set(self.url2, config2) - reporter.set(self.url3, config2) - reporter.set(self.url4, config2) - reporter.set(self.url5, config2) - self.gettime_results.extend([0, 10, 10, 10, 10]) - sampler.sample(reporter) - result_iter = sampler.get_chain(config1, config2) - - # Results for config1 - results = result_iter.next() - self.assertEquals(5, results.total_samples) - self.assertEquals(5, results.unique_samples) - self.verify_sample(results, self.domainA, 1, 0.1) - self.verify_sample(results, self.domainB, 2, 0.2) - self.verify_sample(results, self.domainC, 1, 0.1) - self.verify_sample(results, self.domainD, 1, 0.1) - - # Results for config2 - results = result_iter.next() - self.assertEquals(5, results.total_samples) - self.assertEquals(5, results.unique_samples) - self.verify_sample(results, self.url1, 1, 0.1) - self.verify_sample(results, self.url2, 1, 0.1) - self.verify_sample(results, self.url3, 1, 0.1) - self.verify_sample(results, self.url4, 1, 0.1) - self.verify_sample(results, self.url5, 1, 0.1) - - # Single key test - result_iter = sampler.get_chain( - config1, config2, - single_key=self.url2) - - # Results for config1 - results = result_iter.next() - self.assertEquals(2, results.total_samples) - self.assertEquals(2, results.unique_samples) - self.verify_sample(results, self.domainB, 2, 0.2) - - # Results for config2 - results = result_iter.next() - self.assertEquals(1, results.total_samples) - self.assertEquals(1, results.unique_samples) - self.verify_sample(results, self.url2, 1, 0.1) - - - def testConfig(self): - """Tests config validation.""" - # Bad name. - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - '', - period=10, - samples=10, - by_domain=True) - - # Bad period. - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=0, - samples=10, - by_domain=True) - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=-1, - samples=10, - by_domain=True) - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period='bad', - samples=10, - by_domain=True) - - # Bad samples. - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=0, - by_domain=True) - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=-1, - by_domain=True) - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples='bad', - by_domain=True) - - # Bad domain/url combo. - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=10, - by_domain=True, - by_url=True) - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=10, - by_domain=False, - by_url=False) - - # Bad rate. - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=10, - rate=-1, - by_domain=True) - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=10, - rate=1.1, - by_domain=True) - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=10, - rate='bad', - by_domain=True) - - # Bad tolerance. - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=10, - tolerance=-1, - by_domain=True) - self.assertRaises( - dos.ConfigError, - dos.ReservoirConfig, - 'my name', - period=10, - samples=10, - tolerance='bad', - by_domain=True) - - def testSampleProfile(self): - """Profiles the sample method with lots of data.""" - print 'Tracked objects start',len(gc.get_objects()) - config = dos.ReservoirConfig( - 'testing', - period=10, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config]) - reporter = dos.Reporter() - fake_urls = ['http://example-%s.com/meep' % i - for i in xrange(100)] - for i in xrange(100000): - reporter.set(random.choice(fake_urls), config, random.randint(0, 10000)) - del fake_urls - gc.collect() - dos._DOMAIN_CACHE.clear() - - gc.disable() - gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_LEAK) - try: - # Swap the two following lines to profile memory vs. CPU - sampler.sample(reporter) - #cProfile.runctx('sampler.sample(reporter)', globals(), locals()) - memcache.flush_all() # Clear the string references - print 'Tracked objects before collection', len(gc.get_objects()) - dos._DOMAIN_CACHE.clear() - del reporter - del sampler - finally: - print 'Unreachable', gc.collect() - print 'Tracked objects after collection', len(gc.get_objects()) - gc.set_debug(0) - gc.enable() - - def testGetProfile(self): - """Profiles the get method when there's lots of data.""" - print 'Tracked objects start',len(gc.get_objects()) - config = dos.ReservoirConfig( - 'testing', - period=10, - rate=1, - samples=10000, - by_domain=True) - sampler = dos.MultiSampler([config]) - reporter = dos.Reporter() - fake_urls = ['http://example-%s.com/meep' % i - for i in xrange(100)] - for i in xrange(100000): - reporter.set(random.choice(fake_urls), config, random.randint(0, 10000)) - del fake_urls - dos._DOMAIN_CACHE.clear() - gc.collect() - sampler.sample(reporter) - - gc.disable() - gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_LEAK) - try: - # Swap the two following lines to profile memory vs. CPU - result = sampler.get(config) - #cProfile.runctx('result = sampler.get(config)', globals(), locals()) - memcache.flush_all() # Clear the string references - print 'Tracked objects before collection', len(gc.get_objects()) - try: - del locals()['result'] - del result - except: - pass - dos._DOMAIN_CACHE.clear() - del reporter - del sampler - finally: - print 'Unreachable', gc.collect() - print 'Tracked objects after collection', len(gc.get_objects()) - gc.set_debug(0) - gc.enable() - -################################################################################ - -class UrlScorerTest(unittest.TestCase): - """Tests for the UrlScorer class.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.domain1 = 'mydomain.com' - self.domain2 = 'example.com' - self.domain3 = 'other.com' - self.url1 = 'http://mydomain.com/stuff/meep' - self.url2 = 'http://example.com/some-path?a=b' - self.url3 = 'http://example.com' - self.url4 = 'http://other.com/relative' - self.scorer = dos.UrlScorer( - period=60, - min_requests=1, - max_failure_percentage=0.2, - prefix='test') - - def testConfig(self): - """Tests that the config parameters are sanitized.""" - # Bad periods - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=0, - min_requests=1, - max_failure_percentage=0.2, - prefix='test:') - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=-1, - min_requests=1, - max_failure_percentage=0.2, - prefix='test:') - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period='not an int', - min_requests=1, - max_failure_percentage=0.2, - prefix='test:') - - # Bad min_requests - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=1, - min_requests='bad', - max_failure_percentage=0.2, - prefix='test:') - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=1, - min_requests=-1, - max_failure_percentage=0.2, - prefix='test:') - - # Bad max_failure_percentage - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=1, - min_requests=1, - max_failure_percentage='not a float', - prefix='test:') - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=1, - min_requests=1, - max_failure_percentage=2, - prefix='test:') - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=1, - min_requests=1, - max_failure_percentage=-1, - prefix='test:') - - # Bad prefix - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=1, - min_requests=1, - max_failure_percentage=0.2, - prefix='') - self.assertRaises(dos.ConfigError, - dos.UrlScorer, - period=1, - min_requests=1, - max_failure_percentage=0.2, - prefix=123) - - def testReport(self): - """Tests reporting domain status.""" - self.scorer.report( - [self.url1, self.url2], [self.url3, self.url4]) - self.assertEquals(1, memcache.get('scoring:test:success:' + self.domain1)) - self.assertEquals(0, memcache.get('scoring:test:failure:' + self.domain1)) - self.assertEquals(1, memcache.get('scoring:test:success:' + self.domain2)) - self.assertEquals(1, memcache.get('scoring:test:failure:' + self.domain2)) - self.assertEquals(0, memcache.get('scoring:test:success:' + self.domain3)) - self.assertEquals(1, memcache.get('scoring:test:failure:' + self.domain3)) - - self.scorer.report( - [self.url1, self.url2, self.url3, self.url4], []) - self.assertEquals(2, memcache.get('scoring:test:success:' + self.domain1)) - self.assertEquals(0, memcache.get('scoring:test:failure:' + self.domain1)) - self.assertEquals(3, memcache.get('scoring:test:success:' + self.domain2)) - self.assertEquals(1, memcache.get('scoring:test:failure:' + self.domain2)) - self.assertEquals(1, memcache.get('scoring:test:success:' + self.domain3)) - self.assertEquals(1, memcache.get('scoring:test:failure:' + self.domain3)) - - self.scorer.report( - [], [self.url1, self.url2, self.url3, self.url4]) - self.assertEquals(2, memcache.get('scoring:test:success:' + self.domain1)) - self.assertEquals(1, memcache.get('scoring:test:failure:' + self.domain1)) - self.assertEquals(3, memcache.get('scoring:test:success:' + self.domain2)) - self.assertEquals(3, memcache.get('scoring:test:failure:' + self.domain2)) - self.assertEquals(1, memcache.get('scoring:test:success:' + self.domain3)) - self.assertEquals(2, memcache.get('scoring:test:failure:' + self.domain3)) - - def testBelowMinRequests(self): - """Tests when there are enough failures but not enough total requests.""" - memcache.set('scoring:test:success:' + self.domain1, 0) - memcache.set('scoring:test:failure:' + self.domain1, 10) - self.assertEquals( - [(True, 1), (True, 0)], - self.scorer.filter([self.url1, self.url2])) - - def testFailurePrecentageTooLow(self): - """Tests when there are enough requests but too few failures.""" - memcache.set('scoring:test:success:' + self.domain1, 100) - memcache.set('scoring:test:failure:' + self.domain1, 1) - self.assertEquals( - [(True, 1/101.0), (True, 0)], - self.scorer.filter([self.url1, self.url2])) - - def testNotAllowed(self): - """Tests when a result is blocked due to overage.""" - memcache.set('scoring:test:success:' + self.domain1, 100) - memcache.set('scoring:test:failure:' + self.domain1, 30) - self.assertEquals( - [(False, 30/130.0), (True, 0)], - self.scorer.filter([self.url1, self.url2])) - - def testGetScores(self): - """Tests getting the scores of URLs.""" - memcache.set('scoring:test:success:' + self.domain1, 2) - memcache.set('scoring:test:failure:' + self.domain1, 1) - memcache.set('scoring:test:success:' + self.domain2, 3) - memcache.set('scoring:test:failure:' + self.domain2, 3) - memcache.set('scoring:test:success:' + self.domain3, 1) - memcache.set('scoring:test:failure:' + self.domain3, 2) - self.assertEquals( - [(2, 1), (3, 3), (1, 2)], - self.scorer.get_scores([self.url1, self.url2, self.url4])) - - def testBlackhole(self): - """Tests blackholing a URL.""" - self.assertEquals( - [(True, 0), (True, 0)], - self.scorer.filter([self.url1, self.url2])) - self.scorer.blackhole([self.url1, self.url2]) - self.assertEquals( - [(False, 1.0), (False, 1.0)], - self.scorer.filter([self.url1, self.url2])) - -################################################################################ - -if __name__ == '__main__': - unittest.main() diff --git a/hub/event_details.html b/hub/event_details.html deleted file mode 100644 index 9264a81..0000000 --- a/hub/event_details.html +++ /dev/null @@ -1,105 +0,0 @@ - - - Hub - Subscription Details - {{topic_url|escape}} - - - - - -

Subscription Details - {{topic_url|escape}}

- -{% if error %} -
{{error|escape}}
-{% else %} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Callback URL:{{callback_url|escape}}
Created time (UTC):{{created_time|date:"Y-m-d\TH:i:s\Z"}}
Last modified time (UTC):{{last_modified|date:"Y-m-d\TH:i:s\Z"}}
Expiration time (UTC):{{expiration_time|date:"Y-m-d\TH:i:s\Z"}}
State:{{subscription_state}}
Confirmation failures:{{confirm_failures}}
Delivery to domain: - {% if delivery_blocked %} - BLOCKED - {% else %} - OK - {% endif %} -
Delivery short-term:{{delivery_errors|floatformat:"-2"}}% errors
- -

Error rate statistics

-{% for result in delivery_url_error %} - {% include "stats_table.html" %} -{% endfor %} -{% for result in delivery_domain_error %} - {% include "stats_table.html" %} -{% endfor %} - -

Latency statistics

-{% for result in delivery_url_latency %} - {% include "stats_table.html" %} -{% endfor %} -{% for result in delivery_domain_latency %} - {% include "stats_table.html" %} -{% endfor %} - -{% for event in failed_events %} -
-

Failure {{forloop.counter}}

-

- - - - - - - - - - - - - - - - - -
Next/Last attempt time (UTC):{{event.last_modified|date:"Y-m-d\TH:i:s\Z"}}
Totally failed:{{event.totally_failed}}
Retry attempts:{{event.retry_attempts}}
Content type:{{event.content_type|escape}}
- -

Payload, possibly truncated:

-
-{{event.payload_trunc|escape}}
-
-

-{% endfor %} - -{% endif %} - - - - diff --git a/hub/favicon.ico b/hub/favicon.ico deleted file mode 100644 index c42ce24..0000000 Binary files a/hub/favicon.ico and /dev/null differ diff --git a/hub/feed_diff.py b/hub/feed_diff.py deleted file mode 100755 index b9c7f97..0000000 --- a/hub/feed_diff.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2008 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -__author__ = 'bslatkin@gmail.com (Brett Slatkin)' - -"""Atom/RSS feed parser that quickly extracts entry/item elements.""" - -import cStringIO -import logging -import xml.sax -import xml.sax.handler -import xml.sax.saxutils - - -# Set to true to see stack level messages and other debugging information. -DEBUG = False - - -class Error(Exception): - """Exception for errors in this module.""" - - -class TrivialEntityResolver(xml.sax.handler.EntityResolver): - """Pass-through entity resolver.""" - - def resolveEntity(self, publicId, systemId): - return cStringIO.StringIO() - - -class FeedContentHandler(xml.sax.handler.ContentHandler): - """Sax content handler for quickly parsing Atom and RSS feeds.""" - - def __init__(self, parser): - """Initializer. - - Args: - parser: Instance of the xml.sax parser being used with this handler. - """ - self.enclosing_tag = "" - self.parser = parser - self.header_footer = "" - self.entries_map = {} - - # Internal state - self.stack_level = 0 - self.output_stack = [] - self.current_level = None - self.last_id = '' - self.last_link = '' - self.last_title = '' - self.last_description = '' - - # Helper methods - def emit(self, data): - if type(data) is list: - self.current_level.extend(data) - else: - self.current_level.append(data) - - def push(self): - self.current_level = [] - self.output_stack.append(self.current_level) - - def pop(self): - old_level = self.output_stack.pop() - if len(self.output_stack) > 0: - self.current_level = self.output_stack[-1] - else: - self.current_level = None - return old_level - - # SAX methods - def startElement(self, name, attrs): - self.stack_level += 1 - event = (self.stack_level, name) - if DEBUG: logging.debug('Start stack level %r', event) - if self.stack_level == 1: - # Save the outermost tag for later. - self.enclosing_tag = name.lower() - - self.push() - self.emit(['<', name]) - for key, value in attrs.items(): - self.emit([' ', key, '=', xml.sax.saxutils.quoteattr(value)]) - # Do not emit a '>' here because this tag may need to be immediately - # closed with a '/> ending. - - self.push() - - def endElement(self, name): - event = (self.stack_level, name) - if DEBUG: logging.debug('End stack level %r', event) - - content = self.pop() - if content: - self.emit('>') - self.emit(content) - self.emit(['']) - else: - # No content means this element should be immediately closed. - self.emit('/>') - - self.handleEvent(event, content) - self.stack_level -= 1 - - def characters(self, content): - # The SAX parser will try to escape XML entities (like &) and other - # fun stuff. But this is not what we actually want. We want the original - # content to be reproduced exactly as we received it, so we can pass it - # along to others. The reason is simple: reformatting the XML by unescaping - # certain data may cause the resulting XML to no longer validate. - self.emit(xml.sax.saxutils.escape(content)) - - -def strip_whitespace(enclosing_tag, all_parts): - """Strips the whitespace from a SAX parser list for a feed. - - Args: - enclosing_tag: The enclosing tag of the feed. - all_parts: List of SAX parser elements. - - Returns: - header_footer for those parts with trailing whitespace removed. - """ - if 'feed' in enclosing_tag: - first_part = ''.join(all_parts[:-3]).strip('\n\r\t ') - return '%s\n' % (first_part, enclosing_tag) - else: - first_part = ''.join(all_parts[:-3]).strip('\n\r\t ') - channel_part = first_part.rfind('') - if channel_part == -1: - raise Error('Could not find after trimming whitespace') - stripped = first_part[:channel_part].strip('\n\r\t ') - return '%s\n\n' % (stripped, enclosing_tag) - - -class AtomFeedHandler(FeedContentHandler): - """Sax content handler for Atom feeds.""" - - def handleEvent(self, event, content): - depth, tag = event[0], event[1].lower() - if depth == 1: - if tag != 'feed' and not tag.endswith(':feed'): - raise Error('Enclosing tag is not . Found: %r' % tag) - else: - self.header_footer = strip_whitespace(event[1], self.pop()) - elif depth == 2 and (tag == 'entry' or tag.endswith(':entry')): - self.entries_map[self.last_id] = ''.join(self.pop()) - elif depth == 3 and (tag == 'id' or tag.endswith(':id')): - self.last_id = ''.join(content).strip() - self.emit(self.pop()) - else: - self.emit(self.pop()) - - -class RssFeedHandler(FeedContentHandler): - """Sax content handler for RSS feeds.""" - - def handleEvent(self, event, content): - depth, tag = event[0], event[1].lower() - if depth == 1: - if (tag != 'rss' and not tag.endswith(':rss') - and tag != 'rdf' and not tag.endswith(':rdf')): - raise Error('Enclosing tag is not or . ' - 'Found: %r' % tag) - else: - self.header_footer = strip_whitespace(event[1], self.pop()) - elif (tag == 'item' or tag.endswith(':item')) and ( - depth == 3 or (depth == 2 and 'rdf' in self.enclosing_tag)): - item_id = (self.last_id or self.last_link or - self.last_title or self.last_description) - self.entries_map[item_id] = ''.join(self.pop()) - self.last_id, self.last_link, self.last_title, self.last_description = ( - '', '', '', '') - elif (tag == 'guid' or tag.endswith(':guid')) and ( - depth == 4 or (depth == 3 and 'rdf' in self.enclosing_tag)): - self.last_id = ''.join(content).strip() - self.emit(self.pop()) - elif (tag == 'link' or tag.endswith(':link')) and ( - depth == 4 or (depth == 3 and 'rdf' in self.enclosing_tag)): - self.last_link = ''.join(content).strip() - self.emit(self.pop()) - elif (tag == 'title' or tag.endswith(':title')) and ( - depth == 4 or (depth == 3 and 'rdf' in self.enclosing_tag)): - self.last_title = ''.join(content).strip() - self.emit(self.pop()) - elif (tag == 'description' or tag.endswith(':description')) and ( - depth == 4 or (depth == 3 and 'rdf' in self.enclosing_tag)): - self.last_description = ''.join(content).strip() - self.emit(self.pop()) - else: - self.emit(self.pop()) - - -def filter(data, format): - """Filter a feed through the parser. - - Args: - data: String containing the data of the XML feed to parse. - format: String naming the format of the data. Should be 'rss' or 'atom'. - - Returns: - Tuple (header_footer, entries_map) where: - header_footer: String containing everything else in the feed document - that is specifically *not* an or . - entries_map: Dictionary mapping entry_id to the entry's XML data. - - Raises: - xml.sax.SAXException on parse errors. feed_diff.Error if the diff could not - be derived due to bad content (e.g., a good XML doc that is not Atom or RSS) - or any of the feed entries are missing required fields. - """ - data_stream = cStringIO.StringIO(data) - parser = xml.sax.make_parser() - - if format == 'atom': - handler = AtomFeedHandler(parser) - elif format == 'rss': - handler = RssFeedHandler(parser) - else: - raise Error('Invalid feed format "%s"' % format) - - parser.setContentHandler(handler) - parser.setEntityResolver(TrivialEntityResolver()) - # NOTE: Would like to enable these options, but expat (which is all App Engine - # gives us) cannot report the QName of namespace prefixes. Thus, we have to - # work around this to preserve the document's original namespacing. - # parser.setFeature(xml.sax.handler.feature_namespaces, 1) - # parser.setFeature(xml.sax.handler.feature_namespace_prefixes, 1) - try: - parser.parse(data_stream) - except IOError, e: - raise Error('Encountered IOError while parsing: %s' % e) - - for entry_id, content in handler.entries_map.iteritems(): - if format == 'atom' and not entry_id: - raise Error(' element missing : %s' % content) - elif format == 'rss' and not entry_id: - raise Error(' element missing or : %s' % content) - - return handler.header_footer, handler.entries_map - - -__all__ = ['filter', 'DEBUG', 'Error'] diff --git a/hub/feed_diff_test.py b/hub/feed_diff_test.py deleted file mode 100755 index 7a20130..0000000 --- a/hub/feed_diff_test.py +++ /dev/null @@ -1,331 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2008 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -__author__ = 'bslatkin@gmail.com (Brett Slatkin)' - -"""Tests for the feed_diff module.""" - -import logging -import os -import unittest - -import feed_diff - - -class TestBase(unittest.TestCase): - - format = None - feed_open = None - feed_close = None - entry_open = None - entry_close = None - - def setUp(self): - self.testdata = os.path.join(os.path.dirname(__file__), - 'feed_diff_testdata') - - def verify_entries(self, expected_list, entries): - found_entries = sorted(entries.items()) - self.assertEqual(len(expected_list), len(found_entries)) - for index, (expected_key, found) in enumerate( - zip(expected_list, found_entries)): - found_key, found_content = found - self.assertEqual(expected_key, found_key, - "Fail on index %d: Expected %r, found %r" % ( - index, expected_key, found_key)) - self.assertTrue(found_content.startswith(self.entry_open)) - self.assertTrue(found_content.endswith(self.entry_close)) - - def load_feed(self, path): - data = open(os.path.join(self.testdata, path)).read() - header_footer, entries = feed_diff.filter(data, self.format) - self.assertTrue(header_footer.startswith(self.feed_open)) - self.assertTrue(header_footer.endswith(self.feed_close)) - return header_footer, entries - - -class AtomFeedDiffTest(TestBase): - - format = 'atom' - feed_open = '\n')) - # Verify preservation of '/>' closings. - self.assertTrue('' in header_footer) - - def testEntityEscaping(self): - """Tests when certain external entities show up in the feed. - - Example: '&nbsp' will be converted to ' ' by the parser, but then - the new output entity won't be resolved. - """ - header_footer, entries = self.load_feed('entity_escaping.xml') - self.assertTrue(''' not in header_footer) - entity_id, content = entries.items()[0] - self.assertTrue('&nbsp;' in content) - - def testAttributeEscaping(self): - """Tests when certain external entities show up in an XML attribute. - - Example: gd:foo=""blah"" will be converted to - gd:foo=""blah"" by the parser, which is not valid XML when reconstructing - the result. - """ - header_footer, entries = self.load_feed('attribute_escaping.xml') - self.assertTrue('foo:myattribute=""\'foobar\'""' in header_footer) - - def testInvalidFeed(self): - """Tests when the feed is not a valid Atom document.""" - data = open(os.path.join(self.testdata, 'bad_atom_feed.xml')).read() - try: - feed_diff.filter(data, 'atom') - except feed_diff.Error, e: - self.assertTrue('Enclosing tag is not ' in str(e)) - else: - self.fail() - - def testNoXmlHeader(self): - """Tests that feeds with no XML header are accepted.""" - data = open(os.path.join(self.testdata, 'no_xml_header.xml')).read() - header_footer, entries = feed_diff.filter(data, 'atom') - self.assertEquals(1, len(entries)) - - def testMissingId(self): - """Tests when an Atom entry is missing its ID field.""" - data = open(os.path.join(self.testdata, 'missing_entry_id.xml')).read() - try: - feed_diff.filter(data, 'atom') - except feed_diff.Error, e: - self.assertTrue(' element missing ' in str(e)) - else: - self.fail() - - def testFailsOnRss(self): - """Tests that parsing an RSS feed as Atom will fail.""" - data = open(os.path.join(self.testdata, 'rss2sample.xml')).read() - try: - feed_diff.filter(data, 'atom') - except feed_diff.Error, e: - self.assertTrue('Enclosing tag is not ' in str(e)) - else: - self.fail() - - def testCData(self): - """Tests a feed that has a CData section.""" - data = open(os.path.join(self.testdata, 'cdata_test.xml')).read() - header_footer, entries = feed_diff.filter(data, 'atom') - expected_list = [ - u'tag:blog.livedoor.jp,2010:coupon_123.1635380' - ] - self.verify_entries(expected_list, entries) - self.assertTrue( - ('livedoor Blog') - in header_footer) - entry_data = entries['tag:blog.livedoor.jp,2010:coupon_123.1635380'] - # Here the CData section is rewritten. - self.assertTrue('</FONT>' in entry_data) - - -class AtomNamespacedFeedDiffTest(TestBase): - - format = 'atom' - feed_open = '')) - expected_list = [ - u'http://example.com/feeds/delta/124', - u'http://example.com/feeds/delta/125' - ] - self.verify_entries(expected_list, entries) - - -class RssFeedDiffTest(TestBase): - - format = 'rss' - feed_open = '\n\n')) - # Verify preservation of '/>' closings. - self.assertTrue('' in header_footer) - - def testParsingRss091(self): - """Tests parsing RSS 0.91.""" - header_footer, entries = self.load_feed('sampleRss091.xml') - expected_list = [ - u'http://writetheweb.com/read.php?item=19', - u'http://writetheweb.com/read.php?item=20', - u'http://writetheweb.com/read.php?item=21', - u'http://writetheweb.com/read.php?item=22', - u'http://writetheweb.com/read.php?item=23', - u'http://writetheweb.com/read.php?item=24', - ] - self.verify_entries(expected_list, entries) - - def testParsingRss092(self): - """Tests parsing RSS 0.92 with enclosures and only descriptions.""" - header_footer, entries = self.load_feed('sampleRss092.xml') - expected_list = [ - u'<a href="http://arts.ucsc.edu/GDead/AGDL/other1.html">The Other One</a>, live instrumental, One From The Vault. Very rhythmic very spacy, you can listen to it many times, and enjoy something new every time.', - u'<a href="http://www.cs.cmu.edu/~mleone/gdead/dead-lyrics/Franklin\'s_Tower.txt">Franklin\'s Tower</a>, a live version from One From The Vault.', - u'<a href="http://www.scripting.com/mp3s/youWinAgain.mp3">The news is out</a>, all over town..<p>\nYou\'ve been seen, out runnin round. <p>\nThe lyrics are <a href="http://www.cs.cmu.edu/~mleone/gdead/dead-lyrics/You_Win_Again.txt">here</a>, short and sweet. <p>\n<i>You win again!</i>', - u"It's been a few days since I added a song to the Grateful Dead channel. Now that there are all these new Radio users, many of whom are tuned into this channel (it's #16 on the hotlist of upstreaming Radio users, there's no way of knowing how many non-upstreaming users are subscribing, have to do something about this..). Anyway, tonight's song is a live version of Weather Report Suite from Dick's Picks Volume 7. It's wistful music. Of course a beautiful song, oft-quoted here on Scripting News. <i>A little change, the wind and rain.</i>", - u'Kevin Drennan started a <a href="http://deadend.editthispage.com/">Grateful Dead Weblog</a>. Hey it\'s cool, he even has a <a href="http://deadend.editthispage.com/directory/61">directory</a>. <i>A Frontier 7 feature.</i>', - u'Moshe Weitzman says Shakedown Street is what I\'m lookin for for tonight. I\'m listening right now. It\'s one of my favorites. "Don\'t tell me this town ain\'t got no heart." Too bright. I like the jazziness of Weather Report Suite. Dreamy and soft. How about The Other One? "Spanish lady come to me.."', - u'The HTML rendering almost <a href="http://validator.w3.org/check/referer">validates</a>. Close. Hey I wonder if anyone has ever published a style guide for ALT attributes on images? What are you supposed to say in the ALT attribute? I sure don\'t know. If you\'re blind send me an email if u cn rd ths.', - u'This is a test of a change I just made. Still diggin..', - ] - self.verify_entries(expected_list, entries) - - def testOnlyLink(self): - """Tests when an RSS item only has a link element.""" - header_footer, entries = self.load_feed('rss2_only_link.xml') - expected_list = [ - u'http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp', - u'http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp', - u'http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp', - ] - self.verify_entries(expected_list, entries) - - def testOnlyTitle(self): - """Tests when an RSS item only has a title element.""" - header_footer, entries = self.load_feed('rss2_only_title.xml') - expected_list = [ - u"Astronauts' Dirty Laundry", - u'Star City', - u'The Engine That Does More', - ] - self.verify_entries(expected_list, entries) - - def testFailsOnAtom(self): - """Tests that parsing an Atom feed as RSS will fail.""" - data = open(os.path.join(self.testdata, 'parsing.xml')).read() - try: - feed_diff.filter(data, 'rss') - except feed_diff.Error, e: - self.assertTrue('Enclosing tag is not ' in str(e)) - else: - self.fail() - - -class RssRdfFeedDiffTest(TestBase): - - format = 'rss' - feed_open = '')) - self.verify_entries(expected_list, entries) - - -class FilterTest(TestBase): - - format = 'atom' - - def testEntities(self): - """Tests that external entities cause parsing to fail.""" - try: - self.load_feed('xhtml_entities.xml') - self.fail('Should have raised an exception') - except feed_diff.Error, e: - # TODO(bslatkin): Fix this datafile in head. - # This ensures that hte failure is because of bad entities, not a - # missing test data file. - self.assertFalse('IOError' in str(e)) - - -if __name__ == '__main__': - ## feed_diff.DEBUG = True - ## logging.getLogger().setLevel(logging.DEBUG) - unittest.main() diff --git a/hub/feed_diff_testdata/atom_namespace.xml b/hub/feed_diff_testdata/atom_namespace.xml deleted file mode 100644 index b9e40df..0000000 --- a/hub/feed_diff_testdata/atom_namespace.xml +++ /dev/null @@ -1,60 +0,0 @@ - - - http://example.com/feeds/delta - - - foo@example.com - - http://example.com/feeds/delta/125 - - 2010-03-16T09:13:48.224281Z - paho.org/HealthC_ID/1115006 - 2010-03-12T12:12:12Z - - - - - Foo Hospital - - - - - 45.256 -71.92 - - - 2010-03-12T12:12:12Z - - - - - - bar@example.com - - http://example.com/feeds/delta/124 - - 2010-03-16T09:13:20.073577Z - paho.org/HealthC_ID/1115006 - 2010-03-01T01:02:03Z - - - - - Foo Hospital - - - - - true - - false - - 2010-03-15T23:43:52.183786Z - - - - diff --git a/hub/feed_diff_testdata/atom_no_id.xml b/hub/feed_diff_testdata/atom_no_id.xml deleted file mode 100644 index 6e6d4c3..0000000 --- a/hub/feed_diff_testdata/atom_no_id.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - -dive into mark -everything old is new again -2008-08-23T04:49:22Z - - - - - -Mark -http://diveintomark.org/ - -<![CDATA[The ampersands of Linux]]> - -tag:diveintomark.org,2008-08-14:/archives/20080814215936 -2008-08-14T23:08:54Z -2008-08-14T21:59:36Z -Please try to contain your excitement. -<p>Taking an idea from <a href="http://www.simplebits.com/notebook/2008/08/14/ampersands.html">Use the Best Available Ampersand</a> and a list of pre-installed fonts from the <a href="http://www.apaddedcell.com/web-fonts">Complete Guide to Pre-Installed Fonts in Linux, Mac, and Windows</a>, I present &#8220;The Ampersands of Linux&#8221;:</p> - -<p><img src="http://wearehugh.com/public/2008/08/ampersands-of-linux3.png" alt="[ampersands in 28 fonts]" height="900" width="600"></p> - -<p>(<a href="http://wearehugh.com/public/2008/08/ampersands-of-linux.html"><abbr>HTML</abbr></a>)</p> - -<p>Please try to contain your excitement.</p> - - diff --git a/hub/feed_diff_testdata/attribute_escaping.xml b/hub/feed_diff_testdata/attribute_escaping.xml deleted file mode 100644 index ea52571..0000000 --- a/hub/feed_diff_testdata/attribute_escaping.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - Foo Bar has a blog - - 2008-10-01T06:57:56Z - - Foo Bar - - http://www.foo.com/ - - - My feed - http://www.foo.com/1 - 2008-10-01T06:57:56Z - - some detail here &nbsp; - - - diff --git a/hub/feed_diff_testdata/bad_atom_feed.xml b/hub/feed_diff_testdata/bad_atom_feed.xml deleted file mode 100644 index aff0906..0000000 --- a/hub/feed_diff_testdata/bad_atom_feed.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - Foo Bar's blog - - 2008-10-01T06:57:56Z - - Foo Bar - - http://www.foo.com/ - - - My feed - http://www.foo.com/1 - 2008-10-01T06:57:56Z - - some detail here &nbsp; - - - diff --git a/hub/feed_diff_testdata/bad_feed.xml b/hub/feed_diff_testdata/bad_feed.xml deleted file mode 100644 index 6b77920..0000000 --- a/hub/feed_diff_testdata/bad_feed.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/hub/feed_diff_testdata/cdata_test.xml b/hub/feed_diff_testdata/cdata_test.xml deleted file mode 100644 index 48ab6a5..0000000 --- a/hub/feed_diff_testdata/cdata_test.xml +++ /dev/null @@ -1,47 +0,0 @@ - - -お得なクーポン情報 - - - - -2010-11-16T07:09:01Z - -tag:blog.livedoor.jp,2010:coupon_123 - -coupon_123 - - -livedoor Blog -Copyright (c) 2010, coupon_123 - -りんね【マッサージ・鍼灸院】のクーポン情報!⇒⇒クーポンをお持ちの方アロマコース10%オフ - -2010-11-15T22:08:57Z -2010-11-16T07:08:57+09:00 -tag:blog.livedoor.jp,2010:coupon_123.1635380 - - ★お得情報一覧はこちら - -東京都豊島区東池袋1丁目46-13ホリグチビル1F 営業:11:00~21:00&nbsp; 定休:毎週日曜日定休 - -クーポンをお持ちの方アロマコース10%オフ - -http://www.toshimaku-town.com/coupon/ts026964 - - - -
- ★お得情報一覧はこちら
-
-東京都豊島区東池袋1丁目46-13ホリグチビル1F 営業:11:00~21:00  定休:毎週日曜日定休
-
-クーポンをお持ちの方アロマコース10%オフ
-
-http://www.toshimaku-town.com/coupon/ts026964
]]> -
- -coupon_123 - -
-
diff --git a/hub/feed_diff_testdata/entity_escaping.xml b/hub/feed_diff_testdata/entity_escaping.xml deleted file mode 100644 index d939948..0000000 --- a/hub/feed_diff_testdata/entity_escaping.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - Foo Bar's blog - - 2008-10-01T06:57:56Z - - Foo Bar - - http://www.foo.com/ - - - My feed - http://www.foo.com/1 - 2008-10-01T06:57:56Z - - some detail here &nbsp; - - - diff --git a/hub/feed_diff_testdata/missing_entry_id.xml b/hub/feed_diff_testdata/missing_entry_id.xml deleted file mode 100644 index b6cb516..0000000 --- a/hub/feed_diff_testdata/missing_entry_id.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - Foo Bar's blog - - 2008-10-01T06:57:56Z - - Foo Bar - - http://www.foo.com/ - - - My feed - 2008-10-01T06:57:56Z - - some detail here &nbsp; - - - diff --git a/hub/feed_diff_testdata/no_xml_header.xml b/hub/feed_diff_testdata/no_xml_header.xml deleted file mode 100644 index f0a4d27..0000000 --- a/hub/feed_diff_testdata/no_xml_header.xml +++ /dev/null @@ -1,18 +0,0 @@ - - Foo Bar's blog - - 2008-10-01T06:57:56Z - - Foo Bar - - http://www.foo.com/ - - - My feed - http://www.foo.com/1 - 2008-10-01T06:57:56Z - - some detail here &nbsp; - - - diff --git a/hub/feed_diff_testdata/parsing.xml b/hub/feed_diff_testdata/parsing.xml deleted file mode 100644 index fd941b7..0000000 --- a/hub/feed_diff_testdata/parsing.xml +++ /dev/null @@ -1,348 +0,0 @@ - - - -dive into mark -everything old is new again -tag:diveintomark.org,2001-07-29:/ -2008-08-23T04:49:22Z - - -Subscribe with My Yahoo!Subscribe with NewsGatorSubscribe with My AOLSubscribe with RojoSubscribe with BloglinesSubscribe with NetvibesSubscribe with GoogleSubscribe with PageflakesMu. <h1>Mu.</h1> Mu. - -Mark -http://diveintomark.org/ - -<![CDATA[The ampersands of Linux]]> - -tag:diveintomark.org,2008-08-14:/archives/20080814215936 -2008-08-14T23:08:54Z -2008-08-14T21:59:36Z -Please try to contain your excitement. -<p>Taking an idea from <a href="http://www.simplebits.com/notebook/2008/08/14/ampersands.html">Use the Best Available Ampersand</a> and a list of pre-installed fonts from the <a href="http://www.apaddedcell.com/web-fonts">Complete Guide to Pre-Installed Fonts in Linux, Mac, and Windows</a>, I present &#8220;The Ampersands of Linux&#8221;:</p> - -<p><img src="http://wearehugh.com/public/2008/08/ampersands-of-linux3.png" alt="[ampersands in 28 fonts]" height="900" width="600"></p> - -<p>(<a href="http://wearehugh.com/public/2008/08/ampersands-of-linux.html"><abbr>HTML</abbr></a>)</p> - -<p>Please try to contain your excitement.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[Hello darkness my old friend]]> - -tag:diveintomark.org,2008-08-12:/archives/20080812160843 -2008-08-12T16:08:43Z -2008-08-12T16:08:43Z -Nobody still gives a shit about freedom 0. -<div class="punch" style="width:300px"> -<img src="http://wearehugh.com/public/2008/08/crow.jpg" alt="[crow and moon]" title="" width="300" height="300"> -<p><a href="http://flickr.com/photos/luchilu/2414457426/">The bird and the moon II</a> &copy;&nbsp;<a href="http://flickr.com/people/luchilu/">Luz A. Villa</a> / <a title="used under Creative Commons Attribution 2.0 License" href="http://creativecommons.org/licenses/by/2.0/">CC</a></p> -</div> - -<p>My parents <a href="http://diveintomark.org/archives/2008/01/04/my-parents-desktop">gave up on Linux</a> and bought a Mac Mini. We bought an AppleTV for the kids and <a href="http://handbrake.fr/">filled it with their favorite DVDs</a>. I stood in line for three hours to buy my wife an iPhone 3G for her birthday. And <a href="http://diveintomark.org/archives/2008/01/17/of-canaries-and-coal-mines">nobody gives a shit</a> about freedom 0.</p> - -<p>Discuss.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[A very serious question]]> - -tag:diveintomark.org,2008-08-07:/archives/20080807233337 -2008-08-12T01:23:03Z -2008-08-07T23:33:37Z -So, hypothetically speaking, let&#8217;s say you want to design a system where you had absolute control over which applications your customers were allowed to install on your device. Certainly you would want to ensure that you were the only source for applications. But for extraordinary cases, you might also need to create a [...] -<p>So, hypothetically speaking, let&#8217;s say you want to design a system where you had <a href="http://www.mikeash.com/?page=pyblog/welcome-to-iphone-your-crappy-mac-of-tomorrow-today.html">absolute control</a> over which applications your customers were allowed to install on <a href="http://www.russellbeattie.com/blog/iphone-reconciliation">your device</a>. Certainly you would want to <a href="http://blogs.oreilly.com/iphone/2008/07/one-little-article.html">ensure that you were the only source for applications</a>. But for extraordinary cases, you might also need to create <a href="http://arstechnica.com/journals/apple.ars/2008/08/07/the-goings-on-behind-the-iphone-application-blacklist">a blacklist of applications</a>.</p> - -<p>Each entry in the blacklist would also need a human-readable <b>title</b> &#8212; presumably the name of the app &#8212; and perhaps even a human-readable <b>description</b> to explain why the app was blacklisted. But each entry would also need a <b>unique identifier</b>, of course, so you don&#8217;t accidentally get confused between six apps named &#8220;TODO.&#8221; Finally, you would probably want to include the <b>date</b> that the entry was added to the list.</p> - -<p>Furthermore, since you anticipate <a href="http://www.wsu.edu/~brians/errors/continual.html">continually</a> <a href="http://www.macrumors.com/2008/08/01/netshare-tethering-app-reappears-on-app-store/">adding</a> <a href="http://forums.macrumors.com/showpost.php?p=5953784&amp;postcount=24">new</a> <a href="http://www.alleyinsider.com/2008/8/worthless-1000-i-am-rich-iphone-app-disappears">applications</a> to this blacklist to <a href="http://diveintomark.org/archives/2007/10/16/oh-good-grief#comment-10497">protect your and your partners&#8217; business model</a>, you would need your proprietary <a href="http://earthlingsoft.net/ssp/blog/2006/10/itunes_store_sucks">non-browser-based</a> client to <a href="http://www.appleinsider.com/articles/08/08/06/researcher_discovers_long_publicized_iphone_app_kill_switch.html">periodically poll the list for changes</a>.</p> - -<p>All of which raises a very serious question: <b><i>what data format should you use for the list?</i></b></p> - -<p>If you answered <abbr>&#8220;JSON&#8221;</abbr> then congratulations, you <del>win the <a href="https://iphone-services.apple.com/clbl/unauthorizedApps">Trendy Tech of the Month Award</a></del> <a href="http://diveintomark.org/archives/2008/08/07/a-very-serious-question#comment-12525">lose</a>! To collect your prize, please proceed through the door marked &#8220;<a href="http://en.wikipedia.org/wiki/Barnum%27s_American_Museum">This way to the egress</a>.&#8221; Some restrictions apply.</p> - -<p><b>Update</b>: OK, OK, <a href="http://daringfireball.net/2008/08/core_location_blacklist">it&#8217;s a &#8220;Core Location&#8221; blacklist</a>. Big deal. I&#8217;ll see your tree and <a href="http://online.wsj.com/article/SB121842341491928977.html?mod=googlenews_wsj">raise you a forest</a>:</p> - -<blockquote cite="http://online.wsj.com/article/SB121842341491928977.html?mod=googlenews_wsj"> -<p>&#8230; an independent engineer discovered code inside the iPhone that suggested iPhones routinely check an Apple Web site that could, in theory trigger the removal of the undesirable software from the devices.</p> - -<p>Mr. Jobs confirmed such a capability exists, but argued that Apple needs it in case it inadvertently allows a malicious program &#8212; one that stole users&#8217; personal data, for example &#8212; to be distributed to iPhones through the App Store.</p> -</blockquote> - -<p>As <a href="http://diveintomark.org/archives/2007/10/16/oh-good-grief#comment-10497">I&#8217;ve said before</a>, &#8220;protecting users from malicious programs&#8221; is code for &#8220;cryptographically enforcing restrictions on applications to protect our and our partners’ business model.&#8221; The bullshit about &#8220;stealing personal data&#8221; is just a rhetorical sleight of hand, like the <abbr>RIAA</abbr> claiming that piracy hurts &#8220;artists and other rights holders&#8221; when 99% of artists don&#8217;t own the rights to their own songs. How many apps has Apple de-listed over privacy concerns? <a href="http://gizmodo.com/5028459/aurora-feint-iphone-app-delisted-for-lousy-security-practices">Only one that I know of</a>, and it was quickly reinstated after a quick update. How many apps has Apple de-listed (or prevented being written in the first place) to protect their business? <a href="http://gizmodo.com/5027790/why-we-still-need-the-iphone-app-black-market">Lots</a> and <a href="http://lifehacker.com/400148/iphone-20-jailbreak-apps-you-cant-find-in-the-itunes-store">lots</a>.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[The 4 “B”s]]> - -tag:diveintomark.org,2008-08-07:/archives/20080807025755 -2008-08-07T02:57:55Z -2008-08-07T02:57:55Z -Little boys are so easy. -<div class="punch" style="width:240px"> -<img src="http://wearehugh.com/public/2008/08/blocks.jpg" alt="[colored blocks]" title="" width="240" height="159"> -<p><a href="http://flickr.com/photos/jamoker/1258248356/">prehistoric toys</a> &copy;&nbsp;<a href="http://flickr.com/people/jamoker/">The Jamoker</a> / <a title="used under Creative Commons Attribution 2.0 License" href="http://creativecommons.org/licenses/by/2.0/">CC</a></p> -</div> - -<p>Little boys are so easy. It&#8217;s amazing how many games you can play with nothing but the 4 &#8220;B&#8221;s: blocks, balls, boxes, and bears.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[Little drummer boy]]> - -tag:diveintomark.org,2008-08-06:/archives/20080806144009 -2008-08-06T14:40:09Z -2008-08-06T14:40:09Z -I never really understood how people found bugs like this... -<div class="punch" style="width:240px"> -<img src="http://wearehugh.com/public/2008/08/snare-drums.jpg" alt="[snare drums]" title="" width="240" height="292"> -<p><a href="http://flickr.com/photos/fayjo/338585382/">Pearl Masters</a> &copy;&nbsp;<a href="http://flickr.com/people/fayjo/">Jeremy Pharo</a> / <a title="used under Creative Commons Attribution 2.0 License" href="http://creativecommons.org/licenses/by/2.0/">CC</a></p> -</div> - -<p>For reasons that are not particularly interesting, I found myself reading <a href="http://www.jwz.org/xscreensaver/faq.html">the XScreensaver <abbr>FAQ</abbr></a> last night, which answers the question, &#8220;<a href="http://www.jwz.org/xscreensaver/faq.html#toolkits">The unlock dialog is funny looking, why not use GTK?</a>&#8221; That led me to JWZ&#8217;s mini-rant <a href="http://www.jwz.org/xscreensaver/toolkits.html">On Toolkits</a>, which reminded me of <a href="http://secunia.com/advisories/9184/">this ancient vulnerability in the OS X screensaver</a> where you could unlock the screen by typing 1280 characters in the password dialog. I remember reading about it at the time, but I never really understood how people found bugs like this until I had kids.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[Placating people with options]]> - -tag:diveintomark.org,2008-08-05:/archives/20080805155619 -2008-08-05T19:09:22Z -2008-08-05T15:56:19Z -Google Code is -- and has always been -- a tool to fight license proliferation. It is only incidentally useful. -<p><cite>Matthew Paul Thomas</cite>: <a href="http://mpt.net.nz/archive/2008/08/01/free-software-usability">Why Free Software has poor usability, and how to improve it</a>. Many of the problems he lists apply to all software. Here&#8217;s one that&#8217;s been on my mind recently:</p> - -<blockquote cite="http://mpt.net.nz/archive/2008/08/01/free-software-usability"> -<p>10. Placating people with options.</p> -</blockquote> - -<p>In other news, <a href="http://www.linuxtoday.com/news_story.php3?ltsn=2008-08-01-012-35-NW">Google Code blacklists Mozilla Public License</a>. Some <a href="http://redmonk.com/sogrady/2008/08/04/links-for-2008-08-04-deliciouscom/">smart</a> <a href="http://blogs.sun.com/webmink/entry/links_for_2008_08_04">people</a> don&#8217;t seem to understand why Google would &#8220;reduce&#8221; users&#8217; &#8220;choices.&#8221; This is like complaining that the <abbr>GPL</abbr> is too &#8220;political&#8221; to be a software license because it &#8220;restricts&#8221; users&#8217; &#8220;freedom&#8221; to take without giving back. You&#8217;re missing the point; the <abbr>GPL</abbr> is <em>designed</em> to be a political manifesto (cleverly disguised as a software license).</p> - -<p>Another example: my <a href="http://feedparser.org/">Universal Feed Parser</a> was <a href="http://diveintomark.org/archives/2002/08/13/ultraliberal_rss_parser">conceived as a weapon</a> against what I considered the gravest error of XML: <a href="http://diveintomark.org/archives/2004/01/16/draconianism">draconian error handling</a>. Recently, someone asked me to implement a switch that makes it not fall back on lax parsing in the case of an XML wellformedness error. I said no, not because it would be difficult to implement, but because <em>that defeats its entire reason for being</em>.</p> - -<p>Google Project Hosting is &#8212; and has always been &#8212; <a href="http://google-opensource.blogspot.com/2008/05/standing-against-license-proliferation.html">a tool to fight license proliferation</a>. It is only incidentally useful.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[Weblogs: content w/o context, collaboration, capital, or coruscation]]> - -tag:diveintomark.org,2008-08-05:/archives/20080805020410 -2008-08-05T02:04:10Z -2008-08-05T02:04:10Z -A response to danah boyd. -<p>On her weblog, danah boyd writes:</p> - -<blockquote cite="http://www.zephoria.org/thoughts/archives/2008/08/01/knol_content_wo.html"> -<p>[Compared to Wikipedia,] weblogs take an entirely opposite approach to knowledge production. A weblog&#8217;s entire structure is built around single authors, control and individualism. There aren&#8217;t even mechanisms for multiple authors and the tools available for collaboration are extremely limited. &#8220;Collaboration&#8221; still assumes a primary author.</p> - -<p>&#8230; Weblogs are quickly becoming a &#8220;unit of spam&#8221; instead of a unit of knowledge. Y&#8217;see - a system that is driven by individualism quickly becomes a tool for self-promoters.</p> - -<p>&#8230; Frankly, from my POV, weblogs look like an abysmal failure. There&#8217;s no life to the content. Already articles are being forgotten and left to rot, along with a lot of other web content. There&#8217;s no common format or standards and there&#8217;s a lot more crap than gems.</p> -</blockquote> - -<p>Oops, no, I&#8217;m sorry. She was <a href="http://www.zephoria.org/thoughts/archives/2008/08/01/knol_content_wo.html">talking about Google Knol</a>. My bad.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[HOWTO feed medication to your dog]]> - -tag:diveintomark.org,2008-07-29:/archives/20080729021401 -2008-08-01T00:05:25Z -2008-07-29T02:14:01Z -I'm told that peanut butter also works. -<ol style="list-style:none;margin-left:0;padding-left:0"> -<li><p>Take a quarter slice of American cheese.</p> -<p><img src="http://wearehugh.com/public/2008/07/01-cheese.jpg" alt="[cheese slice in hand]" width="640" height="188"></p></li> -<li><p>Roll the cheese into a ball and insert the pills.</p> -<p><img src="http://wearehugh.com/public/2008/07/02-pills.jpg" alt="[cheese ball with pills inside]" width="640" height="188"></p></li> -<li><p>Feed the cheese ball to your dog.</p> -<p><img src="http://wearehugh.com/public/2008/07/03-feed.jpg" alt="[feeding the cheese to the dog]" width="640" height="188"></p></li> -</ol> - -<p>I&#8217;m told that peanut butter also works, but it&#8217;s messier.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[A helluva thing]]> - -tag:diveintomark.org,2008-07-23:/archives/20080723030709 -2008-07-23T16:55:41Z -2008-07-23T03:07:09Z -Timothy McVeigh blew up a building and killed 168 people. What did we give him? A trial. -<p>It&#8217;s a helluva thing, a trial by jury. It was a radical notion 200 years ago, and it&#8217;s still a radical notion today.</p> - -<p>I served on a jury once. Getting called for jury duty sucks, because most of the time you just sit in the jury lounge all day and then go home. But actually serving on a jury is a totally different thing. I think everyone should experience it at least once. In one sense, it&#8217;s just like you see on TV, except everything takes longer and you can&#8217;t go to the bathroom every 12 minutes. But it&#8217;s weird, if you&#8217;ve seen it on TV, because you realize that you already know the script. &#8220;Burden of proof,&#8221; &#8220;innocent until proven guilty,&#8221; &#8220;proof beyond a reasonable doubt.&#8221; The judge lays it all out for you, from scratch, even though most people have seen the scene and heard the speech and read the words a hundred times.</p> - -<p>Everything is slanted towards the defense. Big stuff, little stuff, process stuff, everything. We learned later (during sentencing) that the defendant in our case had several prior convictions, but the <abbr>DA</abbr> wasn&#8217;t allowed to bring them up during the trial. Witnesses were always being cut off in mid-sentence, but the defendant was given a wide berth to tell his version of events. We didn&#8217;t even know exactly what the charge meant until the defense lawyer made his closing argument. That, in particular, was incredibly frustrating. I made all sorts of notes like &#8220;is this important? don&#8217;t know, check later.&#8221; The judge said it was to force us to listen to everyone and everything as fairly as possible. It was frustrating on purpose, but it worked.</p> - -<p>And I remember thinking at the time, &#8220;This thing. Right here, this thing. This is what we say America is about.&#8221; Everyone is innocent until proven guilty, and everybody gets their day in court. We suspected that <a href="http://en.wikipedia.org/wiki/Hans_Reiser">Hans Reiser killed his wife</a>. What did we give him? A trial. We suspected that <a href="http://en.wikipedia.org/wiki/Oklahoma_City_bombing">Timothy McVeigh blew up a building and killed 168 people</a>. What did we give him? A trial. (<abbr>BTW</abbr>, this is why <a href="http://www.aclu.org/safefree/detention/commissions.html">people get so upset over our mishandling of terror suspects</a>. What could possibly be more un-American than saying that some people don&#8217;t deserve a fair trial?)</p> - -<p>The case I served on wasn&#8217;t anything you&#8217;ll ever hear about, or read about, or see on TV. Just some neighbors who had ongoing petty feuds for years and years, until one day one of them went too far and made a real death threat. You&#8217;d say he &#8220;crossed the line.&#8221; I think the actual charge was called &#8220;commencement of threats,&#8221; which basically means he crossed one of the invisible lines that holds society together.</p> - -<p>Some people spend their whole life right on the edge of being able to function in a civilized society. I knew a guy like that, growing up outside Philadelphia. His name was Eric. He always seemed to be in trouble with the law. Never anything serious, and not your standard crimes like robbery or drugs or weapons. Just&#8230; not quite understanding the boundaries between himself and everyone else. He could make friends quickly, but then he lost them just as quickly. He never had a steady girlfriend. He couldn&#8217;t maintain any sort of long-term relationship. I think an ex-girlfriend got a restraining order against him one time. And he&#8217;d get arrested for stuff like &#8220;criminal trespass&#8221; and &#8220;commencement of threats.&#8221;</p> - -<p>Society is about drawing lines that everyone acknowledges and respects. Some people see the lines and cross them anyway and hope they don&#8217;t get caught. Eric didn&#8217;t even see the lines. They didn&#8217;t make any sense to him, so when he crossed them, he didn&#8217;t understand why he got in trouble. And you just wanted to smack him and say, &#8220;Just stay out of trouble, Eric! Just leave people alone.&#8221; And he did, most of the time. But &#8220;most of the time&#8221; is not &#8220;all of the time.&#8221; And it&#8217;s a crude word, but I think he was a little bit crazy. Not really crazy, like Hannibal Lecter crazy. He just&#8230; couldn&#8217;t see the lines. You don&#8217;t have to be crazy all the time, to be crazy.</p> - -<p>Anyway, our case came and went. The wheels of justice grind slowly, but there&#8217;s only so much you can say about neighbors yelling at each other. I think the whole trial only lasted a day and a half, from jury selection to sentencing. In the end, we deliberated and found the defendant guilty beyond a reasonable doubt. The victim asked, and the judge agreed, and we the jurors were pleased to hear, that he should get a suspended sentence and a mental health assessment, with mandatory followup counseling. I&#8217;d guess he was offered deals and plea bargains, and I&#8217;d guess that everyone in his life begged him to take it. But he wanted his day in court, and he wanted a trial by a jury of his peers, and so that&#8217;s what we did. And he mounted a vigorous defense, and he was innocent right up until the moment we decided he was guilty.</p> - -<p>I&#8217;m not going to get all puffed up about the glory of the system or the honor of performing my civic duty or whatever. The system is broken in a lot of ways, and there&#8217;s a reason they call it a duty &#8212; because it sounds a lot better in the abstract than it feels in the particular, slogging through downtown traffic and standing in line at the metal detectors and sitting quietly while people go on and on about their fucked up lives. But I also got a glimpse of a marvelous and precarious machine, built up and crusted over from two centuries of radical tradition, grinding ever so slowly forward. And it&#8217;s a helluva thing.</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[Things that are awesome]]> - -tag:diveintomark.org,2008-07-17:/archives/20080717044506 -2008-07-17T04:47:24Z -2008-07-17T04:45:06Z -Trader Joe's, because food wasn't expensive enough already. -<ul> -<li><a href="http://fontmatrix.net/">FontMatrix</a>, which helped me pick the God-awful fonts I&#8217;m foisting on the world</li> -<li><a href="http://www.lesswatts.org/projects/powertop/">PowerTOP</a>, because I love viewing new cross-sections of my computing environment</li> -<li><a href="http://wordle.net/">Wordle</a> (<a href="http://bigpicture.typepad.com/comments/2008/07/bernanke-word-c.html">example</a>), despite being a <i>Java applet written in 2008</i></li> -<li><a href="http://openjdk.java.net/">OpenJDK</a>, which is <a href="http://packages.debian.org/openjdk">now in Debian main</a></li> -<li><a href="http://developer.yahoo.com/yslow/">YSlow</a>, which proved that my <code>Cache-Control</code> headers really did blow goats</li> -<li><a href="http://ocaoimh.ie/wp-super-cache/">WP-SuperCache</a>, which fixed them (sorry, goats)</li> -<li><a href="http://www.youtube.com/watch?v=WGoi1MSGu64">Flight of the Conchords</a> (note: they are actually <a href="http://www.youtube.com/watch?v=wd8yxDivs5g">twice as awesome</a> as everything else on this list)</li> -<li><a href="http://www.traderjoes.com/">Trader Joe&#8217;s</a>, because food wasn&#8217;t expensive enough already</li> -<li><a href="http://www.toblerone.com/">Toblerone</a>, just because</li> -</ul> - -<p>And you?</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[Protocol buffers: the early reviews are in]]> - -tag:diveintomark.org,2008-07-13:/archives/20080713011654 -2008-07-13T04:19:40Z -2008-07-13T01:16:54Z -No wireless. Less space than a Nomad. Lame. -<p>Google (my current employer) has finally <a href="http://google-opensource.blogspot.com/2008/07/protocol-buffers-googles-data.html">open sourced protocol buffers</a>, the data interchange format we use for internal server-to-server communication. The blogosphere&#8217;s response? &#8220;<a href="http://www.oreillynet.com/xml/blog/2008/07/google_hates_xml.html?CMP=OTC-TY3388567169&amp;ATT=Google+hates+XML">No wireless</a>. <a href="http://blogs.tedneward.com/CommentView,guid,98ba2332-0f84-4697-b165-87ee357309cb.aspx">Less space than a Nomad</a>. <a href="http://steve.vinoski.net/blog/2008/07/11/protocol-buffers-no-big-deal/">Lame</a>.&#8221;</p> - -<p>Aaaaanyway&#8230;</p> - -<p>Protocol buffers are &#8220;<a href="http://c2.com/cgi/wiki?JustIsaDangerousWord">just</a>&#8221; cross-platform data structures. <a href="http://www.coriolinus.net/2008/07/08/protocol-buffers/">All you have to write is the schema</a> (a <code>.proto</code> file), then generate bindings in <a href="http://code.google.com/apis/protocolbuffers/docs/cpptutorial.html">C++</a>, <a href="http://code.google.com/apis/protocolbuffers/docs/javatutorial.html">Java</a>, or <a href="http://code.google.com/apis/protocolbuffers/docs/pythontutorial.html">Python</a>. (Or <a href="http://hackage.haskell.org/cgi-bin/hackage-scripts/package/protocol-buffers-0.0.5">Haskell</a>. Or <a href="http://groups.google.com/group/protobuf-perl">Perl</a>.) The <code>.proto</code> file is <a href="http://www.betanews.com/article/Google_releases_its_data_encoding_format_to_compete_with_XML/1215530589">just a schema</a>; it doesn&#8217;t contain any data except default values. All getting and setting is done in code. The serialized over-the-wire format is designed to minimize network traffic, and deserialization (especially in C++) is designed to maximize performance. I can&#8217;t begin to describe how much effort Google spends maximizing performance at every level. We would tear down our data centers and rewire them with <a href="http://news.cnet.com/8301-17938_105-9967991-1.html">$500 ethernet cables</a> if you could prove that it would reduce latency by 1%.</p> - -<p>Besides being blindingly fast, protocol buffers have lots of neat features. <a href="http://scottkirkwood.blogspot.com/2008/07/google-opensources-protocol-buffers.html">A zero-size <abbr>PB</abbr> returns default values</a>. <a href="http://zunger.livejournal.com/164024.html">You can nest <abbr>PB</abbr>s inside each other</a>. And most importantly, <abbr>PB</abbr>s are <a href="http://news.ycombinator.com/item?id=239445">both backward and forward compatible</a>, which means you can <a href="http://www.mattcutts.com/blog/google-releases-protocol-buffers/">upgrade servers gradually</a> and they can still talk to each other in the interim. (When you have as many machines as Google has, it&#8217;s always the interim somewhere.)</p> - -<p>Comparisons to other data formats was, I suppose, inevitable. <a href="http://www.thenewsbeforethenews.com/2008/07/10/everything-old-is-new-again/">Old-timers may remember <abbr>ASN.1</abbr></a> or <a href="http://tech.slashdot.org/tech/08/07/08/201245.shtml"><abbr>IIOP</abbr></a>. Kids these days seem to <a href="http://www.webmasterworld.com/xml/3693285.htm">compare everything to <abbr>XML</abbr></a> or <a href="http://www.freshblurbs.com/google-protocol-buffers-good-bad-and-ugly"><abbr>JSON</abbr></a>. They&#8217;re actually closer to Facebook&#8217;s <a href="http://stuartsierra.com/2008/07/10/thrift-vs-protocol-buffers">Thrift</a> (<a href="http://news.ycombinator.com/item?id=239276">written by ex-Googlers</a>) or <a href="http://www.25hoursaday.com/weblog/CommentView.aspx?guid=898f56ef-0439-4100-90da-08701be03c13">SQL Server&#8217;s <abbr>TDS</abbr></a>. Protocol buffers won&#8217;t <a href="http://zimboe.wordpress.com/2008/07/10/farewell-xml/">kill <abbr>XML</abbr></a> (no matter how much <a href="http://www.sharms.org/blog/?p=168">you wish they would</a>), nor will they replace <abbr>JSON</abbr>, <abbr>ASN.1</abbr>, or <a title="the world's first and only implementation of RFC 1149" href="http://www.blug.linux.no/rfc1149/">carrier pigeon</a>. But they&#8217;re simple and they&#8217;re fast and <a href="http://www.webmonkey.com/blog/Google_s_Open_Source_Protocol_Buffers_Offer_Scalability__Speed">they scale like crazy</a>, and that&#8217;s the way Google likes it.</p> - - - - -Mark -http://diveintomark.org/ - -<![CDATA[NY’s top child porn]]> - -tag:diveintomark.org,2008-07-12:/archives/20080712042845 -2008-07-12T04:28:45Z -2008-07-12T04:28:45Z -I really like pluralizing "Attorney General." And verbing nouns. And parenthesizing. -<p>Despite the promising URL, <a rel="nofollow" href="http://www.nystopchildporn.com/">NYsTopChildPorn.com</a> does not, in fact, contain NY&#8217;s top child porn. (&#8221;Package contained bobcat. <a href="http://xkcd.com/325/">Would not buy again</a>.&#8221;) It will, however, be blocked by <em>every web filtering program ever</em> based solely on the domain name &#8212; an irony which will sadly be lost on the Attorneys General. I bet they bring in a metric buttload of weird search traffic before they fold, though.</p> - -<p>Yeah, I know, I&#8217;m joking about child porn. Carlin is dead, Pesci bless him, and somebody&#8217;s gotta start <a href="http://www.youtube.com/watch?v=3av_qRR_DWc">picking up the slack</a>.</p> - -<p>(The site itself is not notable in any way except as a reason to point out the obvious design flaw &#8212; that <em>you&#8217;re declaring war on a protocol</em>, and, you know, good luck with that.)</p> - -<p>(I really like pluralizing &#8220;Attorney General.&#8221; And verbing nouns. And parenthesizing.)</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[New Orc City]]> - -tag:diveintomark.org,2008-07-06:/archives/20080706022239 -2008-07-06T04:12:31Z -2008-07-06T02:22:39Z - - -I found this example of Papyrus on my doorstep last night &#169;&#160;Sarah Marriage / CC - - -The wife and the first born are in New York City for the long weekend, except the second born pronounces it &#8220;New Orc City,&#8221; which sounds much more interesting if you ask me. This is not the first time Ethan [...] -<div class="punch" style="width:180px"> -<img src="http://wearehugh.com/public/2008/07/papyrus.jpg" alt="[example of Papyrus font]" title="" width="180" height="240"> -<p><a href="http://flickr.com/photos/semarr/125189245/">I found this example of Papyrus on my doorstep last night</a> &copy;&nbsp;<a href="http://flickr.com/people/semarr/">Sarah Marriage</a> / <a title="used under Creative Commons Attribution 2.0 License" href="http://creativecommons.org/licenses/by/2.0/">CC</a></p> -</div> - -<p>The wife and the first born are in New York City for the long weekend, except the second born pronounces it &#8220;New Orc City,&#8221; which sounds much more interesting if you ask me. This is not the first time Ethan has traveled, but it&#8217;s the first time that he and his brother have been separated for more than 8 hours. It has also given them their first opportunity to talk to each other on the telephone, long distance. I assume it won&#8217;t be the last.</p> - -<p>Wesley and I have been having a wonderful time doing all manner of things without, you know, competition. Watching his <a title="actually just one movie over and over" href="http://www.imdb.com/title/tt0055254/">favorite movies</a>, wading in the <a href="http://www.enoriver.org/Festival/">Eno River</a>, and spotting butterflies at the <a href="http://www.ncmls.org/">Museum of Life and Science</a>. No pics, sorry (the wife has the camera in New Orc City), but here&#8217;s some pictures from our trip 2 years ago: a <a href="http://flickr.com/photos/f8dy/121701744/">butterfly</a>, a <a href="http://flickr.com/photos/f8dy/121701637/">sheep</a>, and a <a href="http://flickr.com/photos/f8dy/121701704/">pot-bellied pig</a>. I swear the pig hasn&#8217;t moved in 2 years.</p> - -<p>On a coming-full-circle kind of note, the Festival of the Eno uses <a href="/archives/2008/06/21/minimalism#comment-12293">Papyrus</a> in their video materials. We saw it on the shuttle bus on the way back to the parking lot. My mother was sitting next to me, and I got all excited and poked her and pointed to the screen above us and said, &#8220;Hey, I know that font! Everybody hates that font!&#8221; Who says designers are useless?</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[Adobe 9]]> - -tag:diveintomark.org,2008-07-04:/archives/20080704050619 -2008-07-04T20:18:42Z -2008-07-04T05:06:19Z -You can't make this stuff up. -<div class="punch" style="width:240px"> -<img src="http://wearehugh.com/public/2008/07/upgrade-are-failed.jpg" alt="[upgrade are failed]" title="" width="240" height="180"> -<p><a href="http://flickr.com/photos/collinanderson/2413624779/">Upgrade are failed!</a> &copy;&nbsp;<a href="http://flickr.com/people/collinanderson/">Collin Anderson</a> / <a title="used under Creative Commons Attribution 2.0 License" href="http://creativecommons.org/licenses/by/2.0/">CC</a></p> -</div> - -<p>Adobe<a title="you may think I'm intentionally overusing the marks for comedic effect, but I'm not (overusing them, that is)" href="http://www.adobe.com/misc/trade.html">&reg;</a> Reader&reg; 9 is <a href="http://gusmueller.com/blog/archives/2008/07/adobe_reader_9_is_out!.html">out</a>. It&#8217;s now almost half as fast as <a href="http://www.foxitsoftware.com/pdf/reader_2/down_reader.htm">Foxit Reader</a>. It lets you <a href="http://www.download.com/8301-2007_4-9982192-12.html">embed Flash in PDF</a> and <a href="http://blogs.adobe.com/loridefurio/2008/07/pdf_widget_on_a.html">embed PDF in Flash</a>. Adobe supports both kinds of music, <a href="http://www.microsoft.com/">country</a> and <a href="http://www.apple.com/">western</a>. They&#8217;ve also &#8220;<a href="http://blogs.adobe.com/adobereader/2008/06/adobe_reader_9_is_here_1.html">conveniently</a>&#8221; bundled Adobe&reg; AIR&trade; <a href="http://www.tuaw.com/2008/07/02/adobe-reader-9-released/#c12973868">for no apparent reason</a> and added synergistic integration with <a href="https://www.acrobat.com/">their cloud</a>, which claims it doesn&#8217;t support my browser and then requires both Javascript and Flash to sign up for an Adobe&reg; ID, the use of which is governed by this <a type="application/pdf" href="http://www.adobe.com/go/acrobat_com_tou_en">draconian service agreement</a>, which is a PDF.</p> - -<p>You can&#8217;t make this stuff up. And apparently it gets worse if you try to, you know, <a href="http://blog.micropledge.com/2008/07/adobe-reader-9/">actually install it</a>.</p> - -<p>It occurs to me that, at some point in the not-too-<a href="http://www.youtube.com/watch?v=WGoi1MSGu64">distant future</a>, we&#8217;re going to achieve a harmonic convergence with these mega-platforms. &#8220;Adobe&reg; Acrobat&reg; version 9 with Adobe&reg; Flash&reg; version 10 with Adobe&reg; Photoshop&reg; CS3 with Adobe&reg; AIR&trade; beta 3&#8243; will get truncated to &#8220;Adobe 9.&#8221; Coming soon on <a href="http://www.overheardinnewyork.com/">Overheard in New York</a>: &#8220;Hey, are you on Adobe 9?&#8221; &#8220;No, I&#8217;m on Microsoft 14.&#8221; &#8220;Pity. I was hoping we could have sex.&#8221; Or something like that. Who knows, with these wacky kids today and their vendor-specific runtimes?</p> - - - -Mark -http://diveintomark.org/ - -<![CDATA[Microformats and accessibility: the soap opera that never ends]]> - -tag:diveintomark.org,2008-06-29:/archives/20080629044756 -2008-06-29T04:55:43Z -2008-06-29T04:47:56Z -As far as I can tell, the only thing that leading accessibility experts agree on is that nobody listens to leading accessibility experts. -<div class="punch" style="width:240px"> -<img src="http://wearehugh.com/public/2008/06/ducks-butt.jpg" alt="[duck with its head underwater]" title="" width="240" height="160"> -<p><a href="http://flickr.com/photos/spacepleb/249761636/">ducks butt</a> &copy;&nbsp;<a href="http://flickr.com/people/spacepleb/">Dave Gough</a> / <a title="used under Creative Commons Attribution 2.0 License" href="http://creativecommons.org/licenses/by/2.0/">CC</a></p> -</div> - -<p>As far as I can tell, the only thing that leading accessibility experts agree on is that <a href="http://adactio.com/journal/1451/">nobody listens to leading accessibility experts</a>, especially not <a href="http://www.webstandards.org/2008/06/23/haccessibility-redux/#comment-71549">the microformats cabal</a>, which <a href="http://microformats.org/wiki/datetime-design-pattern#Accessibility_issues">has never cared about accessibility</a>, <a href="http://microformats.org/wiki/assistive-technology-abbr-results">has never bothered to test it</a>, and <a href="http://www.isolani.co.uk/blog/access/AccessibilityOfDateTimeMicroformat">has never acknowledged</a> those <a href="http://lab.dotjay.co.uk/tests/screen-readers/microformats/datetime-design-pattern/">who have tested it</a>. In fact, <a href="http://www.bbc.co.uk/blogs/radiolabs/2008/06/removing_microformats_from_bbc.shtml">the BBC recently removed one microformat</a> from their site because one piece of it may be confusing to some screen reader users with a certain non-default configuration. This proves what leading accessibility experts have been saying all along, that <a href="http://adactio.com/journal/1457/">all microformats are inaccessible</a>, and we should <a href="http://ejohn.org/blog/bbc-removing-microformat-support/">all just use RDF</a>.</p> - -<p>Meanwhile, the devilish cabal is <a href="http://microformats.org/wiki/datetime-design-pattern#date_and_time_separation_using_value_excerption">secretly solving the problem</a> on their public wiki page, their public mailing list, and their public IRC channel. But will it be <a href="http://www.bbc.co.uk/blogs/radiolabs/2008/06/microformats_and_rdfa_and_rdf.shtml">enough for the BBC</a>? Be sure to tune in next week, when we&#8217;ll <a href="http://www.imdb.com/title/tt0071853/quotes">drown a leading accessibility expert</a> to see if she&#8217;s a witch.</p> - - diff --git a/hub/feed_diff_testdata/rdf_10_weirdness.xml b/hub/feed_diff_testdata/rdf_10_weirdness.xml deleted file mode 100644 index 94e5674..0000000 --- a/hub/feed_diff_testdata/rdf_10_weirdness.xml +++ /dev/null @@ -1,360 +0,0 @@ - - - - -Slashdot -http://slashdot.org/ -News for nerds, stuff that matters -en-us -Copyright 1997-2009, Geeknet, Inc. All Rights Reserved. -2010-06-03T21:20:16+00:00 -Geeknet, Inc. - -help@slashdot.org -Technology -hourly -1 -1970-01-01T00:00+00:00 - - - - - - - - - - - - - - - - - - - - - - - - - - -Slashdot - -http://a.fsdn.com/sd/topics/topicslashdot.gif -http://slashdot.org/ - - - -How To Get Rejected From the App Store -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/tmmQTFyzGb8/How-To-Get-Rejected-From-the-App-Store -snydeq writes "Fatal Exception's Neil McAllister catalogs 12 sure-fire ways to get your app rejected from Apple's notoriously fickle App Store. From executing interpreted code, to using Apple's APIs without permission, to designing your UI, each transgression has been abstracted from real-life rejections &mdash; for the most part because Apple seems to be making up the rules as it goes along. 'It'd be nice for Apple to make conditions for rejection clear,' McAllister writes. 'Apple has been tinkering with the language of its iPhone SDK license agreement lately, but that hasn't done much to clarify the rules &mdash; unless you're Adobe. For everyone else, the App Store's requirements seem as vague and capricious as ever.'"<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fapple.slashdot.org%2Fstory%2F10%2F06%2F03%2F2017220%2FHow-To-Get-Rejected-From-the-App-Store" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - - <a href="http://twitter.com/home?status=How+To+Get+Rejected+From+the+App+Store%3A+http%3A%2F%2Fbit.ly%2F9nfrph" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://apple.slashdot.org/story/10/06/03/2017220/How-To-Get-Rejected-From-the-App-Store?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674524&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/qnH42oYOpn-HJu8_ddYFIXVOYgo/0/da"><img src="http://feedads.g.doubleclick.net/~at/qnH42oYOpn-HJu8_ddYFIXVOYgo/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/qnH42oYOpn-HJu8_ddYFIXVOYgo/1/da"><img src="http://feedads.g.doubleclick.net/~at/qnH42oYOpn-HJu8_ddYFIXVOYgo/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/tmmQTFyzGb8" height="1" width="1"/> -timothy -2010-06-03T20:54:00+00:00 - -business -feature-the-steve-in-green-turtleneck -apple -28 -28,24,23,16,5,1,0 -http://apple.slashdot.org/story/10/06/03/2017220/How-To-Get-Rejected-From-the-App-Store?from=rss - - -Frank Zappa's Influence On Linux and FOSS Development -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/cshWw102qTE/Frank-Zappas-Influence-On-Linux-and-FOSS-Development - -Roblimo writes "Zappa's 'Dinah-Moe Hummm' is totally about Linux, at least in spirit, while the song 'Montana,' with its talk of zirconium-encrusted tweezers and dental floss, 'is obviously about Mac users.' Not only that: In the early '70s Zappa wrote a song called 'Penguin in Bondage,' an obvious foretelling of the anti-Linux lawsuits and threats from SCO, Microsoft, and other evildoers. Zappa was also a heavy user of the Synclavier, an electronic music machine that was a precursor to today's 'studio on a computer' recording and sound editing software. According to an article on DevX, today Zappa would no doubt be using Linux and Ardour for most of his recording and composition."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Flinux.slashdot.org%2Fstory%2F10%2F06%2F03%2F196234%2FFrank-Zappas-Influence-On-Linux-and-FOSS-Development" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Frank+Zappa's+Influence+On+Linux+and+FOSS+Development%3A+http%3A%2F%2Fbit.ly%2F9s0FmS" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://linux.slashdot.org/story/10/06/03/196234/Frank-Zappas-Influence-On-Linux-and-FOSS-Development?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674468&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/pqPkV5zqIqnjjKi4JwhUaWaKvSs/0/da"><img src="http://feedads.g.doubleclick.net/~at/pqPkV5zqIqnjjKi4JwhUaWaKvSs/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/pqPkV5zqIqnjjKi4JwhUaWaKvSs/1/da"><img src="http://feedads.g.doubleclick.net/~at/pqPkV5zqIqnjjKi4JwhUaWaKvSs/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/cshWw102qTE" height="1" width="1"/> -timothy -2010-06-03T20:11:00+00:00 - -gnu -oh-yeah-try-to-disprove-it -linux -85 -85,82,62,48,7,3,3 -http://linux.slashdot.org/story/10/06/03/196234/Frank-Zappas-Influence-On-Linux-and-FOSS-Development?from=rss - - -Six Major 3G and 4G Networks Tested Nationwide -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/ziiB2Fy7vGY/Six-Major-3G-and-4G-Networks-Tested-Nationwide - -adeelarshad82 writes "PCMag recently tested six 3G and 4G networks to determine which ones were the fastest (and slowest) in 18 different US cities. They focused on data, not calls, and used their own testing script and methodology, which combined various kinds of uploads and downloads. Using laptops, more than a dozen people ran more than 10,000 tests; they found AT&amp;T is both the fastest national 3G network, and the least consistent. Sprint's 3G system was the slowest of the 'big four' carriers, but the most consistent. When the test results were broken down by regions, AT&amp;T led on speed in the Southeast, Central, and West, but T-Mobile took the crown in the Northeast region. Sprint's 4G network was fast where it was available, but it was surprisingly slower than 3G in some cities. The fastest AT&amp;T download seen, at 5.05 megabits/sec, was right behind Apple's headquarters at 1 Infinite Loop in Cupertino, CA. The fastest connection in any of the tests was a blazing 9.11 megabits down on Sprint 4G in the Midtown neighborhood of Atlanta, GA. The slowest city, on average, was Raleigh, with average 3G downloads of 880kbits/sec."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Ftech.slashdot.org%2Fstory%2F10%2F06%2F03%2F1836232%2FSix-Major-3G-and-4G-Networks-Tested-Nationwide" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Six+Major+3G+and+4G+Networks+Tested+Nationwide%3A+http%3A%2F%2Fbit.ly%2F9KEFzt" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://tech.slashdot.org/story/10/06/03/1836232/Six-Major-3G-and-4G-Networks-Tested-Nationwide?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674452&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/zEQ1VfCz-qeWOcC7iheee68Oneo/0/da"><img src="http://feedads.g.doubleclick.net/~at/zEQ1VfCz-qeWOcC7iheee68Oneo/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/zEQ1VfCz-qeWOcC7iheee68Oneo/1/da"><img src="http://feedads.g.doubleclick.net/~at/zEQ1VfCz-qeWOcC7iheee68Oneo/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/ziiB2Fy7vGY" height="1" width="1"/> -timothy -2010-06-03T19:32:00+00:00 - -internet -actually-tested-locally -technology -65 -65,63,52,39,11,8,4 -http://tech.slashdot.org/story/10/06/03/1836232/Six-Major-3G-and-4G-Networks-Tested-Nationwide?from=rss - - -Mobile Phones vs. Supercomputers of the Past -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/D7AUWrUlSys/Mobile-Phones-vs-Supercomputers-of-the-Past - -An anonymous reader writes "The recently published Top 500 list of the world's fastest supercomputers is based on the Linpack benchmark developed decades ago by Jack Dongarra. This same test has been ported to Android mobile phones, which means that we can compare the performance of our phones against that of the supercomputers of the past. For example, a tweaked Motorola Droid can hit 52 Mflop/s, which is more than 15 times faster than the CPUs used in the 1979 Cray-1." But even today's most powerful cellphones don't come with an integrated bench.<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fhardware.slashdot.org%2Fstory%2F10%2F06%2F03%2F1740214%2FMobile-Phones-vs-Supercomputers-of-the-Past" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Mobile+Phones+vs.+Supercomputers+of+the+Past%3A+http%3A%2F%2Fbit.ly%2FasUgMi" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://hardware.slashdot.org/story/10/06/03/1740214/Mobile-Phones-vs-Supercomputers-of-the-Past?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674396&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/Tj3131YW0lSjS0yDrJlqPDmeQXA/0/da"><img src="http://feedads.g.doubleclick.net/~at/Tj3131YW0lSjS0yDrJlqPDmeQXA/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/Tj3131YW0lSjS0yDrJlqPDmeQXA/1/da"><img src="http://feedads.g.doubleclick.net/~at/Tj3131YW0lSjS0yDrJlqPDmeQXA/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/D7AUWrUlSys" height="1" width="1"/> -timothy -2010-06-03T18:49:00+00:00 - -supercomputing -crays-are-pure-sculpture -hardware -137 -137,135,115,90,21,11,6 -http://hardware.slashdot.org/story/10/06/03/1740214/Mobile-Phones-vs-Supercomputers-of-the-Past?from=rss - - -FTC Staff Discuss a Tax on Electronics To Support the News Business -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/RWSAy1nVfYQ/FTC-Staff-Discuss-a-Tax-on-Electronics-To-Support-the-News-Business - -dptalia links to this piece describing a staff discussion draft from the Federal Trade Commission, writing "The FTC is concerned about the death of the 'news.' Specifically newspapers. Rather than look to how old media models can be adapted to the Internet, they instead suggest taxing consumer electronics to support a huge newspaper bailout. Additionally, they suggest making facts 'proprietary' and allowing news organizations to copyright them." - -Note, though, "The good news in all this is that the FTC's bureaucrats try hard to recommend little. They just discuss. And much of what the agency staff ponders are political impossibilities."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fpolitics.slashdot.org%2Fstory%2F10%2F06%2F03%2F1730254%2FFTC-Staff-Discuss-a-Tax-on-Electronics-To-Support-the-News-Business" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=FTC+Staff+Discuss+a+Tax+on+Electronics+To+Support+the+News+Business%3A+http%3A%2F%2Fbit.ly%2FcWrHHx" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://politics.slashdot.org/story/10/06/03/1730254/FTC-Staff-Discuss-a-Tax-on-Electronics-To-Support-the-News-Business?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674376&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/6FyJ4q29U8zi6oajNj1WizX1GQQ/0/da"><img src="http://feedads.g.doubleclick.net/~at/6FyJ4q29U8zi6oajNj1WizX1GQQ/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/6FyJ4q29U8zi6oajNj1WizX1GQQ/1/da"><img src="http://feedads.g.doubleclick.net/~at/6FyJ4q29U8zi6oajNj1WizX1GQQ/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/RWSAy1nVfYQ" height="1" width="1"/> -timothy -2010-06-03T18:03:00+00:00 - -themedia -now-that's-what-I-call-top-down-management -politics -270 -270,270,225,175,37,16,11 -http://politics.slashdot.org/story/10/06/03/1730254/FTC-Staff-Discuss-a-Tax-on-Electronics-To-Support-the-News-Business?from=rss - - -Yahoo Treading Carefully Before Exposing More Private Data -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/wYu2IZPqdzM/Yahoo-Treading-Carefully-Before-Exposing-More-Private-Data - -crimeandpunishment writes "Yahoo hopes to turn on a new sharing option without turning off its users. The company is trying to avoid the privacy backlash that has befallen Facebook and Google. It's advising its email account holders, all 280M of them, to review their privacy settings in advance of Yahoo's new features that will share users' online activities and interests with people in their address books, unless they take steps to prevent it."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fyro.slashdot.org%2Fstory%2F10%2F06%2F03%2F162247%2FYahoo-Treading-Carefully-Before-Exposing-More-Private-Data" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Yahoo+Treading+Carefully+Before+Exposing+More+Private+Data%3A+http%3A%2F%2Fbit.ly%2FakVpBO" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://yro.slashdot.org/story/10/06/03/162247/Yahoo-Treading-Carefully-Before-Exposing-More-Private-Data?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674294&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/ubm3QT0AQfS0J8kJ0Xde_GTSXuI/0/da"><img src="http://feedads.g.doubleclick.net/~at/ubm3QT0AQfS0J8kJ0Xde_GTSXuI/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/ubm3QT0AQfS0J8kJ0Xde_GTSXuI/1/da"><img src="http://feedads.g.doubleclick.net/~at/ubm3QT0AQfS0J8kJ0Xde_GTSXuI/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/wYu2IZPqdzM" height="1" width="1"/> -kdawson -2010-06-03T17:15:00+00:00 - -privacy -can-you-say-buzz -yro -81 -81,79,67,56,19,12,11 -http://yro.slashdot.org/story/10/06/03/162247/Yahoo-Treading-Carefully-Before-Exposing-More-Private-Data?from=rss - - -Police Officers Seek Right Not To Be Recorded -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/OsEit_oEJG8/Police-Officers-Seek-Right-Not-To-Be-Recorded - -linzeal writes "When the police act as though cameras were the equivalent of guns pointed at them, there is a sense in which they are correct. Cameras have become the most effective weapon that ordinary people have to protect against and to expose police abuse. And the police want it to stop. Judges, juries, and legislatures support the police overwhelmingly on this issue, with only a few cases where those accused of 'shooting' the cops being vindicated through the courts."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fyro.slashdot.org%2Fstory%2F10%2F06%2F03%2F1548225%2FPolice-Officers-Seek-Right-Not-To-Be-Recorded" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Police+Officers+Seek+Right+Not+To+Be+Recorded%3A+http%3A%2F%2Fbit.ly%2F9F5Zhn" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://yro.slashdot.org/story/10/06/03/1548225/Police-Officers-Seek-Right-Not-To-Be-Recorded?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674280&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/d0x5YGFOTmjEqB4lo79jBGiqlWo/0/da"><img src="http://feedads.g.doubleclick.net/~at/d0x5YGFOTmjEqB4lo79jBGiqlWo/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/d0x5YGFOTmjEqB4lo79jBGiqlWo/1/da"><img src="http://feedads.g.doubleclick.net/~at/d0x5YGFOTmjEqB4lo79jBGiqlWo/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/OsEit_oEJG8" height="1" width="1"/> -kdawson -2010-06-03T16:28:00+00:00 - -crime -ain't-nobody's-bidness-if-we-do -yro -789 -789,781,636,502,122,84,64 -http://yro.slashdot.org/story/10/06/03/1548225/Police-Officers-Seek-Right-Not-To-Be-Recorded?from=rss - - -OH Senate Passes Bill Banning Human-Animal Hybrids -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/qALRtOq3asc/OH-Senate-Passes-Bill-Banning-Human-Animal-Hybrids - -An anonymous reader writes "The sci-fi movie Splice seems to have scared the Ohio's State Senator Steve Buehrer. The Ohio Senate has passed Sen. Buehrer's bill banning 'the creation, transportation, or receipt of a human-animal hybrid, the transfer of a nonhuman embryo into a human womb, and the transfer of a human embryo into a nonhuman womb.' So much for Teenage Mutant Ninja Turtles."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fidle.slashdot.org%2Fstory%2F10%2F06%2F03%2F1422213%2FOH-Senate-Passes-Bill-Banning-Human-Animal-Hybrids" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=OH+Senate+Passes+Bill+Banning+Human-Animal+Hybrids%3A+http%3A%2F%2Fbit.ly%2F9Rzi4u" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://idle.slashdot.org/story/10/06/03/1422213/OH-Senate-Passes-Bill-Banning-Human-Animal-Hybrids?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674156&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/O2YUM8RDSLbHWAZ7FYD8uGjf9Gk/0/da"><img src="http://feedads.g.doubleclick.net/~at/O2YUM8RDSLbHWAZ7FYD8uGjf9Gk/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/O2YUM8RDSLbHWAZ7FYD8uGjf9Gk/1/da"><img src="http://feedads.g.doubleclick.net/~at/O2YUM8RDSLbHWAZ7FYD8uGjf9Gk/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/qALRtOq3asc" height="1" width="1"/> -samzenpus -2010-06-03T16:10:00+00:00 - -biotech -no-centaurs-allowed -idle -140 -140,138,116,87,16,10,5 -http://idle.slashdot.org/story/10/06/03/1422213/OH-Senate-Passes-Bill-Banning-Human-Animal-Hybrids?from=rss - - -Part-Human, Part-Machine Transistor Devised -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/sH47rA1Mhcs/Part-Human-Part-Machine-Transistor-Devised - -asukasoryu writes "Man and machine can now be linked more intimately than ever, according to a new article in the journal ACS Nano Letters. Scientists have embedded a nano-sized transistor inside a cell-like membrane and powered it using the cell's own fuel. To create the implanted circuit, the UC scientists combined a carbon nanotube transistor, lipid bilayer coating, ion pump, and ATP. The ion pump changes the electrical charge inside the cell, which then changes the electrical charge going through the transistor, which the scientists could measure and monitor."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fhardware.slashdot.org%2Fstory%2F10%2F06%2F03%2F157205%2FPart-Human-Part-Machine-Transistor-Devised" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Part-Human%2C+Part-Machine+Transistor+Devised%3A+http%3A%2F%2Fbit.ly%2FaaeR9U" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://hardware.slashdot.org/story/10/06/03/157205/Part-Human-Part-Machine-Transistor-Devised?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674234&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/Myrxz7xqD6ztwnzMUpnHiW0x7wU/0/da"><img src="http://feedads.g.doubleclick.net/~at/Myrxz7xqD6ztwnzMUpnHiW0x7wU/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/Myrxz7xqD6ztwnzMUpnHiW0x7wU/1/da"><img src="http://feedads.g.doubleclick.net/~at/Myrxz7xqD6ztwnzMUpnHiW0x7wU/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/sH47rA1Mhcs" height="1" width="1"/> -kdawson -2010-06-03T15:39:00+00:00 - -biotech -mitochondria-look-up -hardware -57 -57,54,46,33,15,10,6 -http://hardware.slashdot.org/story/10/06/03/157205/Part-Human-Part-Machine-Transistor-Devised?from=rss - - -Why Are Indian Kids So Good At Spelling? -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/lAWnaU6YgIs/Why-Are-Indian-Kids-So-Good-At-Spelling - -theodp writes "Slate's Ben Paynter looks into why Indian kids dominate the Scripps National Spelling Bee, and concludes it's because they have their own minor-league spelling bee circuit (having the discipline to spell 7,000 to 8,000 words a day probably helps too!). Indian-Americans make up about 1% of the US population, notes Paynter, but this year an estimated 11% of the competitors at Scripps will hail from regional contests run by the North South Foundation. The NSF competitions function as a kind of nerd Olympiad for Indian-Americans &mdash; there are separate divisions for math, science, vocabulary, geography, essay writing, and even public speaking &mdash; and a way to raise money for college scholarships for underprivileged students in India. BTW, Strollerderby has the scoop on Whatever Happened to the Spellbound Kids? (RIP, Ted Brigham)."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fnews.slashdot.org%2Fstory%2F10%2F06%2F03%2F1333252%2FWhy-Are-Indian-Kids-So-Good-At-Spelling" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Why+Are+Indian+Kids+So+Good+At+Spelling%3F%3A+http%3A%2F%2Fbit.ly%2F9p6g5Z" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://news.slashdot.org/story/10/06/03/1333252/Why-Are-Indian-Kids-So-Good-At-Spelling?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674088&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/ZByOTHGO1peZ7CsinQ_HpRRAXMA/0/da"><img src="http://feedads.g.doubleclick.net/~at/ZByOTHGO1peZ7CsinQ_HpRRAXMA/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/ZByOTHGO1peZ7CsinQ_HpRRAXMA/1/da"><img src="http://feedads.g.doubleclick.net/~at/ZByOTHGO1peZ7CsinQ_HpRRAXMA/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/lAWnaU6YgIs" height="1" width="1"/> -StoneLion -2010-06-03T14:48:00+00:00 - -education -i-n-t-e-l-l-i-g-e-n-t -news -386 -386,384,288,235,59,31,13 -http://news.slashdot.org/story/10/06/03/1333252/Why-Are-Indian-Kids-So-Good-At-Spelling?from=rss - - -Mars500 Mission Begins -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/JcnvobAj0DE/Mars500-Mission-Begins - -krou writes "The six participants in the Mars500 project have entered their sealed facility. The project, which lasts for 18 months, is designed to try and simulate a mission to Mars, completely isolated and cut off from the outside world, with a '20-minute, one-way time-delay in communications to mirror the real lag in sending messages over the vast distance between Mars and Earth.' They also have limited consumables, with everything required being loaded onboard from the start. You can follow developments via the blog, or the Twitter feed of Diego Urbina, one of the would-be cosmonauts."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fscience.slashdot.org%2Fstory%2F10%2F06%2F03%2F1326248%2FMars500-Mission-Begins" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Mars500+Mission+Begins%3A+http%3A%2F%2Fbit.ly%2F8X46k9" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://science.slashdot.org/story/10/06/03/1326248/Mars500-Mission-Begins?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1674074&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/0m5VIgNdNIn3qTvqkeUm7Bm_5Ss/0/da"><img src="http://feedads.g.doubleclick.net/~at/0m5VIgNdNIn3qTvqkeUm7Bm_5Ss/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/0m5VIgNdNIn3qTvqkeUm7Bm_5Ss/1/da"><img src="http://feedads.g.doubleclick.net/~at/0m5VIgNdNIn3qTvqkeUm7Bm_5Ss/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/JcnvobAj0DE" height="1" width="1"/> -samzenpus -2010-06-03T14:00:00+00:00 - -mars -I-think-I-saw-this-movie -science -194 -194,186,158,129,39,16,8 -http://science.slashdot.org/story/10/06/03/1326248/Mars500-Mission-Begins?from=rss - - -Military Develops "Green" Cleaners For Terrorist Attack Sites -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/WQA5Xo_6cwA/Military-Develops-Green-Cleaners-For-Terrorist-Attack-Sites - -An anonymous reader writes "Chemists with the U.S. military have developed a set of ultra-strength cleaners to be used in the aftermath of a terrorist attack. The formulas are reportedly tough enough to get rid of nerve gas, mustard gas, radioactive isotopes, and anthrax. But they are also non-toxic, based on ingredients found in foods, cosmetics, and other consumer products."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fidle.slashdot.org%2Fstory%2F10%2F06%2F03%2F0256204%2FMilitary-Develops-Green-Cleaners-For-Terrorist-Attack-Sites" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Military+Develops+%22Green%22+Cleaners+For+Terrorist+Attack+Sites%3A+http%3A%2F%2Fbit.ly%2Fd5B0na" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://idle.slashdot.org/story/10/06/03/0256204/Military-Develops-Green-Cleaners-For-Terrorist-Attack-Sites?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1673468&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/PDXSpTgm3sywZaYwq6gq_hqte1s/0/da"><img src="http://feedads.g.doubleclick.net/~at/PDXSpTgm3sywZaYwq6gq_hqte1s/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/PDXSpTgm3sywZaYwq6gq_hqte1s/1/da"><img src="http://feedads.g.doubleclick.net/~at/PDXSpTgm3sywZaYwq6gq_hqte1s/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/WQA5Xo_6cwA" height="1" width="1"/> -samzenpus -2010-06-03T13:10:00+00:00 - -military -ultra-concentrated -idle -81 -81,76,71,54,20,12,3 -http://idle.slashdot.org/story/10/06/03/0256204/Military-Develops-Green-Cleaners-For-Terrorist-Attack-Sites?from=rss - - -Bill Gives Feds "Emergency" Powers To Secure Civilian Nets -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/iW74en1N7Ro/Bill-Gives-Feds-Emergency-Powers-To-Secure-Civilian-Nets - -ziani writes "Joe Lieberman wants to give the federal government the power to take over civilian networks' security if there's an 'imminent cyber threat.' From the article: 'Lieberman and Collins' solution is one of the more far-reaching proposals. In the Senators' draft bill, "the President may issue a declaration of an imminent cyber threat to covered critical infrastructure." Once such a declaration is made, the director of a DHS National Center for Cybersecurity and Communications is supposed to "develop and coordinate emergency measures or actions necessary to preserve the reliable operation, and mitigate or remediate the consequences of the potential disruption, of covered critical infrastructure."'"<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fnews.slashdot.org%2Fstory%2F10%2F06%2F03%2F038203%2FBill-Gives-Feds-Emergency-Powers-To-Secure-Civilian-Nets" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Bill+Gives+Feds+%22Emergency%22+Powers+To+Secure+Civilian+Nets%3A+http%3A%2F%2Fbit.ly%2FbPhuuU" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://news.slashdot.org/story/10/06/03/038203/Bill-Gives-Feds-Emergency-Powers-To-Secure-Civilian-Nets?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1673478&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/rWboIySEo5PuQp4g7XzQ7ObY6BI/0/da"><img src="http://feedads.g.doubleclick.net/~at/rWboIySEo5PuQp4g7XzQ7ObY6BI/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/rWboIySEo5PuQp4g7XzQ7ObY6BI/1/da"><img src="http://feedads.g.doubleclick.net/~at/rWboIySEo5PuQp4g7XzQ7ObY6BI/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/iW74en1N7Ro" height="1" width="1"/> -samzenpus -2010-06-03T11:57:00+00:00 - -usa -all-your-nets-are-belong-to-us -news -444 -444,439,343,268,62,42,25 -http://news.slashdot.org/story/10/06/03/038203/Bill-Gives-Feds-Emergency-Powers-To-Secure-Civilian-Nets?from=rss - - -Iridium Pushes Ahead Satellite Project -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/-gWGIAF-QGY/Iridium-Pushes-Ahead-Satellite-Project - -oxide7 writes "Iridium (IRDM) continues its push into the market for satellite data and telemetry services, as it announced the company that would build its second generation of satellites. Iridium's old network of 66 satellites was designed for voice calls; the new satellites will also be able to handle data more efficiently, and include cameras as well. The company also plans to share the satellite platforms with some scientists for use in studying the Earth."<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fscience.slashdot.org%2Fstory%2F10%2F06%2F03%2F006256%2FIridium-Pushes-Ahead-Satellite-Project" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Iridium+Pushes+Ahead+Satellite+Project%3A+http%3A%2F%2Fbit.ly%2FconYqH" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://science.slashdot.org/story/10/06/03/006256/Iridium-Pushes-Ahead-Satellite-Project?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1673374&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/_4xqZu9UTaB0_CeYB0dLBjj7YDo/0/da"><img src="http://feedads.g.doubleclick.net/~at/_4xqZu9UTaB0_CeYB0dLBjj7YDo/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/_4xqZu9UTaB0_CeYB0dLBjj7YDo/1/da"><img src="http://feedads.g.doubleclick.net/~at/_4xqZu9UTaB0_CeYB0dLBjj7YDo/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/-gWGIAF-QGY" height="1" width="1"/> -samzenpus -2010-06-03T06:55:00+00:00 - -space -space-business -science -75 -75,74,65,58,29,16,11 -http://science.slashdot.org/story/10/06/03/006256/Iridium-Pushes-Ahead-Satellite-Project?from=rss - - -Doctor Slams Hospital's "Please" Policy -http://rss.slashdot.org/~r/Slashdot/slashdot/~3/Fyn4QmQHquA/Doctor-Slams-Hospitals-Please-Policy - -Administrators at England's Worthing Hospital are insisting that doctors say the magic word when writing orders for blood tests on weekends. If a doctor refuses to write "please" on the order, the test will be refused. From the article: "However, a doctor at the hospital said on condition of anonymity that he sees the policy as a money-saving measure that could prove dangerous for patients. 'I was shocked to come in on Sunday and find none of my bloods had been done from the night before because I'd not written "please,"' the doctor said. 'I had no results to guide treatment of patients. Myself and a senior nurse had to take the bloods ourselves, which added hours to our 12-hour shifts. This system puts patients' lives at risk. Doctors are wasting time doing the job of the technicians.'"<p><a href="http://www.facebook.com/sharer.php?u=http%3A%2F%2Fscience.slashdot.org%2Fstory%2F10%2F06%2F03%2F0118209%2FDoctor-Slams-Hospitals-Please-Policy" target="_blank" title="Share on Facebook"><img src="http://a.fsdn.com/sd/facebook_icon_large.png"></a> - - <a href="http://twitter.com/home?status=Doctor+Slams+Hospital's+%22Please%22+Policy%3A+http%3A%2F%2Fbit.ly%2FcEKPgY" target="_blank" title="Share on Twitter"><img src="http://a.fsdn.com/sd/twitter_icon_large.png"></a></p><p><a href="http://science.slashdot.org/story/10/06/03/0118209/Doctor-Slams-Hospitals-Please-Policy?from=rss">Read more of this story</a> at Slashdot.</p><iframe src="http://slashdot.org/slashdot-it.pl?op=discuss&amp;id=1673418&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> - -<p><a href="http://feedads.g.doubleclick.net/~at/zun8Y5ECDPrhhF4bOJhnZhw9wb0/0/da"><img src="http://feedads.g.doubleclick.net/~at/zun8Y5ECDPrhhF4bOJhnZhw9wb0/0/di" border="0" ismap="true"></img></a><br/> -<a href="http://feedads.g.doubleclick.net/~at/zun8Y5ECDPrhhF4bOJhnZhw9wb0/1/da"><img src="http://feedads.g.doubleclick.net/~at/zun8Y5ECDPrhhF4bOJhnZhw9wb0/1/di" border="0" ismap="true"></img></a></p><img src="http://feeds.feedburner.com/~r/Slashdot/slashdot/~4/Fyn4QmQHquA" height="1" width="1"/> -samzenpus -2010-06-03T05:07:00+00:00 - -medicine -paging-doctor-manners -science -500 -500,495,378,303,97,50,26 -http://science.slashdot.org/story/10/06/03/0118209/Doctor-Slams-Hospitals-Please-Policy?from=rss - - -Search Slashdot -Search Slashdot stories - -query -http://slashdot.org/search.pl - - - diff --git a/hub/feed_diff_testdata/rss2_only_link.xml b/hub/feed_diff_testdata/rss2_only_link.xml deleted file mode 100644 index ba254d6..0000000 --- a/hub/feed_diff_testdata/rss2_only_link.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - Liftoff News - http://liftoff.msfc.nasa.gov/ - Liftoff to Space Exploration. - en-us - Tue, 10 Jun 2003 04:00:00 GMT - Tue, 10 Jun 2003 09:41:01 GMT - http://blogs.law.harvard.edu/tech/rss - Weblog Editor 2.0 - editor@example.com - webmaster@example.com - - http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp - - - http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp - - - http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp - - - \ No newline at end of file diff --git a/hub/feed_diff_testdata/rss2_only_title.xml b/hub/feed_diff_testdata/rss2_only_title.xml deleted file mode 100644 index 1b2a2f2..0000000 --- a/hub/feed_diff_testdata/rss2_only_title.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - Liftoff News - http://liftoff.msfc.nasa.gov/ - Liftoff to Space Exploration. - en-us - Tue, 10 Jun 2003 04:00:00 GMT - Tue, 10 Jun 2003 09:41:01 GMT - http://blogs.law.harvard.edu/tech/rss - Weblog Editor 2.0 - editor@example.com - webmaster@example.com - - Star City - - - The Engine That Does More - - - Astronauts' Dirty Laundry - - - \ No newline at end of file diff --git a/hub/feed_diff_testdata/rss2sample.xml b/hub/feed_diff_testdata/rss2sample.xml deleted file mode 100644 index bcc516d..0000000 --- a/hub/feed_diff_testdata/rss2sample.xml +++ /dev/null @@ -1,42 +0,0 @@ - - - - Liftoff News - http://liftoff.msfc.nasa.gov/ - Liftoff to Space Exploration. - en-us - Tue, 10 Jun 2003 04:00:00 GMT - Tue, 10 Jun 2003 09:41:01 GMT - http://blogs.law.harvard.edu/tech/rss - Weblog Editor 2.0 - editor@example.com - webmaster@example.com - - - Star City - http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp - How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. - Tue, 03 Jun 2003 09:39:21 GMT - http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 - - - Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st. - Fri, 30 May 2003 11:06:42 GMT - http://liftoff.msfc.nasa.gov/2003/05/30.html#item572 - - - The Engine That Does More - http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp - Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that. - Tue, 27 May 2003 08:37:32 GMT - http://liftoff.msfc.nasa.gov/2003/05/27.html#item571 - - - Astronauts' Dirty Laundry - http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp - Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options. - Tue, 20 May 2003 08:56:02 GMT - http://liftoff.msfc.nasa.gov/2003/05/20.html#item570 - - - \ No newline at end of file diff --git a/hub/feed_diff_testdata/rss_no_link.xml b/hub/feed_diff_testdata/rss_no_link.xml deleted file mode 100644 index 1deec62..0000000 --- a/hub/feed_diff_testdata/rss_no_link.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - WriteTheWeb - News for web users that write back - en-us - Copyright 2000, WriteTheWeb team. - editor@writetheweb.com - webmaster@writetheweb.com - - WriteTheWeb - http://writetheweb.com/images/mynetscape88.gif - http://writetheweb.com - 88 - 31 - News for web users that write back - - - Giving the world a pluggable Gnutella - http://writetheweb.com/read.php?item=24 - WorldOS is a framework on which to build programs that work like Freenet or Gnutella -allowing distributed applications using peer-to-peer routing. - - - \ No newline at end of file diff --git a/hub/feed_diff_testdata/rss_rdf.xml b/hub/feed_diff_testdata/rss_rdf.xml deleted file mode 100644 index ecc92b6..0000000 --- a/hub/feed_diff_testdata/rss_rdf.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - WriteTheWeb - http://writetheweb.com - News for web users that write back - en-us - Copyright 2000, WriteTheWeb team. - editor@writetheweb.com - webmaster@writetheweb.com - - WriteTheWeb - http://writetheweb.com/images/mynetscape88.gif - http://writetheweb.com - 88 - 31 - News for web users that write back - - - Giving the world a pluggable Gnutella - http://writetheweb.com/read.php?item=24 - WorldOS is a framework on which to build programs that work like Freenet or Gnutella -allowing distributed applications using peer-to-peer routing. - - - Syndication discussions hot up - http://writetheweb.com/read.php?item=23 - After a period of dormancy, the Syndication mailing list has become active again, with contributions from leaders in traditional media and Web syndication. - - - diff --git a/hub/feed_diff_testdata/sampleRss091.xml b/hub/feed_diff_testdata/sampleRss091.xml deleted file mode 100644 index b86ca2d..0000000 --- a/hub/feed_diff_testdata/sampleRss091.xml +++ /dev/null @@ -1,50 +0,0 @@ - - - - WriteTheWeb - http://writetheweb.com - News for web users that write back - en-us - Copyright 2000, WriteTheWeb team. - editor@writetheweb.com - webmaster@writetheweb.com - - WriteTheWeb - http://writetheweb.com/images/mynetscape88.gif - http://writetheweb.com - 88 - 31 - News for web users that write back - - - Giving the world a pluggable Gnutella - http://writetheweb.com/read.php?item=24 - WorldOS is a framework on which to build programs that work like Freenet or Gnutella -allowing distributed applications using peer-to-peer routing. - - - Syndication discussions hot up - http://writetheweb.com/read.php?item=23 - After a period of dormancy, the Syndication mailing list has become active again, with contributions from leaders in traditional media and Web syndication. - - - Personal web server integrates file sharing and messaging - http://writetheweb.com/read.php?item=22 - The Magi Project is an innovative project to create a combined personal web server and messaging system that enables the sharing and synchronization of information across desktop, laptop and palmtop devices. - - - Syndication and Metadata - http://writetheweb.com/read.php?item=21 - RSS is probably the best known metadata format around. RDF is probably one of the least understood. In this essay, published on my O'Reilly Network weblog, I argue that the next generation of RSS should be based on RDF. - - - UK bloggers get organised - http://writetheweb.com/read.php?item=20 - Looks like the weblogs scene is gathering pace beyond the shores of the US. There's now a UK-specific page on weblogs.com, and a mailing list at egroups. - - - Yournamehere.com more important than anything - http://writetheweb.com/read.php?item=19 - Whatever you're publishing on the web, your site name is the most valuable asset you have, according to Carl Steadman. - - - \ No newline at end of file diff --git a/hub/feed_diff_testdata/sampleRss092.xml b/hub/feed_diff_testdata/sampleRss092.xml deleted file mode 100644 index db26c52..0000000 --- a/hub/feed_diff_testdata/sampleRss092.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - - - Dave Winer: Grateful Dead - http://www.scripting.com/blog/categories/gratefulDead.html - A high-fidelity Grateful Dead song every day. This is where we're experimenting with enclosures on RSS news items that download when you're not using your computer. If it works (it will) it will be the end of the Click-And-Wait multimedia experience on the Internet. - Fri, 13 Apr 2001 19:23:02 GMT - http://backend.userland.com/rss092 - dave@userland.com (Dave Winer) - dave@userland.com (Dave Winer) - - - It's been a few days since I added a song to the Grateful Dead channel. Now that there are all these new Radio users, many of whom are tuned into this channel (it's #16 on the hotlist of upstreaming Radio users, there's no way of knowing how many non-upstreaming users are subscribing, have to do something about this..). Anyway, tonight's song is a live version of Weather Report Suite from Dick's Picks Volume 7. It's wistful music. Of course a beautiful song, oft-quoted here on Scripting News. <i>A little change, the wind and rain.</i> - - - - - Kevin Drennan started a <a href="http://deadend.editthispage.com/">Grateful Dead Weblog</a>. Hey it's cool, he even has a <a href="http://deadend.editthispage.com/directory/61">directory</a>. <i>A Frontier 7 feature.</i> - Scripting News - - - <a href="http://arts.ucsc.edu/GDead/AGDL/other1.html">The Other One</a>, live instrumental, One From The Vault. Very rhythmic very spacy, you can listen to it many times, and enjoy something new every time. - - - - This is a test of a change I just made. Still diggin.. - - - The HTML rendering almost <a href="http://validator.w3.org/check/referer">validates</a>. Close. Hey I wonder if anyone has ever published a style guide for ALT attributes on images? What are you supposed to say in the ALT attribute? I sure don't know. If you're blind send me an email if u cn rd ths. - - - <a href="http://www.cs.cmu.edu/~mleone/gdead/dead-lyrics/Franklin's_Tower.txt">Franklin's Tower</a>, a live version from One From The Vault. - - - - Moshe Weitzman says Shakedown Street is what I'm lookin for for tonight. I'm listening right now. It's one of my favorites. "Don't tell me this town ain't got no heart." Too bright. I like the jazziness of Weather Report Suite. Dreamy and soft. How about The Other One? "Spanish lady come to me.." - Scripting News - - - <a href="http://www.scripting.com/mp3s/youWinAgain.mp3">The news is out</a>, all over town..<p> -You've been seen, out runnin round. <p> -The lyrics are <a href="http://www.cs.cmu.edu/~mleone/gdead/dead-lyrics/You_Win_Again.txt">here</a>, short and sweet. <p> -<i>You win again!</i> - - - - - \ No newline at end of file diff --git a/hub/feed_diff_testdata/whitespace_id.xml b/hub/feed_diff_testdata/whitespace_id.xml deleted file mode 100644 index 2a8fbb0..0000000 --- a/hub/feed_diff_testdata/whitespace_id.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - -dive into mark -everything old is new again -2008-08-23T04:49:22Z - - - - - my feed id here - - - - - -Mark -http://diveintomark.org/ - -<![CDATA[The ampersands of Linux]]> - -tag:diveintomark.org,2008-08-14:/archives/20080814215936 -2008-08-14T23:08:54Z -2008-08-14T21:59:36Z -Please try to contain your excitement. -<p>Taking an idea from <a href="http://www.simplebits.com/notebook/2008/08/14/ampersands.html">Use the Best Available Ampersand</a> and a list of pre-installed fonts from the <a href="http://www.apaddedcell.com/web-fonts">Complete Guide to Pre-Installed Fonts in Linux, Mac, and Windows</a>, I present &#8220;The Ampersands of Linux&#8221;:</p> - -<p><img src="http://wearehugh.com/public/2008/08/ampersands-of-linux3.png" alt="[ampersands in 28 fonts]" height="900" width="600"></p> - -<p>(<a href="http://wearehugh.com/public/2008/08/ampersands-of-linux.html"><abbr>HTML</abbr></a>)</p> - -<p>Please try to contain your excitement.</p> - - diff --git a/hub/feed_diff_testdata/xhtml_entities.xml b/hub/feed_diff_testdata/xhtml_entities.xml deleted file mode 100644 index b33f8cf..0000000 --- a/hub/feed_diff_testdata/xhtml_entities.xml +++ /dev/null @@ -1,3 +0,0 @@ - - -© diff --git a/hub/feed_identifier.py b/hub/feed_identifier.py deleted file mode 100644 index 530b209..0000000 --- a/hub/feed_identifier.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -__author__ = 'bslatkin@gmail.com (Brett Slatkin)' - -"""Atom/RSS feed parser that determines a feed's canonical ID.""" - -import cStringIO -import logging -import re -import xml.sax -import xml.sax.handler -import xml.sax.saxutils - - -# Set to true to see stack level messages and other debugging information. -DEBUG = False - - -class TrivialEntityResolver(xml.sax.handler.EntityResolver): - """Pass-through entity resolver.""" - - def resolveEntity(self, publicId, systemId): - return cStringIO.StringIO() - - -class FeedIdentifier(xml.sax.handler.ContentHandler): - """Base SAX content handler for identifying feeds.""" - - target_tag_stack = None - - def __init__(self, parser): - """Initializer. - - Args: - parser: Instance of the xml.sax parser being used with this handler. - """ - self.parser = parser - self.link = [] - self.tag_stack = [] - self.capture_next_element = False - - # SAX methods - def startElement(self, name, attrs): - if not self.link: - if DEBUG: logging.debug('Start stack level for %r', name) - self.tag_stack.append(name) - if len(self.tag_stack) == len(self.target_tag_stack): - equal = True - for value, predicate in zip(self.tag_stack, self.target_tag_stack): - if not predicate(value): - equal = False - break - if equal: - self.capture_next_element = True - - def endElement(self, name): - if self.link: - self.capture_next_element = False - else: - if DEBUG: logging.debug('End stack level %r', name) - self.tag_stack.pop() - - def characters(self, content): - if self.capture_next_element: - self.link.append(content) - - def get_link(self): - if not self.link: - return None - else: - return ''.join(self.link).strip() - - -class AtomFeedIdentifier(FeedIdentifier): - """SAX content handler for identifying Atom feeds.""" - - target_tag_stack = [ - re.compile(k).match for k in ( - '([^:]+:)?feed$', - '([^:]+:)?id$')] - - -class RssFeedIdentifier(FeedIdentifier): - """SAX content handler for identifying RSS feeds.""" - - target_tag_stack = ( - [re.compile('^(?i)(rss)|(.*rdf)$').match] + - [re.compile(k).match for k in ('channel', 'link')]) - - -def identify(data, format): - """Identifies a feed. - - Args: - data: String containing the data of the XML feed to parse. - format: String naming the format of the data. Should be 'rss' or 'atom'. - - Returns: - The ID of the feed, or None if one could not be determined (due to parse - errors, etc). - - Raises: - xml.sax.SAXException on parse errors. - """ - data_stream = cStringIO.StringIO(data) - parser = xml.sax.make_parser() - - if format == 'atom': - handler = AtomFeedIdentifier(parser) - elif format == 'rss': - handler = RssFeedIdentifier(parser) - else: - assert False, 'Invalid feed format "%s"' % format - - parser.setContentHandler(handler) - parser.setEntityResolver(TrivialEntityResolver()) - parser.parse(data_stream) - - return handler.get_link() - - -__all__ = ['identify', 'DEBUG'] diff --git a/hub/feed_identifier_test.py b/hub/feed_identifier_test.py deleted file mode 100755 index 8c6dd46..0000000 --- a/hub/feed_identifier_test.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -__author__ = 'bslatkin@gmail.com (Brett Slatkin)' - -"""Tests for the feed_identifier module.""" - -import logging -import os -import unittest -import xml.sax - -import feed_identifier - - -class TestBase(unittest.TestCase): - - def setUp(self): - self.testdata = os.path.join(os.path.dirname(__file__), - 'feed_diff_testdata') - - def load(self, path): - return open(os.path.join(self.testdata, path)).read() - - -class AtomTest(TestBase): - """Tests for identifying Atom-formatted feeds.""" - - def testGood(self): - feed_id = feed_identifier.identify(self.load('parsing.xml'), 'atom') - self.assertEquals('tag:diveintomark.org,2001-07-29:/', feed_id) - - def testNoFeedId(self): - feed_id = feed_identifier.identify(self.load('atom_no_id.xml'), 'atom') - self.assertTrue(feed_id is None) - - def testIncorrectFormat(self): - feed_id = feed_identifier.identify(self.load('rss_rdf.xml'), 'atom') - self.assertTrue(feed_id is None) - - def testWhitespace(self): - feed_id = feed_identifier.identify(self.load('whitespace_id.xml'), 'atom') - self.assertEquals('my feed id here', feed_id) - - def testBadFormat(self): - self.assertRaises(xml.sax.SAXParseException, - feed_identifier.identify, - self.load('bad_feed.xml'), - 'atom') - - def testFullNamespace(self): - feed_id = feed_identifier.identify(self.load('atom_namespace.xml'), 'atom') - self.assertEquals('http://example.com/feeds/delta', feed_id) - - -class RssTest(TestBase): - """Tests for identifying RSS-formatted feeds.""" - - def testGood091(self): - feed_id = feed_identifier.identify(self.load('sampleRss091.xml'), 'rss') - self.assertEquals('http://writetheweb.com', feed_id) - - def testGood092(self): - feed_id = feed_identifier.identify(self.load('sampleRss092.xml'), 'rss') - self.assertEquals( - 'http://www.scripting.com/blog/categories/gratefulDead.html', - feed_id) - - def testGood20(self): - feed_id = feed_identifier.identify(self.load('rss2sample.xml'), 'rss') - self.assertEquals('http://liftoff.msfc.nasa.gov/', feed_id) - - def testGoodRdf(self): - feed_id = feed_identifier.identify(self.load('rss_rdf.xml'), 'rss') - self.assertEquals('http://writetheweb.com', feed_id) - - def testNoFeedId(self): - feed_id = feed_identifier.identify(self.load('rss_no_link.xml'), 'rss') - self.assertTrue(feed_id is None) - - def testIncorrectFormat(self): - feed_id = feed_identifier.identify(self.load('parsing.xml'), 'rss') - self.assertTrue(feed_id is None) - - def testBadFormat(self): - self.assertRaises(xml.sax.SAXParseException, - feed_identifier.identify, - self.load('bad_feed.xml'), - 'rss') - - -if __name__ == '__main__': - feed_identifier.DEBUG = True - unittest.main() diff --git a/hub/fork_join_queue.py b/hub/fork_join_queue.py deleted file mode 100644 index 1a97495..0000000 --- a/hub/fork_join_queue.py +++ /dev/null @@ -1,512 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2010 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Fork-join queue for App Engine. - -The Task Queue API executes tasks in a push manner instead of polling with -visibility time like Amazon SQS and other workqueue systems. However, often -you may need to process multiple pieces of queued work simultaneously in a -single App Engine request; the benefit being that you can minimize the -impact of high latency API calls that block and occupy a thread by doing many -asynchronous calls in parallel. - -Fork-join queues have three important parameters: - - * Batch Time: How often new task entities added to the queue should be - coalesced to run as a single unit in parallel. This should be low - enough to not drastically affect latency, but high enough that its - batching effects result in fewer overall occupied threads. - - * Batch Size: How many task entities to run at a time in a single request. - This should be tuned for your asynchronous work's maximum wall-clock time - and the maximum asynchronous API calls need to do in parallel. - - * Shard count: (optional) How many parallel shards to use for this queue. - This represents the minimum parallelism you'll see since you won't get - coalescing until you have at least as many tasks as shards. - -How it works: - -1. Incoming, Datastore entities representing work items are assigned an index -number and committed. A shard number is assigned for load-balancing based on -the index assigned. - -2. After the work entities are committed to the Datastore, corresponding -push-oriented taskqueue tasks are put on the push queue. These push tasks have -an ETA of the next highest time interval for the fork-queue based on the batch -time. The magical part here is that many task entities in the same batch time -will "dedupe" their push-task enqueueing by getting a tombstone/exists error -because they have overlapping task names (based on the work index). Thus, many -separate physical tasks entities will *fan-in* to a single logical task. - -3. The push task runs *after* all work item entities have been written to the -Datastore (guaranteed with a reader/writer lock). The task queries for work in -its particular work index region. It then handles these tasks (in user code) -and allows the task to complete. The task entities need not be deleted. - -4. (optional) When tasks are popped from the fork-join queue, a continuation -task will be enqueued immediately after the batch size is received to do -more work in parallel in smaller chunk sizes. - - -Obligatory diagram (where numbers correspond to batch generations): - -|---------|---------|---> time - ^ ^ ^ ^ ^ ^ ^ ^ ^ - 1 1 1 1 R 2 2 2 R - u u - n n - 1 2 - - -Nota Bene: A naive approach to pull-oriented queues (constantly query on an -'eta' parameter sorting by 'eta' descending, then delete finished entities) may -result in poor performance because of how the Datastore's garbage collection -interacts with Datastore queries and Bigtable's tablet splitting behavior. -Using contiguous row indexes on any work item properties can have the same -effect, so a hash of the sequential work index is used to ensure balancing -across tablets. -""" - -import datetime -import logging -import os -import random -import time - -from google.net.proto import ProtocolBuffer -from google.appengine.api import memcache -from google.appengine.api import taskqueue -from google.appengine.ext import db - -# TODO: Consider using multiple work indexes to alleviate the memcache -# hotspot for the writer path. - -################################################################################ - -def knuth_hash(number): - """A decent hash function for integers.""" - return (number * 2654435761) % 2**32 - - -def datetime_from_stamp(stamp): - """Converts a UNIX timestamp to a datetime.datetime including microseconds.""" - result = datetime.datetime.utcfromtimestamp(stamp) - result += datetime.timedelta(microseconds=10**6 * (stamp - int(stamp))) - return result - - -class Error(Exception): - """Base-class for exceptions in this module.""" - -class WriterLockError(Error): - """When the task adder could not increment the writer lock.""" - -class CannotGetIndexError(Error): - """When the task adder could not get a starting index from memcache.""" - -class TaskConflictError(Error): - """The added task has already ran, meaning the work index is invalid.""" - -class MemcacheError(Error): - """Enqueuing the work item in memcache failed.""" - - -class ForkJoinQueue(object): - """A fork-join queue for App Engine.""" - - FAKE_ZERO = 2**16 - LOCK_OFFSET = FAKE_ZERO / 2 - - def __init__(self, - model_class, - index_property, - task_path, - queue_name, - batch_size=None, - batch_period_ms=None, - lock_timeout_ms=None, - sync_timeout_ms=None, - stall_timeout_ms=None, - acquire_timeout_ms=None, - acquire_attempts=None): - """Initializer. - - Args: - model_class: The model class for work items. - index_property: The model class's property for work indexes. - task_path: Path where joined tasks should run. - queue_name: Queue on which joined tasks should run. - batch_size: How many work items to process at a time before spawning - another task generation to handle more. - batch_period_ms: How often, in milliseconds, to batch work items. - lock_timeout_ms: How long to wait, in milliseconds, for all writers - before a joined task executes. - sync_timeout_ms: How long it takes, in milliseconds, for writers to - finish enqueueing work before readers should attempt to acquire the - lock again. - stall_timeout_ms: How often task queue naming overlaps should be - rotated, in milliseconds, in order to prevent the queue stall caused - by memcache outages. - acquire_timeout_ms: How long to wait, in milliseconds, for writers to - acquire a new index on each attempt. - acquire_attempts: How many times writers should attempt to get new - indexes before raising an error. - """ - # TODO: Add validation. - self.model_class = model_class - self.name = 'fjq-' + model_class.kind() - self.index_property = index_property - self.task_path = task_path - self.queue_name = queue_name - self.batch_size = batch_size - self.lock_timeout = lock_timeout_ms / 1000.0 - self.sync_attempts = int(1.0 * lock_timeout_ms / sync_timeout_ms) - self.sync_timeout = sync_timeout_ms / 1000.0 - self.stall_timeout = stall_timeout_ms / 1000.0 - self.acquire_timeout = acquire_timeout_ms / 1000.0 - self.acquire_attempts = acquire_attempts - if batch_period_ms == 0: - self.batch_delta = None - else: - self.batch_delta = datetime.timedelta(microseconds=batch_period_ms * 1000) - - def get_queue_name(self, index): - """Returns the name of the queue to use based on the given work index.""" - return self.queue_name - - @property - def lock_name(self): - """Returns the lock key prefix for the current prefix name.""" - return self.name + '-lock' - - @property - def add_counter_template(self): - """Returns the add counter prefix template for the current prefix name.""" - return self.name + '-add-lock:%d' - - @property - def index_name(self): - """Returns the index key prefix for the current prefix name.""" - return self.name + '-index' - - def next_index(self, - memget=memcache.get, - memincr=memcache.incr, - memdecr=memcache.decr): - """Reserves the next work index. - - Args: - memget, memincr, memdecr: Used for testing. - - Returns: - The next work index to use for work. - """ - for i in xrange(self.acquire_attempts): - next_index = memget(self.index_name) - if next_index is None: - memcache.add(self.index_name, 1) - next_index = memget(self.index_name) - if next_index is None: - # Can't get it or add it, which means memcache is probably down. - # Handle this as a separate fast-path to prevent memcache overload - # during memcache failures. - raise CannotGetIndexError( - 'Cannot establish new task index in memcache.') - - next_index = knuth_hash(int(next_index)) - add_counter = self.add_counter_template % next_index - count = memincr(add_counter, 1, initial_value=self.FAKE_ZERO) - if count < self.FAKE_ZERO: - # When the counter is super negative that means this index has been - # locked and we can no longer add tasks to it. We need to "refund" the - # reader lock we took to ensure the worker doesn't wait for it. - memdecr(add_counter, 1) - else: - return next_index - time.sleep(self.acquire_timeout) - else: - # Force the index forward; here we're stuck in a loop where the memcache - # index was evicted and all new lock acqusitions are reusing old locks - # that were already closed off to new writers. - memincr(self.index_name) - raise WriterLockError('Task adder could not increment writer lock.') - - def add(self, index, gettime=time.time): - """Adds a task for a work index, decrementing the writer lock.""" - now_stamp = gettime() - # Nearest gap used to kickstart the queues when a task is dropped or - # memcache is evicted. This prevents new task names from overlapping with - # old ones. - nearest_gap = int(now_stamp / self.stall_timeout) - # Include major version in the task name to ensure that test tasks - # enqueued from a non-default major version will run in the new context - # instead of the default major version. - major_version, minor_version = os.environ['CURRENT_VERSION_ID'].split('.') - task_name = '%s-%s-%d-%d-%d' % ( - self.name, major_version, nearest_gap, index, 0) - - # When the batch_period_ms is zero, then there should be no ETA, the task - # should run immediately and the reader will busy wait for all writers. - if self.batch_delta is None: - eta = None - else: - eta = datetime_from_stamp(now_stamp) + self.batch_delta - - try: - taskqueue.Task( - method='POST', - name=task_name, - url=self.task_path, - eta=eta - ).add(self.get_queue_name(index)) - if self.batch_delta is None: - # When the batch_period_ms is zero, we want to immediately move the - # index to the next position as soon as the current batch finishes - # writing its task. This will only run for the first successful task - # inserter. - memcache.incr(self.index_name) - except taskqueue.TaskAlreadyExistsError: - # This is okay. It means the task has already been inserted by another - # add() call for this same batch. We're holding the lock at this point - # so we know that job won't start yet. - pass - except taskqueue.TombstonedTaskError, e: - # This is bad. This means 1) the lock we held expired and the task already - # ran, 2) this task name somehow overlaps with an old task. Return the - # error to the caller so they can try again. - raise TaskConflictError('Task named tombstoned: %s' % e) - finally: - # Don't bother checking the decr status; worst-case the worker job - # will time out after some number of seconds and proceed anyways. - memcache.decr(self.add_counter_template % index, 1) - - def _increment_index(self, last_index): - """Moves the work index forward and waits for all writers. - - Args: - last_index: The last index that was used for the reader/writer lock. - - Returns: - True if all writers were definitely finished; False if the reader/writer - lock timed out and we are proceeding anyways. - """ - # Increment the batch index counter so incoming jobs will use a new index. - # Don't bother setting an initial value here because next_index() will - # do this when it notices no current index is present. Do this *before* - # closing the reader/writer lock below to decrease active writers on the - # current index. - # We do this even in the case that batch_period_ms was zero, just in case - # that memcache operation failed for some reason, we'd rather have more - # batches then have the work index pipeline stall. - memcache.incr(self.index_name) - - # Prevent new writers by making the counter extremely negative. If the - # decrement fails here we can't recover anyways, so just let the worker go. - add_counter = self.add_counter_template % last_index - memcache.decr(add_counter, self.LOCK_OFFSET) - - for i in xrange(self.sync_attempts): - counter = memcache.get(add_counter) - # Less than or equal LOCK_OFFSET here in case a writer decrements twice - # due to rerunning failure tasks. - if counter is None or int(counter) <= self.LOCK_OFFSET: - # Worst-case the counter will be gone due to memcache eviction, which - # means the worker can procede with without waiting for writers - # and just process whatever it can find. This may drop some work. - return True - time.sleep(self.sync_timeout) - else: - logging.critical('Worker for %s gave up waiting for writers', self.name) - - return False - - def _query_work(self, index, cursor): - """Queries for work in the Datastore.""" - query = (self.model_class.all() - .filter('%s =' % self.index_property.name, index) - .order('__key__')) - if cursor: - query.with_cursor(cursor) - result_list = query.fetch(self.batch_size) - return result_list, query.cursor() - - def pop_request(self, request): - """Pops work to be done based on a task queue request. - - Args: - request: webapp.Request with the task payload. - - Returns: - A list of work items, if any. - """ - # TODO: Use request.headers['X-AppEngine-TaskName'] instead of environ. - return self.pop(os.environ['HTTP_X_APPENGINE_TASKNAME'], - request.get('cursor')) - - def pop(self, task_name, cursor=None): - """Pops work to be done based on just the task name. - - Args: - task_name: The name of the task. - cursor: The value of the cursor for this task (optional). - - Returns: - A list of work items, if any. - """ - rest, index, generation = task_name.rsplit('-', 2) - index, generation = int(index), int(generation) - - if not cursor: - # The root worker task already waited for all writers, so continuation - # tasks can start processing immediately. - self._increment_index(index) - - result_list, cursor = self._query_work(index, cursor) - - if len(result_list) == self.batch_size: - for i in xrange(3): - try: - taskqueue.Task( - method='POST', - name='%s-%d-%d' % (rest, index, generation + 1), - url=self.task_path, - params={'cursor': cursor} - ).add(self.get_queue_name(index)) - break - except (taskqueue.TaskAlreadyExistsError, - taskqueue.TombstonedTaskError): - # This means the continuation chain already started and this root - # task failed for some reason; no problem. - break - except (taskqueue.TransientError, taskqueue.InternalError): - # Ignore transient taskqueue errors. - if i == 2: - raise - - return result_list - - -class ShardedForkJoinQueue(ForkJoinQueue): - """A fork-join queue that shards actual work across multiple task queues.""" - - def __init__(self, *args, **kwargs): - """Initialized. - - Args: - *args, **kwargs: Passed to ForkJoinQueue. - shard_count: How many queues there are for sharding the incoming work. - """ - self.shard_count = kwargs.pop('shard_count') - ForkJoinQueue.__init__(self, *args, **kwargs) - - def get_queue_name(self, index): - return self.queue_name % {'shard': 1 + (index % self.shard_count)} - - -class MemcacheForkJoinQueue(ShardedForkJoinQueue): - """A fork-join queue that only stores work items in memcache. - - To use, call next_index() to get the work index then call the put() method, - passing one or more model instances to enqueued in memcache. - - Also a sharded queue for maximum throughput. - """ - - def __init__(self, *args, **kwargs): - """Initializer. - - Args: - *args, **kwargs: Passed to ShardedForkJoinQueue. - expiration_seconds: How long items inserted into memcache should remain - until they are evicted due to timeout. Default is 0, meaning they - will never be evicted. - """ - if 'expiration_seconds' in kwargs: - self.expiration_seconds = kwargs.pop('expiration_seconds') - else: - self.expiration_seconds = 0 - ShardedForkJoinQueue.__init__(self, *args, **kwargs) - - def _create_length_key(self, index): - """Creates a length memecache key for the length of the in-memory queue.""" - return '%s:length:%d' % (self.name, index) - - def _create_index_key(self, index, number): - """Creates an index memcache key for the given in-memory queue location.""" - return '%s:index:%d-%d' % (self.name, index, number) - - def put(self, - index, - entity_list, - memincr=memcache.incr, - memset=memcache.set_multi): - """Enqueue a model instance on this queue. - - Does not write to the Datastore. - - Args: - index: The work index for this entity. - entity_list: List of work entities to insert into the in-memory queue. - memincr, memset: Used for testing. - - Raises: - MemcacheError if the entities were not successfully added. - """ - length_key = self._create_length_key(index) - end = memincr(length_key, len(entity_list), initial_value=0) - if end is None: - raise MemcacheError('Could not increment length key %r' % length_key) - - start = end - len(entity_list) - key_map = {} - for number, entity in zip(xrange(start, end), entity_list): - key_map[self._create_index_key(index, number)] = db.model_to_protobuf( - entity) - - result = memset(key_map, time=self.expiration_seconds) - if result: - raise MemcacheError('Could not set memcache keys %r' % result) - - def _query_work(self, index, cursor): - """Queries for work in memcache.""" - if cursor: - try: - cursor = int(cursor) - except ValueError: - # This is an old style task that resides in the Datastore, not - # memcache. Use the parent implementation instead. - return super(MemcacheForkJoinQueue, self)._query_work(index, cursor) - else: - cursor = 0 - - key_list = [self._create_index_key(index, n) - for n in xrange(cursor, cursor + self.batch_size)] - results = memcache.get_multi(key_list) - - result_list = [] - for key in key_list: - proto = results.get(key) - if not proto: - continue - try: - result_list.append(db.model_from_protobuf(proto)) - except ProtocolBuffer.ProtocolBufferDecodeError: - logging.exception('Could not decode EntityPb at memcache key %r: %r', - key, proto) - - return result_list, cursor + self.batch_size diff --git a/hub/fork_join_queue_test.py b/hub/fork_join_queue_test.py deleted file mode 100755 index 3b351dc..0000000 --- a/hub/fork_join_queue_test.py +++ /dev/null @@ -1,539 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2010 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Tests for the fork_join_queue module.""" - -import datetime -import logging -logging.basicConfig(format='%(levelname)-8s %(filename)s] %(message)s') -import os -import random -import sys -import unittest - -import testutil -testutil.fix_path() - -from google.appengine.api import apiproxy_stub_map -from google.appengine.api import memcache -from google.appengine.ext import db -from google.appengine.ext import webapp - -import fork_join_queue - -################################################################################ - -class TestModel(db.Model): - work_index = db.IntegerProperty() - number = db.IntegerProperty() - - -TEST_QUEUE = fork_join_queue.ForkJoinQueue( - TestModel, - TestModel.work_index, - '/path/to/my/task', - 'default', - batch_size=3, - batch_period_ms=2200, - lock_timeout_ms=1000, - sync_timeout_ms=250, - stall_timeout_ms=30000, - acquire_timeout_ms=50, - acquire_attempts=20) - - -TEST_QUEUE_ZERO_BATCH_TIME = fork_join_queue.ForkJoinQueue( - TestModel, - TestModel.work_index, - '/path/to/my/task', - 'default', - batch_size=3, - batch_period_ms=0, - lock_timeout_ms=1000, - sync_timeout_ms=250, - stall_timeout_ms=30000, - acquire_timeout_ms=50, - acquire_attempts=20) - - -SHARDED_QUEUE = fork_join_queue.ShardedForkJoinQueue( - TestModel, - TestModel.work_index, - '/path/to/my/task', - 'default-%(shard)s', - batch_size=3, - batch_period_ms=2200, - lock_timeout_ms=1000, - sync_timeout_ms=250, - stall_timeout_ms=30000, - acquire_timeout_ms=50, - acquire_attempts=20, - shard_count=4) - - -MEMCACHE_QUEUE = fork_join_queue.MemcacheForkJoinQueue( - TestModel, - TestModel.work_index, - '/path/to/my/task', - 'default', - batch_size=3, - batch_period_ms=2200, - lock_timeout_ms=1000, - sync_timeout_ms=250, - stall_timeout_ms=30000, - acquire_timeout_ms=50, - acquire_attempts=20, - shard_count=4) - - -class ForkJoinQueueTest(unittest.TestCase): - """Tests for the ForkJoinQueue class.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing(require_indexes=False) - self.now1 = 1274078068.886692 - self.now2 = 1274078097.79072 - self.gettime1 = lambda: self.now1 - self.gettime2 = lambda: self.now2 - os.environ['CURRENT_VERSION_ID'] = 'myversion.1234' - if 'HTTP_X_APPENGINE_TASKNAME' in os.environ: - del os.environ['HTTP_X_APPENGINE_TASKNAME'] - - def expect_task(self, - index, - generation=0, - now_time=None, - cursor=None, - batch_period_ms=2200000): - """Creates an expected task dictionary.""" - if now_time is None: - now_time = self.now1 - gap_number = int(now_time / 30.0) - import math - work_item = { - 'name': 'fjq-TestModel-myversion-%d-%d-%d' % ( - gap_number, index, generation), - # Working around weird rounding behavior of task queue stub. - 'eta': (10**6 * now_time) + batch_period_ms, - } - if cursor is not None: - work_item['cursor'] = cursor - return work_item - - def assertTasksEqual(self, expected_tasks, found_tasks, - check_eta=True): - """Asserts two lists of tasks are equal.""" - found_tasks.sort(key=lambda t: t['eta']) - for expected, found in zip(expected_tasks, found_tasks): - self.assertEquals(expected['name'], found['name']) - if check_eta: - # Round these task ETAs to integers because the taskqueue stub does - # not support floating-point ETAs. - self.assertEquals(round(expected['eta'] / 10**6), - round(found['eta'] / 10**6)) - self.assertEquals(expected.get('cursor'), - found.get('params', {}).get('cursor')) - self.assertEquals('POST', found['method']) - self.assertEquals('/path/to/my/task', found['url']) - self.assertEquals(len(expected_tasks), len(found_tasks)) - - def testAddOne(self): - """Tests adding a single entity to the queue.""" - t = TestModel(number=1) - t.work_index = TEST_QUEUE.next_index() - t.put() - TEST_QUEUE.add(t.work_index, gettime=self.gettime1) - self.assertTasksEqual( - [self.expect_task(t.work_index)], - testutil.get_tasks('default', usec_eta=True)) - - def testAddMultiple(self): - """Tests adding multiple tasks on the same index.""" - t1 = TestModel(number=1) - t2 = TestModel(number=2) - t3 = TestModel(number=3) - - tasks = [t1, t2, t3] - work_index = TEST_QUEUE.next_index() - for t in tasks: - t.work_index = work_index - db.put(tasks) - TEST_QUEUE.add(work_index, gettime=self.gettime1) - - self.assertTasksEqual( - [self.expect_task(work_index)], - testutil.get_tasks('default', usec_eta=True)) - - def testAddMultipleDifferent(self): - """Tests adding multiple different tasks to different indexes.""" - t1 = TestModel(number=1) - t2 = TestModel(number=2) - t3 = TestModel(number=3) - - tasks1 = [t1, t2, t3] - work_index = TEST_QUEUE.next_index() - for t in tasks1: - t.work_index = work_index - db.put(tasks1) - TEST_QUEUE.add(work_index, gettime=self.gettime1) - - memcache.incr('fjq-TestModel-index') - - t4 = TestModel(number=4) - t5 = TestModel(number=5) - t6 = TestModel(number=6) - - tasks2 = [t4, t5, t6] - work_index2 = TEST_QUEUE.next_index() - for t in tasks2: - t.work_index = work_index2 - db.put(tasks2) - TEST_QUEUE.add(work_index2, gettime=self.gettime2) - - self.assertNotEqual(work_index, work_index2) - self.assertTasksEqual( - [self.expect_task(work_index), - self.expect_task(work_index2, now_time=self.now2)], - testutil.get_tasks('default', usec_eta=True)) - - def testAddAlreadyExists(self): - """Tests when the added task already exists.""" - t = TestModel(number=1) - t.work_index = TEST_QUEUE.next_index() - t.put() - TEST_QUEUE.add(t.work_index, gettime=self.gettime1) - TEST_QUEUE.add(t.work_index, gettime=self.gettime1) - - def testNextIndexError(self): - """Tests when the next index cannot be retrieved.""" - self.assertRaises( - fork_join_queue.CannotGetIndexError, - TEST_QUEUE.next_index, - memget=lambda x: None) - - def testNextIndexBusyWait(self): - """Tests busy waiting for a new index.""" - calls = [3] - def incr(*args, **kwargs): - result = memcache.incr(*args, **kwargs) - memcache.incr(TEST_QUEUE.index_name) - if calls[0] > 0: - calls[0] -= 1 - return 100 - else: - return result - - last_index = TEST_QUEUE.next_index(memincr=incr) - for i in xrange(1, 4): - self.assertEquals( - TEST_QUEUE.FAKE_ZERO, - int(memcache.get(TEST_QUEUE.add_counter_template % - fork_join_queue.knuth_hash(i)))) - self.assertEquals( - TEST_QUEUE.FAKE_ZERO + 1, # Extra 1 is for the writer lock! - int(memcache.get(TEST_QUEUE.add_counter_template % - fork_join_queue.knuth_hash(4)))) - - def testNextIndexBusyWaitFail(self): - """Tests when busy waiting for a new index fails.""" - seen_keys = [] - def fake_incr(key, *args, **kwargs): - seen_keys.append(key) - return 100 - self.assertRaises( - fork_join_queue.WriterLockError, - TEST_QUEUE.next_index, - memincr=fake_incr) - self.assertEquals( - (['fjq-TestModel-add-lock:2654435761'] * 20) + - ['fjq-TestModel-index'], - seen_keys) - - def testPopOne(self): - """Tests popping a single entity from the queue.""" - t1 = TestModel(work_index=TEST_QUEUE.next_index(), number=1) - - # Ensure these other tasks don't interfere. - t2 = TestModel(number=2) - t3 = TestModel(number=3) - db.put([t1, t2, t3]) - TEST_QUEUE.add(t1.work_index) - - request = testutil.create_test_request('POST', None) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = \ - self.expect_task(t1.work_index)['name'] - result = TEST_QUEUE.pop_request(request) - - self.assertEquals(1, len(result)) - self.assertEquals(t1.key(), result[0].key()) - self.assertEquals(t1.work_index, result[0].work_index) - self.assertEquals(1, result[0].number) - - def testPopMultiple(self): - """Tests popping multiple entities from a queue. - - Should also cause a continuation task to be inserted to handle entities - after the current batch size. - """ - work_index = TEST_QUEUE.next_index() - tasks = [] - for i in xrange(6): - # Simplify tests by assigning the key names of the TestModel, making it - # so the values returned by pop_request() below are predictable. - key = db.Key.from_path(TestModel.kind(), i+1) - tasks.append(TestModel(key=key, work_index=work_index, number=i)) - db.put(tasks) - TEST_QUEUE.add(work_index, gettime=self.gettime1) - - # First pop request. - request = testutil.create_test_request('POST', None) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = \ - self.expect_task(work_index)['name'] - result_list = TEST_QUEUE.pop_request(request) - - self.assertEquals(3, len(result_list)) - for i, result in enumerate(result_list): - self.assertEquals(work_index, result.work_index) - self.assertEquals(i, result.number) - - # Continuation one. - next_task = testutil.get_tasks('default', - expected_count=2, - index=1, - usec_eta=True) - self.assertTrue('cursor' in next_task['params']) - self.assertTrue(next_task['name'].endswith('-1')) - - request = testutil.create_test_request('POST', None, - *next_task['params'].items()) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = next_task['name'] - result_list = TEST_QUEUE.pop_request(request) - self.assertEquals(3, len(result_list)) - for i, result in enumerate(result_list): - self.assertEquals(work_index, result.work_index) - self.assertEquals(i + 3, result.number) - - # Continuation two. - next_task = testutil.get_tasks('default', - expected_count=3, - index=2, - usec_eta=True) - next_params = next_task['params'] - self.assertTrue('cursor' in next_params) - self.assertTrue(next_task['name'].endswith('-2')) - - request = testutil.create_test_request('POST', None, - *next_task['params'].items()) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = next_task['name'] - result_list = TEST_QUEUE.pop_request(request) - self.assertEquals([], result_list) - testutil.get_tasks('default', expected_count=3, usec_eta=True) - - def testPopAlreadyExists(self): - """Tests popping work items when the continuation task already exists.""" - work_index = TEST_QUEUE.next_index() - tasks = [] - for i in xrange(6): - tasks.append(TestModel(work_index=work_index, number=i)) - db.put(tasks) - TEST_QUEUE.add(work_index, gettime=self.gettime1) - testutil.get_tasks('default', expected_count=1) - - request = testutil.create_test_request('POST', None) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = \ - self.expect_task(work_index)['name'] - - result_list = TEST_QUEUE.pop_request(request) - testutil.get_tasks('default', expected_count=2) - - result_list = TEST_QUEUE.pop_request(request) - testutil.get_tasks('default', expected_count=2) - - def testIncrementIndexFail(self): - """Tests when the reader-writer lock is lost.""" - work_index = TEST_QUEUE.next_index() - memcache.delete(TEST_QUEUE.add_counter_template % work_index) - self.assertTrue(TEST_QUEUE._increment_index(work_index)) - - def testIncrementIndexBusyWait(self): - """Tests busy waiting for the writer lock.""" - work_index = TEST_QUEUE.next_index() - self.assertFalse(TEST_QUEUE._increment_index(work_index)) - - def testZeroBatchTime(self): - """Tests that zero batch time results in no task ETA.""" - work_index = TEST_QUEUE_ZERO_BATCH_TIME.next_index() - task = TestModel(work_index=work_index, number=1) - db.put(task) - - before_index = memcache.get(TEST_QUEUE_ZERO_BATCH_TIME.index_name) - self.assertEquals( - work_index, fork_join_queue.knuth_hash(before_index)) - TEST_QUEUE_ZERO_BATCH_TIME.add(work_index, gettime=self.gettime1) - - # This confirms the behavior that batch_period_ms of zero will cause - # immediate increment after adding the tasks. - after_index = memcache.get(TEST_QUEUE_ZERO_BATCH_TIME.index_name) - self.assertEquals(before_index + 1, after_index) - - self.assertTasksEqual( - [self.expect_task(work_index)], - testutil.get_tasks('default', usec_eta=True), - check_eta=False) - - def testShardedQueue(self): - """Tests adding and popping from a sharded queue with continuation.""" - from google.appengine.api import apiproxy_stub_map - stub = apiproxy_stub_map.apiproxy.GetStub('taskqueue') - stub._queues[None]._all_queues_valid = True - try: - work_index = SHARDED_QUEUE.next_index() - tasks = [] - for i in xrange(5): - # Simplify tests by assigning the key names of the TestModel, making it - # so the values returned by pop_request() below are predictable. - key = db.Key.from_path(TestModel.kind(), i+1) - tasks.append(TestModel(key=key, work_index=work_index, number=i)) - db.put(tasks) - SHARDED_QUEUE.add(work_index, gettime=self.gettime1) - queue_name = 'default-%d' % (1 + (work_index % 4)) - testutil.get_tasks(queue_name, expected_count=1) - - # First pop request. - request = testutil.create_test_request('POST', None) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = \ - self.expect_task(work_index)['name'] - result_list = SHARDED_QUEUE.pop_request(request) - - self.assertEquals(3, len(result_list)) - for i, result in enumerate(result_list): - self.assertEquals(work_index, result.work_index) - self.assertEquals(i, result.number) - - # Continuation one. - next_task = testutil.get_tasks(queue_name, - expected_count=2, - index=1) - self.assertTrue('cursor' in next_task['params']) - self.assertTrue(next_task['name'].endswith('-1')) - finally: - stub._queues[None]._all_queues_valid = False - - def testMemcacheQueue(self): - """Tests adding and popping from an in-memory queue with continuation.""" - work_index = MEMCACHE_QUEUE.next_index() - work_items = [TestModel(key=db.Key.from_path(TestModel.kind(), i), - work_index=work_index, number=i) - for i in xrange(1, 6)] - MEMCACHE_QUEUE.put(work_index, work_items) - MEMCACHE_QUEUE.add(work_index, gettime=self.gettime1) - testutil.get_tasks('default', expected_count=1) - - # First pop request. - request = testutil.create_test_request('POST', None) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = \ - self.expect_task(work_index)['name'] - result_list = MEMCACHE_QUEUE.pop_request(request) - - self.assertEquals(3, len(result_list)) - for i, result in enumerate(result_list): - self.assertEquals(work_index, result.work_index) - self.assertEquals(i + 1, result.number) - - # Continuation task enqueued. - next_task = testutil.get_tasks('default', - expected_count=2, - index=1) - self.assertEquals(3, int(next_task['params']['cursor'])) - self.assertTrue(next_task['name'].endswith('-1')) - - # Second pop request. - request = testutil.create_test_request( - 'POST', None, *next_task['params'].items()) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = next_task['name'] - result_list = MEMCACHE_QUEUE.pop_request(request) - - self.assertEquals(2, len(result_list)) - for i, result in enumerate(result_list): - self.assertEquals(work_index, result.work_index) - self.assertEquals(i + 4, result.number) - - def testMemcacheQueue_IncrError(self): - """Tests calling put() when memcache increment fails.""" - work_index = MEMCACHE_QUEUE.next_index() - entity = TestModel(work_index=work_index, number=0) - self.assertRaises(fork_join_queue.MemcacheError, - MEMCACHE_QUEUE.put, - work_index, [entity], - memincr=lambda *a, **k: None) - - def testMemcacheQueue_PutSetError(self): - """Tests calling put() when memcache set fails.""" - work_index = MEMCACHE_QUEUE.next_index() - entity = TestModel(work_index=work_index, number=0) - self.assertRaises(fork_join_queue.MemcacheError, - MEMCACHE_QUEUE.put, - work_index, [entity], - memset=lambda *a, **k: ['blah']) - - def testMemcacheQueue_PopError(self): - """Tests calling pop() when memcache is down.""" - work_index = MEMCACHE_QUEUE.next_index() - entity = TestModel(work_index=work_index, number=0) - MEMCACHE_QUEUE.put(work_index, [entity]) - memcache.flush_all() - - request = testutil.create_test_request('POST', None) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = \ - self.expect_task(work_index)['name'] - result_list = MEMCACHE_QUEUE.pop_request(request) - self.assertEquals([], result_list) - - def testMemcacheQueue_PopHoles(self): - """Tests when there are holes in the memcache array.""" - work_index = MEMCACHE_QUEUE.next_index() - work_items = [TestModel(key=db.Key.from_path(TestModel.kind(), i), - work_index=work_index, number=i) - for i in xrange(1, 6)] - MEMCACHE_QUEUE.put(work_index, work_items) - memcache.delete(MEMCACHE_QUEUE._create_index_key(work_index, 1)) - - request = testutil.create_test_request('POST', None) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = \ - self.expect_task(work_index)['name'] - result_list = MEMCACHE_QUEUE.pop_request(request) - self.assertEquals([1, 3], [r.number for r in result_list]) - - def testMemcacheQueue_PopDecodeError(self): - """Tests when proto decoding fails on the pop() call.""" - work_index = MEMCACHE_QUEUE.next_index() - work_items = [TestModel(key=db.Key.from_path(TestModel.kind(), i), - work_index=work_index, number=i) - for i in xrange(1, 6)] - MEMCACHE_QUEUE.put(work_index, work_items) - - memcache.set(MEMCACHE_QUEUE._create_index_key(work_index, 1), 'bad data') - request = testutil.create_test_request('POST', None) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = \ - self.expect_task(work_index)['name'] - result_list = MEMCACHE_QUEUE.pop_request(request) - -################################################################################ - -if __name__ == '__main__': - unittest.main() diff --git a/hub/google3/__init__.py b/hub/google3/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/hub/google3/apphosting/__init__.py b/hub/google3/apphosting/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/hub/google3/apphosting/runtime/__init__.py b/hub/google3/apphosting/runtime/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/hub/google3/apphosting/runtime/_apphosting_runtime___python__apiproxy.py b/hub/google3/apphosting/runtime/_apphosting_runtime___python__apiproxy.py deleted file mode 100644 index db390f7..0000000 --- a/hub/google3/apphosting/runtime/_apphosting_runtime___python__apiproxy.py +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env python -"""THIS IS AN EMPTY FILE.""" \ No newline at end of file diff --git a/hub/index.yaml b/hub/index.yaml deleted file mode 100644 index 8ce7787..0000000 --- a/hub/index.yaml +++ /dev/null @@ -1,19 +0,0 @@ -indexes: - -# For iterating through subscribers to deliver events to. -- kind: Subscription - properties: - - name: subscription_state - - name: topic_hash - - name: callback_hash - - -# AUTOGENERATED - -# This index.yaml is automatically updated whenever the dev_appserver -# detects that a new type of query is run. If you want to manage the -# index.yaml file manually, remove the above marker line (the line -# saying "# AUTOGENERATED"). If you want to manage some indexes -# manually, move them above the marker line. The index.yaml file is -# automatically uploaded to the admin console when you next deploy -# your application using appcfg.py. diff --git a/hub/main.py b/hub/main.py deleted file mode 100644 index fc4a876..0000000 --- a/hub/main.py +++ /dev/null @@ -1,3562 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2008 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""PubSubHubbub protocol Hub implementation built on Google App Engine. - -=== Model classes: - -* Subscription: A single subscriber's lease on a topic URL. Also represents a - work item of a subscription that is awaiting confirmation (sub. or unsub). - -* FeedToFetch: Work item inserted when a publish event occurs. This will be - moved to the Task Queue API once available. - -* KnownFeed: Materialized view of all distinct topic URLs. Written by - background task every time a new subscription is made for a topic URL. - Used for mapping from input topic URLs to feed IDs and then back to topic - URLs, to properly handle any feed aliases. Also used for doing bootstrap - polling of feeds. - -* KnownFeedIdentity: Reverse index of feed ID to topic URLs. Used in - conjunction with KnownFeed to properly canonicalize feed aliases on - subscription and pinging. - -* KnownFeedStats: Statistics about a topic URL. Used to provide subscriber - counts to publishers on feed fetch. - -* FeedRecord: Metadata information about a feed, the last time it was polled, - and any headers that may affect future polling. Also contains any debugging - information about the last feed fetch and why it may have failed. - -* FeedEntryRecord: Record of a single entry in a single feed. May eventually - be garbage collected after enough time has passed since it was last seen. - -* EventToDeliver: Work item that contains the content to deliver for a feed - event. Maintains current position in subscribers and number of delivery - failures. Used to coordinate delivery retries. Will be deleted in successful - cases or stick around in the event of complete failures for debugging. - -* PollingMarker: Work item that keeps track of the last time all KnownFeed - instances were fetched. Used to do bootstrap polling. - - -=== Entity groups: - -Subscription entities are in their own entity group to allow for a high number -of simultaneous subscriptions for the same topic URL. FeedToFetch is also in -its own entity group for the same reason. FeedRecord, FeedEntryRecord, and -EventToDeliver entries are all in the same entity group, however, to ensure that -each feed polling is either full committed and delivered to subscribers or fails -and will be retried at a later time. - - ------------ - | FeedRecord | - -----+------ - | - | - +-------------+-------------+ - | | - | | - --------+-------- --------+------- -| FeedEntryRecord | | EventToDeliver | - ----------------- ---------------- -""" - -# Bigger TODOs (in priority order) -# -# - Improve polling algorithm to keep stats on each feed. -# -# - Do not poll a feed if we've gotten an event from the publisher in less -# than the polling period. - -import datetime -import gc -import hashlib -import hmac -import logging -import os -import random -import sgmllib -import time -import traceback -import urllib -import urlparse -import wsgiref.handlers -import xml.sax - -from google.appengine import runtime -from google.appengine.api import datastore_types -from google.appengine.api import memcache -from google.appengine.api import urlfetch -from google.appengine.api import urlfetch_errors -from google.appengine.api import taskqueue -from google.appengine.api import users -from google.appengine.ext import db -from google.appengine.ext import webapp -from google.appengine.ext.webapp import template -from google.appengine.runtime import apiproxy_errors - -import async_apiproxy -import dos -import feed_diff -import feed_identifier -import fork_join_queue -import urlfetch_async - -import mapreduce.control -import mapreduce.model - -async_proxy = async_apiproxy.AsyncAPIProxy() - -################################################################################ -# Config parameters - -DEBUG = True - -if DEBUG: - logging.getLogger().setLevel(logging.DEBUG) - -# How many subscribers to contact at a time when delivering events. -EVENT_SUBSCRIBER_CHUNK_SIZE = 50 - -# Maximum number of times to attempt a subscription retry. -MAX_SUBSCRIPTION_CONFIRM_FAILURES = 4 - -# Period to use for exponential backoff on subscription confirm retries. -SUBSCRIPTION_RETRY_PERIOD = 30 # seconds - -# Maximum number of times to attempt to pull a feed. -MAX_FEED_PULL_FAILURES = 4 - -# Period to use for exponential backoff on feed pulling. -FEED_PULL_RETRY_PERIOD = 30 # seconds - -# Maximum number of times to attempt to deliver a feed event. -MAX_DELIVERY_FAILURES = 4 - -# Period to use for exponential backoff on feed event delivery. -DELIVERY_RETRY_PERIOD = 30 # seconds - -# Period at which feed IDs should be refreshed. -FEED_IDENTITY_UPDATE_PERIOD = (20 * 24 * 60 * 60) # 20 days - -# Number of polling feeds to fetch from the Datastore at a time. -BOOSTRAP_FEED_CHUNK_SIZE = 50 - -# How many old Subscription instances to clean up at a time. -SUBSCRIPTION_CLEANUP_CHUNK_SIZE = 100 - -# How far before expiration to refresh subscriptions. -SUBSCRIPTION_CHECK_BUFFER_SECONDS = (24 * 60 * 60) # 24 hours - -# How many mapper shards to use for reconfirming subscriptions. -SUBSCRIPTION_RECONFIRM_SHARD_COUNT = 4 - -# How often to poll feeds. -POLLING_BOOTSTRAP_PERIOD = 10800 # in seconds; 3 hours - -# Default expiration time of a lease. -DEFAULT_LEASE_SECONDS = (5 * 24 * 60 * 60) # 5 days - -# Maximum expiration time of a lease. -MAX_LEASE_SECONDS = (10 * 24 * 60 * 60) # 10 days - -# Maximum number of redirects to follow when feed fetching. -MAX_REDIRECTS = 7 - -# Maximum time to wait for fetching a feed in seconds. -MAX_FETCH_SECONDS = 10 - -# Number of times to try to split FeedEntryRecord, EventToDeliver, and -# FeedRecord entities when putting them and their size is too large. -PUT_SPLITTING_ATTEMPTS = 10 - -# Maximum number of FeedEntryRecord entries to look up in parallel. -MAX_FEED_ENTRY_RECORD_LOOKUPS = 500 - -# Maximum number of FeedEntryRecord entries to save at the same time when -# a new EventToDeliver is being written. -MAX_FEED_RECORD_SAVES = 100 - -# Maximum number of new FeedEntryRecords to process and insert at a time. Any -# remaining will be split into another EventToDeliver instance. -MAX_NEW_FEED_ENTRY_RECORDS = 200 - -################################################################################ -# URL scoring Parameters - -# Fetching feeds -FETCH_SCORER = dos.UrlScorer( - period=300, # Seconds - min_requests=5, # per second - max_failure_percentage=0.8, - prefix='pull_feed') - -# Pushing events -DELIVERY_SCORER = dos.UrlScorer( - period=300, # Seconds - min_requests=0.5, # per second - max_failure_percentage=0.8, - prefix='deliver_events') - - -################################################################################ -# Fetching samplers - -FETCH_URL_SAMPLE_MINUTE = dos.ReservoirConfig( - 'fetch_url_1m', - period=60, - samples=10000, - by_url=True, - value_units='% errors') - -FETCH_URL_SAMPLE_30_MINUTE = dos.ReservoirConfig( - 'fetch_url_30m', - period=1800, - samples=10000, - by_url=True, - value_units='% errors') - -FETCH_URL_SAMPLE_HOUR = dos.ReservoirConfig( - 'fetch_url_1h', - period=3600, - samples=10000, - by_url=True, - value_units='% errors') - -FETCH_URL_SAMPLE_DAY = dos.ReservoirConfig( - 'fetch_url_1d', - period=86400, - samples=10000, - by_url=True, - value_units='% errors') - -FETCH_DOMAIN_SAMPLE_MINUTE = dos.ReservoirConfig( - 'fetch_domain_1m', - period=60, - samples=10000, - by_domain=True, - value_units='% errors') - -FETCH_DOMAIN_SAMPLE_30_MINUTE = dos.ReservoirConfig( - 'fetch_domain_30m', - period=1800, - samples=10000, - by_domain=True, - value_units='% errors') - -FETCH_DOMAIN_SAMPLE_HOUR = dos.ReservoirConfig( - 'fetch_domain_1h', - period=3600, - samples=10000, - by_domain=True, - value_units='% errors') - -FETCH_DOMAIN_SAMPLE_DAY = dos.ReservoirConfig( - 'fetch_domain_1d', - period=86400, - samples=10000, - by_domain=True, - value_units='% errors') - -FETCH_URL_SAMPLE_MINUTE_LATENCY = dos.ReservoirConfig( - 'fetch_url_1m_latency', - period=60, - samples=10000, - by_url=True, - value_units='ms') - -FETCH_URL_SAMPLE_30_MINUTE_LATENCY = dos.ReservoirConfig( - 'fetch_url_30m_latency', - period=1800, - samples=10000, - by_url=True, - value_units='ms') - -FETCH_URL_SAMPLE_HOUR_LATENCY = dos.ReservoirConfig( - 'fetch_url_1h_latency', - period=3600, - samples=10000, - by_url=True, - value_units='ms') - -FETCH_URL_SAMPLE_DAY_LATENCY = dos.ReservoirConfig( - 'fetch_url_1d_latency', - period=86400, - samples=10000, - by_url=True, - value_units='ms') - -FETCH_DOMAIN_SAMPLE_MINUTE_LATENCY = dos.ReservoirConfig( - 'fetch_domain_1m_latency', - period=60, - samples=10000, - by_domain=True, - value_units='ms') - -FETCH_DOMAIN_SAMPLE_30_MINUTE_LATENCY = dos.ReservoirConfig( - 'fetch_domain_30m_latency', - period=1800, - samples=10000, - by_domain=True, - value_units='ms') - -FETCH_DOMAIN_SAMPLE_HOUR_LATENCY = dos.ReservoirConfig( - 'fetch_domain_1h_latency', - period=3600, - samples=10000, - by_domain=True, - value_units='ms') - -FETCH_DOMAIN_SAMPLE_DAY_LATENCY = dos.ReservoirConfig( - 'fetch_domain_1d_latency', - period=86400, - samples=10000, - by_domain=True, - value_units='ms') - - -def report_fetch(reporter, url, success, latency): - """Reports statistics information for a feed fetch. - - Args: - reporter: dos.Reporter instance. - url: The URL of the topic URL that was fetched. - success: True if the fetch was successful, False otherwise. - latency: End-to-end fetch latency in milliseconds. - """ - value = 100 * int(not success) - reporter.set(url, FETCH_URL_SAMPLE_MINUTE, value) - reporter.set(url, FETCH_URL_SAMPLE_30_MINUTE, value) - reporter.set(url, FETCH_URL_SAMPLE_HOUR, value) - reporter.set(url, FETCH_URL_SAMPLE_DAY, value) - reporter.set(url, FETCH_DOMAIN_SAMPLE_MINUTE, value) - reporter.set(url, FETCH_DOMAIN_SAMPLE_30_MINUTE, value) - reporter.set(url, FETCH_DOMAIN_SAMPLE_HOUR, value) - reporter.set(url, FETCH_DOMAIN_SAMPLE_DAY, value) - reporter.set(url, FETCH_URL_SAMPLE_MINUTE_LATENCY, latency) - reporter.set(url, FETCH_URL_SAMPLE_30_MINUTE_LATENCY, latency) - reporter.set(url, FETCH_URL_SAMPLE_HOUR_LATENCY, latency) - reporter.set(url, FETCH_URL_SAMPLE_DAY_LATENCY, latency) - reporter.set(url, FETCH_DOMAIN_SAMPLE_MINUTE_LATENCY, latency) - reporter.set(url, FETCH_DOMAIN_SAMPLE_30_MINUTE_LATENCY, latency) - reporter.set(url, FETCH_DOMAIN_SAMPLE_HOUR_LATENCY, latency) - reporter.set(url, FETCH_DOMAIN_SAMPLE_DAY_LATENCY, latency) - - -FETCH_SAMPLER = dos.MultiSampler([ - FETCH_URL_SAMPLE_MINUTE, - FETCH_URL_SAMPLE_30_MINUTE, - FETCH_URL_SAMPLE_HOUR, - FETCH_URL_SAMPLE_DAY, - FETCH_DOMAIN_SAMPLE_MINUTE, - FETCH_DOMAIN_SAMPLE_30_MINUTE, - FETCH_DOMAIN_SAMPLE_HOUR, - FETCH_DOMAIN_SAMPLE_DAY, - FETCH_URL_SAMPLE_MINUTE_LATENCY, - FETCH_URL_SAMPLE_30_MINUTE_LATENCY, - FETCH_URL_SAMPLE_HOUR_LATENCY, - FETCH_URL_SAMPLE_DAY_LATENCY, - FETCH_DOMAIN_SAMPLE_MINUTE_LATENCY, - FETCH_DOMAIN_SAMPLE_30_MINUTE_LATENCY, - FETCH_DOMAIN_SAMPLE_HOUR_LATENCY, - FETCH_DOMAIN_SAMPLE_DAY_LATENCY, -]) - -################################################################################ -# Delivery samplers - -DELIVERY_URL_SAMPLE_MINUTE = dos.ReservoirConfig( - 'delivery_url_1m', - period=60, - samples=10000, - by_url=True, - value_units='% errors') - -DELIVERY_URL_SAMPLE_30_MINUTE = dos.ReservoirConfig( - 'delivery_url_30m', - period=1800, - samples=10000, - by_url=True, - value_units='% errors') - -DELIVERY_URL_SAMPLE_HOUR = dos.ReservoirConfig( - 'delivery_url_1h', - period=3600, - samples=10000, - by_url=True, - value_units='% errors') - -DELIVERY_URL_SAMPLE_DAY = dos.ReservoirConfig( - 'delivery_url_1d', - period=86400, - samples=10000, - by_url=True, - value_units='% errors') - -DELIVERY_DOMAIN_SAMPLE_MINUTE = dos.ReservoirConfig( - 'delivery_domain_1m', - period=60, - samples=10000, - by_domain=True, - value_units='% errors') - -DELIVERY_DOMAIN_SAMPLE_30_MINUTE = dos.ReservoirConfig( - 'delivery_domain_30m', - period=1800, - samples=10000, - by_domain=True, - value_units='% errors') - -DELIVERY_DOMAIN_SAMPLE_HOUR = dos.ReservoirConfig( - 'delivery_domain_1h', - period=3600, - samples=10000, - by_domain=True, - value_units='% errors') - -DELIVERY_DOMAIN_SAMPLE_DAY = dos.ReservoirConfig( - 'delivery_domain_1d', - period=86400, - samples=10000, - by_domain=True, - value_units='% errors') - -DELIVERY_URL_SAMPLE_MINUTE_LATENCY = dos.ReservoirConfig( - 'delivery_url_1m_latency', - period=60, - samples=10000, - by_url=True, - value_units='ms') - -DELIVERY_URL_SAMPLE_30_MINUTE_LATENCY = dos.ReservoirConfig( - 'delivery_url_30m_latency', - period=1800, - samples=10000, - by_url=True, - value_units='ms') - -DELIVERY_URL_SAMPLE_HOUR_LATENCY = dos.ReservoirConfig( - 'delivery_url_1h_latency', - period=3600, - samples=10000, - by_url=True, - value_units='ms') - -DELIVERY_URL_SAMPLE_DAY_LATENCY = dos.ReservoirConfig( - 'delivery_url_1d_latency', - period=86400, - samples=10000, - by_url=True, - value_units='ms') - -DELIVERY_DOMAIN_SAMPLE_MINUTE_LATENCY = dos.ReservoirConfig( - 'delivery_domain_1m_latency', - period=60, - samples=10000, - by_domain=True, - value_units='ms') - -DELIVERY_DOMAIN_SAMPLE_30_MINUTE_LATENCY = dos.ReservoirConfig( - 'delivery_domain_30m_latency', - period=1800, - samples=10000, - by_domain=True, - value_units='ms') - -DELIVERY_DOMAIN_SAMPLE_HOUR_LATENCY = dos.ReservoirConfig( - 'delivery_domain_1h_latency', - period=3600, - samples=10000, - by_domain=True, - value_units='ms') - -DELIVERY_DOMAIN_SAMPLE_DAY_LATENCY = dos.ReservoirConfig( - 'delivery_domain_1d_latency', - period=86400, - samples=10000, - by_domain=True, - value_units='ms') - - -def report_delivery(reporter, url, success, latency): - """Reports statistics information for a event delivery to a callback. - - Args: - reporter: dos.Reporter instance. - url: The URL of the callback that received the event. - success: True if the delivery was successful, False otherwise. - latency: End-to-end fetch latency in milliseconds. - """ - value = 100 * int(not success) - reporter.set(url, DELIVERY_URL_SAMPLE_MINUTE, value) - reporter.set(url, DELIVERY_URL_SAMPLE_30_MINUTE, value) - reporter.set(url, DELIVERY_URL_SAMPLE_HOUR, value) - reporter.set(url, DELIVERY_URL_SAMPLE_DAY, value) - reporter.set(url, DELIVERY_DOMAIN_SAMPLE_MINUTE, value) - reporter.set(url, DELIVERY_DOMAIN_SAMPLE_30_MINUTE, value) - reporter.set(url, DELIVERY_DOMAIN_SAMPLE_HOUR, value) - reporter.set(url, DELIVERY_DOMAIN_SAMPLE_DAY, value) - reporter.set(url, DELIVERY_URL_SAMPLE_MINUTE_LATENCY, latency) - reporter.set(url, DELIVERY_URL_SAMPLE_30_MINUTE_LATENCY, latency) - reporter.set(url, DELIVERY_URL_SAMPLE_HOUR_LATENCY, latency) - reporter.set(url, DELIVERY_URL_SAMPLE_DAY_LATENCY, latency) - reporter.set(url, DELIVERY_DOMAIN_SAMPLE_MINUTE_LATENCY, latency) - reporter.set(url, DELIVERY_DOMAIN_SAMPLE_30_MINUTE_LATENCY, latency) - reporter.set(url, DELIVERY_DOMAIN_SAMPLE_HOUR_LATENCY, latency) - reporter.set(url, DELIVERY_DOMAIN_SAMPLE_DAY_LATENCY, latency) - - -DELIVERY_SAMPLER = dos.MultiSampler([ - DELIVERY_URL_SAMPLE_MINUTE, - DELIVERY_URL_SAMPLE_30_MINUTE, - DELIVERY_URL_SAMPLE_HOUR, - DELIVERY_URL_SAMPLE_DAY, - DELIVERY_DOMAIN_SAMPLE_MINUTE, - DELIVERY_DOMAIN_SAMPLE_30_MINUTE, - DELIVERY_DOMAIN_SAMPLE_HOUR, - DELIVERY_DOMAIN_SAMPLE_DAY, - DELIVERY_URL_SAMPLE_MINUTE_LATENCY, - DELIVERY_URL_SAMPLE_30_MINUTE_LATENCY, - DELIVERY_URL_SAMPLE_HOUR_LATENCY, - DELIVERY_URL_SAMPLE_DAY_LATENCY, - DELIVERY_DOMAIN_SAMPLE_MINUTE_LATENCY, - DELIVERY_DOMAIN_SAMPLE_30_MINUTE_LATENCY, - DELIVERY_DOMAIN_SAMPLE_HOUR_LATENCY, - DELIVERY_DOMAIN_SAMPLE_DAY_LATENCY, -]) - -################################################################################ -# Constants - -ATOM = 'atom' -RSS = 'rss' -ARBITRARY = 'arbitrary' - -VALID_PORTS = frozenset([ - '80', '443', '4443', '8080', '8081', '8082', '8083', '8084', '8085', - '8086', '8087', '8088', '8089', '8188', '8444', '8990']) - -EVENT_QUEUE = 'event-delivery' - -EVENT_RETRIES_QUEUE = 'event-delivery-retries' - -FEED_QUEUE = 'feed-pulls' - -FEED_RETRIES_QUEUE = 'feed-pulls-retries' - -POLLING_QUEUE = 'polling' - -SUBSCRIPTION_QUEUE = 'subscriptions' - -MAPPINGS_QUEUE = 'mappings' - -################################################################################ -# Helper functions - -def utf8encoded(data): - """Encodes a string as utf-8 data and returns an ascii string. - - Args: - data: The string data to encode. - - Returns: - An ascii string, or None if the 'data' parameter was None. - """ - if data is None: - return None - if isinstance(data, unicode): - return unicode(data).encode('utf-8') - else: - return data - - -def normalize_iri(url): - """Converts a URL (possibly containing unicode characters) to an IRI. - - Args: - url: String (normal or unicode) containing a URL, presumably having - already been percent-decoded by a web framework receiving request - parameters in a POST body or GET request's URL. - - Returns: - A properly encoded IRI (see RFC 3987). - """ - def chr_or_escape(unicode_char): - if ord(unicode_char) > 0x7f: - return urllib.quote(unicode_char.encode('utf-8')) - else: - return unicode_char - return ''.join(chr_or_escape(c) for c in unicode(url)) - - -def sha1_hash(value): - """Returns the sha1 hash of the supplied value.""" - return hashlib.sha1(utf8encoded(value)).hexdigest() - - -def get_hash_key_name(value): - """Returns a valid entity key_name that's a hash of the supplied value.""" - return 'hash_' + sha1_hash(value) - - -def sha1_hmac(secret, data): - """Returns the sha1 hmac for a chunk of data and a secret.""" - # For Python 2.6, which can only compute hmacs on non-unicode data. - secret = utf8encoded(secret) - data = utf8encoded(data) - return hmac.new(secret, data, hashlib.sha1).hexdigest() - - -def is_dev_env(): - """Returns True if we're running in the development environment.""" - return 'Dev' in os.environ.get('SERVER_SOFTWARE', '') - - -def work_queue_only(func): - """Decorator that only allows a request if from cron job, task, or an admin. - - Also allows access if running in development server environment. - - Args: - func: A webapp.RequestHandler method. - - Returns: - Function that will return a 401 error if not from an authorized source. - """ - def decorated(myself, *args, **kwargs): - if ('X-AppEngine-Cron' in myself.request.headers or - 'X-AppEngine-TaskName' in myself.request.headers or - is_dev_env() or users.is_current_user_admin()): - return func(myself, *args, **kwargs) - elif users.get_current_user() is None: - myself.redirect(users.create_login_url(myself.request.url)) - else: - myself.response.set_status(401) - myself.response.out.write('Handler only accessible for work queues') - return decorated - - -def is_valid_url(url): - """Returns True if the URL is valid, False otherwise.""" - split = urlparse.urlparse(url) - if not split.scheme in ('http', 'https'): - logging.debug('URL scheme is invalid: %s', url) - return False - - netloc, port = (split.netloc.split(':', 1) + [''])[:2] - if port and not is_dev_env() and port not in VALID_PORTS: - logging.debug('URL port is invalid: %s', url) - return False - - if split.fragment: - logging.debug('URL includes fragment: %s', url) - return False - - return True - - -_VALID_CHARS = ( - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', - 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', - 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', -) - - -def get_random_challenge(): - """Returns a string containing a random challenge token.""" - return ''.join(random.choice(_VALID_CHARS) for i in xrange(128)) - -################################################################################ -# Models - -class Subscription(db.Model): - """Represents a single subscription to a topic for a callback URL.""" - - STATE_NOT_VERIFIED = 'not_verified' - STATE_VERIFIED = 'verified' - STATE_TO_DELETE = 'to_delete' - STATES = frozenset([ - STATE_NOT_VERIFIED, - STATE_VERIFIED, - STATE_TO_DELETE, - ]) - - callback = db.TextProperty(required=True) - callback_hash = db.StringProperty(required=True) - topic = db.TextProperty(required=True) - topic_hash = db.StringProperty(required=True) - created_time = db.DateTimeProperty(auto_now_add=True) - last_modified = db.DateTimeProperty(auto_now=True) - lease_seconds = db.IntegerProperty(default=DEFAULT_LEASE_SECONDS) - expiration_time = db.DateTimeProperty(required=True) - eta = db.DateTimeProperty(auto_now_add=True) - confirm_failures = db.IntegerProperty(default=0) - verify_token = db.TextProperty() - secret = db.TextProperty() - hmac_algorithm = db.TextProperty() - subscription_state = db.StringProperty(default=STATE_NOT_VERIFIED, - choices=STATES) - - @staticmethod - def create_key_name(callback, topic): - """Returns the key name for a Subscription entity. - - Args: - callback: URL of the callback subscriber. - topic: URL of the topic being subscribed to. - - Returns: - String containing the key name for the corresponding Subscription. - """ - return get_hash_key_name(u'%s\n%s' % (callback, topic)) - - @classmethod - def insert(cls, - callback, - topic, - verify_token, - secret, - hash_func='sha1', - lease_seconds=DEFAULT_LEASE_SECONDS, - now=datetime.datetime.now): - """Marks a callback URL as being subscribed to a topic. - - Creates a new subscription if None already exists. Forces any existing, - pending request (i.e., async) to immediately enter the verified state. - - Args: - callback: URL that will receive callbacks. - topic: The topic to subscribe to. - verify_token: The verification token to use to confirm the - subscription request. - secret: Shared secret used for HMACs. - hash_func: String with the name of the hash function to use for HMACs. - lease_seconds: Number of seconds the client would like the subscription - to last before expiring. Must be a number. - now: Callable that returns the current time as a datetime instance. Used - for testing - - Returns: - True if the subscription was newly created, False otherwise. - """ - key_name = cls.create_key_name(callback, topic) - now_time = now() - def txn(): - sub_is_new = False - sub = cls.get_by_key_name(key_name) - if sub is None: - sub_is_new = True - sub = cls(key_name=key_name, - callback=callback, - callback_hash=sha1_hash(callback), - topic=topic, - topic_hash=sha1_hash(topic), - verify_token=verify_token, - secret=secret, - hash_func=hash_func, - lease_seconds=lease_seconds, - expiration_time=now_time) - sub.subscription_state = cls.STATE_VERIFIED - sub.expiration_time = now_time + datetime.timedelta(seconds=lease_seconds) - sub.confirm_failures = 0 - sub.verify_token = verify_token - sub.secret = secret - sub.put() - return sub_is_new - return db.run_in_transaction(txn) - - @classmethod - def request_insert(cls, - callback, - topic, - verify_token, - secret, - auto_reconfirm=False, - hash_func='sha1', - lease_seconds=DEFAULT_LEASE_SECONDS, - now=datetime.datetime.now): - """Records that a callback URL needs verification before being subscribed. - - Creates a new subscription request (for asynchronous verification) if None - already exists. Any existing subscription request will be overridden; - for instance, if a subscription has already been verified, this method - will cause it to be reconfirmed. - - Args: - callback: URL that will receive callbacks. - topic: The topic to subscribe to. - verify_token: The verification token to use to confirm the - subscription request. - secret: Shared secret used for HMACs. - auto_reconfirm: True if this task is being run by the auto-reconfirmation - offline process; False if this is a user-requested task. Defaults - to False. - hash_func: String with the name of the hash function to use for HMACs. - lease_seconds: Number of seconds the client would like the subscription - to last before expiring. Must be a number. - now: Callable that returns the current time as a datetime instance. Used - for testing - - Returns: - True if the subscription request was newly created, False otherwise. - """ - key_name = cls.create_key_name(callback, topic) - def txn(): - sub_is_new = False - sub = cls.get_by_key_name(key_name) - if sub is None: - sub_is_new = True - sub = cls(key_name=key_name, - callback=callback, - callback_hash=sha1_hash(callback), - topic=topic, - topic_hash=sha1_hash(topic), - secret=secret, - hash_func=hash_func, - verify_token=verify_token, - lease_seconds=lease_seconds, - expiration_time=( - now() + datetime.timedelta(seconds=lease_seconds))) - sub.confirm_failures = 0 - sub.put() - sub.enqueue_task(cls.STATE_VERIFIED, - verify_token, - secret=secret, - auto_reconfirm=auto_reconfirm) - return sub_is_new - return db.run_in_transaction(txn) - - @classmethod - def remove(cls, callback, topic): - """Causes a callback URL to no longer be subscribed to a topic. - - If the callback was not already subscribed to the topic, this method - will do nothing. Otherwise, the subscription will immediately be removed. - - Args: - callback: URL that will receive callbacks. - topic: The topic to subscribe to. - - Returns: - True if the subscription had previously existed, False otherwise. - """ - key_name = cls.create_key_name(callback, topic) - def txn(): - sub = cls.get_by_key_name(key_name) - if sub is not None: - sub.delete() - return True - return False - return db.run_in_transaction(txn) - - @classmethod - def request_remove(cls, callback, topic, verify_token): - """Records that a callback URL needs to be unsubscribed. - - Creates a new request to unsubscribe a callback URL from a topic (where - verification should happen asynchronously). If an unsubscribe request - has already been made, this method will do nothing. - - Args: - callback: URL that will receive callbacks. - topic: The topic to subscribe to. - verify_token: The verification token to use to confirm the - unsubscription request. - - Returns: - True if the Subscription to remove actually exists, False otherwise. - """ - key_name = cls.create_key_name(callback, topic) - def txn(): - sub = cls.get_by_key_name(key_name) - if sub is not None: - sub.confirm_failures = 0 - sub.put() - sub.enqueue_task(cls.STATE_TO_DELETE, verify_token) - return True - else: - return False - return db.run_in_transaction(txn) - - @classmethod - def archive(cls, callback, topic): - """Archives a subscription as no longer active. - - Args: - callback: URL that will receive callbacks. - topic: The topic to subscribe to. - """ - key_name = cls.create_key_name(callback, topic) - def txn(): - sub = cls.get_by_key_name(key_name) - if sub is not None: - sub.subscription_state = cls.STATE_TO_DELETE - sub.confirm_failures = 0 - sub.put() - return db.run_in_transaction(txn) - - @classmethod - def has_subscribers(cls, topic): - """Check if a topic URL has verified subscribers. - - Args: - topic: The topic URL to check for subscribers. - - Returns: - True if it has verified subscribers, False otherwise. - """ - if (cls.all(keys_only=True).filter('topic_hash =', sha1_hash(topic)) - .filter('subscription_state =', cls.STATE_VERIFIED).get() is not None): - return True - else: - return False - - @classmethod - def get_subscribers(cls, topic, count, starting_at_callback=None): - """Gets the list of subscribers starting at an offset. - - Args: - topic: The topic URL to retrieve subscribers for. - count: How many subscribers to retrieve. - starting_at_callback: A string containing the callback hash to offset - to when retrieving more subscribers. The callback at the given offset - *will* be included in the results. If None, then subscribers will - be retrieved from the beginning. - - Returns: - List of Subscription objects that were found, or an empty list if none - were found. - """ - query = cls.all() - query.filter('topic_hash =', sha1_hash(topic)) - query.filter('subscription_state = ', cls.STATE_VERIFIED) - if starting_at_callback: - query.filter('callback_hash >=', sha1_hash(starting_at_callback)) - query.order('callback_hash') - - return query.fetch(count) - - def enqueue_task(self, - next_state, - verify_token, - auto_reconfirm=False, - secret=None): - """Enqueues a task to confirm this Subscription. - - Args: - next_state: The next state this subscription should be in. - verify_token: The verify_token to use when confirming this request. - auto_reconfirm: True if this task is being run by the auto-reconfirmation - offline process; False if this is a user-requested task. Defaults - to False. - secret: Only required for subscription confirmation (not unsubscribe). - The new secret to use for this subscription after successful - confirmation. - """ - RETRIES = 3 - if auto_reconfirm: - target_queue = POLLING_QUEUE - else: - target_queue = SUBSCRIPTION_QUEUE - for i in xrange(RETRIES): - try: - taskqueue.Task( - url='/work/subscriptions', - eta=self.eta, - params={'subscription_key_name': self.key().name(), - 'next_state': next_state, - 'verify_token': verify_token, - 'secret': secret or '', - 'auto_reconfirm': str(auto_reconfirm)} - ).add(target_queue, transactional=True) - except (taskqueue.Error, apiproxy_errors.Error): - logging.exception('Could not insert task to confirm ' - 'topic = %s, callback = %s', - self.topic, self.callback) - if i == (RETRIES - 1): - raise - else: - return - - def confirm_failed(self, - next_state, - verify_token, - auto_reconfirm=False, - secret=None, - max_failures=MAX_SUBSCRIPTION_CONFIRM_FAILURES, - retry_period=SUBSCRIPTION_RETRY_PERIOD, - now=datetime.datetime.utcnow): - """Reports that an asynchronous confirmation request has failed. - - This will delete this entity if the maximum number of failures has been - exceeded. - - Args: - next_state: The next state this subscription should be in. - verify_token: The verify_token to use when confirming this request. - auto_reconfirm: True if this task is being run by the auto-reconfirmation - offline process; False if this is a user-requested task. - secret: The new secret to use for this subscription after successful - confirmation. - max_failures: Maximum failures to allow before giving up. - retry_period: Initial period for doing exponential (base-2) backoff. - now: Returns the current time as a UTC datetime. - - Returns: - True if this Subscription confirmation should be retried again. Returns - False if we should give up and never try again. - """ - def txn(): - if self.confirm_failures >= max_failures: - logging.debug('Max subscription failures exceeded, giving up.') - return False - else: - retry_delay = retry_period * (2 ** self.confirm_failures) - self.eta = now() + datetime.timedelta(seconds=retry_delay) - self.confirm_failures += 1 - self.put() - self.enqueue_task(next_state, - verify_token, - auto_reconfirm=auto_reconfirm, - secret=secret) - return True - return db.run_in_transaction(txn) - - -class FeedToFetch(db.Expando): - """A feed that has new data that needs to be pulled. - - The key name of this entity is a get_hash_key_name() hash of the topic URL, so - multiple inserts will only ever write a single entity. - """ - - topic = db.TextProperty(required=True) - eta = db.DateTimeProperty(auto_now_add=True, indexed=False) - fetching_failures = db.IntegerProperty(default=0, indexed=False) - totally_failed = db.BooleanProperty(default=False, indexed=False) - source_keys = db.StringListProperty(indexed=False) - source_values = db.StringListProperty(indexed=False) - work_index = db.IntegerProperty() - - # TODO(bslatkin): Add fetching failure reason (urlfetch, parsing, etc) and - # surface it on the topic details page. - - FORK_JOIN_QUEUE = None - - @classmethod - def get_by_topic(cls, topic): - """Retrives a FeedToFetch by the topic URL. - - Args: - topic: The URL for the feed. - - Returns: - The FeedToFetch or None if it does not exist. - """ - return cls.get_by_key_name(get_hash_key_name(topic)) - - @classmethod - def insert(cls, topic_list, source_dict=None, memory_only=True): - """Inserts a set of FeedToFetch entities for a set of topics. - - Overwrites any existing entities that are already there. - - Args: - topic_list: List of the topic URLs of feeds that need to be fetched. - source_dict: Dictionary of sources for the feed. Defaults to an empty - dictionary. - memory_only: Only save FeedToFetch records to memory, not to disk. - - Returns: - The list of FeedToFetch records that was created. - """ - if not topic_list: - return - - if source_dict: - source_keys, source_values = zip(*source_dict.items()) # Yay Python! - else: - source_keys, source_values = [], [] - - if os.environ.get('HTTP_X_APPENGINE_QUEUENAME') == POLLING_QUEUE: - cls.FORK_JOIN_QUEUE.queue_name = POLLING_QUEUE - else: - cls.FORK_JOIN_QUEUE.queue_name = FEED_QUEUE - - if memory_only: - work_index = cls.FORK_JOIN_QUEUE.next_index() - else: - work_index = None - try: - feed_list = [ - cls(key=db.Key.from_path(cls.kind(), get_hash_key_name(topic)), - topic=topic, - source_keys=list(source_keys), - source_values=list(source_values), - work_index=work_index) - for topic in set(topic_list)] - if memory_only: - cls.FORK_JOIN_QUEUE.put(work_index, feed_list) - else: - # TODO(bslatkin): Insert fetching tasks here to fix the polling - # mode for this codebase. - db.put(feed_list) - finally: - if memory_only: - cls.FORK_JOIN_QUEUE.add(work_index) - - return feed_list - - def fetch_failed(self, - max_failures=MAX_FEED_PULL_FAILURES, - retry_period=FEED_PULL_RETRY_PERIOD, - now=datetime.datetime.utcnow): - """Reports that feed fetching failed. - - This will mark this feed as failing to fetch. This feed will not be - refetched until insert() is called again. - - Args: - max_failures: Maximum failures to allow before giving up. - retry_period: Initial period for doing exponential (base-2) backoff. - now: Returns the current time as a UTC datetime. - """ - orig_failures = self.fetching_failures - def txn(): - if self.fetching_failures >= max_failures: - logging.debug('Max fetching failures exceeded, giving up.') - self.totally_failed = True - else: - retry_delay = retry_period * (2 ** orig_failures) - logging.debug('Fetching failed. Will retry in %s seconds', - retry_delay) - self.eta = now() + datetime.timedelta(seconds=retry_delay) - self.fetching_failures = orig_failures + 1 - self._enqueue_retry_task() - self.put() - try: - db.run_in_transaction_custom_retries(2, txn) - except: - logging.exception('Could not mark feed fetching as a failure: topic=%r', - self.topic) - - def done(self): - """The feed fetch has completed successfully. - - This will delete this FeedToFetch entity iff the ETA has not changed, - meaning a subsequent publish event did not happen for this topic URL. If - the ETA has changed, then we can safely assume there is a pending Task to - take care of this FeedToFetch and we should leave the entry. - - Returns: - True if the entity was deleted, False otherwise. In the case the - FeedToFetch record never made it into the Datastore (because it only - ever lived in the in-memory cache), this function will return False. - """ - def txn(): - other = db.get(self.key()) - if other and other.eta == self.eta: - other.delete() - return True - else: - return False - return db.run_in_transaction(txn) - - def _enqueue_retry_task(self): - """Enqueues a task to retry fetching this feed.""" - RETRIES = 3 - - if os.environ.get('HTTP_X_APPENGINE_QUEUENAME') == POLLING_QUEUE: - queue_name = POLLING_QUEUE - else: - queue_name = FEED_RETRIES_QUEUE - - for i in xrange(RETRIES): - try: - taskqueue.Task( - url='/work/pull_feeds', - eta=self.eta, - params={'topic': self.topic}).add(queue_name, transactional=True) - except (taskqueue.Error, apiproxy_errors.Error): - if i == (RETRIES - 1): - raise - else: - return - - -FeedToFetch.FORK_JOIN_QUEUE = fork_join_queue.MemcacheForkJoinQueue( - FeedToFetch, - FeedToFetch.work_index, - '/work/pull_feeds', - FEED_QUEUE, - batch_size=15, - batch_period_ms=500, - lock_timeout_ms=10000, - sync_timeout_ms=250, - stall_timeout_ms=30000, - acquire_timeout_ms=10, - acquire_attempts=50, - shard_count=1, - expiration_seconds=600) # Give up on fetches after 10 minutes. - - -class FeedRecord(db.Model): - """Represents record of the feed from when it has been polled. - - This contains everything in a feed except for the entry data. That means any - footers, top-level XML elements, namespace declarations, etc, will be - captured in this entity. - - The key name of this entity is a get_hash_key_name() of the topic URL. - """ - - topic = db.TextProperty(required=True) - header_footer = db.TextProperty() - last_updated = db.DateTimeProperty(auto_now=True, indexed=False) - format = db.TextProperty() # 'atom', 'rss', or 'arbitrary' - - # Content-related headers served by the feed's host. - content_type = db.TextProperty() - last_modified = db.TextProperty() - etag = db.TextProperty() - - @staticmethod - def create_key_name(topic): - """Creates a key name for a FeedRecord for a topic. - - Args: - topic: The topic URL for the FeedRecord. - - Returns: - String containing the key name. - """ - return get_hash_key_name(topic) - - @classmethod - def get_or_create_all(cls, topic_list): - """Retrieves and/or creates FeedRecord entities for the supplied topics. - - Args: - topic_list: List of topics to retrieve. - - Returns: - The list of FeedRecords corresponding to the input topic list in the - same order they were supplied. - """ - key_list = [db.Key.from_path(cls.kind(), cls.create_key_name(t)) - for t in topic_list] - found_list = db.get(key_list) - results = [] - for topic, key, found in zip(topic_list, key_list, found_list): - if found: - results.append(found) - else: - results.append(cls(key=key, topic=topic)) - return results - - @classmethod - def get_or_create(cls, topic): - """Retrieves a FeedRecord by its topic or creates it if non-existent. - - Args: - topic: The topic URL to retrieve the FeedRecord for. - - Returns: - The FeedRecord found for this topic or a new one if it did not already - exist. - """ - return cls.get_or_insert(FeedRecord.create_key_name(topic), topic=topic) - - def update(self, headers, header_footer=None, format=None): - """Updates the polling record of this feed. - - This method will *not* insert this instance into the Datastore. - - Args: - headers: Dictionary of response headers from the feed that should be used - to determine how to poll the feed in the future. - header_footer: Contents of the feed's XML document minus the entry data. - if not supplied, the old value will remain. Only saved for feeds. - format: The last parsing format that worked correctly for this feed. - Should be 'rss', 'atom', or 'arbitrary'. - """ - try: - self.content_type = headers.get('Content-Type', '').lower() - except UnicodeDecodeError: - logging.exception('Content-Type header had bad encoding') - - try: - self.last_modified = headers.get('Last-Modified') - except UnicodeDecodeError: - logging.exception('Last-Modified header had bad encoding') - - try: - self.etag = headers.get('ETag') - except UnicodeDecodeError: - logging.exception('ETag header had bad encoding') - - if format is not None: - self.format = format - if header_footer is not None and self.format != ARBITRARY: - self.header_footer = header_footer - - def get_request_headers(self, subscriber_count): - """Returns the request headers that should be used to pull this feed. - - Args: - subscriber_count: The number of subscribers this feed has. - - Returns: - Dictionary of request header values. - """ - headers = { - 'Cache-Control': 'no-cache no-store max-age=1', - 'Connection': 'cache-control', - 'Accept': '*/*', - } - if self.last_modified: - headers['If-Modified-Since'] = self.last_modified - if self.etag: - headers['If-None-Match'] = self.etag - if subscriber_count: - headers['User-Agent'] = ( - 'Public Hub (+http://pubsubhubbub.appspot.com; %d subscribers)' % - subscriber_count) - return headers - - -class FeedEntryRecord(db.Expando): - """Represents a feed entry that has been seen. - - The key name of this entity is a get_hash_key_name() hash of the entry_id. - """ - entry_content_hash = db.StringProperty(indexed=False) - update_time = db.DateTimeProperty(auto_now=True, indexed=False) - - @property - def id_hash(self): - """Returns the sha1 hash of the entry ID.""" - return self.key().name()[len('hash_'):] - - @classmethod - def create_key(cls, topic, entry_id): - """Creates a new Key for a FeedEntryRecord entity. - - Args: - topic: The topic URL to retrieve entries for. - entry_id: String containing the entry_id. - - Returns: - Key instance for this FeedEntryRecord. - """ - return db.Key.from_path( - FeedRecord.kind(), - FeedRecord.create_key_name(topic), - cls.kind(), - get_hash_key_name(entry_id)) - - @classmethod - def get_entries_for_topic(cls, topic, entry_id_list): - """Gets multiple FeedEntryRecord entities for a topic by their entry_ids. - - Args: - topic: The topic URL to retrieve entries for. - entry_id_list: Sequence of entry_ids to retrieve. - - Returns: - List of FeedEntryRecords that were found, if any. - """ - results = cls.get([cls.create_key(topic, entry_id) - for entry_id in entry_id_list]) - # Filter out those pesky Nones. - return [r for r in results if r] - - @classmethod - def create_entry_for_topic(cls, topic, entry_id, content_hash): - """Creates multiple FeedEntryRecords entities for a topic. - - Does not actually insert the entities into the Datastore. This is left to - the caller so they can do it as part of a larger batch put(). - - Args: - topic: The topic URL to insert entities for. - entry_id: String containing the ID of the entry. - content_hash: Sha1 hash of the entry's entire XML content. For example, - with Atom this would apply to everything from to with - the surrounding tags included. With RSS it would be everything from - to . - - Returns: - A new FeedEntryRecord that should be inserted into the Datastore. - """ - key = cls.create_key(topic, entry_id) - return cls(key=key, entry_content_hash=content_hash) - - -class EventToDeliver(db.Expando): - """Represents a publishing event to deliver to subscribers. - - This model is meant to be used together with Subscription entities. When a - feed has new published data and needs to be pushed to subscribers, one of - these entities will be inserted. The background worker should iterate - through all Subscription entities for this topic, sending them the event - payload. The update() method should be used to track the progress of the - background worker as well as any Subscription entities that failed delivery. - - The key_name for each of these entities is unique. It is up to the event - injection side of the system to de-dupe events to deliver. For example, when - a publish event comes in, that publish request should be de-duped immediately. - Later, when the feed puller comes through to grab feed diffs, it should insert - a single event to deliver, collapsing any overlapping publish events during - the delay from publish time to feed pulling time. - """ - - DELIVERY_MODES = ('normal', 'retry') - NORMAL = 'normal' - RETRY = 'retry' - - topic = db.TextProperty(required=True) - topic_hash = db.StringProperty(required=True) - last_callback = db.TextProperty(default='') # For paging Subscriptions - failed_callbacks = db.ListProperty(db.Key) # Refs to Subscription entities - delivery_mode = db.StringProperty(default=NORMAL, choices=DELIVERY_MODES, - indexed=False) - retry_attempts = db.IntegerProperty(default=0, indexed=False) - last_modified = db.DateTimeProperty(required=True, indexed=False) - totally_failed = db.BooleanProperty(default=False, indexed=False) - content_type = db.TextProperty(default='') - max_failures = db.IntegerProperty(indexed=False) - - @classmethod - def create_event_for_topic(cls, - topic, - format, - content_type, - header_footer, - entry_payloads, - now=datetime.datetime.utcnow, - set_parent=True, - max_failures=None): - """Creates an event to deliver for a topic and set of published entries. - - Args: - topic: The topic that had the event. - format: Format of the feed, 'atom', 'rss', or 'arbitrary'. - content_type: The original content type of the feed, fetched from the - server, if any. May be empty. - header_footer: The header and footer of the published feed into which - the entry list will be spliced. For arbitrary content this is the - full body of the resource. - entry_payloads: List of strings containing entry payloads (i.e., all - XML data for each entry, including surrounding tags) in order of newest - to oldest. - now: Returns the current time as a UTC datetime. Used in tests. - set_parent: Set the parent to the FeedRecord for the given topic. This is - necessary for the parse_feed flow's transaction. Default is True. Set - to False if this EventToDeliver will be written outside of the - FeedRecord transaction. - max_failures: Maximum number of failures to allow for this event. When - None (the default) it will use the MAX_DELIVERY_FAILURES constant. - - Returns: - A new EventToDeliver instance that has not been stored. - """ - if format in (ATOM, RSS): - # This is feed XML. - close_index = header_footer.rfind(' we need to traverse one - # level higher. - close_index = header_footer[:close_index].rfind('" in feed envelope' - end_tag = header_footer[close_index:] - content_type = 'application/rss+xml' - elif 'feed' in end_tag: - content_type = 'application/atom+xml' - elif 'rdf' in end_tag: - content_type = 'application/rdf+xml' - - payload_list = ['', - header_footer[:close_index]] - payload_list.extend(entry_payloads) - payload_list.append(header_footer[close_index:]) - payload = '\n'.join(payload_list) - elif format == ARBITRARY: - # This is an arbitrary payload. - payload = header_footer - - if set_parent: - parent = db.Key.from_path( - FeedRecord.kind(), FeedRecord.create_key_name(topic)) - else: - parent = None - - if isinstance(payload, unicode): - payload = payload.encode('utf-8') - - return cls( - parent=parent, - topic=topic, - topic_hash=sha1_hash(topic), - payload=db.Blob(payload), - last_modified=now(), - content_type=content_type, - max_failures=max_failures) - - def get_next_subscribers(self, chunk_size=None): - """Retrieve the next set of subscribers to attempt delivery for this event. - - Args: - chunk_size: How many subscribers to retrieve at a time while delivering - the event. Defaults to EVENT_SUBSCRIBER_CHUNK_SIZE. - - Returns: - Tuple (more_subscribers, subscription_list) where: - more_subscribers: True if there are more subscribers to deliver to - after the returned 'subscription_list' has been contacted; this value - should be passed to update() after the delivery is attempted. - subscription_list: List of Subscription entities to attempt to contact - for this event. - """ - if chunk_size is None: - chunk_size = EVENT_SUBSCRIBER_CHUNK_SIZE - - if self.delivery_mode == EventToDeliver.NORMAL: - all_subscribers = Subscription.get_subscribers( - self.topic, chunk_size + 1, starting_at_callback=self.last_callback) - if all_subscribers: - self.last_callback = all_subscribers[-1].callback - else: - self.last_callback = '' - - more_subscribers = len(all_subscribers) > chunk_size - subscription_list = all_subscribers[:chunk_size] - elif self.delivery_mode == EventToDeliver.RETRY: - next_chunk = self.failed_callbacks[:chunk_size] - more_subscribers = len(self.failed_callbacks) > len(next_chunk) - - if self.last_callback: - # If the final index is present in the next chunk, that means we've - # wrapped back around to the beginning and will need to do more - # exponential backoff. This also requires updating the last_callback - # in the update() method, since we do not know which callbacks from - # the next chunk will end up failing. - final_subscription_key = datastore_types.Key.from_path( - Subscription.__name__, - Subscription.create_key_name(self.last_callback, self.topic)) - try: - final_index = next_chunk.index(final_subscription_key) - except ValueError: - pass - else: - more_subscribers = False - next_chunk = next_chunk[:final_index] - - subscription_list = [x for x in db.get(next_chunk) if x is not None] - if subscription_list and not self.last_callback: - # This must be the first time through the current iteration where we do - # not yet know a sentinal value in the list that represents the starting - # point. - self.last_callback = subscription_list[0].callback - - # If the failed callbacks fail again, they will be added back to the - # end of the list. - self.failed_callbacks = self.failed_callbacks[len(next_chunk):] - - return more_subscribers, subscription_list - - def update(self, - more_callbacks, - more_failed_callbacks, - now=datetime.datetime.utcnow, - max_failures=MAX_DELIVERY_FAILURES, - retry_period=DELIVERY_RETRY_PERIOD): - """Updates an event with work progress or deletes it if it's done. - - Reschedules another Task to run to handle this event delivery if needed. - - Args: - more_callbacks: True if there are more callbacks to deliver, False if - there are no more subscribers to deliver for this feed. - more_failed_callbacks: Iterable of Subscription entities for this event - that failed to deliver. - max_failures: Maximum failures to allow before giving up. - retry_period: Initial period for doing exponential (base-2) backoff. - now: Returns the current time as a UTC datetime. - """ - self.last_modified = now() - - # Ensure the list of failed callbacks is in sorted order so we keep track - # of the last callback seen in alphabetical order of callback URL hashes. - more_failed_callbacks = sorted(more_failed_callbacks, - key=lambda x: x.callback_hash) - - self.failed_callbacks.extend(e.key() for e in more_failed_callbacks) - if not more_callbacks and not self.failed_callbacks: - logging.info('EventToDeliver complete: topic = %s, delivery_mode = %s', - self.topic, self.delivery_mode) - self.delete() - return - elif not more_callbacks: - self.last_callback = '' - self.retry_attempts += 1 - if self.max_failures is not None: - max_failures = self.max_failures - if self.retry_attempts > max_failures: - self.totally_failed = True - else: - retry_delay = retry_period * (2 ** (self.retry_attempts-1)) - try: - self.last_modified += datetime.timedelta(seconds=retry_delay) - except OverflowError: - pass - - if self.delivery_mode == EventToDeliver.NORMAL: - logging.debug('Normal delivery done; %d broken callbacks remain', - len(self.failed_callbacks)) - self.delivery_mode = EventToDeliver.RETRY - else: - logging.debug('End of attempt %d; topic = %s, subscribers = %d, ' - 'waiting until %s or totally_failed = %s', - self.retry_attempts, self.topic, - len(self.failed_callbacks), self.last_modified, - self.totally_failed) - - def txn(): - self.put() - if not self.totally_failed: - self.enqueue() - db.run_in_transaction(txn) - - def enqueue(self): - """Enqueues a Task that will execute this EventToDeliver.""" - RETRIES = 3 - if self.delivery_mode == EventToDeliver.RETRY: - target_queue = EVENT_RETRIES_QUEUE - elif os.environ.get('HTTP_X_APPENGINE_QUEUENAME') == POLLING_QUEUE: - target_queue = POLLING_QUEUE - else: - target_queue = EVENT_QUEUE - for i in xrange(RETRIES): - try: - taskqueue.Task( - url='/work/push_events', - eta=self.last_modified, - params={'event_key': self.key()} - ).add(target_queue, transactional=True) - except (taskqueue.Error, apiproxy_errors.Error): - logging.exception('Could not insert task to deliver ' - 'events for topic = %s', self.topic) - if i == (RETRIES - 1): - raise - else: - return - - -class KnownFeed(db.Model): - """Represents a feed that we know exists. - - This entity will be overwritten anytime someone subscribes to this feed. The - benefit is we have a single entity per known feed, allowing us to quickly - iterate through all of them. This may have issues if the subscription rate - for a single feed is over one per second. - """ - - topic = db.TextProperty(required=True) - feed_id = db.TextProperty() - update_time = db.DateTimeProperty(auto_now=True) - - @classmethod - def create(cls, topic): - """Creates a new KnownFeed. - - Args: - topic: The feed's topic URL. - - Returns: - The KnownFeed instance that hasn't been added to the Datastore. - """ - return cls(key_name=get_hash_key_name(topic), topic=topic) - - @classmethod - def record(cls, topic): - """Enqueues a task to create a new KnownFeed and initiate feed ID discovery. - - Args: - topic: The feed's topic URL. - """ - RETRIES = 3 - target_queue = MAPPINGS_QUEUE - for i in xrange(RETRIES): - try: - taskqueue.Task( - url='/work/record_feeds', - params={'topic': topic} - ).add(target_queue) - except (taskqueue.Error, apiproxy_errors.Error): - logging.exception('Could not insert task to do feed ID ' - 'discovery for topic = %s', topic) - if i == (RETRIES - 1): - raise - else: - return - - @classmethod - def create_key(cls, topic): - """Creates a key for a KnownFeed. - - Args: - topic: The feed's topic URL. - - Returns: - Key instance for this feed. - """ - return datastore_types.Key.from_path(cls.kind(), get_hash_key_name(topic)) - - @classmethod - def check_exists(cls, topics): - """Checks if the supplied topic URLs are known feeds. - - Args: - topics: Iterable of topic URLs. - - Returns: - List of topic URLs with KnownFeed entries. If none are known, this list - will be empty. The returned order is arbitrary. - """ - result = [] - for known_feed in cls.get([cls.create_key(url) for url in set(topics)]): - if known_feed is not None: - result.append(known_feed.topic) - return result - - -class KnownFeedStats(db.Model): - """Represents stats about a feed we know that exists. - - Parent is the KnownFeed entity for a given topic URL. - """ - - subscriber_count = db.IntegerProperty() - update_time = db.DateTimeProperty(auto_now=True) - - @classmethod - def create_key(cls, topic_url=None, topic_hash=None): - """Creates a key for a KnownFeedStats instance. - - Args: - topic_url: The topic URL to create the key for. - topic_hash: The hash of the topic URL to create the key for. May only - be supplied if topic_url is None. - - Returns: - db.Key of the KnownFeedStats instance. - """ - if topic_url and topic_hash: - raise TypeError('Must specify topic_url or topic_hash.') - if topic_url: - topic_hash = sha1_hash(topic_url) - - return db.Key.from_path(KnownFeed.kind(), topic_hash, - cls.kind(), 'overall') - - @classmethod - def get_or_create_all(cls, topic_list): - """Retrieves and/or creates KnownFeedStats entities for the supplied topics. - - Args: - topic_list: List of topics to retrieve. - - Returns: - The list of KnownFeedStats corresponding to the input topic list in - the same order they were supplied. - """ - key_list = [cls.create_key(t) for t in topic_list] - found_list = db.get(key_list) - results = [] - for topic, key, found in zip(topic_list, key_list, found_list): - if found: - results.append(found) - else: - results.append(cls(key=key, subscriber_count=0)) - return results - - -class PollingMarker(db.Model): - """Keeps track of the current position in the bootstrap polling process.""" - - last_start = db.DateTimeProperty() - next_start = db.DateTimeProperty(required=True) - - @classmethod - def get(cls, now=datetime.datetime.utcnow): - """Returns the current PollingMarker, creating it if it doesn't exist. - - Args: - now: Returns the current time as a UTC datetime. - """ - key_name = 'The Mark' - the_mark = db.get(datastore_types.Key.from_path(cls.kind(), key_name)) - if the_mark is None: - next_start = now() - datetime.timedelta(seconds=60) - the_mark = PollingMarker(key_name=key_name, - next_start=next_start, - current_key=None) - return the_mark - - def should_progress(self, - period=POLLING_BOOTSTRAP_PERIOD, - now=datetime.datetime.utcnow): - """Returns True if the bootstrap polling should progress. - - May modify this PollingMarker to when the next polling should start. - - Args: - period: The poll period for bootstrapping. - now: Returns the current time as a UTC datetime. - """ - now_time = now() - if self.next_start < now_time: - logging.info('Polling starting afresh for start time %s', self.next_start) - self.last_start = self.next_start - self.next_start = now_time + datetime.timedelta(seconds=period) - return True - else: - return False - - -class KnownFeedIdentity(db.Model): - """Stores a set of known URL aliases for a particular feed.""" - - feed_id = db.TextProperty(required=True) - topics = db.ListProperty(db.Text) - last_update = db.DateTimeProperty() - - @classmethod - def create_key(cls, feed_id): - """Creates a key for a KnownFeedIdentity. - - Args: - feed_id: The feed's identity. For Atom this is the //feed/id element; - for RSS it is the //rss/channel/link element. If for whatever reason - the ID is missing, then the feed URL itself should be used. - - Returns: - Key instance for this feed identity. - """ - return datastore_types.Key.from_path(cls.kind(), get_hash_key_name(feed_id)) - - @classmethod - def update(cls, feed_id, topic): - """Updates a KnownFeedIdentity to have a topic URL mapping. - - Args: - feed_id: The identity of the feed to update with the mapping. - topic: The topic URL to add to the feed's list of aliases. - - Returns: - The KnownFeedIdentity that has been created or updated. - """ - def txn(): - known_feed = db.get(cls.create_key(feed_id)) - if not known_feed: - known_feed = cls(feed_id=feed_id, key_name=get_hash_key_name(feed_id)) - if topic not in known_feed.topics: - known_feed.topics.append(db.Text(topic)) - known_feed.last_update = datetime.datetime.now() - known_feed.put() - return known_feed - try: - return db.run_in_transaction(txn) - except (db.BadRequestError, apiproxy_errors.RequestTooLargeError): - logging.exception( - 'Could not update feed_id=%r; expansion is already too large', - feed_id) - - @classmethod - def remove(cls, feed_id, topic): - """Updates a KnownFeedIdentity to no longer have a topic URL mapping. - - Args: - feed_id: The identity of the feed to update with the mapping. - topic: The topic URL to remove from the feed's list of aliases. - - Returns: - The KnownFeedIdentity that has been updated or None if the mapping - did not exist previously or has now been deleted because it has no - active mappings. - """ - def txn(): - known_feed = db.get(cls.create_key(feed_id)) - if not known_feed: - return None - try: - known_feed.topics.remove(db.Text(topic)) - except ValueError: - return None - - if not known_feed.topics: - known_feed.delete() - return None - else: - known_feed.last_update = datetime.datetime.now() - known_feed.put() - return known_feed - return db.run_in_transaction(txn) - - @classmethod - def derive_additional_topics(cls, topics): - """Derives topic URL aliases from a set of topics by using feed IDs. - - If a topic URL has a KnownFeed entry but no valid feed_id or - KnownFeedIdentity record, the input topic will be echoed in the output - dictionary directly. This properly handles the case where the feed_id has - not yet been recorded for the feed. - - Args: - topics: Iterable of topic URLs. - - Returns: - Dictionary mapping input topic URLs to their full set of aliases, - including the input topic URL. - """ - topics = set(topics) - output_dict = {} - known_feeds = KnownFeed.get([KnownFeed.create_key(t) for t in topics]) - - topics = [] - feed_ids = [] - for feed in known_feeds: - if feed is None: - # In case the KnownFeed hasn't been written yet, don't deliver an event; - # we need the KnownFeed cache to make subscription checking fast. - continue - - fix_feed_id = feed.feed_id - if fix_feed_id is not None: - fix_feed_id = fix_feed_id.strip() - - # No expansion for feeds that have no known topic -> feed_id relation, but - # record those with KnownFeed as having a mapping from topic -> topic for - # backwards compatibility with existing production data. - if fix_feed_id: - topics.append(feed.topic) - feed_ids.append(feed.feed_id) - else: - output_dict[feed.topic] = set([feed.topic]) - - known_feed_ids = cls.get([cls.create_key(f) for f in feed_ids]) - - for known_topic, identified in zip(topics, known_feed_ids): - if identified: - topic_set = output_dict.get(known_topic) - if topic_set is None: - topic_set = set([known_topic]) - output_dict[known_topic] = topic_set - # TODO(bslatkin): Test this. - if len(identified.topics) > 25: - logging.debug('Too many expansion feeds for topic %s: %s', - known_topic, identified.topics) - else: - topic_set.update(identified.topics) - - return output_dict - -################################################################################ -# Subscription handlers and workers - -def confirm_subscription(mode, topic, callback, verify_token, - secret, lease_seconds, record_topic=True): - """Confirms a subscription request and updates a Subscription instance. - - Args: - mode: The mode of subscription confirmation ('subscribe' or 'unsubscribe'). - topic: URL of the topic being subscribed to. - callback: URL of the callback handler to confirm the subscription with. - verify_token: Opaque token passed to the callback. - secret: Shared secret used for HMACs. - lease_seconds: Number of seconds the client would like the subscription - to last before expiring. If more than max_lease_seconds, will be capped - to that value. Should be an integer number. - record_topic: When True, also cause the topic's feed ID to be recorded - if this is a new subscription. - - Returns: - True if the subscription was confirmed properly, False if the subscription - request encountered an error or any other error has hit. - """ - logging.debug('Attempting to confirm %s for topic = %r, callback = %r, ' - 'verify_token = %r, secret = %r, lease_seconds = %s', - mode, topic, callback, verify_token, secret, lease_seconds) - - parsed_url = list(urlparse.urlparse(utf8encoded(callback))) - challenge = get_random_challenge() - real_lease_seconds = min(lease_seconds, MAX_LEASE_SECONDS) - params = { - 'hub.mode': mode, - 'hub.topic': utf8encoded(topic), - 'hub.challenge': challenge, - 'hub.lease_seconds': real_lease_seconds, - } - if verify_token: - params['hub.verify_token'] = utf8encoded(verify_token) - - if parsed_url[4]: - # Preserve subscriber-supplied callback parameters. - parsed_url[4] = '%s&%s' % (parsed_url[4], urllib.urlencode(params)) - else: - parsed_url[4] = urllib.urlencode(params) - - adjusted_url = urlparse.urlunparse(parsed_url) - - try: - response = urlfetch.fetch(adjusted_url, method='get', - follow_redirects=False, - deadline=MAX_FETCH_SECONDS) - except urlfetch_errors.Error: - error_traceback = traceback.format_exc() - logging.debug('Error encountered while confirming subscription ' - 'to %s for callback %s:\n%s', - topic, callback, error_traceback) - return False - - if 200 <= response.status_code < 300 and response.content == challenge: - if mode == 'subscribe': - Subscription.insert(callback, topic, verify_token, secret, - lease_seconds=real_lease_seconds) - if record_topic: - # Enqueue a task to record the feed and do discovery for it's ID. - KnownFeed.record(topic) - else: - Subscription.remove(callback, topic) - logging.info('Subscription action verified, ' - 'callback = %s, topic = %s: %s', callback, topic, mode) - return True - elif mode == 'subscribe' and response.status_code == 404: - Subscription.archive(callback, topic) - logging.info('Subscribe request returned 404 for callback = %s, ' - 'topic = %s; subscription archived', callback, topic) - return True - else: - logging.debug('Could not confirm subscription; encountered ' - 'status %d with content: %s', response.status_code, - response.content) - return False - - -class SubscribeHandler(webapp.RequestHandler): - """End-user accessible handler for Subscribe and Unsubscribe events.""" - - def get(self): - self.response.out.write(template.render('subscribe_debug.html', {})) - - @dos.limit(param='hub.callback', count=10, period=1) - def post(self): - self.response.headers['Content-Type'] = 'text/plain' - - callback = self.request.get('hub.callback', '') - topic = self.request.get('hub.topic', '') - verify_type_list = [s.lower() for s in self.request.get_all('hub.verify')] - verify_token = unicode(self.request.get('hub.verify_token', '')) - secret = unicode(self.request.get('hub.secret', '')) or None - lease_seconds = ( - self.request.get('hub.lease_seconds', '') or str(DEFAULT_LEASE_SECONDS)) - mode = self.request.get('hub.mode', '').lower() - - error_message = None - if not callback or not is_valid_url(callback): - error_message = ('Invalid parameter: hub.callback; ' - 'must be valid URI with no fragment and ' - 'optional port %s' % ','.join(VALID_PORTS)) - else: - callback = normalize_iri(callback) - - if not topic or not is_valid_url(topic): - error_message = ('Invalid parameter: hub.topic; ' - 'must be valid URI with no fragment and ' - 'optional port %s' % ','.join(VALID_PORTS)) - else: - topic = normalize_iri(topic) - - enabled_types = [vt for vt in verify_type_list if vt in ('async', 'sync')] - if not enabled_types: - error_message = 'Invalid values for hub.verify: %s' % (verify_type_list,) - else: - verify_type = enabled_types[0] - - if mode not in ('subscribe', 'unsubscribe'): - error_message = 'Invalid value for hub.mode: %s' % mode - - if lease_seconds: - try: - old_lease_seconds = lease_seconds - lease_seconds = int(old_lease_seconds) - if not old_lease_seconds == str(lease_seconds): - raise ValueError - except ValueError: - error_message = ('Invalid value for hub.lease_seconds: %s' % - old_lease_seconds) - - if error_message: - logging.debug('Bad request for mode = %s, topic = %s, ' - 'callback = %s, verify_token = %s, lease_seconds = %s: %s', - mode, topic, callback, verify_token, - lease_seconds, error_message) - self.response.out.write(error_message) - return self.response.set_status(400) - - try: - # Retrieve any existing subscription for this callback. - sub = Subscription.get_by_key_name( - Subscription.create_key_name(callback, topic)) - - # Deletions for non-existant subscriptions will be ignored. - if mode == 'unsubscribe' and not sub: - return self.response.set_status(204) - - # Enqueue a background verification task, or immediately confirm. - # We prefer synchronous confirmation. - if verify_type == 'sync': - if hooks.execute(confirm_subscription, - mode, topic, callback, verify_token, secret, lease_seconds): - return self.response.set_status(204) - else: - self.response.out.write('Error trying to confirm subscription') - return self.response.set_status(409) - else: - if mode == 'subscribe': - Subscription.request_insert(callback, topic, verify_token, secret, - lease_seconds=lease_seconds) - else: - Subscription.request_remove(callback, topic, verify_token) - logging.debug('Queued %s request for callback = %s, ' - 'topic = %s, verify_token = "%s", lease_seconds= %s', - mode, callback, topic, verify_token, lease_seconds) - return self.response.set_status(202) - - except (apiproxy_errors.Error, db.Error, - runtime.DeadlineExceededError, taskqueue.Error), e: - logging.debug('Could not verify subscription request. %s: %s', - e.__class__.__name__, e) - self.response.headers['Retry-After'] = '120' - return self.response.set_status(503) - - -class SubscriptionConfirmHandler(webapp.RequestHandler): - """Background worker for asynchronously confirming subscriptions.""" - - @work_queue_only - def post(self): - sub_key_name = self.request.get('subscription_key_name') - next_state = self.request.get('next_state') - verify_token = self.request.get('verify_token') - secret = self.request.get('secret') or None - auto_reconfirm = self.request.get('auto_reconfirm', 'False') == 'True' - sub = Subscription.get_by_key_name(sub_key_name) - if not sub: - logging.debug('No subscriptions to confirm ' - 'for subscription_key_name = %s', sub_key_name) - return - - if next_state == Subscription.STATE_TO_DELETE: - mode = 'unsubscribe' - else: - # NOTE: If next_state wasn't specified, this is probably an old task from - # the last version of this code. Handle these tasks by assuming they - # mant subscribe, which will probably cause less damage. - mode = 'subscribe' - - if not hooks.execute(confirm_subscription, - mode, sub.topic, sub.callback, - verify_token, secret, sub.lease_seconds, - record_topic=False): - # After repeated re-confirmation failures for a subscription, assume that - # the callback is dead and archive it. End-user-initiated subscription - # requests cannot possibly follow this code path, preventing attacks - # from unsubscribing callbacks without ownership. - if (not sub.confirm_failed(next_state, verify_token, - auto_reconfirm=auto_reconfirm, - secret=secret) and - auto_reconfirm and mode == 'subscribe'): - logging.info('Auto-renewal subscribe request failed the maximum ' - 'number of times for callback = %s, topic = %s; ' - 'subscription archived', sub.callback, sub.topic) - Subscription.archive(sub.callback, sub.topic) - - -class SubscriptionReconfirmHandler(webapp.RequestHandler): - """Periodic handler causes reconfirmation for almost expired subscriptions.""" - - def __init__(self, now=time.time, start_map=mapreduce.control.start_map): - """Initializer.""" - webapp.RequestHandler.__init__(self) - self.now = now - self.start_map = start_map - - @work_queue_only - def get(self): - # Use the name, such that only one of these tasks runs per calendar day. - name = 'reconfirm-%s' % time.strftime('%Y-%m-%d' , time.gmtime(self.now())) - try: - taskqueue.Task( - url='/work/reconfirm_subscriptions', - name=name - ).add(POLLING_QUEUE) - except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError): - logging.exception('Could not enqueue FIRST reconfirmation task; ' - 'must have already run today.') - - @work_queue_only - def post(self): - self.start_map( - name='Reconfirm expiring subscriptions', - handler_spec='offline_jobs.SubscriptionReconfirmMapper.run', - reader_spec='mapreduce.input_readers.DatastoreInputReader', - mapper_parameters=dict( - processing_rate=100000, - entity_kind='main.Subscription', - threshold_timestamp=int( - self.now() + SUBSCRIPTION_CHECK_BUFFER_SECONDS)), - shard_count=SUBSCRIPTION_RECONFIRM_SHARD_COUNT, - queue_name=POLLING_QUEUE, - mapreduce_parameters=dict( - done_callback='/work/cleanup_mapper', - done_callback_queue=POLLING_QUEUE)) - - -# TODO(bslatkin): Move this to an offline job. -class SubscriptionCleanupHandler(webapp.RequestHandler): - """Background worker for cleaning up deleted Subscription instances.""" - - @work_queue_only - def get(self): - subscriptions = (Subscription.all() - .filter('subscription_state =', Subscription.STATE_TO_DELETE) - .fetch(SUBSCRIPTION_CLEANUP_CHUNK_SIZE)) - if subscriptions: - logging.info('Cleaning up %d subscriptions', len(subscriptions)) - try: - db.delete(subscriptions) - except (db.Error, apiproxy_errors.Error, runtime.DeadlineExceededError): - logging.exception('Could not clean-up Subscription instances') - - -class CleanupMapperHandler(webapp.RequestHandler): - """Cleans up all data from a Mapper job run.""" - - @work_queue_only - def post(self): - mapreduce_id = self.request.headers.get('mapreduce-id') - # TODO: Use Mapper Cleanup API once available. - db.delete(mapreduce.model.MapreduceControl.get_key_by_job_id(mapreduce_id)) - shards = mapreduce.model.ShardState.find_by_mapreduce_id(mapreduce_id) - db.delete(shards) - db.delete(mapreduce.model.MapreduceState.get_key_by_job_id(mapreduce_id)) - -################################################################################ -# Publishing handlers - -def preprocess_urls(urls): - """Preprocesses URLs doing any necessary canonicalization. - - Args: - urls: Set of URLs. - - Returns: - Iterable of URLs that have been modified. - """ - return urls - - -def derive_sources(request_handler, urls): - """Derives feed sources for a publish event. - - Args: - request_handler: webapp.RequestHandler instance for the publish event. - urls: Set of URLs that were published. - """ - return {} - - -class PublishHandlerBase(webapp.RequestHandler): - """Base-class for publish ping receiving handlers.""" - - def receive_publish(self, urls, success_code, param_name): - """Receives a publishing event for a set of topic URLs. - - Serves 400 errors on invalid input, 503 retries on insertion failures. - - Args: - urls: Iterable of URLs that have been published. - success_code: HTTP status code to return on success. - param_name: Name of the parameter that will be validated. - - Returns: - The error message, or an empty string if there are no errors. - """ - urls = hooks.execute(preprocess_urls, urls) - for url in urls: - if not is_valid_url(url): - self.response.set_status(400) - return '%s invalid: %s' % (param_name, url) - - # Normalize all URLs. This assumes our web framework has already decoded - # any POST-body encoded URLs that were passed in to the 'urls' parameter. - urls = set(normalize_iri(u) for u in urls) - - # Only insert FeedToFetch entities for feeds that are known to have - # subscribers. The rest will be ignored. - topic_map = KnownFeedIdentity.derive_additional_topics(urls) - if not topic_map: - urls = set() - else: - # Expand topic URLs by their feed ID to properly handle any aliases - # this feed may have active subscriptions for. - urls = set() - for topic, value in topic_map.iteritems(): - urls.update(value) - logging.info('Topics with known subscribers: %s', urls) - - source_dict = hooks.execute(derive_sources, self, urls) - - # Record all FeedToFetch requests here. The background Pull worker will - # double-check if there are any subscribers that need event delivery and - # will skip any unused feeds. - try: - FeedToFetch.insert(urls, source_dict) - except (taskqueue.Error, apiproxy_errors.Error, db.Error, - runtime.DeadlineExceededError, fork_join_queue.Error): - logging.exception('Failed to insert FeedToFetch records') - self.response.headers['Retry-After'] = '120' - self.response.set_status(503) - return 'Transient error; please try again later' - else: - self.response.set_status(success_code) - return '' - - -class PublishHandler(PublishHandlerBase): - """End-user accessible handler for the Publish event.""" - - def get(self): - self.response.out.write(template.render('publish_debug.html', {})) - - @dos.limit(count=100, period=1) - def post(self): - self.response.headers['Content-Type'] = 'text/plain' - - mode = self.request.get('hub.mode') - if mode.lower() != 'publish': - self.response.set_status(400) - self.response.out.write('hub.mode MUST be "publish"') - return - - urls = set(self.request.get_all('hub.url')) - if not urls: - self.response.set_status(400) - self.response.out.write('MUST supply at least one hub.url parameter') - return - - logging.debug('Publish event for %d URLs (showing first 25): %s', - len(urls), list(urls)[:25]) - error = self.receive_publish(urls, 204, 'hub.url') - if error: - self.response.out.write(error) - -################################################################################ -# Pulling - -def find_feed_updates(topic, format, feed_content, - filter_feed=feed_diff.filter): - """Determines the updated entries for a feed and returns their records. - - Args: - topic: The topic URL of the feed. - format: The string 'atom', 'rss', or 'arbitrary'. - feed_content: The content of the feed, which may include unicode characters. - For arbitrary content, this is just the content itself. - filter_feed: Used for dependency injection. - - Returns: - Tuple (header_footer, entry_list, entry_payloads) where: - header_footer: The header/footer data of the feed. - entry_list: List of FeedEntryRecord instances, if any, that represent - the changes that have occurred on the feed. These records do *not* - include the payload data for the entry. - entry_payloads: List of strings containing entry payloads (i.e., the XML - data for the Atom or ). - - Raises: - xml.sax.SAXException if there is a parse error. - feed_diff.Error if the feed could not be diffed for any other reason. - """ - if format == ARBITRARY: - return (feed_content, [], []) - - header_footer, entries_map = filter_feed(feed_content, format) - - # Find the new entries we've never seen before, and any entries that we - # knew about that have been updated. - STEP = MAX_FEED_ENTRY_RECORD_LOOKUPS - all_keys = entries_map.keys() - existing_entries = [] - for position in xrange(0, len(all_keys), STEP): - key_set = all_keys[position:position+STEP] - existing_entries.extend(FeedEntryRecord.get_entries_for_topic( - topic, key_set)) - - existing_dict = dict((e.id_hash, e.entry_content_hash) - for e in existing_entries if e) - logging.debug('Retrieved %d feed entries, %d of which have been seen before', - len(entries_map), len(existing_dict)) - - entities_to_save = [] - entry_payloads = [] - for entry_id, new_content in entries_map.iteritems(): - new_content_hash = sha1_hash(new_content) - new_entry_id_hash = sha1_hash(entry_id) - # Mark the entry as new if the sha1 hash is different. - try: - old_content_hash = existing_dict[new_entry_id_hash] - if old_content_hash == new_content_hash: - continue - except KeyError: - pass - - entry_payloads.append(new_content) - entities_to_save.append(FeedEntryRecord.create_entry_for_topic( - topic, entry_id, new_content_hash)) - - return header_footer, entities_to_save, entry_payloads - - -def pull_feed(feed_to_fetch, fetch_url, headers): - """Pulls a feed. - - Args: - feed_to_fetch: FeedToFetch instance to pull. - fetch_url: The URL to fetch. Should be the same as the topic stored on - the FeedToFetch instance, but may be different due to redirects. - headers: Dictionary of headers to use for doing the feed fetch. - - Returns: - Tuple (status_code, response_headers, content) where: - status_code: The response status code. - response_headers: Caseless dictionary of response headers. - content: The body of the response. - - Raises: - apiproxy_errors.Error if any RPC errors are encountered. urlfetch.Error if - there are any fetching API errors. - """ - response = urlfetch.fetch( - fetch_url, - headers=headers, - follow_redirects=False, - deadline=MAX_FETCH_SECONDS) - return response.status_code, response.headers, response.content - - -def pull_feed_async(feed_to_fetch, fetch_url, headers, async_proxy, callback): - """Pulls a feed asynchronously. - - The callback's prototype is: - Args: - status_code: The response status code. - response_headers: Caseless dictionary of response headers. - content: The body of the response. - exception: apiproxy_errors.Error if any RPC errors are encountered. - urlfetch.Error if there are any fetching API errors. None if there - were no errors. - - Args: - feed_to_fetch: FeedToFetch instance to pull. - fetch_url: The URL to fetch. Should be the same as the topic stored on - the FeedToFetch instance, but may be different due to redirects. - headers: Dictionary of headers to use for doing the feed fetch. - async_proxy: AsyncAPIProxy to use for fetching and waiting. - callback: Callback function to call after a response has been received. - """ - def wrapper(response, exception): - callback(getattr(response, 'status_code', None), - getattr(response, 'headers', None), - getattr(response, 'content', None), - exception) - urlfetch_async.fetch(fetch_url, - headers=headers, - follow_redirects=False, - async_proxy=async_proxy, - callback=wrapper, - deadline=MAX_FETCH_SECONDS) - - -def inform_event(event_to_deliver, alternate_topics): - """Helper hook informs the Hub of new notifications. - - This can be used to take an action on every notification processed. - - Args: - event_to_deliver: The new event to deliver, already submitted. - alternate_topics: A list of alternative Feed topics that this event - should be delievered for in addition to the 'event_to_deliver's topic. - """ - pass - - -def parse_feed(feed_record, - headers, - content, - true_on_bad_feed=True, - alternate_topics=None): - """Parses a feed's content, determines changes, enqueues notifications. - - This function will only enqueue new notifications if the feed has changed. - - Args: - feed_record: The FeedRecord object of the topic that has new content. - headers: Dictionary of response headers found during feed fetching (may - be empty). - content: The feed document possibly containing new entries. - true_on_bad_feed: When True, return True when the feed's format is - beyond hope and there's no chance of parsing it correctly. When - False the error will be propagated up to the caller with a False - response to this function. - alternate_topics: A list of alternative Feed topics that this parsed event - should be delievered for in addition to the main FeedRecord's topic. - - Returns: - True if successfully parsed the feed content; False on error. - """ - # The content-type header is extremely unreliable for determining the feed's - # content-type. Using a regex search for " MAX_NEW_FEED_ENTRY_RECORDS: - logging.warning('Found more entities than we can process for topic %r; ' - 'splitting', feed_record.topic) - entities_to_save = entities_to_save[:MAX_NEW_FEED_ENTRY_RECORDS] - entry_payloads = entry_payloads[:MAX_NEW_FEED_ENTRY_RECORDS] - parse_successful = False - else: - feed_record.update(headers, header_footer, format) - parse_successful = True - - if format != ARBITRARY and not entities_to_save: - logging.debug('No new entries found') - event_to_deliver = None - else: - logging.info( - 'Saving %d new/updated entries for content ' - 'format=%r, content_type=%r, header_footer_bytes=%d', - len(entities_to_save), format, feed_record.content_type, - len(header_footer)) - event_to_deliver = EventToDeliver.create_event_for_topic( - feed_record.topic, format, feed_record.content_type, - header_footer, entry_payloads) - entities_to_save.insert(0, event_to_deliver) - - entities_to_save.insert(0, feed_record) - - # Segment all entities into smaller groups to reduce the chance of memory - # errors or too large of requests when the entities are put in a single - # call to the Datastore API. - all_entities = [] - STEP = MAX_FEED_RECORD_SAVES - for position in xrange(0, len(entities_to_save), STEP): - next_entities = entities_to_save[position:position+STEP] - all_entities.append(next_entities) - - # Doing this put in a transaction ensures that we have written all - # FeedEntryRecords, updated the FeedRecord, and written the EventToDeliver - # at the same time. Otherwise, if any of these fails individually we could - # drop messages on the floor. If this transaction fails, the whole fetch - # will be redone and find the same entries again (thus it is idempotent). - def txn(): - while all_entities: - group = all_entities.pop(0) - try: - db.put(group) - except (db.BadRequestError, apiproxy_errors.RequestTooLargeError): - logging.exception('Could not insert %d entities for topic %r; ' - 'splitting in half', len(group), feed_record.topic) - # Insert the first half at the beginning since we need to make sure that - # the EventToDeliver gets inserted first. - all_entities.insert(0, group[len(group)/2:]) - all_entities.insert(0, group[:len(group)/2]) - raise - if event_to_deliver: - event_to_deliver.enqueue() - - try: - for i in xrange(PUT_SPLITTING_ATTEMPTS): - try: - db.run_in_transaction(txn) - break - except (db.BadRequestError, apiproxy_errors.RequestTooLargeError): - pass - else: - logging.critical('Insertion of event to delivery *still* failing due to ' - 'request size; dropping event for %s', feed_record.topic) - return true_on_bad_feed - except (db.TransactionFailedError, db.Timeout): - # Datastore failure will cause a refetch and reparse of the feed as if - # the fetch attempt failed, instead of relying on the task queue to do - # this retry for us. This ensures the queue throughputs stay consistent. - logging.exception('Could not submit transaction for topic %r', - feed_record.topic) - return False - - # Inform any hooks that there will is a new event to deliver that has - # been recorded and delivery has begun. - hooks.execute(inform_event, event_to_deliver, alternate_topics) - - return parse_successful - - -class PullFeedHandler(webapp.RequestHandler): - """Background worker for pulling feeds.""" - - def _handle_fetches(self, feed_list): - """Handles a set of FeedToFetch records that need to be fetched.""" - ready_feed_list = [] - scorer_results = FETCH_SCORER.filter([f.topic for f in feed_list]) - for to_fetch, (allow, percent) in zip(feed_list, scorer_results): - if not allow: - logging.warning('Scoring prevented fetch of %r ' - 'with failure rate %.2f%%', - to_fetch.topic, 100 * percent) - to_fetch.done() - elif not Subscription.has_subscribers(to_fetch.topic): - logging.debug('Ignoring event because there are no subscribers ' - 'for topic %s', to_fetch.topic) - to_fetch.done() - else: - ready_feed_list.append(to_fetch) - - if not ready_feed_list: - return - - topic_list = [f.topic for f in ready_feed_list] - feed_record_list = FeedRecord.get_or_create_all(topic_list) - feed_stats_list = KnownFeedStats.get_or_create_all(topic_list) - start_time = time.time() - reporter = dos.Reporter() - successful_topics = [] - failed_topics = [] - - def create_callback(feed_record, feed_stats, work, fetch_url, attempts): - return lambda *args: callback( - feed_record, feed_stats, work, fetch_url, attempts, *args) - - def callback(feed_record, feed_stats, work, fetch_url, attempts, - status_code, headers, content, exception): - should_parse = False - fetch_success = False - if exception: - if isinstance(exception, urlfetch.ResponseTooLargeError): - logging.warning('Feed response too large for topic %r at url %r; ' - 'skipping', work.topic, fetch_url) - work.done() - elif isinstance(exception, urlfetch.InvalidURLError): - logging.warning('Invalid redirection for topic %r to url %r; ' - 'skipping', work.topic, fetch_url) - work.done() - elif isinstance(exception, (apiproxy_errors.Error, urlfetch.Error)): - logging.warning('Failed to fetch topic %r at url %r. %s: %s', - work.topic, fetch_url, exception.__class__, exception) - work.fetch_failed() - else: - logging.critical('Unexpected exception fetching topic %r. %s: %s', - work.topic, exception.__class__, exception) - work.fetch_failed() - else: - if status_code == 200: - should_parse = True - elif status_code in (301, 302, 303, 307) and 'Location' in headers: - fetch_url = headers['Location'] - logging.debug('Feed publisher for topic %r returned %d ' - 'redirect to %r', work.topic, status_code, fetch_url) - if attempts >= MAX_REDIRECTS: - logging.warning('Too many redirects for topic %r', work.topic) - work.fetch_failed() - else: - # Recurse to do the refetch. - hooks.execute(pull_feed_async, - work, - fetch_url, - feed_record.get_request_headers(feed_stats.subscriber_count), - async_proxy, - create_callback(feed_record, feed_stats, work, fetch_url, - attempts + 1)) - return - elif status_code == 304: - logging.debug('Feed publisher for topic %r returned ' - '304 response (cache hit)', work.topic) - work.done() - fetch_success = True - else: - logging.debug('Received bad response for topic = %r, ' - 'status_code = %s, response_headers = %r', - work.topic, status_code, headers) - work.fetch_failed() - - # Fetch is done one way or another. - end_time = time.time() - latency = int((end_time - start_time) * 1000) - if should_parse: - if parse_feed(feed_record, headers, content): - fetch_success = True - work.done() - else: - work.fetch_failed() - - if fetch_success: - successful_topics.append(work.topic) - else: - failed_topics.append(work.topic) - report_fetch(reporter, work.topic, fetch_success, latency) - # End callback - - # Fire off a fetch for every work item and wait for all callbacks. - for work, feed_record, feed_stats in zip( - ready_feed_list, feed_record_list, feed_stats_list): - hooks.execute(pull_feed_async, - work, - work.topic, - feed_record.get_request_headers(feed_stats.subscriber_count), - async_proxy, - create_callback(feed_record, feed_stats, work, work.topic, 1)) - - try: - async_proxy.wait() - except runtime.DeadlineExceededError: - logging.error('Could not finish all fetches due to deadline.') - else: - # Only update stats if we are not dealing with a deadlined request. - FETCH_SCORER.report(successful_topics, failed_topics) - FETCH_SAMPLER.sample(reporter) - - @work_queue_only - def post(self): - topic = self.request.get('topic') - if topic: - # For compatibility with old tasks and retry tasks. - work = FeedToFetch.get_by_topic(topic) - if not work: - logging.debug('No feeds to fetch for topic = %s', topic) - return - self._handle_fetches([work]) - else: - work_list = FeedToFetch.FORK_JOIN_QUEUE.pop_request(self.request) - self._handle_fetches(work_list) - -################################################################################ -# Event delivery - -def push_event(sub, headers, payload, async_proxy, callback): - """Pushes an event to a single subscriber using an asynchronous API call. - - Args: - sub: The Subscription instance to push the event to. - headers: Request headers to use when pushing the event. - payload: The content body the request should have. - async_proxy: AsyncAPIProxy to use for registering RPCs. - callback: Python callable to execute on success or failure. This callback - has the signature func(sub, result, exception) where sub is the - Subscription instance, result is the urlfetch.Response instance, and - exception is any exception encountered, if any. - """ - urlfetch_async.fetch(sub.callback, - method='POST', - headers=headers, - payload=payload, - async_proxy=async_proxy, - callback=callback, - deadline=MAX_FETCH_SECONDS) - - -class PushEventHandler(webapp.RequestHandler): - """Background worker for pushing events to subscribers.""" - - @work_queue_only - def post(self): - work = EventToDeliver.get(self.request.get('event_key')) - if not work: - logging.debug('No events to deliver.') - return - - # Retrieve the first N + 1 subscribers; note if we have more to contact. - more_subscribers, subscription_list = work.get_next_subscribers() - logging.info('%d more subscribers to contact for: ' - 'topic = %s, delivery_mode = %s', - len(subscription_list), work.topic, work.delivery_mode) - - # Keep track of failed callbacks. Do this instead of tracking successful - # callbacks because the asynchronous API calls could be interrupted by a - # deadline error. If that happens we'll want to mark all outstanding - # callback urls as still pending (and thus failed). - all_callbacks = set(subscription_list) - failed_callbacks = all_callbacks.copy() - reporter = dos.Reporter() - start_time = time.time() - - def callback(sub, result, exception): - end_time = time.time() - latency = int((end_time - start_time) * 1000) - if exception or not (200 <= result.status_code <= 299): - logging.debug('Could not deliver to target url %s: ' - 'Exception = %r, status_code = %s', - sub.callback, exception, - getattr(result, 'status_code', 'unknown')) - report_delivery(reporter, sub.callback, False, latency) - else: - failed_callbacks.remove(sub) - report_delivery(reporter, sub.callback, True, latency) - - def create_callback(sub): - return lambda *args: callback(sub, *args) - - payload_utf8 = utf8encoded(work.payload) - scores = DELIVERY_SCORER.filter(s.callback for s in all_callbacks) - for sub, (allowed, percent) in zip(all_callbacks, scores): - if not allowed: - logging.warning( - 'Scoring prevented delivery of %s to %s with failure rate %.2f%%', - work.topic, sub.callback, 100 * percent) - # Remove it from the list of all callbacks and failured callbacks. - # When a callback domain is hurting, we do not further penalize it - # with more failures, but we leave its standing the same. So it's - # as if this callback was never even seen. At the beginning of - # the next scoring period this callback will be allowed again. - all_callbacks.remove(sub) - failed_callbacks.remove(sub) - continue - - headers = { - # In case there was no content type header. - 'Content-Type': work.content_type or 'text/xml', - # TODO(bslatkin): add a better test for verify_token here. - 'X-Hub-Signature': 'sha1=%s' % sha1_hmac( - sub.secret or sub.verify_token or '', payload_utf8), - } - hooks.execute(push_event, - sub, headers, payload_utf8, async_proxy, create_callback(sub)) - - try: - async_proxy.wait() - except runtime.DeadlineExceededError: - logging.error('Could not finish all callbacks due to deadline. ' - 'Remaining are: %r', [s.callback for s in failed_callbacks]) - else: - # Only update stats if we're not dealing with a terminating request. - DELIVERY_SCORER.report( - [s.callback for s in (all_callbacks - failed_callbacks)], - [s.callback for s in failed_callbacks]) - DELIVERY_SAMPLER.sample(reporter) - - work.update(more_subscribers, failed_callbacks) - -################################################################################ - -def take_polling_action(topic_list, poll_type): - """Takes an action on a set of topics to be polled. - - Args: - topic_list: The iterable of topic URLs to take a polling action on. - poll_type: The type of polling to do. - """ - try: - if poll_type == 'record': - for topic in topic_list: - KnownFeed.record(topic) - else: - # Force these FeedToFetch records to be written to disk so we ensure - # that we will eventually polll the feeds. - FeedToFetch.insert(topic_list, memory_only=False) - except (taskqueue.Error, apiproxy_errors.Error, - db.Error, runtime.DeadlineExceededError, - fork_join_queue.Error): - logging.exception('Could not take polling action ' - 'of type %r for topics: %s', poll_type, topic_list) - - -class PollBootstrapHandler(webapp.RequestHandler): - """Boostrap handler automatically polls feeds.""" - - @work_queue_only - def get(self): - poll_type = self.request.get('poll_type', 'bootstrap') - the_mark = PollingMarker.get() - if the_mark.should_progress(): - # Naming the task based on the current start time here allows us to - # enqueue the *next* task in the polling chain before we've enqueued - # any of the actual FeedToFetch tasks. This is great because it lets us - # queue up a ton of tasks in parallel (since the task queue is reentrant). - # - # Without the task name present, each intermittent failure in the polling - # chain would cause an *alternate* sequence of tasks to execute. This - # causes exponential explosion in the number of tasks (think of an - # NP diagram or the "multiverse" of time/space). Yikes. - name = 'poll-' + str(int(time.mktime(the_mark.last_start.utctimetuple()))) - try: - taskqueue.Task( - url='/work/poll_bootstrap', - name=name, - params=dict(sequence=name, poll_type=poll_type) - ).add(POLLING_QUEUE) - except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError): - logging.exception('Could not enqueue FIRST polling task') - - the_mark.put() - - @work_queue_only - def post(self): - sequence = self.request.get('sequence') - current_key = self.request.get('current_key') - poll_type = self.request.get('poll_type') - logging.info('Handling polling for sequence = %s, ' - 'current_key = %r, poll_type = %r', - sequence, current_key, poll_type) - - query = KnownFeed.all() - if current_key: - query.filter('__key__ >', datastore_types.Key(current_key)) - known_feeds = query.fetch(BOOSTRAP_FEED_CHUNK_SIZE) - - if known_feeds: - current_key = str(known_feeds[-1].key()) - logging.info('Found %s more feeds to poll, ended at %s', - len(known_feeds), known_feeds[-1].topic) - try: - taskqueue.Task( - url='/work/poll_bootstrap', - name='%s-%s' % (sequence, sha1_hash(current_key)), - params=dict(sequence=sequence, - current_key=current_key, - poll_type=poll_type)).add(POLLING_QUEUE) - except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError): - logging.exception('Continued polling task already present; ' - 'this work has already been done') - return - - # TODO(bslatkin): Do more intelligent retrying of polling actions. - hooks.execute(take_polling_action, - [k.topic for k in known_feeds], - poll_type) - - else: - logging.info('Polling cycle complete') - current_key = None - -################################################################################ -# Feed canonicalization - -class RecordFeedHandler(webapp.RequestHandler): - """Background worker for categorizing/classifying feed URLs by their ID.""" - - def __init__(self, now=datetime.datetime.now): - """Initializer. - - Args: - now: Callable that returns the current time as a datetime.datetime. - """ - webapp.RequestHandler.__init__(self) - self.now = now - - @work_queue_only - def post(self): - topic = self.request.get('topic') - logging.debug('Recording topic = %s', topic) - - known_feed_key = KnownFeed.create_key(topic) - known_feed = KnownFeed.get(known_feed_key) - if known_feed: - seconds_since_update = self.now() - known_feed.update_time - if known_feed.feed_id and (seconds_since_update < - datetime.timedelta(seconds=FEED_IDENTITY_UPDATE_PERIOD)): - logging.debug('Ignoring feed identity update for topic = %s ' - 'due to update %s ago', topic, seconds_since_update) - return - else: - known_feed = KnownFeed.create(topic) - - try: - response = urlfetch.fetch(topic) - except (apiproxy_errors.Error, urlfetch.Error), e: - logging.warning('Could not fetch topic = %s for feed ID. %s: %s', - topic, e.__class__.__name__, e) - known_feed.put() - return - - # TODO(bslatkin): Add more intelligent retrying of feed identification. - if response.status_code != 200: - logging.warning('Fetching topic = %s for feed ID returned response %s', - topic, response.status_code) - known_feed.put() - return - - order = (ATOM, RSS) - parse_failures = 0 - error_traceback = 'Could not determine feed_id' - feed_id = None - for feed_type in order: - try: - feed_id = feed_identifier.identify(response.content, feed_type) - if feed_id is not None: - break - else: - parse_failures += 1 - except Exception: - error_traceback = traceback.format_exc() - logging.debug( - 'Could not parse feed for content of %d bytes in format "%s":\n%s', - len(response.content), feed_type, error_traceback) - parse_failures += 1 - - if parse_failures == len(order) or not feed_id: - logging.warning('Could not record feed ID for topic=%r, feed_id=%r:\n%s', - topic, feed_id, error_traceback) - known_feed.put() - # Just give up, since we can't parse it. This case also covers when - # the character encoding for the document is unsupported or the document - # is of an arbitrary content type. - return - - logging.info('For topic = %s found new feed ID %r; old feed ID was %r', - topic, feed_id, known_feed.feed_id) - - if known_feed.feed_id and known_feed.feed_id != feed_id: - logging.info('Removing old feed_id relation from ' - 'topic = %r to feed_id = %r', topic, known_feed.feed_id) - KnownFeedIdentity.remove(known_feed.feed_id, topic) - - KnownFeedIdentity.update(feed_id, topic) - known_feed.feed_id = feed_id - known_feed.put() - -################################################################################ - -class HubHandler(webapp.RequestHandler): - """Handler to multiplex subscribe and publish events on the same URL.""" - - def get(self): - context = { - 'host': self.request.host, - } - self.response.out.write(template.render('welcome.html', context)) - - def post(self): - mode = self.request.get('hub.mode', '').lower() - if mode == 'publish': - handler = PublishHandler() - elif mode in ('subscribe', 'unsubscribe'): - handler = SubscribeHandler() - else: - self.response.set_status(400) - self.response.out.write('hub.mode is invalid') - return - - handler.initialize(self.request, self.response) - handler.post() - - -class TopicDetailHandler(webapp.RequestHandler): - """Handler that serves topic debugging information to end-users.""" - - @dos.limit(count=5, period=60) - def get(self): - topic_url = normalize_iri(self.request.get('hub.url')) - feed = FeedRecord.get_by_key_name(FeedRecord.create_key_name(topic_url)) - if not feed: - self.response.set_status(400) - context = { - 'topic_url': topic_url, - 'error': 'Could not find any record for topic URL: ' + topic_url, - } - else: - fetch_score = FETCH_SCORER.filter([topic_url])[0] - context = { - 'topic_url': topic_url, - 'last_successful_fetch': feed.last_updated, - 'last_content_type': feed.content_type, - 'last_etag': feed.etag, - 'last_modified': feed.last_modified, - 'last_header_footer': feed.header_footer, - 'fetch_blocked': not fetch_score[0], - 'fetch_errors': fetch_score[1] * 100, - 'fetch_url_error': FETCH_SAMPLER.get_chain( - FETCH_URL_SAMPLE_MINUTE, - FETCH_URL_SAMPLE_30_MINUTE, - FETCH_URL_SAMPLE_HOUR, - FETCH_URL_SAMPLE_DAY, - single_key=topic_url), - 'fetch_url_latency': FETCH_SAMPLER.get_chain( - FETCH_URL_SAMPLE_MINUTE_LATENCY, - FETCH_URL_SAMPLE_30_MINUTE_LATENCY, - FETCH_URL_SAMPLE_HOUR_LATENCY, - FETCH_URL_SAMPLE_DAY_LATENCY, - single_key=topic_url), - } - - if users.is_current_user_admin(): - feed_stats = db.get(KnownFeedStats.create_key(topic_url=topic_url)) - if feed_stats: - context.update({ - 'subscriber_count': feed_stats.subscriber_count, - 'feed_stats_update_time': feed_stats.update_time, - }) - - fetch = FeedToFetch.get_by_topic(topic_url) - if fetch: - context.update({ - 'next_fetch': fetch.eta, - 'fetch_attempts': fetch.fetching_failures, - 'totally_failed': fetch.totally_failed, - }) - self.response.out.write(template.render('topic_details.html', context)) - - -class SubscriptionDetailHandler(webapp.RequestHandler): - """Handler that serves details about subscriber deliveries to end-users.""" - - @dos.limit(count=5, period=60) - def get(self): - topic_url = normalize_iri(self.request.get('hub.topic')) - callback_url = normalize_iri(self.request.get('hub.callback')) - secret = normalize_iri(self.request.get('hub.secret')) - subscription = Subscription.get_by_key_name( - Subscription.create_key_name(callback_url, topic_url)) - callback_domain = dos.get_url_domain(callback_url) - - context = { - 'topic_url': topic_url, - 'callback_url': callback_url, - 'callback_domain': callback_domain, - } - - if not subscription or ( - not users.is_current_user_admin() and - subscription.secret and - subscription.secret != secret): - context.update({ - 'error': 'Could not find any subscription for ' - 'the given (callback, topic, secret) tuple' - }) - else: - failed_events = (EventToDeliver.all() - .filter('failed_callbacks =', subscription.key()) - .fetch(25)) - delivery_score = DELIVERY_SCORER.filter([callback_url])[0] - - context.update({ - 'created_time': subscription.created_time, - 'last_modified': subscription.last_modified, - 'lease_seconds': subscription.lease_seconds, - 'expiration_time': subscription.expiration_time, - 'confirm_failures': subscription.confirm_failures, - 'subscription_state': subscription.subscription_state, - 'failed_events': [ - { - 'last_modified': e.last_modified, - 'retry_attempts': e.retry_attempts, - 'totally_failed': e.totally_failed, - 'content_type': e.content_type, - 'payload_trunc': e.payload[:10000], - } - for e in failed_events], - 'delivery_blocked': not delivery_score[0], - 'delivery_errors': delivery_score[1] * 100, - 'delivery_url_error': DELIVERY_SAMPLER.get_chain( - DELIVERY_URL_SAMPLE_MINUTE, - DELIVERY_URL_SAMPLE_30_MINUTE, - DELIVERY_URL_SAMPLE_HOUR, - DELIVERY_URL_SAMPLE_DAY, - single_key=callback_url), - 'delivery_url_latency': DELIVERY_SAMPLER.get_chain( - DELIVERY_URL_SAMPLE_MINUTE_LATENCY, - DELIVERY_URL_SAMPLE_30_MINUTE_LATENCY, - DELIVERY_URL_SAMPLE_HOUR_LATENCY, - DELIVERY_URL_SAMPLE_DAY_LATENCY, - single_key=callback_url), - }) - # Only show the domain stats when the subscription had a secret. - if subscription.secret or users.is_current_user_admin(): - context.update({ - 'delivery_domain_error': DELIVERY_SAMPLER.get_chain( - DELIVERY_DOMAIN_SAMPLE_MINUTE, - DELIVERY_DOMAIN_SAMPLE_30_MINUTE, - DELIVERY_DOMAIN_SAMPLE_HOUR, - DELIVERY_DOMAIN_SAMPLE_DAY, - single_key=callback_url), - 'delivery_domain_latency': DELIVERY_SAMPLER.get_chain( - DELIVERY_DOMAIN_SAMPLE_MINUTE_LATENCY, - DELIVERY_DOMAIN_SAMPLE_30_MINUTE_LATENCY, - DELIVERY_DOMAIN_SAMPLE_HOUR_LATENCY, - DELIVERY_DOMAIN_SAMPLE_DAY_LATENCY, - single_key=callback_url), - }) - - self.response.out.write(template.render('event_details.html', context)) - - -class StatsHandler(webapp.RequestHandler): - """Handler that serves DoS statistics information.""" - - def post(self): - if self.request.get('action').lower() == 'flush': - logging.critical('Flushing memcache!') - memcache.flush_all() - self.redirect('/stats') - - def get(self): - context = { - 'fetch_url_error': FETCH_SAMPLER.get_chain( - FETCH_URL_SAMPLE_MINUTE, - FETCH_URL_SAMPLE_30_MINUTE, - FETCH_URL_SAMPLE_HOUR, - FETCH_URL_SAMPLE_DAY), - 'fetch_url_latency': FETCH_SAMPLER.get_chain( - FETCH_URL_SAMPLE_MINUTE_LATENCY, - FETCH_URL_SAMPLE_30_MINUTE_LATENCY, - FETCH_URL_SAMPLE_HOUR_LATENCY, - FETCH_URL_SAMPLE_DAY_LATENCY), - 'fetch_domain_error': FETCH_SAMPLER.get_chain( - FETCH_DOMAIN_SAMPLE_MINUTE, - FETCH_DOMAIN_SAMPLE_30_MINUTE, - FETCH_DOMAIN_SAMPLE_HOUR, - FETCH_DOMAIN_SAMPLE_DAY), - 'fetch_domain_latency': FETCH_SAMPLER.get_chain( - FETCH_DOMAIN_SAMPLE_MINUTE_LATENCY, - FETCH_DOMAIN_SAMPLE_30_MINUTE_LATENCY, - FETCH_DOMAIN_SAMPLE_HOUR_LATENCY, - FETCH_DOMAIN_SAMPLE_DAY_LATENCY), - 'delivery_url_error': DELIVERY_SAMPLER.get_chain( - DELIVERY_URL_SAMPLE_MINUTE, - DELIVERY_URL_SAMPLE_30_MINUTE, - DELIVERY_URL_SAMPLE_HOUR, - DELIVERY_URL_SAMPLE_DAY), - 'delivery_url_latency': DELIVERY_SAMPLER.get_chain( - DELIVERY_URL_SAMPLE_MINUTE_LATENCY, - DELIVERY_URL_SAMPLE_30_MINUTE_LATENCY, - DELIVERY_URL_SAMPLE_HOUR_LATENCY, - DELIVERY_URL_SAMPLE_DAY_LATENCY), - 'delivery_domain_error': DELIVERY_SAMPLER.get_chain( - DELIVERY_DOMAIN_SAMPLE_MINUTE, - DELIVERY_DOMAIN_SAMPLE_30_MINUTE, - DELIVERY_DOMAIN_SAMPLE_HOUR, - DELIVERY_DOMAIN_SAMPLE_DAY), - 'delivery_domain_latency': DELIVERY_SAMPLER.get_chain( - DELIVERY_DOMAIN_SAMPLE_MINUTE_LATENCY, - DELIVERY_DOMAIN_SAMPLE_30_MINUTE_LATENCY, - DELIVERY_DOMAIN_SAMPLE_HOUR_LATENCY, - DELIVERY_DOMAIN_SAMPLE_DAY_LATENCY), - } - all_configs = [] - all_configs.extend(FETCH_SAMPLER.configs) - all_configs.extend(DELIVERY_SAMPLER.configs) - context.update({ - 'all_configs': all_configs, - 'show_everything': True, - }) - self.response.out.write(template.render('all_stats.html', context)) - -################################################################################ -# Hook system - -class InvalidHookError(Exception): - """A module has tried to access a hook for an unknown function.""" - - -class Hook(object): - """A conditional hook that overrides or modifies Hub behavior. - - Each Hook corresponds to a single Python callable that may be overridden - by the hook system. Multiple Hooks may inspect or modify the parameters, but - only a single callable may elect to actually handle the call. The inspect() - method will be called for each hook in the order the hooks are imported - by the HookManager. The final set of parameters will be passed to the - targetted hook's __call__() method. If more than one Hook elects to execute - a hooked function, a warning logging message be issued and the *first* Hook - encountered will be executed. - """ - - def inspect(self, args, kwargs): - """Inspects a hooked function's parameters, possibly modifying them. - - Args: - args: List of positional arguments for the hook call. - kwargs: Dictionary of keyword arguments for the hook call. - - Returns: - True if this Hook should handle the call, False otherwise. - """ - return False - - def __call__(self, *args, **kwargs): - """Handles the hook call. - - Args: - *args, **kwargs: Parameters matching the original function's signature. - - Returns: - The return value expected by the original function. - """ - assert False, '__call__ method not defined for %s' % self.__class__ - - -class HookManager(object): - """Manages registering and loading Hooks from external modules. - - Hook modules will have a copy of this 'main' module's contents in their - globals dictionary and the Hooks class to be sub-classed. They will also - have the 'register' method, which the hook module should use to register any - Hook sub-classes that it defines. - - The 'register' method has the same signature as the _register method of - this class, but without the leading 'filename' argument; that value is - curried by the HookManager. - """ - - def __init__(self): - """Initializer.""" - # Maps hook functions to a list of (filename, Hook) tuples. - self._mapping = {} - - def load(self, hooks_path='hooks', globals_dict=None): - """Loads all hooks from a particular directory. - - Args: - hooks_path: Optional. Relative path to the application directory or - absolute path to load hook modules from. - globals_dict: Dictionary of global variables to use when loading the - hook module. If None, defaults to the contents of this 'main' module. - Only for use in testing! - """ - if globals_dict is None: - globals_dict = globals() - - hook_directory = os.path.join(os.getcwd(), hooks_path) - if not os.path.exists(hook_directory): - return - module_list = os.listdir(hook_directory) - for module_name in sorted(module_list): - if not module_name.endswith('.py'): - continue - module_path = os.path.join(hook_directory, module_name) - context_dict = globals_dict.copy() - context_dict.update({ - 'Hook': Hook, - 'register': lambda *a, **k: self._register(module_name, *a, **k) - }) - logging.debug('Loading hook "%s" from %s', module_name, module_path) - try: - exec open(module_path) in context_dict - except: - logging.exception('Error loading hook "%s" from %s', - module_name, module_path) - raise - - def declare(self, original): - """Declares a function as being hookable. - - Args: - original: Python callable that may be hooked. - """ - self._mapping[original] = [] - - def execute(self, original, *args, **kwargs): - """Executes a hookable method, possibly invoking a registered Hook. - - Args: - original: The original hooked callable. - args: Positional arguments to pass to the callable. - kwargs: Keyword arguments to pass to the callable. - - Returns: - Whatever value is returned by the hooked call. - """ - try: - hook_list = self._mapping[original] - except KeyError, e: - raise InvalidHookError(e) - - modifiable_args = list(args) - modifiable_kwargs = dict(kwargs) - matches = [] - for filename, hook in hook_list: - if hook.inspect(modifiable_args, modifiable_kwargs): - matches.append((filename, hook)) - - filename = __name__ - designated_hook = original - if len(matches) >= 1: - filename, designated_hook = matches[0] - - if len(matches) > 1: - logging.critical( - 'Found multiple matching hooks for %s in files: %s. ' - 'Will use the first hook encountered: %s', - original, [f for (f, hook) in matches], filename) - - return designated_hook(*args, **kwargs) - - def _register(self, filename, original, hook): - """Registers a Hook to inspect and potentially execute a hooked function. - - Args: - filename: The name of the hook module this Hook is defined in. - original: The Python callable of the original hooked function. - hook: The Hook to register for this hooked function. - - Raises: - InvalidHookError if the original hook function is not known. - """ - try: - self._mapping[original].append((filename, hook)) - except KeyError, e: - raise InvalidHookError(e) - - def override_for_test(self, original, test): - """Adds a hook function for testing. - - Args: - original: The Python callable of the original hooked function. - test: The callable to use to override the original for this hook function. - """ - class OverrideHook(Hook): - def inspect(self, args, kwargs): - return True - def __call__(self, *args, **kwargs): - return test(*args, **kwargs) - self._register(__name__, original, OverrideHook()) - - def reset_for_test(self, original): - """Clears the configured test hook for a hooked function. - - Args: - original: The Python callable of the original hooked function. - """ - self._mapping[original].pop() - -################################################################################ - -HANDLERS = [] - - -def modify_handlers(handlers): - """Modifies the set of web request handlers. - - Args: - handlers: List of (path_regex, webapp.RequestHandler) instances that are - configured for this application. - - Returns: - Modified list of handlers, with some possibly removed and others added. - """ - return handlers - - -def main(): - global HANDLERS - if not HANDLERS: - HANDLERS = hooks.execute(modify_handlers, [ - # External interfaces - (r'/', HubHandler), - (r'/publish', PublishHandler), - (r'/subscribe', SubscribeHandler), - (r'/topic-details', TopicDetailHandler), - (r'/subscription-details', SubscriptionDetailHandler), - (r'/stats', StatsHandler), - # Low-latency workers - (r'/work/subscriptions', SubscriptionConfirmHandler), - (r'/work/pull_feeds', PullFeedHandler), - (r'/work/push_events', PushEventHandler), - (r'/work/record_feeds', RecordFeedHandler), - # Periodic workers - (r'/work/poll_bootstrap', PollBootstrapHandler), - (r'/work/subscription_cleanup', SubscriptionCleanupHandler), - (r'/work/reconfirm_subscriptions', SubscriptionReconfirmHandler), - (r'/work/cleanup_mapper', CleanupMapperHandler), - ]) - application = webapp.WSGIApplication(HANDLERS, debug=DEBUG) - wsgiref.handlers.CGIHandler().run(application) - -################################################################################ -# Declare and load external hooks. - -hooks = HookManager() -hooks.declare(confirm_subscription) -hooks.declare(derive_sources) -hooks.declare(inform_event) -hooks.declare(modify_handlers) -hooks.declare(preprocess_urls) -hooks.declare(pull_feed) -hooks.declare(pull_feed_async) -hooks.declare(push_event) -hooks.declare(take_polling_action) -hooks.load() - - -if __name__ == '__main__': - main() diff --git a/hub/main_test.py b/hub/main_test.py deleted file mode 100755 index c2b7a93..0000000 --- a/hub/main_test.py +++ /dev/null @@ -1,4350 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Tests for the main module.""" - -import datetime -import logging -logging.basicConfig(format='%(levelname)-8s %(filename)s] %(message)s') -import os -import shutil -import sys -import time -import tempfile -import unittest -import urllib -import xml.sax - -import testutil -testutil.fix_path() - - -from google.appengine import runtime -from google.appengine.api import memcache -from google.appengine.ext import db -from google.appengine.ext import webapp -from google.appengine.runtime import apiproxy_errors - -import async_apiproxy -import dos -import feed_diff -import main -import urlfetch_test_stub - -import mapreduce.control -import mapreduce.model - -################################################################################ -# For convenience - -sha1_hash = main.sha1_hash -get_hash_key_name = main.get_hash_key_name - -OTHER_STRING = '/~one:two/&=' -FUNNY = '/CaSeSeNsItIvE' -FUNNY_UNICODE = u'/blah/\u30d6\u30ed\u30b0\u8846' -FUNNY_UTF8 = '/blah/\xe3\x83\x96\xe3\x83\xad\xe3\x82\xb0\xe8\xa1\x86' -FUNNY_IRI = '/blah/%E3%83%96%E3%83%AD%E3%82%B0%E8%A1%86' - -################################################################################ - -class UtilityFunctionTest(unittest.TestCase): - """Tests for utility functions.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - - def testSha1Hash(self): - self.assertEquals('09f2c66851e75a7800748808ae7d855869b0c9d7', - main.sha1_hash('this is my test data')) - - def testGetHashKeyName(self): - self.assertEquals('hash_54f6638eb67ad389b66bbc3fa65f7392b0c2d270', - get_hash_key_name('and now testing a key')) - - def testSha1Hmac(self): - self.assertEquals('d95abcea4b2a8b0219da7cb04c261639a7bd8c94', - main.sha1_hmac('secrat', 'mydatahere')) - - def testIsValidUrl(self): - self.assertTrue(main.is_valid_url( - 'https://example.com:443/path/to?handler=1&b=2')) - self.assertTrue(main.is_valid_url('http://example.com:8080')) - self.assertFalse(main.is_valid_url('httpm://example.com')) - self.assertFalse(main.is_valid_url('http://example.com:9999')) - self.assertFalse(main.is_valid_url('http://example.com/blah#bad')) - - def testNormalizeIri(self): - uri_with_port = u'http://foo.com:9120/url/with/a/port' - self.assertEquals(uri_with_port, main.normalize_iri(uri_with_port)) - - uri_with_query = u'http://foo.com:9120/url?doh=this&port=1' - self.assertEquals(uri_with_query, main.normalize_iri(uri_with_query)) - - uri_with_funny = u'http://foo.com/~myuser/@url!with#nice;delimiter:chars' - self.assertEquals(uri_with_funny, main.normalize_iri(uri_with_funny)) - - not_unicode = 'http://foo.com:9120/url/with/a/port' - self.assertEquals(not_unicode, main.normalize_iri(not_unicode)) - - uri_with_port = u'http://foo.com:9120/url/with/a/port' - self.assertEquals(uri_with_port, main.normalize_iri(uri_with_port)) - - good_iri = ( - 'http://www.google.com/reader/public/atom/user' - '/07256788297315478906/label/%E3%83%96%E3%83%AD%E3%82%B0%E8%A1%86') - iri = (u'http://www.google.com/reader/public/atom/user' - u'/07256788297315478906/label/\u30d6\u30ed\u30b0\u8846') - self.assertEquals(good_iri, main.normalize_iri(iri)) - -################################################################################ - -class TestWorkQueueHandler(webapp.RequestHandler): - @main.work_queue_only - def get(self): - self.response.out.write('Pass') - - -class WorkQueueOnlyTest(testutil.HandlerTestBase): - """Tests the @work_queue_only decorator.""" - - handler_class = TestWorkQueueHandler - - def testNotLoggedIn(self): - os.environ['SERVER_SOFTWARE'] = 'Production' - self.handle('get') - self.assertEquals(302, self.response_code()) - - def testCronHeader(self): - os.environ['SERVER_SOFTWARE'] = 'Production' - os.environ['HTTP_X_APPENGINE_CRON'] = 'True' - try: - self.handle('get') - self.assertEquals('Pass', self.response_body()) - finally: - del os.environ['HTTP_X_APPENGINE_CRON'] - - def testDevelopmentEnvironment(self): - os.environ['SERVER_SOFTWARE'] = 'Development/1.0' - self.handle('get') - self.assertEquals('Pass', self.response_body()) - - def testAdminUser(self): - os.environ['SERVER_SOFTWARE'] = 'Production' - os.environ['USER_EMAIL'] = 'foo@example.com' - os.environ['USER_IS_ADMIN'] = '1' - try: - self.handle('get') - self.assertEquals('Pass', self.response_body()) - finally: - del os.environ['USER_IS_ADMIN'] - - def testNonAdminUser(self): - os.environ['SERVER_SOFTWARE'] = 'Production' - os.environ['USER_EMAIL'] = 'foo@example.com' - os.environ['USER_IS_ADMIN'] = '0' - try: - self.handle('get') - self.assertEquals(401, self.response_code()) - finally: - del os.environ['USER_IS_ADMIN'] - - def testTaskQueueHeader(self): - os.environ['SERVER_SOFTWARE'] = 'Production' - os.environ['HTTP_X_APPENGINE_TASKNAME'] = 'Foobar' - try: - self.handle('get') - self.assertEquals('Pass', self.response_body()) - finally: - del os.environ['HTTP_X_APPENGINE_TASKNAME'] - -################################################################################ - -KnownFeed = main.KnownFeed - -class KnownFeedTest(unittest.TestCase): - """Tests for the KnownFeed model class.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.topic = 'http://example.com/my-topic' - self.topic2 = 'http://example.com/my-topic2' - self.topic3 = 'http://example.com/my-topic3' - - def testCreateAndDelete(self): - known_feed = KnownFeed.create(self.topic) - self.assertEquals(self.topic, known_feed.topic) - db.put(known_feed) - - found_feed = db.get(KnownFeed.create_key(self.topic)) - self.assertEquals(found_feed.key(), known_feed.key()) - self.assertEquals(found_feed.topic, known_feed.topic) - - db.delete(KnownFeed.create_key(self.topic)) - self.assertTrue(db.get(KnownFeed.create_key(self.topic)) is None) - - def testCheckExistsMissing(self): - self.assertEquals([], KnownFeed.check_exists([])) - self.assertEquals([], KnownFeed.check_exists([self.topic])) - self.assertEquals([], KnownFeed.check_exists( - [self.topic, self.topic2, self.topic3])) - self.assertEquals([], KnownFeed.check_exists( - [self.topic, self.topic, self.topic, self.topic2, self.topic2])) - - def testCheckExists(self): - KnownFeed.create(self.topic).put() - KnownFeed.create(self.topic2).put() - KnownFeed.create(self.topic3).put() - self.assertEquals([self.topic], KnownFeed.check_exists([self.topic])) - self.assertEquals([self.topic2], KnownFeed.check_exists([self.topic2])) - self.assertEquals([self.topic3], KnownFeed.check_exists([self.topic3])) - self.assertEquals( - sorted([self.topic, self.topic2, self.topic3]), - sorted(KnownFeed.check_exists([self.topic, self.topic2, self.topic3]))) - self.assertEquals( - sorted([self.topic, self.topic2]), - sorted(KnownFeed.check_exists( - [self.topic, self.topic, self.topic, self.topic2, self.topic2]))) - - def testCheckExistsSubset(self): - KnownFeed.create(self.topic).put() - KnownFeed.create(self.topic3).put() - self.assertEquals( - sorted([self.topic, self.topic3]), - sorted(KnownFeed.check_exists([self.topic, self.topic2, self.topic3]))) - self.assertEquals( - sorted([self.topic, self.topic3]), - sorted(KnownFeed.check_exists( - [self.topic, self.topic, self.topic, - self.topic2, self.topic2, - self.topic3, self.topic3]))) - - def testRecord(self): - """Tests the method for recording a feed's identity.""" - KnownFeed.record(self.topic) - task = testutil.get_tasks(main.MAPPINGS_QUEUE, index=0, expected_count=1) - self.assertEquals(self.topic, task['params']['topic']) - -################################################################################ - -KnownFeedIdentity = main.KnownFeedIdentity - -class KnownFeedIdentityTest(unittest.TestCase): - """Tests for the KnownFeedIdentity class.""" - - def setUp(self): - testutil.setup_for_testing() - self.feed_id = 'my;feed;id' - self.feed_id2 = 'my;feed;id;2' - self.topic = 'http://example.com/foobar1' - self.topic2 = 'http://example.com/meep2' - self.topic3 = 'http://example.com/stuff3' - self.topic4 = 'http://example.com/blah4' - self.topic5 = 'http://example.com/woot5' - self.topic6 = 'http://example.com/neehaw6' - - def testUpdate(self): - """Tests the update method.""" - feed = KnownFeedIdentity.update(self.feed_id, self.topic) - feed_key = KnownFeedIdentity.create_key(self.feed_id) - self.assertEquals(feed_key, feed.key()) - self.assertEquals(self.feed_id, feed.feed_id) - self.assertEquals([self.topic], feed.topics) - - feed = KnownFeedIdentity.update(self.feed_id, self.topic2) - self.assertEquals(self.feed_id, feed.feed_id) - self.assertEquals([self.topic, self.topic2], feed.topics) - - def testRemove(self): - """Tests the remove method.""" - # Removing a mapping from an unknown ID does nothing. - self.assertTrue(KnownFeedIdentity.remove(self.feed_id, self.topic) is None) - - KnownFeedIdentity.update(self.feed_id, self.topic) - KnownFeedIdentity.update(self.feed_id, self.topic2) - - # Removing an unknown mapping for a known ID does nothing. - self.assertTrue(KnownFeedIdentity.remove(self.feed_id, self.topic3) is None) - - # Removing from a known ID returns the updated copy. - feed = KnownFeedIdentity.remove(self.feed_id, self.topic2) - self.assertEquals([self.topic], feed.topics) - - # Removing a second time does nothing. - self.assertTrue(KnownFeedIdentity.remove(self.feed_id, self.topic2) is None) - feed = KnownFeedIdentity.get(KnownFeedIdentity.create_key(self.feed_id)) - self.assertEquals([self.topic], feed.topics) - - # Removing the last one will delete the mapping completely. - self.assertTrue(KnownFeedIdentity.remove(self.feed_id, self.topic) is None) - feed = KnownFeedIdentity.get(KnownFeedIdentity.create_key(self.feed_id)) - self.assertTrue(feed is None) - - def testDeriveAdditionalTopics(self): - """Tests the derive_additional_topics method.""" - # topic, topic2 -> feed_id - for topic in (self.topic, self.topic2): - feed = KnownFeed.create(topic) - feed.feed_id = self.feed_id - feed.put() - KnownFeedIdentity.update(self.feed_id, self.topic) - KnownFeedIdentity.update(self.feed_id, self.topic2) - - # topic3, topic4 -> feed_id2 - for topic in (self.topic3, self.topic4): - feed = KnownFeed.create(topic) - feed.feed_id = self.feed_id2 - feed.put() - KnownFeedIdentity.update(self.feed_id2, self.topic3) - KnownFeedIdentity.update(self.feed_id2, self.topic4) - - # topic5 -> KnownFeed missing; should not be expanded at all - # topic6 -> KnownFeed where feed_id = None; default to simple mapping - KnownFeed.create(self.topic6).put() - - # Put missing topics first to provoke potential ordering errors in the - # iteration order of the retrieval loop. - result = KnownFeedIdentity.derive_additional_topics([ - self.topic5, self.topic6, self.topic, - self.topic2, self.topic3, self.topic4]) - - expected = { - 'http://example.com/foobar1': - set(['http://example.com/foobar1', 'http://example.com/meep2']), - 'http://example.com/meep2': - set(['http://example.com/foobar1', 'http://example.com/meep2']), - 'http://example.com/blah4': - set(['http://example.com/blah4', 'http://example.com/stuff3']), - 'http://example.com/neehaw6': - set(['http://example.com/neehaw6']), - 'http://example.com/stuff3': - set(['http://example.com/blah4', 'http://example.com/stuff3']) - } - self.assertEquals(expected, result) - - def testDeriveAdditionalTopicsWhitespace(self): - """Tests when the feed ID contains whitespace it is handled correctly. - - This test is only required because the 'feed_identifier' module did not - properly strip whitespace in its initial version. - """ - # topic -> feed_id with whitespace - feed = KnownFeed.create(self.topic) - feed.feed_id = self.feed_id - feed.put() - KnownFeedIdentity.update(self.feed_id, self.topic) - - # topic2 -> feed_id without whitespace - feed = KnownFeed.create(self.topic2) - feed.feed_id = '\n %s \n \n' % self.feed_id - feed.put() - KnownFeedIdentity.update(self.feed_id, self.topic2) - - # topic3 -> KnownFeed where feed_id = all whitespace - feed = KnownFeed.create(self.topic3) - feed.feed_id = '\n \n \n' - feed.put() - - result = KnownFeedIdentity.derive_additional_topics([ - self.topic, self.topic2, self.topic3]) - - expected = { - 'http://example.com/foobar1': - set(['http://example.com/foobar1', 'http://example.com/meep2']), - 'http://example.com/stuff3': - set(['http://example.com/stuff3']), - } - self.assertEquals(expected, result) - - def testKnownFeedIdentityTooLarge(self): - """Tests when the fan-out expansion of the KnownFeedIdentity is too big.""" - feed = KnownFeedIdentity.update(self.feed_id, self.topic) - KnownFeedIdentity.update( - self.feed_id, - 'http://super-extra-long-topic/' + ('a' * 10000000)) - # Doesn't explode and the update time stays the same. - new_feed = db.get(feed.key()) - self.assertEquals(feed.last_update, new_feed.last_update) - -################################################################################ - -Subscription = main.Subscription - - -class SubscriptionTest(unittest.TestCase): - """Tests for the Subscription model class.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.callback = 'http://example.com/my-callback-url' - self.callback2 = 'http://example.com/second-callback-url' - self.callback3 = 'http://example.com/third-callback-url' - self.topic = 'http://example.com/my-topic-url' - self.topic2 = 'http://example.com/second-topic-url' - self.token = 'token' - self.secret = 'my secrat' - self.callback_key_map = dict( - (Subscription.create_key_name(cb, self.topic), cb) - for cb in (self.callback, self.callback2, self.callback3)) - - def get_subscription(self): - """Returns the subscription for the test callback and topic.""" - return Subscription.get_by_key_name( - Subscription.create_key_name(self.callback, self.topic)) - - def verify_tasks(self, next_state, verify_token, secret, **kwargs): - """Verifies the required tasks have been submitted. - - Args: - next_state: The next state the Subscription should have. - verify_token: The token that should be used to confirm the - subscription action. - **kwargs: Passed to testutil.get_tasks(). - """ - task = testutil.get_tasks(main.SUBSCRIPTION_QUEUE, **kwargs) - self.assertEquals(next_state, task['params']['next_state']) - self.assertEquals(verify_token, task['params']['verify_token']) - self.assertEquals(secret, task['params']['secret']) - - def testRequestInsert_defaults(self): - now_datetime = datetime.datetime.now() - now = lambda: now_datetime - lease_seconds = 1234 - - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, - self.secret, lease_seconds=lease_seconds, now=now)) - self.verify_tasks(Subscription.STATE_VERIFIED, self.token, self.secret, - expected_count=1, index=0) - self.assertFalse(Subscription.request_insert( - self.callback, self.topic, self.token, - self.secret, lease_seconds=lease_seconds, now=now)) - self.verify_tasks(Subscription.STATE_VERIFIED, self.token, self.secret, - expected_count=2, index=1) - - sub = self.get_subscription() - self.assertEquals(Subscription.STATE_NOT_VERIFIED, sub.subscription_state) - self.assertEquals(self.callback, sub.callback) - self.assertEquals(sha1_hash(self.callback), sub.callback_hash) - self.assertEquals(self.topic, sub.topic) - self.assertEquals(sha1_hash(self.topic), sub.topic_hash) - self.assertEquals(self.token, sub.verify_token) - self.assertEquals(self.secret, sub.secret) - self.assertEquals(0, sub.confirm_failures) - self.assertEquals(now_datetime + datetime.timedelta(seconds=lease_seconds), - sub.expiration_time) - self.assertEquals(lease_seconds, sub.lease_seconds) - - def testInsert_defaults(self): - now_datetime = datetime.datetime.now() - now = lambda: now_datetime - lease_seconds = 1234 - - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret, - lease_seconds=lease_seconds, now=now)) - self.assertFalse(Subscription.insert( - self.callback, self.topic, self.token, self.secret, - lease_seconds=lease_seconds, now=now)) - testutil.get_tasks(main.SUBSCRIPTION_QUEUE, expected_count=0) - - sub = self.get_subscription() - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.assertEquals(self.callback, sub.callback) - self.assertEquals(sha1_hash(self.callback), sub.callback_hash) - self.assertEquals(self.topic, sub.topic) - self.assertEquals(sha1_hash(self.topic), sub.topic_hash) - self.assertEquals(self.token, sub.verify_token) - self.assertEquals(self.secret, sub.secret) - self.assertEquals(0, sub.confirm_failures) - self.assertEquals(now_datetime + datetime.timedelta(seconds=lease_seconds), - sub.expiration_time) - self.assertEquals(lease_seconds, sub.lease_seconds) - - def testInsertOverride(self): - """Tests that insert will override the existing Subscription fields.""" - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret)) - self.assertEquals(Subscription.STATE_NOT_VERIFIED, - self.get_subscription().subscription_state) - - second_token = 'second token' - second_secret = 'second secret' - sub = self.get_subscription() - sub.confirm_failures = 123 - sub.put() - self.assertFalse(Subscription.insert( - self.callback, self.topic, second_token, second_secret)) - - sub = self.get_subscription() - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.assertEquals(0, sub.confirm_failures) - self.assertEquals(second_token, sub.verify_token) - self.assertEquals(second_secret, sub.secret) - - self.verify_tasks(Subscription.STATE_VERIFIED, self.token, self.secret, - expected_count=1, index=0) - - def testInsert_expiration(self): - """Tests that the expiration time is updated on repeated insert() calls.""" - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - sub = Subscription.all().get() - expiration1 = sub.expiration_time - time.sleep(0.5) - self.assertFalse(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - sub = db.get(sub.key()) - expiration2 = sub.expiration_time - self.assertTrue(expiration2 > expiration1) - - def testRemove(self): - self.assertFalse(Subscription.remove(self.callback, self.topic)) - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.remove(self.callback, self.topic)) - self.assertFalse(Subscription.remove(self.callback, self.topic)) - # Only task should be the initial insertion request. - self.verify_tasks(Subscription.STATE_VERIFIED, self.token, self.secret, - expected_count=1, index=0) - - def testRequestRemove(self): - """Tests the request remove method.""" - self.assertFalse(Subscription.request_remove( - self.callback, self.topic, self.token)) - # No tasks should be enqueued and this request should do nothing because - # no subscription currently exists. - testutil.get_tasks(main.SUBSCRIPTION_QUEUE, expected_count=0) - - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret)) - second_token = 'this is the second token' - self.assertTrue(Subscription.request_remove( - self.callback, self.topic, second_token)) - - sub = self.get_subscription() - self.assertEquals(self.token, sub.verify_token) - self.assertEquals(Subscription.STATE_NOT_VERIFIED, sub.subscription_state) - - self.verify_tasks(Subscription.STATE_VERIFIED, self.token, self.secret, - expected_count=2, index=0) - self.verify_tasks(Subscription.STATE_TO_DELETE, second_token, '', - expected_count=2, index=1) - - def testRequestInsertOverride(self): - """Tests that requesting insertion does not override the verify_token.""" - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - second_token = 'this is the second token' - second_secret = 'another secret here' - self.assertFalse(Subscription.request_insert( - self.callback, self.topic, second_token, second_secret)) - - sub = self.get_subscription() - self.assertEquals(self.token, sub.verify_token) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - - self.verify_tasks(Subscription.STATE_VERIFIED, second_token, second_secret, - expected_count=1, index=0) - - def testHasSubscribers_unverified(self): - """Tests that unverified subscribers do not make the subscription active.""" - self.assertFalse(Subscription.has_subscribers(self.topic)) - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret)) - self.assertFalse(Subscription.has_subscribers(self.topic)) - - def testHasSubscribers_verified(self): - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.has_subscribers(self.topic)) - self.assertTrue(Subscription.remove(self.callback, self.topic)) - self.assertFalse(Subscription.has_subscribers(self.topic)) - - def testGetSubscribers_unverified(self): - """Tests that unverified subscribers will not be retrieved.""" - self.assertEquals([], Subscription.get_subscribers(self.topic, 10)) - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.request_insert( - self.callback2, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.request_insert( - self.callback3, self.topic, self.token, self.secret)) - self.assertEquals([], Subscription.get_subscribers(self.topic, 10)) - - def testGetSubscribers_verified(self): - self.assertEquals([], Subscription.get_subscribers(self.topic, 10)) - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, self.token, self.secret)) - sub_list = Subscription.get_subscribers(self.topic, 10) - found_keys = set(s.key().name() for s in sub_list) - self.assertEquals(set(self.callback_key_map.keys()), found_keys) - - def testGetSubscribers_count(self): - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, self.token, self.secret)) - sub_list = Subscription.get_subscribers(self.topic, 1) - self.assertEquals(1, len(sub_list)) - - def testGetSubscribers_withOffset(self): - """Tests the behavior of the starting_at_callback offset parameter.""" - # In the order the query will sort them. - all_hashes = [ - u'87a74994e48399251782eb401e9a61bd1d55aeee', - u'01518f29da9db10888a92e9f0211ac0c98ec7ecb', - u'f745d00a9806a5cdd39f16cd9eff80e8f064cfee', - ] - all_keys = ['hash_' + h for h in all_hashes] - all_callbacks = [self.callback_key_map[k] for k in all_keys] - - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, self.token, self.secret)) - - def key_list(starting_at_callback): - sub_list = Subscription.get_subscribers( - self.topic, 10, starting_at_callback=starting_at_callback) - return [s.key().name() for s in sub_list] - - self.assertEquals(all_keys, key_list(None)) - self.assertEquals(all_keys, key_list(all_callbacks[0])) - self.assertEquals(all_keys[1:], key_list(all_callbacks[1])) - self.assertEquals(all_keys[2:], key_list(all_callbacks[2])) - - def testGetSubscribers_multipleTopics(self): - """Tests that separate topics do not overlap in subscriber queries.""" - self.assertEquals([], Subscription.get_subscribers(self.topic2, 10)) - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, self.token, self.secret)) - self.assertEquals([], Subscription.get_subscribers(self.topic2, 10)) - - self.assertTrue(Subscription.insert( - self.callback2, self.topic2, self.token, self.secret)) - self.assertTrue(Subscription.insert( - self.callback3, self.topic2, self.token, self.secret)) - sub_list = Subscription.get_subscribers(self.topic2, 10) - found_keys = set(s.key().name() for s in sub_list) - self.assertEquals( - set(Subscription.create_key_name(cb, self.topic2) - for cb in (self.callback2, self.callback3)), - found_keys) - self.assertEquals(3, len(Subscription.get_subscribers(self.topic, 10))) - - def testConfirmFailed(self): - """Tests retry delay periods when a subscription confirmation fails.""" - start = datetime.datetime.utcnow() - def now(): - return start - - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret)) - sub_key = Subscription.create_key_name(self.callback, self.topic) - sub = Subscription.get_by_key_name(sub_key) - self.assertEquals(0, sub.confirm_failures) - - for i, delay in enumerate((5, 10, 20, 40, 80)): - self.assertTrue( - sub.confirm_failed(Subscription.STATE_VERIFIED, self.token, False, - max_failures=5, retry_period=5, now=now)) - self.assertEquals(sub.eta, start + datetime.timedelta(seconds=delay)) - self.assertEquals(i+1, sub.confirm_failures) - - # It will give up on the last try. - self.assertFalse( - sub.confirm_failed(Subscription.STATE_VERIFIED, self.token, False, - max_failures=5, retry_period=5)) - sub = Subscription.get_by_key_name(sub_key) - self.assertEquals(Subscription.STATE_NOT_VERIFIED, sub.subscription_state) - testutil.get_tasks(main.SUBSCRIPTION_QUEUE, index=0, expected_count=6) - - def testQueueSelected(self): - """Tests that auto_reconfirm will put the task on the polling queue.""" - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret, - auto_reconfirm=True)) - testutil.get_tasks(main.SUBSCRIPTION_QUEUE, expected_count=0) - testutil.get_tasks(main.POLLING_QUEUE, expected_count=1) - - self.assertFalse(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret, - auto_reconfirm=False)) - testutil.get_tasks(main.SUBSCRIPTION_QUEUE, expected_count=1) - testutil.get_tasks(main.POLLING_QUEUE, expected_count=1) - - def testArchiveExists(self): - """Tests the archive method when the subscription exists.""" - Subscription.insert(self.callback, self.topic, self.token, self.secret) - sub_key = Subscription.create_key_name(self.callback, self.topic) - sub = Subscription.get_by_key_name(sub_key) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - Subscription.archive(self.callback, self.topic) - sub = Subscription.get_by_key_name(sub_key) - self.assertEquals(Subscription.STATE_TO_DELETE, sub.subscription_state) - - def testArchiveMissing(self): - """Tests the archive method when the subscription does not exist.""" - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - Subscription.archive(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - -################################################################################ - -FeedToFetch = main.FeedToFetch - -class FeedToFetchTest(unittest.TestCase): - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.topic = 'http://example.com/topic-one' - self.topic2 = 'http://example.com/topic-two' - self.topic3 = 'http://example.com/topic-three' - - def testInsertAndGet(self): - """Tests inserting and getting work.""" - all_topics = [self.topic, self.topic2, self.topic3] - found_feeds = FeedToFetch.insert(all_topics) - task = testutil.get_tasks(main.FEED_QUEUE, index=0, expected_count=1) - self.assertTrue(task['name'].endswith('%d-0' % found_feeds[0].work_index)) - - for topic, feed_to_fetch in zip(all_topics, found_feeds): - self.assertEquals(topic, feed_to_fetch.topic) - self.assertEquals([], feed_to_fetch.source_keys) - self.assertEquals([], feed_to_fetch.source_values) - self.assertEquals(found_feeds[0].work_index, feed_to_fetch.work_index) - - def testEmpty(self): - """Tests when the list of urls is empty.""" - FeedToFetch.insert([]) - self.assertEquals([], testutil.get_tasks(main.FEED_QUEUE)) - - def testDuplicates(self): - """Tests duplicate urls.""" - all_topics = [self.topic, self.topic, self.topic2, self.topic2] - found_feeds = FeedToFetch.insert(all_topics) - found_topics = set(t.topic for t in found_feeds) - self.assertEquals(set(all_topics), found_topics) - task = testutil.get_tasks(main.FEED_QUEUE, index=0, expected_count=1) - self.assertTrue(task['name'].endswith('%d-0' % found_feeds[0].work_index)) - - def testDone(self): - """Tests marking the feed as completed.""" - (feed,) = FeedToFetch.insert([self.topic]) - self.assertFalse(feed.done()) - self.assertTrue(FeedToFetch.get_by_topic(self.topic) is None) - - def testDoneAfterFailure(self): - """Tests done() after a fetch_failed() writes the FeedToFetch to disk.""" - (feed,) = FeedToFetch.insert([self.topic]) - feed.fetch_failed() - self.assertTrue(feed.done()) - self.assertTrue(FeedToFetch.get_by_topic(self.topic) is None) - - def testDoneConflict(self): - """Tests when another entity was written over the top of this one.""" - (feed1,) = FeedToFetch.insert([self.topic]) - feed1.put() - (feed2,) = FeedToFetch.insert([self.topic]) - feed2.put() - - self.assertFalse(feed1.done()) - self.assertTrue(FeedToFetch.get_by_topic(self.topic) is not None) - - def testFetchFailed(self): - """Tests when the fetch fails and should be retried.""" - start = datetime.datetime.utcnow() - now = lambda: start - - (feed,) = FeedToFetch.insert([self.topic]) - etas = [] - for i, delay in enumerate((5, 10, 20, 40, 80)): - feed = FeedToFetch.get_by_topic(self.topic) or feed - feed.fetch_failed(max_failures=5, retry_period=5, now=now) - expected_eta = start + datetime.timedelta(seconds=delay) - self.assertEquals(expected_eta, feed.eta) - etas.append(testutil.task_eta(feed.eta)) - self.assertEquals(i+1, feed.fetching_failures) - self.assertEquals(False, feed.totally_failed) - - feed.fetch_failed(max_failures=5, retry_period=5, now=now) - self.assertEquals(True, feed.totally_failed) - - tasks = testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - tasks.extend(testutil.get_tasks(main.FEED_RETRIES_QUEUE, expected_count=5)) - found_etas = [t['eta'] for t in tasks[1:]] # First task is from insert() - self.assertEquals(etas, found_etas) - - def testQueuePreserved(self): - """Tests the request's polling queue is preserved for new FeedToFetch.""" - FeedToFetch.insert([self.topic]) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - - os.environ['HTTP_X_APPENGINE_QUEUENAME'] = main.POLLING_QUEUE - try: - (feed,) = FeedToFetch.insert([self.topic]) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - testutil.get_tasks(main.POLLING_QUEUE, expected_count=1) - finally: - del os.environ['HTTP_X_APPENGINE_QUEUENAME'] - - def testSources(self): - """Tests when sources are supplied.""" - source_dict = {'foo': 'bar', 'meepa': 'stuff'} - all_topics = [self.topic, self.topic2, self.topic3] - feed_list = FeedToFetch.insert(all_topics, source_dict=source_dict) - for feed_to_fetch in feed_list: - found_source_dict = dict(zip(feed_to_fetch.source_keys, - feed_to_fetch.source_values)) - self.assertEquals(source_dict, found_source_dict) - -################################################################################ - -FeedEntryRecord = main.FeedEntryRecord -EventToDeliver = main.EventToDeliver - - -class EventToDeliverTest(unittest.TestCase): - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.topic = 'http://example.com/my-topic' - # Order out of the datastore will be done by callback hash, not alphabetical - self.callback = 'http://example.com/my-callback' - self.callback2 = 'http://example.com/second-callback' - self.callback3 = 'http://example.com/third-callback-123' - self.callback4 = 'http://example.com/fourth-callback-1205' - self.header_footer = '\nblah\n' - self.token = 'verify token' - self.secret = 'some secret' - self.test_payloads = [ - 'article1', - 'article2', - 'article3', - ] - - def insert_subscriptions(self): - """Inserts Subscription instances and an EventToDeliver for testing. - - Returns: - Tuple (event, work_key, sub_list, sub_keys) where: - event: The EventToDeliver that was inserted. - work_key: Key for the 'event' - sub_list: List of Subscription instances that were created in order - of their callback hashes. - sub_keys: Key instances corresponding to the entries in 'sub_list'. - """ - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - work_key = event.key() - - Subscription.insert( - self.callback, self.topic, self.token, self.secret) - Subscription.insert( - self.callback2, self.topic, self.token, self.secret) - Subscription.insert( - self.callback3, self.topic, self.token, self.secret) - Subscription.insert( - self.callback4, self.topic, self.token, self.secret) - sub_list = Subscription.get_subscribers(self.topic, 10) - sub_keys = [s.key() for s in sub_list] - self.assertEquals(4, len(sub_list)) - - return (event, work_key, sub_list, sub_keys) - - def testCreateEventForTopic(self): - """Tests that the payload of an event is properly formed.""" - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - expected_data = \ -u""" - -blah - -article1 -article2 -article3 -""" - self.assertEquals(expected_data, event.payload) - self.assertEquals('application/atom+xml', event.content_type) - - def testCreateEventForTopic_Rss(self): - """Tests that the RSS payload is properly formed.""" - self.test_payloads = [ - 'article1', - 'article2', - 'article3', - ] - self.header_footer = ( - '\n\nblah\n\n') - event = EventToDeliver.create_event_for_topic( - self.topic, main.RSS, 'application/rss+xml', - self.header_footer, self.test_payloads) - expected_data = \ -u""" - - -blah - -article1 -article2 -article3 - -""" - self.assertEquals(expected_data, event.payload) - self.assertEquals('application/rss+xml', event.content_type) - - def testCreateEventForTopic_Abitrary(self): - """Tests that an arbitrary payload is properly formed.""" - self.test_payloads = [] - self.header_footer = 'this is my data here' - event = EventToDeliver.create_event_for_topic( - self.topic, main.ARBITRARY, 'my crazy content type', - self.header_footer, self.test_payloads) - expected_data = 'this is my data here' - self.assertEquals(expected_data, event.payload) - self.assertEquals('my crazy content type', event.content_type) - - def testCreateEvent_badHeaderFooter(self): - """Tests when the header/footer data in an event is invalid.""" - self.assertRaises(AssertionError, EventToDeliver.create_event_for_topic, - self.topic, main.ATOM, 'content type unused', - 'has no end tag', self.test_payloads) - - def testNormal_noFailures(self): - """Tests that event delivery with no failures will delete the event.""" - event, work_key, sub_list, sub_keys = self.insert_subscriptions() - more, subs = event.get_next_subscribers() - event.update(more, []) - event = EventToDeliver.get(work_key) - self.assertTrue(event is None) - - def testUpdate_failWithNoSubscribersLeft(self): - """Tests that failures are written correctly by EventToDeliver.update. - - This tests the common case of completing the failed callbacks list extending - when there are new Subscriptions that have been found in the latest work - queue query. - """ - event, work_key, sub_list, sub_keys = self.insert_subscriptions() - - # Assert that the callback offset is updated and any failed callbacks - # are recorded. - more, subs = event.get_next_subscribers(chunk_size=1) - event.update(more, [sub_list[0]]) - event = EventToDeliver.get(event.key()) - self.assertEquals(EventToDeliver.NORMAL, event.delivery_mode) - self.assertEquals([sub_list[0].key()], event.failed_callbacks) - self.assertEquals(self.callback2, event.last_callback) - - more, subs = event.get_next_subscribers(chunk_size=3) - event.update(more, sub_list[1:]) - event = EventToDeliver.get(event.key()) - self.assertTrue(event is not None) - self.assertEquals(EventToDeliver.RETRY, event.delivery_mode) - self.assertEquals('', event.last_callback) - - self.assertEquals([s.key() for s in sub_list], event.failed_callbacks) - tasks = testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - tasks.extend(testutil.get_tasks(main.EVENT_RETRIES_QUEUE, expected_count=1)) - self.assertEquals([str(work_key)] * 2, - [t['params']['event_key'] for t in tasks]) - - def testUpdate_actuallyNoMoreCallbacks(self): - """Tests when the normal update delivery has no Subscriptions left. - - This tests the case where update is called with no Subscribers in the - list of Subscriptions. This can happen if a Subscription is deleted - between when an update happens and when the work queue is invoked again. - """ - event, work_key, sub_list, sub_keys = self.insert_subscriptions() - - more, subs = event.get_next_subscribers(chunk_size=3) - event.update(more, subs) - event = EventToDeliver.get(event.key()) - self.assertEquals(self.callback4, event.last_callback) - self.assertEquals(EventToDeliver.NORMAL, event.delivery_mode) - - # This final call to update will transition to retry properly. - Subscription.remove(self.callback4, self.topic) - more, subs = event.get_next_subscribers(chunk_size=1) - event.update(more, []) - event = EventToDeliver.get(event.key()) - self.assertEquals([], subs) - self.assertTrue(event is not None) - self.assertEquals(EventToDeliver.RETRY, event.delivery_mode) - - tasks = testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - tasks.extend(testutil.get_tasks(main.EVENT_RETRIES_QUEUE, expected_count=1)) - self.assertEquals([str(work_key)] * 2, - [t['params']['event_key'] for t in tasks]) - - def testGetNextSubscribers_retriesFinallySuccessful(self): - """Tests retries until all subscribers are successful.""" - event, work_key, sub_list, sub_keys = self.insert_subscriptions() - - # Simulate that callback 2 is successful and the rest fail. - more, subs = event.get_next_subscribers(chunk_size=2) - event.update(more, sub_list[:1]) - event = EventToDeliver.get(event.key()) - self.assertTrue(more) - self.assertEquals(self.callback3, event.last_callback) - self.assertEquals(EventToDeliver.NORMAL, event.delivery_mode) - - more, subs = event.get_next_subscribers(chunk_size=2) - event.update(more, sub_list[2:]) - event = EventToDeliver.get(event.key()) - self.assertEquals('', event.last_callback) - self.assertFalse(more) - self.assertEquals(EventToDeliver.RETRY, event.delivery_mode) - - # Now getting the next subscribers will returned the failed ones. - more, subs = event.get_next_subscribers(chunk_size=2) - expected = sub_keys[:1] + sub_keys[2:3] - self.assertEquals(expected, [s.key() for s in subs]) - event.update(more, subs) - event = EventToDeliver.get(event.key()) - self.assertTrue(more) - self.assertEquals(self.callback, event.last_callback) - self.assertEquals(EventToDeliver.RETRY, event.delivery_mode) - - # This will get the last of the failed subscribers but *not* include the - # sentinel value of event.last_callback, since that marks the end of this - # attempt. - more, subs = event.get_next_subscribers(chunk_size=2) - expected = sub_keys[3:] - self.assertEquals(expected, [s.key() for s in subs]) - event.update(more, subs) - event = EventToDeliver.get(event.key()) - self.assertFalse(more) - self.assertEquals('', event.last_callback) - self.assertEquals(EventToDeliver.RETRY, event.delivery_mode) - self.assertEquals(sub_keys[:1] + sub_keys[2:], event.failed_callbacks) - - # Now simulate all retries being successful one chunk at a time. - more, subs = event.get_next_subscribers(chunk_size=2) - expected = sub_keys[:1] + sub_keys[2:3] - self.assertEquals(expected, [s.key() for s in subs]) - event.update(more, []) - event = EventToDeliver.get(event.key()) - self.assertTrue(more) - self.assertEquals(self.callback, event.last_callback) - self.assertEquals(EventToDeliver.RETRY, event.delivery_mode) - self.assertEquals(sub_keys[3:], event.failed_callbacks) - - more, subs = event.get_next_subscribers(chunk_size=2) - expected = sub_keys[3:] - self.assertEquals(expected, [s.key() for s in subs]) - event.update(more, []) - self.assertFalse(more) - - tasks = testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - tasks.extend(testutil.get_tasks(main.EVENT_RETRIES_QUEUE, expected_count=4)) - self.assertEquals([str(work_key)] * 5, - [t['params']['event_key'] for t in tasks]) - - def testGetNextSubscribers_failedFewerThanChunkSize(self): - """Tests when there are fewer failed callbacks than the chunk size. - - Ensures that we step through retry attempts when there is only a single - chunk to go through on each retry iteration. - """ - event, work_key, sub_list, sub_keys = self.insert_subscriptions() - - # Simulate that callback 2 is successful and the rest fail. - more, subs = event.get_next_subscribers(chunk_size=2) - event.update(more, sub_list[:1]) - event = EventToDeliver.get(event.key()) - self.assertTrue(more) - self.assertEquals(self.callback3, event.last_callback) - self.assertEquals(EventToDeliver.NORMAL, event.delivery_mode) - - more, subs = event.get_next_subscribers(chunk_size=2) - event.update(more, sub_list[2:]) - event = EventToDeliver.get(event.key()) - self.assertEquals('', event.last_callback) - self.assertFalse(more) - self.assertEquals(EventToDeliver.RETRY, event.delivery_mode) - self.assertEquals(1, event.retry_attempts) - - # Now attempt a retry with a chunk size equal to the number of callbacks. - more, subs = event.get_next_subscribers(chunk_size=3) - event.update(more, subs) - event = EventToDeliver.get(event.key()) - self.assertFalse(more) - self.assertEquals(EventToDeliver.RETRY, event.delivery_mode) - self.assertEquals(2, event.retry_attempts) - - tasks = testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - tasks.extend(testutil.get_tasks(main.EVENT_RETRIES_QUEUE, expected_count=2)) - self.assertEquals([str(work_key)] * 3, - [t['params']['event_key'] for t in tasks]) - - def testGetNextSubscribers_giveUp(self): - """Tests retry delay amounts until we finally give up on event delivery. - - Verifies retry delay logic works properly. - """ - event, work_key, sub_list, sub_keys = self.insert_subscriptions() - - start = datetime.datetime.utcnow() - now = lambda: start - - etas = [] - for i, delay in enumerate((5, 10, 20, 40, 80, 160, 320, 640)): - more, subs = event.get_next_subscribers(chunk_size=4) - event.update(more, subs, retry_period=5, now=now, max_failures=8) - event = EventToDeliver.get(event.key()) - self.assertEquals(i+1, event.retry_attempts) - expected_eta = start + datetime.timedelta(seconds=delay) - self.assertEquals(expected_eta, event.last_modified) - etas.append(testutil.task_eta(event.last_modified)) - self.assertFalse(event.totally_failed) - - more, subs = event.get_next_subscribers(chunk_size=4) - event.update(more, subs) - event = EventToDeliver.get(event.key()) - self.assertTrue(event.totally_failed) - - tasks = testutil.get_tasks(main.EVENT_RETRIES_QUEUE, expected_count=8) - found_etas = [t['eta'] for t in tasks] - self.assertEquals(etas, found_etas) - - def testQueuePreserved(self): - """Tests that enqueueing an EventToDeliver preserves the polling queue.""" - event, work_key, sub_list, sub_keys = self.insert_subscriptions() - def txn(): - event.enqueue() - db.run_in_transaction(txn) - - testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - os.environ['HTTP_X_APPENGINE_QUEUENAME'] = main.POLLING_QUEUE - try: - db.run_in_transaction(txn) - finally: - del os.environ['HTTP_X_APPENGINE_QUEUENAME'] - - testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - testutil.get_tasks(main.POLLING_QUEUE, expected_count=1) - - def testMaxFailuresOverride(self): - """Tests the max_failures override value.""" - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - self.assertEquals(None, event.max_failures) - - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads, - max_failures=1) - self.assertEquals(1, event.max_failures) - - Subscription.insert( - self.callback, self.topic, self.token, self.secret) - subscription_list = list(Subscription.all()) - - event.put() - event.update(False, subscription_list) - event2 = db.get(event.key()) - self.assertFalse(event2.totally_failed) - - event2.update(False, []) - event3 = db.get(event.key()) - self.assertTrue(event3.totally_failed) - -################################################################################ - -class PublishHandlerTest(testutil.HandlerTestBase): - - handler_class = main.PublishHandler - - def setUp(self): - testutil.HandlerTestBase.setUp(self) - self.topic = 'http://example.com/first-url' - self.topic2 = 'http://example.com/second-url' - self.topic3 = 'http://example.com/third-url' - - def get_feeds_to_fetch(self): - """Gets the enqueued FeedToFetch records.""" - return FeedToFetch.FORK_JOIN_QUEUE.pop( - testutil.get_tasks(main.FEED_QUEUE, index=0, expected_count=1)['name']) - - def testDebugFormRenders(self): - self.handle('get') - self.assertTrue('' in self.response_body()) - - def testBadMode(self): - self.handle('post', - ('hub.mode', 'invalid'), - ('hub.url', 'http://example.com')) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.mode' in self.response_body()) - - def testNoUrls(self): - self.handle('post', ('hub.mode', 'publish')) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.url' in self.response_body()) - - def testBadUrls(self): - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', 'http://example.com/bad_url#fragment')) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.url invalid' in self.response_body()) - - def testInsertion(self): - db.put([KnownFeed.create(self.topic), - KnownFeed.create(self.topic2), - KnownFeed.create(self.topic3)]) - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', self.topic), - ('hub.url', self.topic2), - ('hub.url', self.topic3)) - self.assertEquals(204, self.response_code()) - expected_topics = set([self.topic, self.topic2, self.topic3]) - feed_list = self.get_feeds_to_fetch() - inserted_topics = set(f.topic for f in feed_list) - self.assertEquals(expected_topics, inserted_topics) - - def testIgnoreUnknownFeed(self): - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', self.topic), - ('hub.url', self.topic2), - ('hub.url', self.topic3)) - self.assertEquals(204, self.response_code()) - testutil.get_tasks(main.FEED_QUEUE, expected_count=0) - - def testDuplicateUrls(self): - db.put([KnownFeed.create(self.topic), - KnownFeed.create(self.topic2)]) - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', self.topic), - ('hub.url', self.topic), - ('hub.url', self.topic), - ('hub.url', self.topic), - ('hub.url', self.topic), - ('hub.url', self.topic), - ('hub.url', self.topic), - ('hub.url', self.topic2), - ('hub.url', self.topic2), - ('hub.url', self.topic2), - ('hub.url', self.topic2), - ('hub.url', self.topic2), - ('hub.url', self.topic2), - ('hub.url', self.topic2)) - self.assertEquals(204, self.response_code()) - expected_topics = set([self.topic, self.topic2]) - inserted_topics = set(f.topic for f in self.get_feeds_to_fetch()) - self.assertEquals(expected_topics, inserted_topics) - - def testInsertFailure(self): - """Tests when a publish event fails insertion.""" - old_insert = FeedToFetch.insert - try: - for exception in (db.Error(), apiproxy_errors.Error(), - runtime.DeadlineExceededError()): - @classmethod - def new_insert(cls, *args): - raise exception - FeedToFetch.insert = new_insert - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', 'http://example.com/first-url'), - ('hub.url', 'http://example.com/second-url'), - ('hub.url', 'http://example.com/third-url')) - self.assertEquals(503, self.response_code()) - finally: - FeedToFetch.insert = old_insert - - def testCaseSensitive(self): - """Tests that cases for topics URLs are preserved.""" - self.topic += FUNNY - self.topic2 += FUNNY - self.topic3 += FUNNY - db.put([KnownFeed.create(self.topic), - KnownFeed.create(self.topic2), - KnownFeed.create(self.topic3)]) - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', self.topic), - ('hub.url', self.topic2), - ('hub.url', self.topic3)) - self.assertEquals(204, self.response_code()) - expected_topics = set([self.topic, self.topic2, self.topic3]) - inserted_topics = set(f.topic for f in self.get_feeds_to_fetch()) - self.assertEquals(expected_topics, inserted_topics) - - def testNormalization(self): - """Tests that URLs are properly normalized.""" - self.topic += OTHER_STRING - self.topic2 += OTHER_STRING - self.topic3 += OTHER_STRING - normalized = [ - main.normalize_iri(t) - for t in [self.topic, self.topic2, self.topic3]] - db.put([KnownFeed.create(t) for t in normalized]) - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', self.topic), - ('hub.url', self.topic2), - ('hub.url', self.topic3)) - self.assertEquals(204, self.response_code()) - inserted_topics = set(f.topic for f in self.get_feeds_to_fetch()) - self.assertEquals(set(normalized), inserted_topics) - - def testIri(self): - """Tests publishing with an IRI with international characters.""" - topic = main.normalize_iri(self.topic + FUNNY_UNICODE) - topic2 = main.normalize_iri(self.topic2 + FUNNY_UNICODE) - topic3 = main.normalize_iri(self.topic3 + FUNNY_UNICODE) - normalized = [topic, topic2, topic3] - db.put([KnownFeed.create(t) for t in normalized]) - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', self.topic + FUNNY_UTF8), - ('hub.url', self.topic2 + FUNNY_UTF8), - ('hub.url', self.topic3 + FUNNY_UTF8)) - self.assertEquals(204, self.response_code()) - inserted_topics = set(f.topic for f in self.get_feeds_to_fetch()) - self.assertEquals(set(normalized), inserted_topics) - - def testUnicode(self): - """Tests publishing with a URL that has unicode characters.""" - topic = main.normalize_iri(self.topic + FUNNY_UNICODE) - topic2 = main.normalize_iri(self.topic2 + FUNNY_UNICODE) - topic3 = main.normalize_iri(self.topic3 + FUNNY_UNICODE) - normalized = [topic, topic2, topic3] - db.put([KnownFeed.create(t) for t in normalized]) - - payload = ( - 'hub.mode=publish' - '&hub.url=' + urllib.quote(self.topic) + FUNNY_UTF8 + - '&hub.url=' + urllib.quote(self.topic2) + FUNNY_UTF8 + - '&hub.url=' + urllib.quote(self.topic3) + FUNNY_UTF8) - self.handle_body('post', payload) - self.assertEquals(204, self.response_code()) - inserted_topics = set(f.topic for f in self.get_feeds_to_fetch()) - self.assertEquals(set(normalized), inserted_topics) - - def testSources(self): - """Tests that derived sources are properly set on FeedToFetch instances.""" - db.put([KnownFeed.create(self.topic), - KnownFeed.create(self.topic2), - KnownFeed.create(self.topic3)]) - source_dict = {'one': 'two', 'three': 'four'} - topics = [self.topic, self.topic2, self.topic3] - def derive_sources(handler, urls): - self.assertEquals(set(topics), set(urls)) - self.assertEquals('testvalue', handler.request.get('the-real-thing')) - return source_dict - - main.hooks.override_for_test(main.derive_sources, derive_sources) - try: - self.handle('post', - ('hub.mode', 'PuBLisH'), - ('hub.url', self.topic), - ('hub.url', self.topic2), - ('hub.url', self.topic3), - ('the-real-thing', 'testvalue')) - self.assertEquals(204, self.response_code()) - for feed_to_fetch in self.get_feeds_to_fetch(): - found_source_dict = dict(zip(feed_to_fetch.source_keys, - feed_to_fetch.source_values)) - self.assertEquals(source_dict, found_source_dict) - finally: - main.hooks.reset_for_test(main.derive_sources) - - -class PublishHandlerThroughHubUrlTest(PublishHandlerTest): - - handler_class = main.HubHandler - -################################################################################ - -class FindFeedUpdatesTest(unittest.TestCase): - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.topic = 'http://example.com/my-topic-here' - self.header_footer = 'this is my test header footer' - self.entries_map = { - 'id1': 'content1', - 'id2': 'content2', - 'id3': 'content3', - } - self.content = 'the expected response data' - def my_filter(content, ignored_format): - self.assertEquals(self.content, content) - return self.header_footer, self.entries_map - self.my_filter = my_filter - - def run_test(self): - """Runs a test.""" - header_footer, entry_list, entry_payloads = main.find_feed_updates( - self.topic, main.ATOM, self.content, filter_feed=self.my_filter) - self.assertEquals(self.header_footer, header_footer) - return entry_list, entry_payloads - - @staticmethod - def get_entry(entry_id, entry_list): - """Finds the entry with the given ID in the list of entries.""" - return [e for e in entry_list if e.id_hash == sha1_hash(entry_id)][0] - - def testAllNewContent(self): - """Tests when al pulled feed content is new.""" - entry_list, entry_payloads = self.run_test() - entry_id_hash_set = set(f.id_hash for f in entry_list) - self.assertEquals(set(sha1_hash(k) for k in self.entries_map.keys()), - entry_id_hash_set) - self.assertEquals(self.entries_map.values(), entry_payloads) - - def testSomeExistingEntries(self): - """Tests when some entries are already known.""" - FeedEntryRecord.create_entry_for_topic( - self.topic, 'id1', sha1_hash('content1')).put() - FeedEntryRecord.create_entry_for_topic( - self.topic, 'id2', sha1_hash('content2')).put() - - entry_list, entry_payloads = self.run_test() - entry_id_hash_set = set(f.id_hash for f in entry_list) - self.assertEquals(set(sha1_hash(k) for k in ['id3']), entry_id_hash_set) - self.assertEquals(['content3'], entry_payloads) - - def testPulledEntryNewer(self): - """Tests when an entry is already known but has been updated recently.""" - FeedEntryRecord.create_entry_for_topic( - self.topic, 'id1', sha1_hash('content1')).put() - FeedEntryRecord.create_entry_for_topic( - self.topic, 'id2', sha1_hash('content2')).put() - self.entries_map['id1'] = 'newcontent1' - - entry_list, entry_payloads = self.run_test() - entry_id_hash_set = set(f.id_hash for f in entry_list) - self.assertEquals(set(sha1_hash(k) for k in ['id1', 'id3']), - entry_id_hash_set) - - # Verify the old entry would be overwritten. - entry1 = self.get_entry('id1', entry_list) - self.assertEquals(sha1_hash('newcontent1'), entry1.entry_content_hash) - self.assertEquals(['content3', 'newcontent1'], entry_payloads) - - def testUnicodeContent(self): - """Tests when the content contains unicode characters.""" - self.entries_map['id2'] = u'\u2019 asdf' - entry_list, entry_payloads = self.run_test() - entry_id_hash_set = set(f.id_hash for f in entry_list) - self.assertEquals(set(sha1_hash(k) for k in self.entries_map.keys()), - entry_id_hash_set) - - def testMultipleParallelBatches(self): - """Tests that retrieving FeedEntryRecords is done in multiple batches.""" - old_get_feed_record = main.FeedEntryRecord.get_entries_for_topic - calls = [0] - @staticmethod - def fake_get_record(*args, **kwargs): - calls[0] += 1 - return old_get_feed_record(*args, **kwargs) - - old_lookups = main.MAX_FEED_ENTRY_RECORD_LOOKUPS - main.FeedEntryRecord.get_entries_for_topic = fake_get_record - main.MAX_FEED_ENTRY_RECORD_LOOKUPS = 1 - try: - entry_list, entry_payloads = self.run_test() - entry_id_hash_set = set(f.id_hash for f in entry_list) - self.assertEquals(set(sha1_hash(k) for k in self.entries_map.keys()), - entry_id_hash_set) - self.assertEquals(self.entries_map.values(), entry_payloads) - self.assertEquals(3, calls[0]) - finally: - main.MAX_FEED_ENTRY_RECORD_LOOKUPS = old_lookups - main.FeedEntryRecord.get_entries_for_topic = old_get_feed_record - -################################################################################ - -FeedRecord = main.FeedRecord -KnownFeedStats = main.KnownFeedStats - -class PullFeedHandlerTest(testutil.HandlerTestBase): - - handler_class = main.PullFeedHandler - - def setUp(self): - """Sets up the test harness.""" - testutil.HandlerTestBase.setUp(self) - - self.topic = 'http://example.com/my-topic-here' - self.header_footer = 'this is my test header footer' - self.all_ids = ['1', '2', '3'] - self.entry_payloads = [ - 'content%s' % entry_id for entry_id in self.all_ids - ] - self.entry_list = [ - FeedEntryRecord.create_entry_for_topic( - self.topic, entry_id, 'content%s' % entry_id) - for entry_id in self.all_ids - ] - self.expected_response = 'the expected response data' - self.etag = 'something unique' - self.last_modified = 'some time' - self.headers = { - 'ETag': self.etag, - 'Last-Modified': self.last_modified, - 'Content-Type': 'application/atom+xml', - } - self.expected_exceptions = [] - - def my_find_updates(ignored_topic, ignored_format, content): - self.assertEquals(self.expected_response, content) - if self.expected_exceptions: - raise self.expected_exceptions.pop(0) - return self.header_footer, self.entry_list, self.entry_payloads - - self.old_find_feed_updates = main.find_feed_updates - main.find_feed_updates = my_find_updates - - self.callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert( - self.callback, self.topic, 'token', 'secret')) - - def tearDown(self): - """Tears down the test harness.""" - main.find_feed_updates = self.old_find_feed_updates - urlfetch_test_stub.instance.verify_and_reset() - - def run_fetch_task(self, index=0): - """Runs the currently enqueued fetch task.""" - task = testutil.get_tasks(main.FEED_QUEUE, index=index) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = task['name'] - try: - self.handle('post') - finally: - del os.environ['HTTP_X_APPENGINE_TASKNAME'] - - def testNoWork(self): - self.handle('post', ('topic', self.topic)) - - def testNewEntries_Atom(self): - """Tests when new entries are found.""" - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - response_headers=self.headers) - self.run_fetch_task() - - # Verify that all feed entry records have been written along with the - # EventToDeliver and FeedRecord. - feed_entries = FeedEntryRecord.get_entries_for_topic( - self.topic, self.all_ids) - self.assertEquals( - [sha1_hash(k) for k in self.all_ids], - [e.id_hash for e in feed_entries]) - - work = EventToDeliver.all().get() - event_key = work.key() - self.assertEquals(self.topic, work.topic) - self.assertTrue('content1\ncontent2\ncontent3' in work.payload) - work.delete() - - record = FeedRecord.get_or_create(self.topic) - self.assertEquals(self.header_footer, record.header_footer) - self.assertEquals(self.etag, record.etag) - self.assertEquals(self.last_modified, record.last_modified) - self.assertEquals('application/atom+xml', record.content_type) - - task = testutil.get_tasks(main.EVENT_QUEUE, index=0, expected_count=1) - self.assertEquals(str(event_key), task['params']['event_key']) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testRssFailBack(self): - """Tests when parsing as Atom fails and it uses RSS instead.""" - self.expected_exceptions.append(feed_diff.Error('whoops')) - self.header_footer = 'this is my test' - self.headers['Content-Type'] = 'application/xml' - - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - response_headers=self.headers) - self.run_fetch_task() - - feed_entries = FeedEntryRecord.get_entries_for_topic( - self.topic, self.all_ids) - self.assertEquals( - [sha1_hash(k) for k in self.all_ids], - [e.id_hash for e in feed_entries]) - - work = EventToDeliver.all().get() - event_key = work.key() - self.assertEquals(self.topic, work.topic) - self.assertTrue('content1\ncontent2\ncontent3' in work.payload) - work.delete() - - record = FeedRecord.get_or_create(self.topic) - self.assertEquals('application/xml', record.content_type) - - task = testutil.get_tasks(main.EVENT_QUEUE, index=0, expected_count=1) - self.assertEquals(str(event_key), task['params']['event_key']) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testAtomFailBack(self): - """Tests when parsing as RSS fails and it uses Atom instead.""" - self.expected_exceptions.append(feed_diff.Error('whoops')) - self.headers.clear() - self.headers['Content-Type'] = 'application/rss+xml' - info = FeedRecord.get_or_create(self.topic) - info.update(self.headers) - info.put() - - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - response_headers=self.headers) - self.run_fetch_task() - - feed_entries = FeedEntryRecord.get_entries_for_topic( - self.topic, self.all_ids) - self.assertEquals( - [sha1_hash(k) for k in self.all_ids], - [e.id_hash for e in feed_entries]) - - work = EventToDeliver.all().get() - event_key = work.key() - self.assertEquals(self.topic, work.topic) - self.assertTrue('content1\ncontent2\ncontent3' in work.payload) - work.delete() - - record = FeedRecord.get_or_create(self.topic) - self.assertEquals('application/rss+xml', record.content_type) - - task = testutil.get_tasks(main.EVENT_QUEUE, index=0, expected_count=1) - self.assertEquals(str(event_key), task['params']['event_key']) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testArbitraryContent(self): - """Tests when the feed cannot be parsed as Atom or RSS.""" - self.entry_list = [] - self.entry_payloads = [] - self.header_footer = 'this is all of the content' - self.expected_exceptions.append(feed_diff.Error('whoops')) - self.expected_exceptions.append(feed_diff.Error('whoops')) - FeedToFetch.insert([self.topic]) - self.headers['content-type'] = 'My Crazy Content Type' - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - response_headers=self.headers) - self.run_fetch_task() - - feed = FeedToFetch.get_by_key_name(get_hash_key_name(self.topic)) - self.assertTrue(feed is None) - self.assertEquals(0, len(list(FeedEntryRecord.all()))) - - work = EventToDeliver.all().get() - event_key = work.key() - self.assertEquals(self.topic, work.topic) - self.assertEquals('this is all of the content', work.payload) - work.delete() - - record = FeedRecord.get_or_create(self.topic) - # header_footer not saved for arbitrary data - self.assertEquals(None, record.header_footer) - self.assertEquals(self.etag, record.etag) - self.assertEquals(self.last_modified, record.last_modified) - self.assertEquals('my crazy content type', record.content_type) - - task = testutil.get_tasks(main.EVENT_QUEUE, index=0, expected_count=1) - self.assertEquals(str(event_key), task['params']['event_key']) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - testutil.get_tasks(main.FEED_RETRIES_QUEUE, expected_count=0) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testCacheHit(self): - """Tests when the fetched feed matches the last cached version of it.""" - info = FeedRecord.get_or_create(self.topic) - info.update(self.headers) - info.put() - - request_headers = { - 'If-None-Match': self.etag, - 'If-Modified-Since': self.last_modified, - } - - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 304, '', - request_headers=request_headers, - response_headers=self.headers) - self.run_fetch_task() - self.assertTrue(EventToDeliver.all().get() is None) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testStatsUserAgent(self): - """Tests that the user agent string includes feed stats.""" - info = FeedRecord.get_or_create(self.topic) - info.update(self.headers) - info.put() - - KnownFeedStats( - key=KnownFeedStats.create_key(self.topic), - subscriber_count=123).put() - - request_headers = { - 'User-Agent': - 'Public Hub (+http://pubsubhubbub.appspot.com; 123 subscribers)', - } - - FeedToFetch.insert([self.topic]) - self.entry_list = [] - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - request_headers=request_headers, - response_headers=self.headers) - self.run_fetch_task() - self.assertTrue(EventToDeliver.all().get() is None) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - record = FeedRecord.get_or_create(self.topic) - self.assertEquals(self.header_footer, record.header_footer) - self.assertEquals(self.etag, record.etag) - self.assertEquals(self.last_modified, record.last_modified) - self.assertEquals('application/atom+xml', record.content_type) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testNoNewEntries(self): - """Tests when there are no new entries.""" - FeedToFetch.insert([self.topic]) - self.entry_list = [] - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - response_headers=self.headers) - self.run_fetch_task() - self.assertTrue(EventToDeliver.all().get() is None) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - record = FeedRecord.get_or_create(self.topic) - self.assertEquals(self.header_footer, record.header_footer) - self.assertEquals(self.etag, record.etag) - self.assertEquals(self.last_modified, record.last_modified) - self.assertEquals('application/atom+xml', record.content_type) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testPullError(self): - """Tests when URLFetch raises an exception.""" - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, urlfetch_error=True) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(self.topic)) - self.assertEquals(1, feed.fetching_failures) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - task = testutil.get_tasks(main.FEED_RETRIES_QUEUE, - index=0, expected_count=1) - self.assertEquals(self.topic, task['params']['topic']) - self.assertEquals([(0, 1)], main.FETCH_SCORER.get_scores([self.topic])) - - def testPullRetry(self): - """Tests that the task enqueued after a failure will run properly.""" - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, urlfetch_error=True) - self.run_fetch_task() - - # Verify the failed feed was written to the Datastore. - feed = FeedToFetch.get_by_key_name(get_hash_key_name(self.topic)) - self.assertEquals(1, feed.fetching_failures) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - testutil.get_tasks(main.FEED_RETRIES_QUEUE, expected_count=1) - task = testutil.get_tasks(main.FEED_RETRIES_QUEUE, - index=0, expected_count=1) - self.assertEquals(self.topic, task['params']['topic']) - self.assertEquals([(0, 1)], main.FETCH_SCORER.get_scores([self.topic])) - - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, urlfetch_error=True) - self.handle('post', *task['params'].items()) - feed = FeedToFetch.get_by_key_name(get_hash_key_name(self.topic)) - self.assertEquals(2, feed.fetching_failures) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - testutil.get_tasks(main.FEED_RETRIES_QUEUE, expected_count=2) - - def testPullBadStatusCode(self): - """Tests when the response status is bad.""" - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 500, self.expected_response) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(self.topic)) - self.assertEquals(1, feed.fetching_failures) - - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - task = testutil.get_tasks(main.FEED_RETRIES_QUEUE, - index=0, expected_count=1) - self.assertEquals(self.topic, task['params']['topic']) - self.assertEquals([(0, 1)], main.FETCH_SCORER.get_scores([self.topic])) - - def testApiProxyError(self): - """Tests when the APIProxy raises an error.""" - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, apiproxy_error=True) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(self.topic)) - self.assertEquals(1, feed.fetching_failures) - - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - task = testutil.get_tasks(main.FEED_RETRIES_QUEUE, - index=0, expected_count=1) - self.assertEquals(self.topic, task['params']['topic']) - self.assertEquals([(0, 1)], main.FETCH_SCORER.get_scores([self.topic])) - - def testNoSubscribers(self): - """Tests that when a feed has no subscribers we do not pull it.""" - self.assertTrue(Subscription.remove(self.callback, self.topic)) - db.put(KnownFeed.create(self.topic)) - self.assertTrue(db.get(KnownFeed.create_key(self.topic)) is not None) - self.entry_list = [] - FeedToFetch.insert([self.topic]) - self.run_fetch_task() - - # Verify that *no* feed entry records have been written. - self.assertEquals([], FeedEntryRecord.get_entries_for_topic( - self.topic, self.all_ids)) - - # And there is no EventToDeliver or tasks. - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - tasks = testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - - # And no scoring. - self.assertEquals([(0, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testRedirects(self): - """Tests when redirects are encountered.""" - info = FeedRecord.get_or_create(self.topic) - info.update(self.headers) - info.put() - FeedToFetch.insert([self.topic]) - - real_topic = 'http://example.com/real-topic-location' - self.headers['Location'] = real_topic - urlfetch_test_stub.instance.expect( - 'get', self.topic, 302, '', - response_headers=self.headers.copy()) - - del self.headers['Location'] - urlfetch_test_stub.instance.expect( - 'get', real_topic, 200, self.expected_response, - response_headers=self.headers) - - self.run_fetch_task() - self.assertTrue(EventToDeliver.all().get() is not None) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testTooManyRedirects(self): - """Tests when too many redirects are encountered.""" - info = FeedRecord.get_or_create(self.topic) - info.update(self.headers) - info.put() - FeedToFetch.insert([self.topic]) - - last_topic = self.topic - real_topic = 'http://example.com/real-topic-location' - for i in xrange(main.MAX_REDIRECTS): - next_topic = real_topic + str(i) - self.headers['Location'] = next_topic - urlfetch_test_stub.instance.expect( - 'get', last_topic, 302, '', - response_headers=self.headers.copy()) - last_topic = next_topic - - self.run_fetch_task() - self.assertTrue(EventToDeliver.all().get() is None) - - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - task = testutil.get_tasks(main.FEED_RETRIES_QUEUE, - index=0, expected_count=1) - self.assertEquals(self.topic, task['params']['topic']) - - self.assertEquals([(0, 1)], main.FETCH_SCORER.get_scores([self.topic])) - - def testRedirectToBadUrl(self): - """Tests when the redirect URL is bad.""" - info = FeedRecord.get_or_create(self.topic) - info.update(self.headers) - info.put() - FeedToFetch.insert([self.topic]) - - real_topic = '/not/a/valid-redirect-location' - self.headers['Location'] = real_topic - urlfetch_test_stub.instance.expect( - 'get', self.topic, 302, '', - response_headers=self.headers.copy()) - - self.run_fetch_task() - self.assertTrue(EventToDeliver.all().get() is None) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - self.assertEquals([(0, 1)], main.FETCH_SCORER.get_scores([self.topic])) - - def testPutSplitting(self): - """Tests that put() calls for feed records are split when too large.""" - # Make the content way too big. - content_template = ('content' * 100 + '%s') - self.all_ids = [str(i) for i in xrange(1000)] - self.entry_payloads = [ - (content_template % entry_id) for entry_id in self.all_ids - ] - self.entry_list = [ - FeedEntryRecord.create_entry_for_topic( - self.topic, entry_id, 'content%s' % entry_id) - for entry_id in self.all_ids - ] - - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - response_headers=self.headers) - - old_max_new = main.MAX_NEW_FEED_ENTRY_RECORDS - main.MAX_NEW_FEED_ENTRY_RECORDS = len(self.all_ids) + 1 - try: - self.run_fetch_task() - finally: - main.MAX_NEW_FEED_ENTRY_RECORDS = old_max_new - - # Verify that all feed entry records have been written along with the - # EventToDeliver and FeedRecord. - feed_entries = list(FeedEntryRecord.all()) - self.assertEquals( - set(sha1_hash(k) for k in self.all_ids), - set(e.id_hash for e in feed_entries)) - - work = EventToDeliver.all().get() - event_key = work.key() - self.assertEquals(self.topic, work.topic) - self.assertTrue('\n'.join(self.entry_payloads) in work.payload) - work.delete() - - record = FeedRecord.get_or_create(self.topic) - self.assertEquals(self.header_footer, record.header_footer) - self.assertEquals(self.etag, record.etag) - self.assertEquals(self.last_modified, record.last_modified) - self.assertEquals('application/atom+xml', record.content_type) - - task = testutil.get_tasks(main.EVENT_QUEUE, index=0, expected_count=1) - self.assertEquals(str(event_key), task['params']['event_key']) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - testutil.get_tasks(main.FEED_RETRIES_QUEUE, expected_count=0) - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testPutSplittingFails(self): - """Tests when splitting put() calls still doesn't help and we give up.""" - # Make the content way too big. - content_template = ('content' * 150 + '%s') - self.all_ids = [str(i) for i in xrange(1000)] - self.entry_payloads = [ - (content_template % entry_id) for entry_id in self.all_ids - ] - self.entry_list = [ - FeedEntryRecord.create_entry_for_topic( - self.topic, entry_id, 'content%s' % entry_id) - for entry_id in self.all_ids - ] - - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - response_headers=self.headers) - - old_splitting_attempts = main.PUT_SPLITTING_ATTEMPTS - old_max_saves = main.MAX_FEED_RECORD_SAVES - old_max_new = main.MAX_NEW_FEED_ENTRY_RECORDS - main.PUT_SPLITTING_ATTEMPTS = 1 - main.MAX_FEED_RECORD_SAVES = len(self.entry_list) + 1 - main.MAX_NEW_FEED_ENTRY_RECORDS = main.MAX_FEED_RECORD_SAVES - try: - self.run_fetch_task() - finally: - main.PUT_SPLITTING_ATTEMPTS = old_splitting_attempts - main.MAX_FEED_RECORD_SAVES = old_max_saves - main.MAX_NEW_FEED_ENTRY_RECORDS = old_max_new - - # Verify that *NO* FeedEntryRecords or EventToDeliver has been written, - # the FeedRecord wasn't updated, and no tasks were enqueued. - self.assertEquals([], list(FeedEntryRecord.all())) - self.assertEquals(None, EventToDeliver.all().get()) - - record = FeedRecord.all().get() - self.assertEquals(None, record) - - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - # Put splitting failure does not count against the feed. - self.assertEquals([(1, 0)], main.FETCH_SCORER.get_scores([self.topic])) - - def testFeedTooLarge(self): - """Tests when the pulled feed's content size is too large.""" - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, '', - response_headers=self.headers, - urlfetch_size_error=True) - self.run_fetch_task() - self.assertEquals([], list(FeedEntryRecord.all())) - self.assertEquals(None, EventToDeliver.all().get()) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - self.assertEquals([(0, 1)], main.FETCH_SCORER.get_scores([self.topic])) - - def testTooManyNewEntries(self): - """Tests when there are more new entries than we can handle at once.""" - self.all_ids = [str(i) for i in xrange(1000)] - self.entry_payloads = [ - 'content%s' % entry_id for entry_id in self.all_ids - ] - self.entry_list = [ - FeedEntryRecord.create_entry_for_topic( - self.topic, entry_id, 'content%s' % entry_id) - for entry_id in self.all_ids - ] - - FeedToFetch.insert([self.topic]) - urlfetch_test_stub.instance.expect( - 'get', self.topic, 200, self.expected_response, - response_headers=self.headers) - - self.run_fetch_task() - - # Verify that a subset of the entry records are present and the payload - # only has the first N entries. - feed_entries = FeedEntryRecord.get_entries_for_topic( - self.topic, self.all_ids) - expected_records = main.MAX_NEW_FEED_ENTRY_RECORDS - self.assertEquals( - [sha1_hash(k) for k in self.all_ids[:expected_records]], - [e.id_hash for e in feed_entries]) - - work = EventToDeliver.all().get() - event_key = work.key() - self.assertEquals(self.topic, work.topic) - expected_content = '\n'.join(self.entry_payloads[:expected_records]) - self.assertTrue(expected_content in work.payload) - self.assertFalse('content%d' % expected_records in work.payload) - work.delete() - - record = FeedRecord.all().get() - self.assertNotEquals(self.etag, record.etag) - - task = testutil.get_tasks(main.EVENT_QUEUE, index=0, expected_count=1) - self.assertEquals(str(event_key), task['params']['event_key']) - testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - task = testutil.get_tasks(main.FEED_RETRIES_QUEUE, - index=0, expected_count=1) - self.assertEquals(self.topic, task['params']['topic']) - self.assertEquals([(0, 1)], main.FETCH_SCORER.get_scores([self.topic])) - - def testNotAllowed(self): - """Tests when the URL fetch is blocked due to URL scoring.""" - dos.DISABLE_FOR_TESTING = False - try: - main.FETCH_SCORER.blackhole([self.topic]) - start_scores = main.FETCH_SCORER.get_scores([self.topic]) - - info = FeedRecord.get_or_create(self.topic) - info.update(self.headers) - info.put() - FeedToFetch.insert([self.topic]) - self.run_fetch_task() - - # Verify that *no* feed entry records have been written. - self.assertEquals([], FeedEntryRecord.get_entries_for_topic( - self.topic, self.all_ids)) - - # And there is no EventToDeliver or tasks. - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - tasks = testutil.get_tasks(main.FEED_QUEUE, expected_count=1) - - self.assertEquals( - start_scores, - main.FETCH_SCORER.get_scores([self.topic])) - finally: - dos.DISABLE_FOR_TESTING = True - - -class PullFeedHandlerTestWithParsing(testutil.HandlerTestBase): - - handler_class = main.PullFeedHandler - - def run_fetch_task(self, index=0): - """Runs the currently enqueued fetch task.""" - task = testutil.get_tasks(main.FEED_QUEUE, index=index) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = task['name'] - try: - self.handle('post') - finally: - del os.environ['HTTP_X_APPENGINE_TASKNAME'] - - def testPullBadContent(self): - """Tests when the content doesn't parse correctly.""" - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect( - 'get', topic, 200, 'this does not parse') - self.run_fetch_task() - # No retry task should be written. - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - - def testPullBadFeed(self): - """Tests when the content parses, but is not a good Atom document.""" - data = ('\n' - 'wooh') - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect('get', topic, 200, data) - self.run_fetch_task() - # No retry task should be written. - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - - def testPullBadEncoding(self): - """Tests when the content has a bad character encoding.""" - data = ('\n' - '' - '1123wooh') - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect('get', topic, 200, data) - self.run_fetch_task() - # No retry task should be written. - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - - def testPullGoodAtom(self): - """Tests when the Atom XML can parse just fine.""" - data = ('\n' - '1123wooh') - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect('get', topic, 200, data) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - event = EventToDeliver.all().get() - self.assertEquals(data.replace('\n', ''), event.payload.replace('\n', '')) - self.assertEquals('application/atom+xml', event.content_type) - self.assertEquals('atom', FeedRecord.all().get().format) - - def testPullWithUnicodeEtag(self): - """Tests when the ETag header has a unicode value. - - The ETag value should be ignored because non-ascii ETag values are invalid. - """ - data = ('\n' - '1123wooh') - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect('get', topic, 200, data, - response_headers={ - 'ETag': '\xe3\x83\x96\xe3\x83\xad\xe3\x82\xb0\xe8\xa1\x86', - 'Content-Type': 'application/atom+xml', - }) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - event = EventToDeliver.all().get() - self.assertEquals(data.replace('\n', ''), event.payload.replace('\n', '')) - self.assertEquals('application/atom+xml', event.content_type) - self.assertEquals( - {'Accept': '*/*', - 'Connection': 'cache-control', - 'Cache-Control': 'no-cache no-store max-age=1'}, - FeedRecord.all().get().get_request_headers(0)) - - def testPullGoodRss(self): - """Tests when the RSS XML can parse just fine.""" - data = ('\n' - '' - '1123wooh' - '') - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect('get', topic, 200, data) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - event = EventToDeliver.all().get() - self.assertEquals(data.replace('\n', ''), event.payload.replace('\n', '')) - self.assertEquals('application/rss+xml', event.content_type) - self.assertEquals('rss', FeedRecord.all().get().format) - - def testPullGoodRdf(self): - """Tests when the RDF (RSS 1.0) XML can parse just fine.""" - data = ('\n' - '' - '' - '1123wooh' - '') - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect('get', topic, 200, data) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - event = EventToDeliver.all().get() - self.assertEquals(data.replace('\n', ''), event.payload.replace('\n', '')) - self.assertEquals('application/rdf+xml', event.content_type) - self.assertEquals('rss', FeedRecord.all().get().format) - - def testPullArbitrary(self): - """Tests pulling content of an arbitrary type.""" - data = 'this is my random payload of data' - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect( - 'get', topic, 200, data, - response_headers={'Content-Type': 'my crazy content type'}) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - event = EventToDeliver.all().get() - self.assertEquals(data, event.payload) - self.assertEquals('my crazy content type', event.content_type) - self.assertEquals('arbitrary', FeedRecord.all().get().format) - - def testPullBinaryContent(self): - """Tests pulling binary content.""" - data = '\xff\x12 some binary data' - topic = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - FeedToFetch.insert([topic]) - urlfetch_test_stub.instance.expect( - 'get', topic, 200, data, - response_headers={'Content-Type': 'my crazy content type'}) - self.run_fetch_task() - feed = FeedToFetch.get_by_key_name(get_hash_key_name(topic)) - self.assertTrue(feed is None) - event = EventToDeliver.all().get() - self.assertEquals(data, event.payload) - self.assertEquals('my crazy content type', event.content_type) - self.assertEquals('arbitrary', FeedRecord.all().get().format) - - def testMultipleFetch(self): - """Tests doing multiple fetches asynchronously in parallel. - - Exercises the fork-join queue part of the fetching pipeline. - """ - data = ('\n' - '1123wooh') - topic_base = 'http://example.com/my-topic' - callback = 'http://example.com/my-subscriber' - topic_list = [topic_base + '1', topic_base + '2', topic_base + '3'] - FeedToFetch.insert(topic_list) - for topic in topic_list: - urlfetch_test_stub.instance.expect('get', topic, 200, data) - self.assertTrue(Subscription.insert(callback, topic, 'token', 'secret')) - - os.environ['HTTP_X_APPENGINE_TASKNAME'] = testutil.get_tasks( - main.FEED_QUEUE, index=0, expected_count=1)['name'] - try: - self.handle('post') - finally: - del os.environ['HTTP_X_APPENGINE_TASKNAME'] - - # Feed to fetch removed. - self.assertEquals([], list(FeedToFetch.all())) - self.assertEquals([(3, 0), (3, 0), (3, 0)], # 3 because of shared domain - main.FETCH_SCORER.get_scores(topic_list)) - - # All events written and correct. - all_events = list(EventToDeliver.all()) - all_topics = [e.topic for e in all_events] - self.assertEquals(3, len(all_events)) - self.assertEquals(set(topic_list), set(all_topics)) - event_tasks = testutil.get_tasks(main.EVENT_QUEUE, expected_count=3) - self.assertEquals(set(str(e.key()) for e in all_events), - set(task['params']['event_key'] for task in event_tasks)) - - # All feed records written. - all_records = list(FeedEntryRecord.all()) - all_parents = set(db.Key.from_path(FeedRecord.kind(), - FeedRecord.create_key_name(topic)) - for topic in topic_list) - found_parents = set(r.parent().key() for r in all_records) - self.assertEquals(3, len(found_parents)) - self.assertEquals(found_parents, all_parents) - -################################################################################ - -class PushEventHandlerTest(testutil.HandlerTestBase): - - handler_class = main.PushEventHandler - - def setUp(self): - """Sets up the test harness.""" - testutil.HandlerTestBase.setUp(self) - - self.chunk_size = main.EVENT_SUBSCRIBER_CHUNK_SIZE - self.topic = 'http://example.com/hamster-topic' - # Order of these URL fetches is determined by the ordering of the hashes - # of the callback URLs, so we need random extra strings here to get - # alphabetical hash order. - self.callback1 = 'http://example1.com/hamster-callback1-12' - self.callback2 = 'http://example2.com/hamster-callback2' - self.callback3 = 'http://example3.com/hamster-callback3-123456' - self.callback4 = 'http://example4.com/hamster-callback4-123' - self.header_footer = '\nblah\n' - self.test_payloads = [ - 'article1', - 'article2', - 'article3', - ] - self.expected_payload = ( - '\n' - '\n' - 'blah\n' - '\n' - 'article1\n' - 'article2\n' - 'article3\n' - '' - ) - - self.header_footer_rss = '' - self.test_payloads_rss = [ - 'article1', - 'article2', - 'article3', - ] - self.expected_payload_rss = ( - '\n' - '\n' - 'article1\n' - 'article2\n' - 'article3\n' - '' - ) - - self.bad_key = db.Key.from_path(EventToDeliver.kind(), 'does_not_exist') - - def tearDown(self): - """Resets any external modules modified for testing.""" - main.EVENT_SUBSCRIBER_CHUNK_SIZE = self.chunk_size - urlfetch_test_stub.instance.verify_and_reset() - - def testNoWork(self): - self.handle('post', ('event_key', str(self.bad_key))) - - def testNoExtraSubscribers(self): - """Tests when a single chunk of delivery is enough.""" - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, 'token', 'secret')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 3 - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 200, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback2, 204, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 299, '', request_payload=self.expected_payload) - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - self.handle('post', ('event_key', str(event.key()))) - self.assertEquals([], list(EventToDeliver.all())) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - self.assertEquals( - [(1, 0), (1, 0), (1, 0)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3])) - - def testHmacData(self): - """Tests that the content is properly signed with an HMAC.""" - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret3')) - # Secret is empty on purpose here, so the verify_token will be used instead. - self.assertTrue(Subscription.insert( - self.callback2, self.topic, 'my-token', '')) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, 'token', 'secret-stuff')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 3 - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 204, '', - request_payload=self.expected_payload, - request_headers={ - 'Content-Type': 'application/atom+xml', - 'X-Hub-Signature': 'sha1=3e9caf971b0833d15393022f5f01a47adf597af5'}) - urlfetch_test_stub.instance.expect( - 'post', self.callback2, 200, '', - request_payload=self.expected_payload, - request_headers={ - 'Content-Type': 'application/atom+xml', - 'X-Hub-Signature': 'sha1=4847815aae8578eff55d351bc84a159b9bd8846e'}) - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 204, '', - request_payload=self.expected_payload, - request_headers={ - 'Content-Type': 'application/atom+xml', - 'X-Hub-Signature': 'sha1=8b0a9da7204afa8ae04fc9439755c556b1e38d99'}) - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - self.handle('post', ('event_key', str(event.key()))) - self.assertEquals([], list(EventToDeliver.all())) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - def testRssContentType(self): - """Tests that the content type of an RSS feed is properly supplied.""" - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 3 - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 204, '', - request_payload=self.expected_payload_rss, - request_headers={ - 'Content-Type': 'application/rss+xml', - 'X-Hub-Signature': 'sha1=1607313b6195af74f29158421f0a31aa25d680da'}) - event = EventToDeliver.create_event_for_topic( - self.topic, main.RSS, 'application/rss+xml', - self.header_footer_rss, self.test_payloads_rss) - event.put() - self.handle('post', ('event_key', str(event.key()))) - self.assertEquals([], list(EventToDeliver.all())) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - def testExtraSubscribers(self): - """Tests when there are more subscribers to contact after delivery.""" - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, 'token', 'secret')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 1 - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - event_key = str(event.key()) - - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 204, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - urlfetch_test_stub.instance.expect( - 'post', self.callback2, 200, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - self.assertEquals( - [(1, 0), (1, 0), (0, 0)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3])) - - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 204, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - self.assertEquals([], list(EventToDeliver.all())) - - tasks = testutil.get_tasks(main.EVENT_QUEUE, expected_count=2) - self.assertEquals([event_key] * 2, - [t['params']['event_key'] for t in tasks]) - - self.assertEquals( - [(1, 0), (1, 0), (1, 0)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3])) - - def testBrokenCallbacks(self): - """Tests that when callbacks return errors and are saved for later.""" - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, 'token', 'secret')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 2 - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - event_key = str(event.key()) - - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 302, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback2, 404, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - self.assertEquals( - [(0, 1), (0, 1), (0, 0)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3])) - - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 500, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - self.assertEquals( - [(0, 1), (0, 1), (0, 1)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3])) - - work = EventToDeliver.all().get() - sub_list = Subscription.get(work.failed_callbacks) - callback_list = [sub.callback for sub in sub_list] - self.assertEquals([self.callback1, self.callback2, self.callback3], - callback_list) - - tasks = testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - tasks.extend(testutil.get_tasks(main.EVENT_RETRIES_QUEUE, expected_count=1)) - self.assertEquals([event_key] * 2, - [t['params']['event_key'] for t in tasks]) - - def testDeadlineError(self): - """Tests that callbacks in flight at deadline will be marked as failed.""" - try: - def deadline(): - raise runtime.DeadlineExceededError() - main.async_proxy.wait = deadline - - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, 'token', 'secret')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 2 - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - event_key = str(event.key()) - self.handle('post', ('event_key', event_key)) - - # All events should be marked as failed even though no urlfetches - # were made. - work = EventToDeliver.all().get() - sub_list = Subscription.get(work.failed_callbacks) - callback_list = [sub.callback for sub in sub_list] - self.assertEquals([self.callback1, self.callback2], callback_list) - - self.assertEquals(event_key, testutil.get_tasks( - main.EVENT_QUEUE, index=0, expected_count=1)['params']['event_key']) - - # In this case no reporting should happen, since we do not have - # any more time in the runtime to report stats. - self.assertEquals( - [(0, 0), (0, 0), (0, 0)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3])) - finally: - main.async_proxy = async_apiproxy.AsyncAPIProxy() - - def testRetryLogic(self): - """Tests that failed urls will be retried after subsequent failures. - - This is an end-to-end test for push delivery failures and retries. We'll - simulate multiple times through the failure list. - """ - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback4, self.topic, 'token', 'secret')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 3 - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - event_key = str(event.key()) - - # First pass through all URLs goes full speed for two chunks. - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 404, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback2, 204, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 302, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - self.assertEquals( - [(0, 1), (1, 0), (0, 1), (0, 0)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3, self.callback4])) - - urlfetch_test_stub.instance.expect( - 'post', self.callback4, 500, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - self.assertEquals( - [(0, 1), (1, 0), (0, 1), (0, 1)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3, self.callback4])) - - # Now the retries. - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 404, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 302, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback4, 500, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - self.assertEquals( - [(0, 2), (1, 0), (0, 2), (0, 2)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3, self.callback4])) - - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 204, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 302, '', request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback4, 200, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - self.assertEquals( - [(1, 2), (1, 0), (0, 3), (1, 2)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3, self.callback4])) - - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 204, '', request_payload=self.expected_payload) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - self.assertEquals( - [(1, 2), (1, 0), (1, 3), (1, 2)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3, self.callback4])) - - self.assertEquals([], list(EventToDeliver.all())) - tasks = testutil.get_tasks(main.EVENT_QUEUE, expected_count=1) - tasks.extend(testutil.get_tasks(main.EVENT_RETRIES_QUEUE, expected_count=3)) - self.assertEquals([event_key] * 4, - [t['params']['event_key'] for t in tasks]) - - def testUrlFetchFailure(self): - """Tests the UrlFetch API raising exceptions while sending notifications.""" - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, 'token', 'secret')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 3 - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - event_key = str(event.key()) - - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 200, '', - request_payload=self.expected_payload, urlfetch_error=True) - urlfetch_test_stub.instance.expect( - 'post', self.callback2, 200, '', - request_payload=self.expected_payload, apiproxy_error=True) - self.handle('post', ('event_key', event_key)) - urlfetch_test_stub.instance.verify_and_reset() - - work = EventToDeliver.all().get() - sub_list = Subscription.get(work.failed_callbacks) - callback_list = [sub.callback for sub in sub_list] - self.assertEquals([self.callback1, self.callback2], callback_list) - - self.assertEquals(event_key, testutil.get_tasks( - main.EVENT_RETRIES_QUEUE, index=0, expected_count=1) - ['params']['event_key']) - - self.assertEquals( - [(0, 1), (0, 1)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2])) - - def testNotAllowed(self): - """Tests pushing events to a URL that's not allowed due to scoring.""" - dos.DISABLE_FOR_TESTING = False - try: - main.DELIVERY_SCORER.blackhole([self.callback2]) - start_scores = main.DELIVERY_SCORER.get_scores([self.callback2]) - - self.assertTrue(Subscription.insert( - self.callback1, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback2, self.topic, 'token', 'secret')) - self.assertTrue(Subscription.insert( - self.callback3, self.topic, 'token', 'secret')) - main.EVENT_SUBSCRIBER_CHUNK_SIZE = 3 - urlfetch_test_stub.instance.expect( - 'post', self.callback1, 204, '', - request_payload=self.expected_payload) - urlfetch_test_stub.instance.expect( - 'post', self.callback3, 204, '', - request_payload=self.expected_payload) - event = EventToDeliver.create_event_for_topic( - self.topic, main.ATOM, 'application/atom+xml', - self.header_footer, self.test_payloads) - event.put() - self.handle('post', ('event_key', str(event.key()))) - self.assertEquals([], list(EventToDeliver.all())) - testutil.get_tasks(main.EVENT_QUEUE, expected_count=0) - - self.assertEquals( - [(1, 0)] + start_scores + [(1, 0)], - main.DELIVERY_SCORER.get_scores( - [self.callback1, self.callback2, self.callback3])) - finally: - dos.DISABLE_FOR_TESTING = True - -################################################################################ - -class SubscribeHandlerTest(testutil.HandlerTestBase): - - handler_class = main.SubscribeHandler - - def setUp(self): - """Tests up the test harness.""" - testutil.HandlerTestBase.setUp(self) - self.challenge = 'this_is_my_fake_challenge_string' - self.old_get_challenge = main.get_random_challenge - main.get_random_challenge = lambda: self.challenge - self.callback = 'http://example.com/good-callback' - self.topic = 'http://example.com/the-topic' - self.verify_token = 'the_token' - self.verify_callback_querystring_template = ( - self.callback + - '?hub.verify_token=the_token' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic' - '&hub.mode=%s' - '&hub.lease_seconds=432000') - - def tearDown(self): - """Tears down the test harness.""" - testutil.HandlerTestBase.tearDown(self) - main.get_random_challenge = self.old_get_challenge - - def verify_record_task(self, topic): - """Tests there is a valid KnownFeedIdentity task enqueued. - - Args: - topic: The topic the task should be for. - - Raises: - AssertionError if the task isn't there. - """ - task = testutil.get_tasks(main.MAPPINGS_QUEUE, index=0, expected_count=1) - self.assertEquals(topic, task['params']['topic']) - - def testDebugFormRenders(self): - self.handle('get') - self.assertTrue('' in self.response_body()) - - def testValidation(self): - """Tests form validation.""" - # Bad mode - self.handle('post', - ('hub.mode', 'bad'), - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.mode' in self.response_body()) - - # Empty callback - self.handle('post', - ('hub.mode', 'subscribe'), - ('hub.callback', ''), - ('hub.topic', self.topic), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.callback' in self.response_body()) - - # Bad callback URL - self.handle('post', - ('hub.mode', 'subscribe'), - ('hub.callback', 'httpf://example.com'), - ('hub.topic', self.topic), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.callback' in self.response_body()) - - # Empty topic - self.handle('post', - ('hub.mode', 'subscribe'), - ('hub.callback', self.callback), - ('hub.topic', ''), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.topic' in self.response_body()) - - # Bad topic URL - self.handle('post', - ('hub.mode', 'subscribe'), - ('hub.callback', self.callback), - ('hub.topic', 'httpf://example.com'), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.topic' in self.response_body()) - - # Bad verify - self.handle('post', - ('hub.mode', 'subscribe'), - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.verify', 'meep'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.verify' in self.response_body()) - - # Bad lease_seconds - self.handle('post', - ('hub.mode', 'subscribe'), - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.verify', 'async'), - ('hub.verify_token', 'asdf'), - ('hub.lease_seconds', 'stuff')) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.lease_seconds' in self.response_body()) - - # Bad lease_seconds zero padding will break things - self.handle('post', - ('hub.mode', 'subscribe'), - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.verify', 'async'), - ('hub.verify_token', 'asdf'), - ('hub.lease_seconds', '000010')) - self.assertEquals(400, self.response_code()) - self.assertTrue('hub.lease_seconds' in self.response_body()) - - def testUnsubscribeMissingSubscription(self): - """Tests that deleting a non-existent subscription does nothing.""" - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.verify', 'sync'), - ('hub.mode', 'unsubscribe'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(204, self.response_code()) - - def testSynchronous(self): - """Tests synchronous subscribe and unsubscribe.""" - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(self.topic) - - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'unsubscribe', 200, - self.challenge) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'unsubscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(204, self.response_code()) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - - def testAsynchronous(self): - """Tests sync and async subscriptions cause the correct state transitions. - - Also tests that synchronous subscribes and unsubscribes will overwrite - asynchronous requests. - """ - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - - # Async subscription. - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(202, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_NOT_VERIFIED, sub.subscription_state) - - # Sync subscription overwrites. - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(self.topic) - - # Async unsubscribe queues removal, but does not change former state. - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'unsubscribe'), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(202, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - - # Synch unsubscribe overwrites. - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'unsubscribe', 200, - self.challenge) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'unsubscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(204, self.response_code()) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - - def testResubscribe(self): - """Tests that subscribe requests will reset pending unsubscribes.""" - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - - # Async subscription. - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(202, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_NOT_VERIFIED, sub.subscription_state) - - # Async un-subscription does not change previous subscription state. - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'unsubscribe'), - ('hub.verify', 'async'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(202, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_NOT_VERIFIED, sub.subscription_state) - - # Synchronous subscription overwrites. - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(self.topic) - - def testMaxLeaseSeconds(self): - """Tests when the max lease period is specified.""" - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - - self.verify_callback_querystring_template = ( - self.callback + - '?hub.verify_token=the_token' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic' - '&hub.mode=%s' - '&hub.lease_seconds=864000') - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token), - ('hub.lease_seconds', '1000000000000000000')) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(self.topic) - - def testDefaultLeaseSeconds(self): - """Tests when the lease_seconds parameter is ommitted.""" - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - - self.verify_callback_querystring_template = ( - self.callback + - '?hub.verify_token=the_token' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic' - '&hub.mode=%s' - '&hub.lease_seconds=432000') - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token), - ('hub.lease_seconds', '')) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(self.topic) - - def testInvalidChallenge(self): - """Tests when the returned challenge is bad.""" - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'subscribe', 200, 'bad') - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - self.assertTrue(db.get(KnownFeed.create_key(self.topic)) is None) - self.assertEquals(409, self.response_code()) - - def testSynchronousConfirmFailure(self): - """Tests when synchronous confirmations fail.""" - # Subscribe - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'subscribe', 500, '') - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - self.assertTrue(db.get(KnownFeed.create_key(self.topic)) is None) - self.assertEquals(409, self.response_code()) - - # Unsubscribe - Subscription.insert(self.callback, self.topic, self.verify_token, 'secret') - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'unsubscribe', 500, '') - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'unsubscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertTrue(Subscription.get_by_key_name(sub_key) is not None) - self.assertEquals(409, self.response_code()) - - def testAfterSubscriptionError(self): - """Tests when an exception occurs after subscription.""" - for exception in (runtime.DeadlineExceededError(), db.Error(), - apiproxy_errors.Error()): - def new_confirm(*args): - raise exception - main.hooks.override_for_test(main.confirm_subscription, new_confirm) - try: - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(503, self.response_code()) - finally: - main.hooks.reset_for_test(main.confirm_subscription) - - def testSubscriptionError(self): - """Tests when errors occurs during subscription.""" - # URLFetch errors are probably the subscriber's fault, so we'll serve these - # as a conflict. - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', - None, '', urlfetch_error=True) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(409, self.response_code()) - - # An apiproxy error or deadline error will fall through and serve a 503, - # since that means there's something wrong with our service. - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', - None, '', apiproxy_error=True) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(503, self.response_code()) - - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', - None, '', deadline_error=True) - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(503, self.response_code()) - - def testCaseSensitive(self): - """Tests that the case of topics, callbacks, and tokens are preserved.""" - self.topic += FUNNY - self.callback += FUNNY - self.verify_token += FUNNY - sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - self.verify_callback_querystring_template = ( - self.callback + - '?hub.verify_token=the_token%%2FCaSeSeNsItIvE' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic%%2FCaSeSeNsItIvE' - '&hub.mode=%s' - '&hub.lease_seconds=432000') - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(self.topic) - - def testSubscribeNormalization(self): - """Tests that the topic and callback URLs are properly normalized.""" - self.topic += OTHER_STRING - orig_callback = self.callback - self.callback += OTHER_STRING - sub_key = Subscription.create_key_name( - main.normalize_iri(self.callback), - main.normalize_iri(self.topic)) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - self.verify_callback_querystring_template = ( - orig_callback + '/~one:two/&=' - '?hub.verify_token=the_token' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic' - '%%2F%%7Eone%%3Atwo%%2F%%26%%3D' - '&hub.mode=%s' - '&hub.lease_seconds=432000') - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - - self.handle('post', - ('hub.callback', self.callback), - ('hub.topic', self.topic), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', self.verify_token)) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(main.normalize_iri(self.topic)) - - def testSubscribeIri(self): - """Tests when the topic, callback, verify_token, and secrets are IRIs.""" - topic = self.topic + FUNNY_UNICODE - topic_utf8 = self.topic + FUNNY_UTF8 - callback = self.callback + FUNNY_UNICODE - callback_utf8 = self.callback + FUNNY_UTF8 - verify_token = self.verify_token + FUNNY_UNICODE - verify_token_utf8 = self.verify_token + FUNNY_UTF8 - - sub_key = Subscription.create_key_name( - main.normalize_iri(callback), - main.normalize_iri(topic)) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - self.verify_callback_querystring_template = ( - self.callback + - '/blah/%%E3%%83%%96%%E3%%83%%AD%%E3%%82%%B0%%E8%%A1%%86' - '?hub.verify_token=the_token%%2F' - 'blah%%2F%%E3%%83%%96%%E3%%83%%AD%%E3%%82%%B0%%E8%%A1%%86' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic%%2F' - 'blah%%2F%%25E3%%2583%%2596%%25E3%%2583%%25AD' - '%%25E3%%2582%%25B0%%25E8%%25A1%%2586' - '&hub.mode=%s' - '&hub.lease_seconds=432000') - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - - self.handle('post', - ('hub.callback', callback_utf8), - ('hub.topic', topic_utf8), - ('hub.mode', 'subscribe'), - ('hub.verify', 'sync'), - ('hub.verify_token', verify_token_utf8)) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(self.topic + FUNNY_IRI) - - def testSubscribeUnicode(self): - """Tests when UTF-8 encoded bytes show up in the requests. - - Technically this isn't well-formed or allowed by the HTTP/URI spec, but - people do it anyways and we may as well allow it. - """ - quoted_topic = urllib.quote(self.topic) - topic = self.topic + FUNNY_UNICODE - topic_utf8 = self.topic + FUNNY_UTF8 - quoted_callback = urllib.quote(self.callback) - callback = self.callback + FUNNY_UNICODE - callback_utf8 = self.callback + FUNNY_UTF8 - quoted_verify_token = urllib.quote(self.verify_token) - verify_token = self.verify_token + FUNNY_UNICODE - verify_token_utf8 = self.verify_token + FUNNY_UTF8 - - sub_key = Subscription.create_key_name( - main.normalize_iri(callback), - main.normalize_iri(topic)) - self.assertTrue(Subscription.get_by_key_name(sub_key) is None) - self.verify_callback_querystring_template = ( - self.callback + - '/blah/%%E3%%83%%96%%E3%%83%%AD%%E3%%82%%B0%%E8%%A1%%86' - '?hub.verify_token=the_token%%2F' - 'blah%%2F%%E3%%83%%96%%E3%%83%%AD%%E3%%82%%B0%%E8%%A1%%86' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic%%2F' - 'blah%%2F%%25E3%%2583%%2596%%25E3%%2583%%25AD' - '%%25E3%%2582%%25B0%%25E8%%25A1%%2586' - '&hub.mode=%s' - '&hub.lease_seconds=432000') - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - - payload = ( - 'hub.callback=' + quoted_callback + FUNNY_UTF8 + - '&hub.topic=' + quoted_topic + FUNNY_UTF8 + - '&hub.mode=subscribe' - '&hub.verify=sync' - '&hub.verify_token=' + quoted_verify_token + FUNNY_UTF8) - - self.handle_body('post', payload) - self.assertEquals(204, self.response_code()) - sub = Subscription.get_by_key_name(sub_key) - self.assertTrue(sub is not None) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_record_task(self.topic + FUNNY_IRI) - - -class SubscribeHandlerThroughHubUrlTest(SubscribeHandlerTest): - - handler_class = main.HubHandler - -################################################################################ - -class SubscriptionConfirmHandlerTest(testutil.HandlerTestBase): - - handler_class = main.SubscriptionConfirmHandler - - def setUp(self): - """Sets up the test fixture.""" - testutil.HandlerTestBase.setUp(self) - self.callback = 'http://example.com/good-callback' - self.topic = 'http://example.com/the-topic' - self.challenge = 'this_is_my_fake_challenge_string' - self.old_get_challenge = main.get_random_challenge - main.get_random_challenge = lambda: self.challenge - self.sub_key = Subscription.create_key_name(self.callback, self.topic) - self.verify_token = 'the_token' - self.secret = 'teh secrat' - self.verify_callback_querystring_template = ( - self.callback + - '?hub.verify_token=the_token' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic' - '&hub.mode=%s' - '&hub.lease_seconds=432000') - - def tearDown(self): - """Verify that all URL fetches occurred.""" - testutil.HandlerTestBase.tearDown(self) - main.get_random_challenge = self.old_get_challenge - urlfetch_test_stub.instance.verify_and_reset() - - def verify_task(self, next_state): - """Verifies that a subscription worker task is present. - - Args: - next_state: The next state the task should cause the Subscription to have. - """ - task = testutil.get_tasks(main.SUBSCRIPTION_QUEUE, - index=0, expected_count=1) - params = task['params'] - self.assertEquals(self.sub_key, params['subscription_key_name']) - self.assertEquals(next_state, params['next_state']) - - def verify_retry_task(self, - eta, - next_state, - verify_token=None, - secret=None, - auto_reconfirm=False): - """Verifies that a subscription worker retry task is present. - - Args: - eta: The ETA the retry task should have. - next_state: The next state the task should cause the Subscription to have. - verify_token: The verify token the retry task should have. Defaults to - the current token. - secret: The secret the retry task should have. Defaults to the - current secret. - auto_reconfirm: The confirmation type the retry task should have. - """ - task = testutil.get_tasks(main.SUBSCRIPTION_QUEUE, - index=1, expected_count=2) - params = task['params'] - self.assertEquals(testutil.task_eta(eta), task['eta']) - self.assertEquals(self.sub_key, params['subscription_key_name']) - self.assertEquals(next_state, params['next_state']) - self.assertEquals(verify_token or self.verify_token, params['verify_token']) - self.assertEquals(secret or self.secret, params['secret']) - self.assertEquals(str(auto_reconfirm), params['auto_reconfirm']) - - def verify_no_record_task(self): - """Tests there is not KnownFeedIdentity task enqueued. - - Raises: - AssertionError if the task is there. - """ - task = testutil.get_tasks(main.MAPPINGS_QUEUE, expected_count=0) - - def testNoWork(self): - """Tests when a task is enqueued for a Subscription that doesn't exist.""" - self.handle('post', ('subscription_key_name', 'unknown'), - ('next_state', Subscription.STATE_VERIFIED)) - - def testSubscribeSuccessful(self): - """Tests when a subscription task is successful.""" - self.assertTrue(db.get(KnownFeed.create_key(self.topic)) is None) - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.request_insert( - self.callback, self.topic, self.verify_token, self.secret) - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('secret', self.secret), - ('next_state', Subscription.STATE_VERIFIED)) - self.verify_task(Subscription.STATE_VERIFIED) - self.verify_no_record_task() - - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.assertEquals(self.verify_token, sub.verify_token) - self.assertEquals(self.secret, sub.secret) - - def testSubscribeSuccessfulQueryStringArgs(self): - """Tests a subscription callback with querystring args.""" - self.callback += '?some=query&string=params&to=mess&it=up' - self.sub_key = Subscription.create_key_name(self.callback, self.topic) - self.assertTrue(db.get(KnownFeed.create_key(self.topic)) is None) - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.request_insert( - self.callback, self.topic, self.verify_token, self.secret) - self.verify_callback_querystring_template = ( - self.callback + - '&hub.verify_token=the_token' - '&hub.challenge=this_is_my_fake_challenge_string' - '&hub.topic=http%%3A%%2F%%2Fexample.com%%2Fthe-topic' - '&hub.mode=%s' - '&hub.lease_seconds=432000') - - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'subscribe', 200, - self.challenge) - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('secret', self.secret), - ('next_state', Subscription.STATE_VERIFIED)) - self.verify_task(Subscription.STATE_VERIFIED) - self.verify_no_record_task() - - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.assertEquals(self.verify_token, sub.verify_token) - self.assertEquals(self.secret, sub.secret) - - def testSubscribeFailed(self): - """Tests when a subscription task fails.""" - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.request_insert( - self.callback, self.topic, self.verify_token, self.secret) - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'subscribe', 500, '') - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('secret', self.secret), - ('next_state', Subscription.STATE_VERIFIED)) - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(Subscription.STATE_NOT_VERIFIED, sub.subscription_state) - self.assertEquals(1, sub.confirm_failures) - self.assertEquals(self.verify_token, sub.verify_token) - self.assertEquals(self.secret, sub.secret) - self.verify_retry_task(sub.eta, - Subscription.STATE_VERIFIED, - verify_token=self.verify_token, - secret=self.secret) - - def testSubscribeConflict(self): - """Tests when confirmation hits a conflict and archives the subscription.""" - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.request_insert( - self.callback, self.topic, self.verify_token, self.secret) - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'subscribe', 404, '') - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('secret', self.secret), - ('next_state', Subscription.STATE_VERIFIED)) - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(Subscription.STATE_TO_DELETE, sub.subscription_state) - testutil.get_tasks(main.SUBSCRIPTION_QUEUE, expected_count=1) - - def testSubscribeBadChallengeResponse(self): - """Tests when the subscriber responds with a bad challenge.""" - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.request_insert( - self.callback, self.topic, self.verify_token, self.secret) - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'subscribe', 200, 'bad') - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('secret', self.secret), - ('next_state', Subscription.STATE_VERIFIED)) - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(Subscription.STATE_NOT_VERIFIED, sub.subscription_state) - self.assertEquals(1, sub.confirm_failures) - self.verify_retry_task(sub.eta, Subscription.STATE_VERIFIED) - - def testUnsubscribeSuccessful(self): - """Tests when an unsubscription request is successful.""" - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.insert( - self.callback, self.topic, self.verify_token, self.secret) - Subscription.request_remove(self.callback, self.topic, self.verify_token) - urlfetch_test_stub.instance.expect( - 'get', self.verify_callback_querystring_template % 'unsubscribe', 200, - self.challenge) - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('next_state', Subscription.STATE_TO_DELETE)) - self.verify_task(Subscription.STATE_TO_DELETE) - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - - def testUnsubscribeFailed(self): - """Tests when an unsubscription task fails.""" - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.insert( - self.callback, self.topic, self.verify_token, self.secret) - Subscription.request_remove(self.callback, self.topic, self.verify_token) - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'unsubscribe', 500, '') - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('next_state', Subscription.STATE_TO_DELETE), - ('secret', self.secret)) - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(1, sub.confirm_failures) - self.verify_retry_task(sub.eta, Subscription.STATE_TO_DELETE) - - def testUnsubscribeGivesUp(self): - """Tests when an unsubscription task completely gives up.""" - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.insert( - self.callback, self.topic, self.verify_token, self.secret) - Subscription.request_remove(self.callback, self.topic, self.verify_token) - sub = Subscription.get_by_key_name(self.sub_key) - sub.confirm_failures = 100 - sub.put() - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'unsubscribe', 500, '') - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('next_state', Subscription.STATE_TO_DELETE)) - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(100, sub.confirm_failures) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.verify_task(Subscription.STATE_TO_DELETE) - - def testSubscribeOverwrite(self): - """Tests that subscriptions can be overwritten with new parameters.""" - Subscription.insert( - self.callback, self.topic, self.verify_token, self.secret) - second_token = 'second_verify_token' - second_secret = 'second secret' - new_template = self.verify_callback_querystring_template.replace( - self.verify_token, second_token) - urlfetch_test_stub.instance.expect( - 'get', new_template % 'subscribe', 200, self.challenge) - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', second_token), - ('secret', second_secret), - ('next_state', Subscription.STATE_VERIFIED)) - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(Subscription.STATE_VERIFIED, sub.subscription_state) - self.assertEquals(second_token, sub.verify_token) - self.assertEquals(second_secret, sub.secret) - self.verify_no_record_task() - - def testConfirmError(self): - """Tests when an exception is raised while confirming a subscription. - - This will just propagate up in the stack and cause the task to retry - via the normal task queue retries. - """ - called = [False] - Subscription.request_insert( - self.callback, self.topic, self.verify_token, self.secret) - # All exceptions should just fall through. - def new_confirm(*args, **kwargs): - called[0] = True - raise db.Error() - try: - main.hooks.override_for_test(main.confirm_subscription, new_confirm) - try: - self.handle('post', ('subscription_key_name', self.sub_key)) - except db.Error: - pass - else: - self.fail() - finally: - main.hooks.reset_for_test(main.confirm_subscription) - self.assertTrue(called[0]) - self.verify_task(Subscription.STATE_VERIFIED) - - def testRenewNack(self): - """Tests when an auto-subscription-renewal returns a 404.""" - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.insert( - self.callback, self.topic, self.verify_token, self.secret) - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'subscribe', 404, '') - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('secret', self.secret), - ('next_state', Subscription.STATE_VERIFIED), - ('auto_reconfirm', 'True')) - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(Subscription.STATE_TO_DELETE, sub.subscription_state) - testutil.get_tasks(main.SUBSCRIPTION_QUEUE, expected_count=0) - - def testRenewErrorFailure(self): - """Tests when an auto-subscription-renewal returns errors repeatedly. - - In this case, since it's auto-renewal, the subscription should be dropped. - """ - self.assertTrue(Subscription.get_by_key_name(self.sub_key) is None) - Subscription.insert( - self.callback, self.topic, self.verify_token, self.secret) - sub = Subscription.get_by_key_name(self.sub_key) - sub.confirm_failures = 100 - sub.put() - urlfetch_test_stub.instance.expect('get', - self.verify_callback_querystring_template % 'subscribe', 500, '') - self.handle('post', ('subscription_key_name', self.sub_key), - ('verify_token', self.verify_token), - ('next_state', Subscription.STATE_VERIFIED), - ('auto_reconfirm', 'True')) - sub = Subscription.get_by_key_name(self.sub_key) - self.assertEquals(Subscription.STATE_TO_DELETE, sub.subscription_state) - testutil.get_tasks(main.SUBSCRIPTION_QUEUE, expected_count=0) - - -class SubscriptionReconfirmHandlerTest(testutil.HandlerTestBase): - """Tests for the periodic subscription reconfirming worker.""" - - def testFullFlow(self): - """Tests a full flow through the reconfirm worker.""" - self.now = time.time() - self.called = False - def start_map(*args, **kwargs): - self.assertEquals({ - 'name': 'Reconfirm expiring subscriptions', - 'reader_spec': 'mapreduce.input_readers.DatastoreInputReader', - 'queue_name': 'polling', - 'handler_spec': 'offline_jobs.SubscriptionReconfirmMapper.run', - 'shard_count': 4, - 'mapper_parameters': { - 'entity_kind': 'main.Subscription', - 'processing_rate': 100000, - 'threshold_timestamp': - int(self.now + main.SUBSCRIPTION_CHECK_BUFFER_SECONDS), - }, - 'mapreduce_parameters': { - 'done_callback': '/work/cleanup_mapper', - 'done_callback_queue': 'polling', - }, - }, kwargs) - self.called = True - - def create_handler(): - return main.SubscriptionReconfirmHandler( - now=lambda: self.now, - start_map=start_map) - self.handler_class = create_handler - - os.environ['HTTP_X_APPENGINE_QUEUENAME'] = main.POLLING_QUEUE - try: - self.handle('get') - task = testutil.get_tasks(main.POLLING_QUEUE, index=0, expected_count=1) - self.handle('post') - finally: - del os.environ['HTTP_X_APPENGINE_QUEUENAME'] - - self.assertTrue(self.called) - - -class SubscriptionCleanupHandlerTest(testutil.HandlerTestBase): - """Tests fo the SubscriptionCleanupHandler.""" - - handler_class = main.SubscriptionCleanupHandler - - def testEmpty(self): - """Tests cleaning up empty subscriptions.""" - self.handle('get') - - def testCleanup(self): - """Tests cleaning up a few deleted subscriptions.""" - callback = 'http://example.com/callback/%d' - topic = 'http://example.com/mytopic' - self.assertTrue(Subscription.insert(callback % 1, topic, '', '')) - self.assertTrue(Subscription.insert(callback % 2, topic, '', '')) - self.assertTrue(Subscription.insert(callback % 3, topic, '', '')) - self.assertEquals(3 * [Subscription.STATE_VERIFIED], - [s.subscription_state for s in Subscription.all()]) - - Subscription.archive(callback % 1, topic) - self.handle('get') - self.assertEquals(2 * [Subscription.STATE_VERIFIED], - [s.subscription_state for s in Subscription.all()]) - - -class CleanupMapperHandlerTest(testutil.HandlerTestBase): - """Tests for the CleanupMapperHandler.""" - - handler_class = main.CleanupMapperHandler - - def testMissing(self): - """Tests cleaning up a mapreduce that's not present.""" - self.assertEquals([], list(mapreduce.model.MapreduceState.all())) - os.environ['HTTP_MAPREDUCE_ID'] = '12345' - try: - self.handle('post') - finally: - del os.environ['HTTP_MAPREDUCE_ID'] - self.assertEquals([], list(mapreduce.model.MapreduceState.all())) - - def testPresent(self): - """Tests cleaning up a mapreduce that's present.""" - mapreduce_id = mapreduce.control.start_map( - name='Reconfirm expiring subscriptions', - handler_spec='offline_jobs.SubscriptionReconfirmMapper.run', - reader_spec='mapreduce.input_readers.DatastoreInputReader', - mapper_parameters=dict( - processing_rate=100000, - entity_kind='main.Subscription')) - - self.assertEquals(1, len(list(mapreduce.model.MapreduceState.all()))) - os.environ['HTTP_MAPREDUCE_ID'] = mapreduce_id - try: - self.handle('post') - finally: - del os.environ['HTTP_MAPREDUCE_ID'] - self.assertEquals([], list(mapreduce.model.MapreduceState.all())) - -################################################################################ - -PollingMarker = main.PollingMarker - - -class TakePollingActionTest(unittest.TestCase): - """Tests for the take_polling_action function.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - - def testFailure(self): - """Tests when inserting a new feed to fetch raises an exception.""" - called = [False] - topics = ['one', 'two', 'three'] - @classmethod - def new_insert(cls, topic_list, memory_only=True): - called[0] = True - self.assertFalse(memory_only) - self.assertEquals(topic_list, topics) - raise db.Error('Mock DB error') - - old_insert = main.FeedToFetch.insert - main.FeedToFetch.insert = new_insert - try: - main.take_polling_action(['one', 'two', 'three'], '') - finally: - main.FeedToFetch.insert = old_insert - - self.assertTrue(called[0]) - - -class PollBootstrapHandlerTest(testutil.HandlerTestBase): - - handler_class = main.PollBootstrapHandler - - def setUp(self): - """Sets up the test harness.""" - testutil.HandlerTestBase.setUp(self) - self.original_chunk_size = main.BOOSTRAP_FEED_CHUNK_SIZE - main.BOOSTRAP_FEED_CHUNK_SIZE = 2 - os.environ['HTTP_X_APPENGINE_QUEUENAME'] = main.POLLING_QUEUE - - def tearDown(self): - """Tears down the test harness.""" - testutil.HandlerTestBase.tearDown(self) - main.BOOSTRAP_FEED_CHUNK_SIZE = self.original_chunk_size - del os.environ['HTTP_X_APPENGINE_QUEUENAME'] - - def testFullFlow(self): - """Tests a full flow through multiple chunks.""" - topic = 'http://example.com/feed1' - topic2 = 'http://example.com/feed2' - topic3 = 'http://example.com/feed3-124' # alphabetical on the hash of this - db.put([KnownFeed.create(topic), KnownFeed.create(topic2), - KnownFeed.create(topic3)]) - self.assertTrue(FeedToFetch.get_by_topic(topic) is None) - self.assertTrue(FeedToFetch.get_by_topic(topic2) is None) - self.assertTrue(FeedToFetch.get_by_topic(topic3) is None) - - # This will repeatedly insert the initial task to start the polling process. - self.handle('get') - self.handle('get') - self.handle('get') - task = testutil.get_tasks(main.POLLING_QUEUE, index=0, expected_count=1) - sequence = task['params']['sequence'] - self.assertEquals('bootstrap', task['params']['poll_type']) - - # Now run the post handler with the params from this first task. It will - # enqueue another task that starts *after* the last one in the chunk. - self.handle('post', *task['params'].items()) - self.assertTrue(FeedToFetch.get_by_topic(topic) is not None) - self.assertTrue(FeedToFetch.get_by_topic(topic2) is not None) - self.assertTrue(FeedToFetch.get_by_topic(topic3) is None) - - # Running this handler again will overwrite the FeedToFetch instances, - # but it will not duplicate the polling queue Task in the chain of - # iterating through all KnownFeed entries or the fork-join queue task that - # will do the actual fetching. - self.handle('post', *task['params'].items()) - task = testutil.get_tasks(main.POLLING_QUEUE, index=1, expected_count=2) - self.assertEquals(sequence, task['params']['sequence']) - self.assertEquals('bootstrap', task['params']['poll_type']) - self.assertEquals(str(KnownFeed.create_key(topic2)), - task['params']['current_key']) - self.assertTrue(task['name'].startswith(sequence)) - - # Now running another post handler will handle the rest of the feeds. - self.handle('post', *task['params'].items()) - self.assertTrue(FeedToFetch.get_by_topic(topic) is not None) - self.assertTrue(FeedToFetch.get_by_topic(topic2) is not None) - self.assertTrue(FeedToFetch.get_by_topic(topic3) is not None) - - # Running this post handler again will do nothing because we de-dupe on - # the continuation task to prevent doing any more work in the current cycle. - self.handle('post', *task['params'].items()) - - task_list = testutil.get_tasks(main.POLLING_QUEUE, expected_count=3) - - # Deal with a stupid race condition - task = task_list[2] - if 'params' not in task: - task = task_list[3] - - self.assertEquals(sequence, task['params']['sequence']) - self.assertEquals('bootstrap', task['params']['poll_type']) - self.assertEquals(str(KnownFeed.create_key(topic3)), - task['params']['current_key']) - self.assertTrue(task['name'].startswith(sequence)) - - # Starting the cycle again will do nothing. - self.handle('get') - testutil.get_tasks(main.POLLING_QUEUE, expected_count=3) - - # Resetting the next start time to before the present time will - # cause the iteration to start again. - the_mark = PollingMarker.get() - the_mark.next_start = \ - datetime.datetime.utcnow() - datetime.timedelta(seconds=120) - db.put(the_mark) - self.handle('get') - - task_list = testutil.get_tasks(main.POLLING_QUEUE, expected_count=4) - task = task_list[3] - self.assertNotEquals(sequence, task['params']['sequence']) - - def testRecord(self): - """Tests when the parameter "poll_type=record" is specified.""" - topic = 'http://example.com/feed1' - topic2 = 'http://example.com/feed2' - topic3 = 'http://example.com/feed3-124' # alphabetical on the hash of this - db.put([KnownFeed.create(topic), KnownFeed.create(topic2), - KnownFeed.create(topic3)]) - self.assertTrue(FeedToFetch.get_by_topic(topic) is None) - self.assertTrue(FeedToFetch.get_by_topic(topic2) is None) - self.assertTrue(FeedToFetch.get_by_topic(topic3) is None) - - # This will insert the initial task to start the polling process. - self.handle('get', ('poll_type', 'record')) - task = testutil.get_tasks(main.POLLING_QUEUE, index=0, expected_count=1) - sequence = task['params']['sequence'] - self.assertEquals('record', task['params']['poll_type']) - - # Now run the post handler with the params from this first task. It will - # enqueue another task that starts *after* the last one in the chunk. - self.handle('post', *task['params'].items()) - task = testutil.get_tasks(main.POLLING_QUEUE, index=1, expected_count=2) - self.assertEquals('record', task['params']['poll_type']) - - # Now running another post handler will handle the rest of the feeds. - self.handle('post', *task['params'].items()) - - # And there will be tasks in the MAPPINGS_QUEUE to update all of the - # KnownFeeds that we have found. - task = testutil.get_tasks(main.MAPPINGS_QUEUE, index=0, expected_count=3) - self.assertEquals(topic, task['params']['topic']) - task = testutil.get_tasks(main.MAPPINGS_QUEUE, index=1, expected_count=3) - self.assertEquals(topic2, task['params']['topic']) - task = testutil.get_tasks(main.MAPPINGS_QUEUE, index=2, expected_count=3) - self.assertEquals(topic3, task['params']['topic']) - -################################################################################ - -KnownFeedIdentity = main.KnownFeedIdentity - - -class RecordFeedHandlerTest(testutil.HandlerTestBase): - """Tests for the RecordFeedHandler flow.""" - - def setUp(self): - """Sets up the test harness.""" - self.now = [datetime.datetime.utcnow()] - self.handler_class = lambda: main.RecordFeedHandler(now=lambda: self.now[0]) - testutil.HandlerTestBase.setUp(self) - - self.old_identify = main.feed_identifier.identify - self.expected_calls = [] - self.expected_results = [] - def new_identify(content, feed_type): - self.assertEquals(self.expected_calls.pop(0), (content, feed_type)) - result = self.expected_results.pop(0) - if isinstance(result, Exception): - raise result - else: - return result - - main.feed_identifier.identify = new_identify - self.topic = 'http://www.example.com/meepa' - self.feed_id = 'my_feed_id' - self.content = 'my_atom_content' - - def tearDown(self): - """Tears down the test harness.""" - main.feed_identifier.identify = self.old_identify - testutil.HandlerTestBase.tearDown(self) - urlfetch_test_stub.instance.verify_and_reset() - - def verify_update(self): - """Verifies the feed_id has been added for the topic.""" - feed_id = KnownFeedIdentity.get(KnownFeedIdentity.create_key(self.feed_id)) - feed = KnownFeed.get(KnownFeed.create_key(self.topic)) - self.assertEquals([self.topic], feed_id.topics) - self.assertEquals(feed.feed_id, self.feed_id) - self.assertEquals(feed.feed_id, feed_id.feed_id) - - def testNewFeed(self): - """Tests recording details for a known feed.""" - urlfetch_test_stub.instance.expect('GET', self.topic, 200, self.content) - self.expected_calls.append((self.content, 'atom')) - self.expected_results.append(self.feed_id) - self.handle('post', ('topic', self.topic)) - self.verify_update() - - def testNewFeedFetchFailure(self): - """Tests when fetching a feed to record returns a non-200 response.""" - urlfetch_test_stub.instance.expect('GET', self.topic, 404, '') - self.handle('post', ('topic', self.topic)) - feed = KnownFeed.get(KnownFeed.create_key(self.topic)) - self.assertTrue(feed.feed_id is None) - - def testNewFeedFetchException(self): - """Tests when fetching a feed to record returns an exception.""" - urlfetch_test_stub.instance.expect('GET', self.topic, 200, '', - urlfetch_error=True) - self.handle('post', ('topic', self.topic)) - feed = KnownFeed.get(KnownFeed.create_key(self.topic)) - self.assertTrue(feed.feed_id is None) - - def testParseRetry(self): - """Tests when parsing as Atom fails, but RSS is successful.""" - urlfetch_test_stub.instance.expect('GET', self.topic, 200, self.content) - self.expected_calls.append((self.content, 'atom')) - self.expected_results.append(xml.sax.SAXException('Mock error')) - self.expected_calls.append((self.content, 'rss')) - self.expected_results.append(self.feed_id) - self.handle('post', ('topic', self.topic)) - self.verify_update() - - def testParseFails(self): - """Tests when parsing completely fails.""" - urlfetch_test_stub.instance.expect('GET', self.topic, 200, self.content) - self.expected_calls.append((self.content, 'atom')) - self.expected_results.append(xml.sax.SAXException('Mock error')) - self.expected_calls.append((self.content, 'rss')) - self.expected_results.append(xml.sax.SAXException('Mock error 2')) - self.handle('post', ('topic', self.topic)) - feed = KnownFeed.get(KnownFeed.create_key(self.topic)) - self.assertTrue(feed.feed_id is None) - - def testParseFindsNoIds(self): - """Tests when no SAX exception is raised but no feed ID is found.""" - urlfetch_test_stub.instance.expect('GET', self.topic, 200, self.content) - self.expected_calls.append((self.content, 'atom')) - self.expected_results.append(None) - self.expected_calls.append((self.content, 'rss')) - self.expected_results.append(None) - self.handle('post', ('topic', self.topic)) - feed = KnownFeed.get(KnownFeed.create_key(self.topic)) - self.assertTrue(feed.feed_id is None) - - def testParseFindsEmptyId(self): - """Tests when no SAX exception is raised but the feed ID is empty.""" - urlfetch_test_stub.instance.expect('GET', self.topic, 200, self.content) - self.expected_calls.append((self.content, 'atom')) - self.expected_results.append('') - self.handle('post', ('topic', self.topic)) - feed = KnownFeed.get(KnownFeed.create_key(self.topic)) - self.assertTrue(feed.feed_id is None) - - def testExistingFeedNeedsRefresh(self): - """Tests recording details for an existing feed that needs a refresh.""" - KnownFeed.create(self.topic).put() - self.now[0] += datetime.timedelta( - seconds=main.FEED_IDENTITY_UPDATE_PERIOD + 1) - - urlfetch_test_stub.instance.expect('GET', self.topic, 200, self.content) - self.expected_calls.append((self.content, 'atom')) - self.expected_results.append(self.feed_id) - self.handle('post', ('topic', self.topic)) - self.verify_update() - - def testExistingFeedNoRefresh(self): - """Tests recording details when the feed does not need a refresh.""" - feed = KnownFeed.create(self.topic) - feed.feed_id = 'meep' - feed.put() - self.handle('post', ('topic', self.topic)) - # Confirmed by no calls to urlfetch or feed_identifier. - - def testExistingFeedNoIdRefresh(self): - """Tests that a KnownFeed with no ID will be refreshed.""" - feed = KnownFeed.create(self.topic) - urlfetch_test_stub.instance.expect('GET', self.topic, 200, self.content) - self.expected_calls.append((self.content, 'atom')) - self.expected_results.append(self.feed_id) - self.handle('post', ('topic', self.topic)) - self.verify_update() - - def testNewFeedRelation(self): - """Tests when the feed ID relation changes for a topic.""" - KnownFeedIdentity.update(self.feed_id, self.topic) - feed = KnownFeed.create(self.topic) - feed.feed_id = self.feed_id - feed.put() - self.now[0] += datetime.timedelta( - seconds=main.FEED_IDENTITY_UPDATE_PERIOD + 1) - - new_feed_id = 'other_feed_id' - urlfetch_test_stub.instance.expect('GET', self.topic, 200, self.content) - self.expected_calls.append((self.content, 'atom')) - self.expected_results.append(new_feed_id) - self.handle('post', ('topic', self.topic)) - - feed_id = KnownFeedIdentity.get(KnownFeedIdentity.create_key(new_feed_id)) - feed = KnownFeed.get(feed.key()) - self.assertEquals([self.topic], feed_id.topics) - self.assertEquals(feed.feed_id, new_feed_id) - self.assertEquals(feed.feed_id, feed_id.feed_id) - - # Old KnownFeedIdentity should have been deleted. - self.assertTrue(KnownFeedIdentity.get( - KnownFeedIdentity.create_key(self.feed_id)) is None) - - -class RecordFeedHandlerWithParsingTest(testutil.HandlerTestBase): - """Tests for the RecordFeedHandler that excercise parsing.""" - - handler_class = main.RecordFeedHandler - - def testAtomParsing(self): - """Tests parsing an Atom feed.""" - topic = 'http://example.com/atom' - feed_id = 'my-id' - data = ('' - 'my-id') - urlfetch_test_stub.instance.expect('GET', topic, 200, data) - self.handle('post', ('topic', topic)) - - known_id = KnownFeedIdentity.get(KnownFeedIdentity.create_key(feed_id)) - feed = KnownFeed.get(KnownFeed.create_key(topic)) - self.assertEquals([topic], known_id.topics) - self.assertEquals(feed.feed_id, feed_id) - self.assertEquals(feed.feed_id, known_id.feed_id) - - def testRssParsing(self): - """Tests parsing an Atom feed.""" - topic = 'http://example.com/rss' - feed_id = 'http://example.com/blah' - data = ('' - 'http://example.com/blah') - urlfetch_test_stub.instance.expect('GET', topic, 200, data) - self.handle('post', ('topic', topic)) - - known_id = KnownFeedIdentity.get(KnownFeedIdentity.create_key(feed_id)) - feed = KnownFeed.get(KnownFeed.create_key(topic)) - self.assertEquals([topic], known_id.topics) - self.assertEquals(feed.feed_id, feed_id) - self.assertEquals(feed.feed_id, known_id.feed_id) - -################################################################################ - -class HookManagerTest(unittest.TestCase): - """Tests for the HookManager and Hook classes.""" - - def setUp(self): - """Sets up the test harness.""" - self.hooks_directory = tempfile.mkdtemp() - if not os.path.exists(self.hooks_directory): - os.makedirs(self.hooks_directory) - self.valueA = object() - self.valueB = object() - self.valueC = object() - self.funcA = lambda *a, **k: self.valueA - self.funcB = lambda *a, **k: self.valueB - self.funcC = lambda *a, **k: self.valueC - self.globals_dict = { - 'funcA': self.funcA, - 'funcB': self.funcB, - 'funcC': self.funcC, - } - self.manager = main.HookManager() - self.manager.declare(self.funcA) - self.manager.declare(self.funcB) - self.manager.declare(self.funcC) - - def tearDown(self): - """Tears down the test harness.""" - shutil.rmtree(self.hooks_directory, True) - - def write_hook(self, filename, content): - """Writes a test hook to the hooks directory. - - Args: - filename: The relative filename the hook should have. - content: The Python code that should go in the hook module. - """ - hook_file = open(os.path.join(self.hooks_directory, filename), 'w') - try: - hook_file.write('#!/usr/bin/env python\n') - hook_file.write(content) - finally: - hook_file.close() - - def load_hooks(self): - """Causes the hooks to load.""" - self.manager.load(hooks_path=self.hooks_directory, - globals_dict=self.globals_dict) - - def testNoHooksDir(self): - """Tests when there is no hooks directory present at all.""" - hooks_path = tempfile.mktemp() - self.assertFalse(os.path.exists(hooks_path)) - self.manager.load(hooks_path=hooks_path, - globals_dict=self.globals_dict) - for entry, hooks in self.manager._mapping.iteritems(): - self.assertEquals(0, len(hooks)) - - def testNoHooks(self): - """Tests loading a directory with no hooks modules.""" - self.load_hooks() - self.assertEquals(self.valueA, self.manager.execute(self.funcA)) - self.assertEquals(self.valueB, self.manager.execute(self.funcB)) - self.assertEquals(self.valueC, self.manager.execute(self.funcC)) - - def testOneGoodHook(self): - """Tests a single good hook.""" - self.write_hook('my_hook.py',""" -class MyHook(Hook): - def inspect(self, args, kwargs): - return True - def __call__(self, *args, **kwargs): - return 'fancy string' -register(funcA, MyHook()) -""") - self.load_hooks() - self.assertEquals('fancy string', self.manager.execute(self.funcA)) - - def testDifferentHooksInOneModule(self): - """Tests different hook methods in a single hook module.""" - self.write_hook('my_hook.py',""" -class MyHook(Hook): - def __init__(self, value): - self.value = value - def inspect(self, args, kwargs): - return True - def __call__(self, *args, **kwargs): - return self.value -register(funcA, MyHook('fancy A')) -register(funcB, MyHook('fancy B')) -register(funcC, MyHook('fancy C')) -""") - self.load_hooks() - self.assertEquals('fancy A', self.manager.execute(self.funcA)) - self.assertEquals('fancy B', self.manager.execute(self.funcB)) - self.assertEquals('fancy C', self.manager.execute(self.funcC)) - - def testBadHookModule(self): - """Tests a hook module that's bad and throws exception on load.""" - self.write_hook('my_hook.py',"""raise Exception('Doh')""") - self.assertRaises( - Exception, - self.load_hooks) - - def testIncompleteHook(self): - """Tests that an incomplete hook implementation will die on execute.""" - self.write_hook('my_hook1.py',""" -class MyHook(Hook): - def inspect(self, args, kwargs): - return True -register(funcA, MyHook()) -""") - self.load_hooks() - self.assertRaises( - AssertionError, - self.manager.execute, - self.funcA) - - def testHookModuleOrdering(self): - """Tests that hook modules are loaded and applied in order.""" - self.write_hook('my_hook1.py',""" -class MyHook(Hook): - def inspect(self, args, kwargs): - args[0].append(1) - return False -register(funcA, MyHook()) -""") - self.write_hook('my_hook2.py',""" -class MyHook(Hook): - def inspect(self, args, kwargs): - args[0].append(2) - return False -register(funcA, MyHook()) -""") - self.write_hook('my_hook3.py',""" -class MyHook(Hook): - def inspect(self, args, kwargs): - return True - def __call__(self, *args, **kwargs): - return 'peanuts' -register(funcA, MyHook()) -""") - self.load_hooks() - value_list = [5] - self.assertEquals('peanuts', self.manager.execute(self.funcA, value_list)) - self.assertEquals([5, 1, 2], value_list) - - def testHookBadRegistration(self): - """Tests when registering a hook for an unknown callable.""" - self.write_hook('my_hook1.py',""" -class MyHook(Hook): - def inspect(self, args, kwargs): - return False -register(lambda: None, MyHook()) -""") - self.assertRaises( - main.InvalidHookError, - self.load_hooks) - - def testMultipleRegistration(self): - """Tests that the first hook is called when two are registered.""" - self.write_hook('my_hook.py',""" -class MyHook(Hook): - def __init__(self, value): - self.value = value - def inspect(self, args, kwargs): - args[0].append(self.value) - return True - def __call__(self, *args, **kwargs): - return self.value -register(funcA, MyHook('fancy first')) -register(funcA, MyHook('fancy second')) -""") - self.load_hooks() - value_list = ['hello'] - self.assertEquals('fancy first', - self.manager.execute(self.funcA, value_list)) - self.assertEquals(['hello', 'fancy first', 'fancy second'], value_list) - -################################################################################ - -if __name__ == '__main__': - dos.DISABLE_FOR_TESTING = True - unittest.main() diff --git a/hub/mapreduce.yaml b/hub/mapreduce.yaml deleted file mode 100644 index b2b86f9..0000000 --- a/hub/mapreduce.yaml +++ /dev/null @@ -1,44 +0,0 @@ -mapreduce: -- name: Cleanup old EventToDeliver instances - mapper: - input_reader: mapreduce.input_readers.DatastoreInputReader - handler: offline_jobs.CleanupOldEventToDeliver.run - params: - - name: entity_kind - default: main.EventToDeliver - - name: shard_count - default: 32 - - name: processing_rate - default: 100000 - - name: age_days - default: 14 - params_validator: offline_jobs.CleanupOldEventToDeliver.validate_params -- name: Reconfirm expiring subscriptions - mapper: - input_reader: mapreduce.input_readers.DatastoreInputReader - handler: offline_jobs.SubscriptionReconfirmMapper.run - params: - - name: entity_kind - default: main.Subscription - - name: shard_count - default: 32 - - name: processing_rate - default: 100000 - - name: threshold_timestamp - params_validator: offline_jobs.SubscriptionReconfirmMapper.validate_params -- name: Count subscribers by topic and callback pattern - mapper: - input_reader: mapreduce.input_readers.DatastoreInputReader - handler: offline_jobs.CountSubscribers.run - params: - - name: entity_kind - default: main.Subscription - - name: shard_count - default: 128 - - name: processing_rate - default: 1000000 - - name: topic_pattern - default: http(s)?://.* - - name: callback_pattern - default: http(?:s)?://(?:[^\\.]+\\.)*([^\\./]+\.[^\\./]+)(?:/.*)? - params_validator: offline_jobs.CountSubscribers.validate_params diff --git a/hub/offline_jobs.py b/hub/offline_jobs.py deleted file mode 100644 index ab0610d..0000000 --- a/hub/offline_jobs.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2010 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Offline cleanup and analysis jobs used with the hub.""" - -import datetime -import logging -import math -import re -import time - -from google.appengine.ext import db - -import main - -from mapreduce import context -from mapreduce import input_readers -from mapreduce import mapreduce_pipeline -from mapreduce import operation as op -from mapreduce import util - - -class CleanupOldEventToDeliver(object): - """Removes EventToDeliver instances older than a certain value.""" - - @staticmethod - def validate_params(params): - assert 'age_days' in params - params['oldest_last_modified'] = ( - time.time() - (86400 * int(params['age_days']))) - - def __init__(self): - self.oldest_last_modified = None - - def run(self, event): - if not self.oldest_last_modified: - params = context.get().mapreduce_spec.mapper.params - self.oldest_last_modified = datetime.datetime.utcfromtimestamp( - params['oldest_last_modified']) - - if event.last_modified < self.oldest_last_modified: - yield op.db.Delete(event) - - -class CountSubscribers(object): - """Mapper counts active subscribers to a feed pattern by domain. - - Args: - topic_pattern: Fully-matching regular expression pattern for topics to - include in the count. - callback_pattern: Full-matching regular expression pattern for callback - URLs, where the first group is used as the aggregation key for counters. - """ - - @staticmethod - def validate_params(params): - topic_pattern = params['topic_pattern'] - assert topic_pattern and re.compile(topic_pattern) - callback_pattern = params['callback_pattern'] - assert callback_pattern and re.compile(callback_pattern) - - def __init__(self): - self.topic_pattern = None - self.callback_pattern = None - - def run(self, subscription): - if self.topic_pattern is None: - params = context.get().mapreduce_spec.mapper.params - self.topic_pattern = re.compile(params['topic_pattern']) - self.callback_pattern = re.compile(params['callback_pattern']) - - if self.topic_pattern.match(subscription.topic): - the_match = self.callback_pattern.match(subscription.callback) - if (the_match and - subscription.subscription_state == main.Subscription.STATE_VERIFIED): - yield op.counters.Increment(the_match.group(1)) - elif the_match: - yield op.counters.Increment('matched but inactive') - - -class SubscriptionReconfirmMapper(object): - """For reconfirming subscriptions that are nearing expiration.""" - - @staticmethod - def validate_params(params): - assert 'threshold_timestamp' in params - - def __init__(self): - self.threshold_timestamp = None - - def run(self, sub): - if sub.subscription_state != main.Subscription.STATE_VERIFIED: - return - - if self.threshold_timestamp is None: - params = context.get().mapreduce_spec.mapper.params - self.threshold_timestamp = datetime.datetime.utcfromtimestamp( - float(params['threshold_timestamp'])) - - if sub.expiration_time < self.threshold_timestamp: - sub.request_insert(sub.callback, sub.topic, sub.verify_token, - sub.secret, auto_reconfirm=True) - - -def count_subscriptions_for_topic(subscription): - """Counts a Subscription instance if it's still active.""" - print subscription.subscription_state - if subscription.subscription_state == main.Subscription.STATE_VERIFIED: - yield (subscription.topic_hash, '1') - - -def save_subscription_counts_for_topic(topic_hash, counts): - """Sums subscriptions to a topic and saves a corresponding KnownFeedStat.""" - total_count = len(counts) - entity = main.KnownFeedStats( - key=main.KnownFeedStats.create_key(topic_hash=topic_hash), - subscriber_count=total_count) - yield op.db.Put(entity) - - -def start_count_subscriptions(): - """Kicks off the MapReduce for determining and saving subscription counts.""" - job = mapreduce_pipeline.MapreducePipeline( - 'Count subscriptions', - 'offline_jobs.count_subscriptions_for_topic', - 'offline_jobs.save_subscription_counts_for_topic', - 'mapreduce.input_readers.DatastoreInputReader', - mapper_params=dict(entity_kind='main.Subscription'), - shards=4) - # TODO(bslatkin): Pass through the queue name to run the job on. This is - # a limitation in the mapper library. - job.start() - return job.pipeline_id diff --git a/hub/offline_jobs_test.py b/hub/offline_jobs_test.py deleted file mode 100755 index 98916a8..0000000 --- a/hub/offline_jobs_test.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2010 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Tests for the offline_jobs module.""" - -import datetime -import logging -logging.basicConfig(format='%(levelname)-8s %(filename)s] %(message)s') -import re -import time -import unittest - -import testutil -testutil.fix_path() - -from google.appengine.ext import db - -from mapreduce import context -from mapreduce.lib import key_range - -import main -import offline_jobs - -################################################################################ - -Subscription = main.Subscription - - -class SubscriptionReconfirmMapperTest(unittest.TestCase): - """Tests for the SubscriptionReconfirmMapper.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.mapper = offline_jobs.SubscriptionReconfirmMapper() - self.callback = 'http://example.com/my-callback-url' - self.topic = 'http://example.com/my-topic-url' - self.token = 'token' - self.secret = 'my secrat' - - self.now = datetime.datetime.utcnow() - self.threshold_seconds = 1000 - self.threshold_timestamp = ( - time.mktime(self.now.utctimetuple()) + self.threshold_seconds) - self.getnow = lambda: self.now - - class FakeMapper(object): - params = {'threshold_timestamp': str(self.threshold_timestamp)} - class FakeSpec(object): - mapreduce_id = '1234' - mapper = FakeMapper() - self.context = context.Context(FakeSpec(), None) - context.Context._set(self.context) - - def get_subscription(self): - """Returns the Subscription used for testing.""" - return Subscription.get_by_key_name( - Subscription.create_key_name(self.callback, self.topic)) - - def testValidateParams(self): - """Tests the validate_params static method.""" - self.assertRaises( - AssertionError, - offline_jobs.SubscriptionReconfirmMapper.validate_params, - {}) - offline_jobs.SubscriptionReconfirmMapper.validate_params( - {'threshold_timestamp': 123}) - - def testIgnoreUnverified(self): - """Tests that unverified subscriptions are skipped.""" - self.assertTrue(Subscription.request_insert( - self.callback, self.topic, self.token, self.secret, - now=self.getnow)) - sub = self.get_subscription() - self.mapper.run(sub) - testutil.get_tasks(main.POLLING_QUEUE, expected_count=0) - - def testAfterThreshold(self): - """Tests when a subscription is not yet ready for reconfirmation.""" - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret, - now=self.getnow, lease_seconds=self.threshold_seconds)) - sub = self.get_subscription() - self.mapper.run(sub) - testutil.get_tasks(main.POLLING_QUEUE, expected_count=0) - - def testBeforeThreshold(self): - """Tests when a subscription is ready for reconfirmation.""" - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret, - now=self.getnow, lease_seconds=self.threshold_seconds-1)) - sub = self.get_subscription() - self.mapper.run(sub) - task = testutil.get_tasks(main.POLLING_QUEUE, index=0, expected_count=1) - self.assertEquals('polling', task['headers']['X-AppEngine-QueueName']) - -################################################################################ - -class CountSubscribersTest(unittest.TestCase): - """Tests for the CountSubscribers job.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.mapper = offline_jobs.CountSubscribers() - self.callback = 'http://foo.callback-example.com/my-callback-url' - self.topic = 'http://example.com/my-topic-url' - self.token = 'token' - self.secret = 'my secrat' - # Do not make these raw strings on purpose, since they will get - # passed through escaped in the mapreduce.yaml. - self.topic_pattern = '^http://example\\.com/.*$' - self.callback_pattern = ( - 'http(?:s)?://(?:[^\\.]+\\.)*([^\\./]+\.[^\\./]+)(?:/.*)?') - - class FakeMapper(object): - params = { - 'topic_pattern': self.topic_pattern, - 'callback_pattern': self.callback_pattern, - } - class FakeSpec(object): - mapreduce_id = '1234' - mapper = FakeMapper() - self.context = context.Context(FakeSpec(), None) - context.Context._set(self.context) - - def get_subscription(self): - """Returns the Subscription used for testing.""" - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - return Subscription.get_by_key_name( - Subscription.create_key_name(self.callback, self.topic)) - - def testExpressions(self): - """Tests the default expressions we're going to use for callbacks.""" - callback_re = re.compile(self.callback_pattern) - self.assertEquals( - 'blah.com', - callback_re.match('http://foo.blah.com/stuff').group(1)) - self.assertEquals( - 'blah.com', - callback_re.match('http://blah.com/stuff').group(1)) - self.assertEquals( - 'blah.com', - callback_re.match('http://one.two.three.blah.com/stuff').group(1)) - self.assertEquals( - 'blah.com', - callback_re.match('http://no-ending.blah.com').group(1)) - self.assertEquals( - 'example.com', - callback_re.match('https://fun.with.https.example.com/').group(1)) - - def testValidateParams(self): - """Tests the validate_params function.""" - self.assertRaises( - KeyError, - offline_jobs.CountSubscribers.validate_params, - {}) - self.assertRaises( - AssertionError, - offline_jobs.CountSubscribers.validate_params, - {'topic_pattern': ''}) - self.assertRaises( - re.error, - offline_jobs.CountSubscribers.validate_params, - {'topic_pattern': 'this is bad('}) - self.assertRaises( - KeyError, - offline_jobs.CountSubscribers.validate_params, - {'topic_pattern': 'okay'}) - self.assertRaises( - AssertionError, - offline_jobs.CountSubscribers.validate_params, - {'topic_pattern': 'okay', 'callback_pattern': ''}) - self.assertRaises( - re.error, - offline_jobs.CountSubscribers.validate_params, - {'topic_pattern': 'okay', 'callback_pattern': 'this is bad('}) - offline_jobs.CountSubscribers.validate_params( - {'topic_pattern': 'okay', 'callback_pattern': 'and okay'}) - - def testTopicMatch_CallbackMatch(self): - """Tests when the topic and callbacks match.""" - sub = self.get_subscription() - gen = self.mapper.run(sub) - counter = gen.next() - self.assertEquals('callback-example.com', counter.counter_name) - self.assertEquals(1, counter.delta) - self.assertRaises(StopIteration, gen.next) - - def testTopicMatch_CallbackMatch_Inactive(self): - """Tests when the subscription matches but is inactive.""" - sub = self.get_subscription() - sub.subscription_state = Subscription.STATE_NOT_VERIFIED - sub.put() - gen = self.mapper.run(sub) - counter = gen.next() - self.assertEquals('matched but inactive', counter.counter_name) - self.assertEquals(1, counter.delta) - self.assertRaises(StopIteration, gen.next) - - def testTopicMatch_CallbackNoMatch(self): - """Tests when the topic matches but the callback does not.""" - self.callback = 'some garbage' - sub = self.get_subscription() - gen = self.mapper.run(sub) - self.assertRaises(StopIteration, gen.next) - - def testTopicNoMatch(self): - """Tests when the topic does not match.""" - self.topic = 'http://does-not-match.com' - sub = self.get_subscription() - gen = self.mapper.run(sub) - self.assertRaises(StopIteration, gen.next) - -################################################################################ - -class SaveSubscriptionCountsTest(unittest.TestCase): - """Tests for the MapReduce that saves subscription counts.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.callback = 'http://foo.callback-example.com/my-callback-url' - self.topic = 'http://example.com/my-topic-url' - self.token = 'token' - self.secret = 'my secrat' - - def testMap(self): - """Tests the mapper function.""" - self.assertTrue(Subscription.insert( - self.callback, self.topic, self.token, self.secret)) - sub = Subscription.get_by_key_name( - Subscription.create_key_name(self.callback, self.topic)) - - # Active subscription. - it = offline_jobs.count_subscriptions_for_topic(sub) - self.assertEquals(('95ff66c343530c88a750cbc7fd1e0bbd8cc7bce2', '1'), - it.next()) - self.assertRaises(StopIteration, it.next) - - # Not active - Subscription.archive(self.callback, self.topic) - sub = db.get(sub.key()) - it = offline_jobs.count_subscriptions_for_topic(sub) - self.assertRaises(StopIteration, it.next) - - def testReduce(self): - """Tests the reducer function.""" - self.assertEquals(0, len(list(main.KnownFeedStats.all()))) - it = offline_jobs.save_subscription_counts_for_topic( - '95ff66c343530c88a750cbc7fd1e0bbd8cc7bce2', - ['1'] * 321) - op = it.next() - self.assertEquals( - db.Key.from_path( - 'KnownFeed', '95ff66c343530c88a750cbc7fd1e0bbd8cc7bce2', - 'KnownFeedStats', 'overall'), - op.entity.key()) - self.assertEquals(321, op.entity.subscriber_count) - self.assertRaises(StopIteration, it.next) - - def testStart(self): - """Tests starting the mapreduce job.""" - job_id = offline_jobs.start_count_subscriptions() - self.assertTrue(job_id is not None) - task = testutil.get_tasks('default', expected_count=1, index=0) - self.assertEquals('/mapreduce/pipeline/run', task['url']) - -################################################################################ - -if __name__ == '__main__': - unittest.main() diff --git a/hub/publish_debug.html b/hub/publish_debug.html deleted file mode 100644 index 6f2b3cf..0000000 --- a/hub/publish_debug.html +++ /dev/null @@ -1,38 +0,0 @@ - - - Hub - Publisher debug - - - - - -

Publish

- -
- -

- - -

-

-
-Note: submission will result in a HTTP 204 response to acknowledge; in browsers this looks like a no-op - -

Publisher Diagnostics

- -Retrieve details about a topic URL. -
-

- - -

-

-
-Note: This hub ignores publisher notifications about feeds with no -subscribers. If you are trying to test that your publisher works correctly, -you may want to use the Subscription page to subscribe -the example subscriber -app to your feed. - - - diff --git a/hub/queue.yaml b/hub/queue.yaml deleted file mode 100644 index 6f29ad5..0000000 --- a/hub/queue.yaml +++ /dev/null @@ -1,19 +0,0 @@ -queue: -- name: subscriptions - rate: 1/s -- name: polling - rate: 1/s -- name: feed-pulls - rate: 5/s -- name: feed-pulls-retries - rate: 1/s -- name: event-delivery - rate: 5/s -- name: event-delivery-retries - rate: 1/s -- name: mappings - rate: 1/s -- name: mapreduce - rate: 2/s -- name: default - rate: 0/s diff --git a/hub/remote_shell.py b/hub/remote_shell.py deleted file mode 100755 index 2e89b59..0000000 --- a/hub/remote_shell.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import testutil -testutil.fix_path() - -from main import * - -import code -import getpass -import sys - -from google.appengine.ext.remote_api import remote_api_stub -from google.appengine.ext import db - -def auth_func(): - return raw_input('Username:'), getpass.getpass('Password:') - -if len(sys.argv) > 3: - print "Usage: %s [app_id] [host]" % (sys.argv[0],) - sys.exit(1) - -app_id = 'pubsubhubbub' -if len(sys.argv) >= 2: - app_id = sys.argv[1] - -host = '%s.appspot.com' % app_id -if len(sys.argv) == 3: - host = sys.argv[2] - -remote_api_stub.ConfigureRemoteDatastore(app_id, '/remote_api', auth_func, host) - -code.interact('App Engine interactive console for %s' % app_id, None, locals()) diff --git a/hub/run_tests.sh b/hub/run_tests.sh deleted file mode 100755 index 5a0f4a3..0000000 --- a/hub/run_tests.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -for test_file in $(ls *_test.py ../nonstandard/*_test.py) -do - echo -e "========== Running $test_file" - ./$test_file - if [ "$?" -ne "0" ]; then - echo "Died in $test_file" - exit 1 - fi -done diff --git a/hub/stats_table.html b/hub/stats_table.html deleted file mode 100644 index ef280bf..0000000 --- a/hub/stats_table.html +++ /dev/null @@ -1,33 +0,0 @@ -
-

{{result.title}}

-
- Sample window: {{result.time_elapsed|floatformat:"0"}} seconds - {% if show_everything %} - Total samples: {{result.total_samples}} - Unique keys: {{result.unique_samples}} - Overall rate: {{result.overall_rate|floatformat:"-4"}}/sec - {% endif %} -
-{% if result.unique_samples %} - - - - - - - - - - {% for sample in result.sample_objects|dictsortreversed:"frequency"|slice:":40" %} - - - - - - - - - {% endfor %} -
{{result.key_name}}SamplesFrequencyMinMaxAverage
{{sample.key|escape}}{{sample.count}}{{sample.frequency|floatformat:"-2"}}/sec{{sample.min|floatformat:"-2"}}{{sample.max|floatformat:"-2"}}{{sample.average|floatformat:"-2"}} {{result.value_units}}
-{% endif %} -
diff --git a/hub/subscribe_debug.html b/hub/subscribe_debug.html deleted file mode 100644 index 8276396..0000000 --- a/hub/subscribe_debug.html +++ /dev/null @@ -1,68 +0,0 @@ - - - Hub - Subscription debug - - - - - -

Subscribe/Unsubscribe

- -Create a new subscription. -
-

- - -

-

- - -

-

- - -

-

- - -

-

- - -

-

- - -

-

-
-Note: submission will result in a HTTP 204 response to acknowledge; in browsers this looks like a no-op - -

Subscriber Diagnostics

- -Retrieve details about a subscription. -
-

- - -

-

- - -

-

- - -

- -

-
- - - diff --git a/hub/testutil.py b/hub/testutil.py deleted file mode 100644 index 2d47047..0000000 --- a/hub/testutil.py +++ /dev/null @@ -1,283 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Utilities common to all tests.""" - -import StringIO -import base64 -import cgi -import logging -import os -import sys -import tempfile -import unittest -import urllib - - -TEST_APP_ID = 'my-app-id' -TEST_VERSION_ID = 'my-version.1234' - -# Assign the application ID up front here so we can create db.Key instances -# before doing any other test setup. -os.environ['APPLICATION_ID'] = TEST_APP_ID -os.environ['CURRENT_VERSION_ID'] = TEST_VERSION_ID - - -def fix_path(): - """Finds the google_appengine directory and fixes Python imports to use it.""" - all_paths = os.environ.get('PATH').split(os.pathsep) - for path_dir in all_paths: - dev_appserver_path = os.path.join(path_dir, 'dev_appserver.py') - if os.path.exists(dev_appserver_path): - google_appengine = os.path.dirname(os.path.realpath(dev_appserver_path)) - sys.path.append(google_appengine) - # Use the next import will fix up sys.path even further to bring in - # any dependent lib directories that the SDK needs. - dev_appserver = __import__('dev_appserver') - sys.path.extend(dev_appserver.EXTRA_PATHS) - return - - -def setup_for_testing(require_indexes=True): - """Sets up the stubs for testing. - - Args: - require_indexes: True if indexes should be required for all indexes. - """ - from google.appengine.api import apiproxy_stub_map - from google.appengine.api import memcache - from google.appengine.tools import dev_appserver - from google.appengine.tools import dev_appserver_index - import urlfetch_test_stub - before_level = logging.getLogger().getEffectiveLevel() - try: - logging.getLogger().setLevel(100) - root_path = os.path.realpath(os.path.dirname(__file__)) - dev_appserver.SetupStubs( - TEST_APP_ID, - root_path=root_path, - login_url='', - datastore_path=tempfile.mktemp(suffix='datastore_stub'), - history_path=tempfile.mktemp(suffix='datastore_history'), - blobstore_path=tempfile.mktemp(suffix='blobstore_stub'), - require_indexes=require_indexes, - clear_datastore=False) - dev_appserver_index.SetupIndexes(TEST_APP_ID, root_path) - apiproxy_stub_map.apiproxy._APIProxyStubMap__stub_map['urlfetch'] = \ - urlfetch_test_stub.instance - # Actually need to flush, even though we've reallocated. Maybe because the - # memcache stub's cache is at the module level, not the API stub? - memcache.flush_all() - finally: - logging.getLogger().setLevel(before_level) - - -def create_test_request(method, body, *params): - """Creates a webapp.Request object for use in testing. - - Args: - method: Method to use for the test. - body: The body to use for the request; implies that *params is empty. - *params: List of (key, value) tuples to use in the post-body or query - string of the request. - - Returns: - A new webapp.Request object for testing. - """ - assert not(body and params), 'Must specify body or params, not both' - from google.appengine.ext import webapp - - if body: - body = StringIO.StringIO(body) - encoded_params = '' - else: - encoded_params = urllib.urlencode(params) - body = StringIO.StringIO() - body.write(encoded_params) - body.seek(0) - - environ = os.environ.copy() - environ.update({ - 'QUERY_STRING': '', - 'wsgi.input': body, - }) - if method.lower() == 'get': - environ['REQUEST_METHOD'] = method.upper() - environ['QUERY_STRING'] = encoded_params - else: - environ['REQUEST_METHOD'] = method.upper() - environ['CONTENT_TYPE'] = 'application/x-www-form-urlencoded' - environ['CONTENT_LENGTH'] = str(len(body.getvalue())) - return webapp.Request(environ) - - -class HandlerTestBase(unittest.TestCase): - """Base-class for webapp.RequestHandler tests.""" - - # Set to the class being tested. - handler_class = None - - def setUp(self): - """Sets up the test harness.""" - setup_for_testing() - - def tearDown(self): - """Tears down the test harness.""" - pass - - def handle(self, method, *params): - """Runs a test of a webapp.RequestHandler. - - Args: - method: The method to invoke for this test. - *params: Passed to testutil.create_test_request - """ - from google.appengine.ext import webapp - before_software = os.environ.get('SERVER_SOFTWARE') - before_auth_domain = os.environ.get('AUTH_DOMAIN') - before_email = os.environ.get('USER_EMAIL') - - os.environ['wsgi.url_scheme'] = 'http' - os.environ['SERVER_NAME'] = 'example.com' - os.environ['SERVER_PORT'] = '' - try: - if not before_software: - os.environ['SERVER_SOFTWARE'] = 'Development/1.0' - if not before_auth_domain: - os.environ['AUTH_DOMAIN'] = 'example.com' - if not before_email: - os.environ['USER_EMAIL'] = '' - self.resp = webapp.Response() - self.req = create_test_request(method, None, *params) - handler = self.handler_class() - handler.initialize(self.req, self.resp) - getattr(handler, method.lower())() - logging.info('%r returned status %d: %s', self.handler_class, - self.response_code(), self.response_body()) - finally: - del os.environ['SERVER_SOFTWARE'] - del os.environ['AUTH_DOMAIN'] - del os.environ['USER_EMAIL'] - - def handle_body(self, method, body): - """Runs a test of a webapp.RequestHandler with a POST body. - - Args: - method: The HTTP method to invoke for this test. - body: The body payload bytes. - """ - from google.appengine.ext import webapp - before_software = os.environ.get('SERVER_SOFTWARE') - before_auth_domain = os.environ.get('AUTH_DOMAIN') - before_email = os.environ.get('USER_EMAIL') - - os.environ['wsgi.url_scheme'] = 'http' - os.environ['SERVER_NAME'] = 'example.com' - os.environ['SERVER_PORT'] = '' - try: - if not before_software: - os.environ['SERVER_SOFTWARE'] = 'Development/1.0' - if not before_auth_domain: - os.environ['AUTH_DOMAIN'] = 'example.com' - if not before_email: - os.environ['USER_EMAIL'] = '' - self.resp = webapp.Response() - self.req = create_test_request(method, body) - handler = self.handler_class() - handler.initialize(self.req, self.resp) - getattr(handler, method.lower())() - logging.info('%r returned status %d: %s', self.handler_class, - self.response_code(), self.response_body()) - finally: - del os.environ['SERVER_SOFTWARE'] - del os.environ['AUTH_DOMAIN'] - del os.environ['USER_EMAIL'] - - def response_body(self): - """Returns the response body after the request is handled.""" - return self.resp.out.getvalue() - - def response_code(self): - """Returns the response code after the request is handled.""" - return self.resp._Response__status[0] - - def response_headers(self): - """Returns the response headers after the request is handled.""" - return self.resp.headers - - -def get_tasks(queue_name, index=None, expected_count=None, usec_eta=False): - """Retrieves Tasks from the supplied named queue. - - Args: - queue_name: The queue to access. - index: Index of the task (ordered by ETA) to retrieve from the queue. - expected_count: If not None, the number of tasks expected to be in the - queue. This function will raise an AssertionError exception if there are - more or fewer tasks. - usec_eta: If ETAs should be formatted as microseconds since the UNIX epoch. - When False, the ETA will be rendered as a string. - - Returns: - List of dictionaries corresponding to each task, with the keys: 'name', - 'url', 'method', 'eta', 'body', 'headers', 'params'. The 'params' - value will only be present if the body's Content-Type header is - 'application/x-www-form-urlencoded'. - """ - from google.appengine.api import apiproxy_stub_map - stub = apiproxy_stub_map.apiproxy.GetStub('taskqueue') - - # Gross hack to modify the stub's module-level function to pass through ETAs. - if usec_eta: - stub_globals = stub.GetTasks.func_globals - old_format = stub_globals['_FormatEta'] - # TODO: Taskqueue stub should have more resolution! This will only be - # accurate to the nearest whole second. - stub_globals['_FormatEta'] = lambda x: x - try: - tasks = stub.GetTasks(queue_name) - finally: - if usec_eta: - stub_globals['_FormatEta'] = old_format - - if expected_count is not None: - assert len(tasks) == expected_count, 'found %s == %s' % ( - len(tasks), expected_count) - for task in tasks: - del task['eta_delta'] - task['body'] = base64.b64decode(task['body']) - # Convert headers list into a dictionary-- we don't care about repeats - task['headers'] = dict(task['headers']) - if ('application/x-www-form-urlencoded' in - task['headers'].get('content-type', '')): - task['params'] = dict(cgi.parse_qsl(task['body'], True)) - if index is not None: - return tasks[index] - else: - return tasks - - -def task_eta(eta): - """Converts a datetime.datetime into a taskqueue ETA. - - Args: - eta: Naive datetime.datetime of the task's ETA. - - Returns: - The ETA formatted as a string. - """ - return eta.strftime('%Y/%m/%d %H:%M:%S') diff --git a/hub/topic_details.html b/hub/topic_details.html deleted file mode 100644 index 2152bc1..0000000 --- a/hub/topic_details.html +++ /dev/null @@ -1,88 +0,0 @@ - - - Hub - Topic Details - {{topic_url|escape}} - - - - - -

Topic Details - {{topic_url|escape}}

- -{% if error %} -
{{error|escape}}
-{% else %} - - {% if next_fetch %} - - - - - - - - - - - - - {% endif %} - - - - - - - - - - - - - - - - - {% if subscriber_count %} - - - - - - - - - {% endif %} - - - - - - - - -
Next fetch time (UTC):{{next_fetch|date:"Y-m-d\TH:i:s\Z"}}
Current fetch attempts:{{fetch_attempts}}
All attempts failed{{totally_failed}}
Last successful fetch (UTC):{{last_successful_fetch|date:"Y-m-d\TH:i:s\Z"}}
Last Content-Type:{{last_content_type|escape}}
Last ETag:{{last_etag|escape}}
Last Modified:{{last_modified|escape}}
Active subscribers:{{subscriber_count}}
Last count (UTC):{{feed_stats_update_time|date:"Y-m-d\TH:i:s\Z"}}
Fetch from domain: - {% if fetch_blocked %} - BLOCKED - {% else %} - OK - {% endif %} -
Fetch short-term:{{fetch_errors|floatformat:"-2"}}% errors for domain
- -

Error rate statistics

-{% for result in fetch_url_error %} - {% include "stats_table.html" %} -{% endfor %} - -

Latency statistics

-{% for result in fetch_url_latency %} - {% include "stats_table.html" %} -{% endfor %} - -

Last feed envelope retrieved:

-
-{{last_header_footer|escape}}
-
-{% endif %} - - - diff --git a/hub/urlfetch_async.py b/hub/urlfetch_async.py deleted file mode 100644 index 6e83bbd..0000000 --- a/hub/urlfetch_async.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2008 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from google.appengine.api import urlfetch -from google.appengine.api import apiproxy_stub_map -from google.appengine.api import urlfetch_service_pb -from google.appengine.runtime import apiproxy_errors - - -def fetch(url, payload=None, method=urlfetch.GET, headers={}, - allow_truncated=False, follow_redirects=True, - callback=None, async_proxy=None, - deadline=5): - """Fetches the given HTTP URL, blocking until the result is returned. - - Other optional parameters are: - method: GET, POST, HEAD, PUT, or DELETE - payload: POST or PUT payload (implies method is not GET, HEAD, or DELETE) - headers: dictionary of HTTP headers to send with the request - allow_truncated: if true, truncate large responses and return them without - error. otherwise, ResponseTooLargeError will be thrown when a response is - truncated. - follow_redirects: Whether or not redirects should be followed. - callback: Callable that takes (_URLFetchResult, URLFetchException). - Exactly one of the two arguments is None. Required if async_proxy is - not None. - async_proxy: If not None, instance of AsyncAPIProxy to use for executing - asynchronous API calls. - deadline: How long to allow the request to wait, in seconds. Defaults - to 5 seconds. - - We use a HTTP/1.1 compliant proxy to fetch the result. - - The returned data structure has the following fields: - content: string containing the response from the server - status_code: HTTP status code returned by the server - headers: dictionary of headers returned by the server - - If the URL is an empty string or obviously invalid, we throw an - urlfetch.InvalidURLError. If the server cannot be contacted, we throw a - urlfetch.DownloadError. Note that HTTP errors are returned as a part - of the returned structure, so HTTP errors like 404 do not result in an - exception. - """ - request = urlfetch_service_pb.URLFetchRequest() - response = urlfetch_service_pb.URLFetchResponse() - request.set_url(url) - - if isinstance(method, basestring): - method = method.upper() - method = urlfetch._URL_STRING_MAP.get(method, method) - if method not in urlfetch._VALID_METHODS: - raise InvalidMethodError('Invalid method %s.' % str(method)) - if method == urlfetch.GET: - request.set_method(urlfetch_service_pb.URLFetchRequest.GET) - elif method == urlfetch.POST: - request.set_method(urlfetch_service_pb.URLFetchRequest.POST) - elif method == urlfetch.HEAD: - request.set_method(urlfetch_service_pb.URLFetchRequest.HEAD) - elif method == urlfetch.PUT: - request.set_method(urlfetch_service_pb.URLFetchRequest.PUT) - elif method == urlfetch.DELETE: - request.set_method(urlfetch_service_pb.URLFetchRequest.DELETE) - - request.set_followredirects(follow_redirects) - - if payload and (method == urlfetch.POST or method == urlfetch.PUT): - request.set_payload(payload) - - for key, value in headers.iteritems(): - header_proto = request.add_header() - header_proto.set_key(key) - header_proto.set_value(value) - - if async_proxy: - def completion_callback(response, urlfetch_exception): - result, user_exception = HandleResult(response, urlfetch_exception, - allow_truncated) - callback(result, user_exception) - async_proxy.start_call('urlfetch', 'Fetch', request, response, - completion_callback, deadline=deadline) - return - - user_exception = None - try: - apiproxy_stub_map.MakeSyncCall('urlfetch', 'Fetch', request, response) - except apiproxy_errors.ApplicationError, e: - user_exception = e - - result, user_exception = HandleResult( - response, user_exception, allow_truncated) - if user_exception: - raise user_exception - else: - return result - - -def HandleResult(response, urlfetch_exception, allow_truncated): - """Returns (result, user_exception) to return from a fetch() call.""" - result = None - user_exception = None - - if urlfetch_exception: - user_exception = urlfetch_exception - if hasattr(urlfetch_exception, 'application_error'): - if (urlfetch_exception.application_error == - urlfetch_service_pb.URLFetchServiceError.INVALID_URL): - user_exception = urlfetch.InvalidURLError(str(urlfetch_exception)) - elif (urlfetch_exception.application_error == - urlfetch_service_pb.URLFetchServiceError.UNSPECIFIED_ERROR): - user_exception = urlfetch.DownloadError(str(urlfetch_exception)) - elif (urlfetch_exception.application_error == - urlfetch_service_pb.URLFetchServiceError.FETCH_ERROR): - user_exception = urlfetch.DownloadError(str(urlfetch_exception)) - elif (urlfetch_exception.application_error == - urlfetch_service_pb.URLFetchServiceError.RESPONSE_TOO_LARGE): - user_exception = urlfetch.ResponseTooLargeError(None) - else: - result = urlfetch._URLFetchResult(response) - if not allow_truncated and response.contentwastruncated(): - user_exception = urlfetch.ResponseTooLargeError(result) - - return result, user_exception diff --git a/hub/urlfetch_test_stub.py b/hub/urlfetch_test_stub.py deleted file mode 100644 index ea04993..0000000 --- a/hub/urlfetch_test_stub.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""URLFetchServiceStub implementation that returns mock values.""" - -import logging - -from google.appengine import runtime -from google.appengine.api import apiproxy_stub -from google.appengine.api import urlfetch_service_pb -from google.appengine.api import urlfetch_stub -from google.appengine.runtime import apiproxy_errors - - -class URLFetchServiceTestStub(urlfetch_stub.URLFetchServiceStub): - """Enables tests to mock calls to the URLFetch service and test inputs.""" - - def __init__(self): - """Initializer.""" - super(URLFetchServiceTestStub, self).__init__() - # Maps (method, url) keys to (request_payload, request_headers, - # response_code, response_data, response_headers, error_instance) - self._expectations = {} - - def clear(self): - """Clears all expectations on this stub.""" - self._expectations.clear() - - def expect(self, method, url, response_code, response_data, - response_headers=None, request_payload='', request_headers=None, - urlfetch_error=False, apiproxy_error=False, deadline_error=False, - urlfetch_size_error=False): - """Expects a certain request and response. - - Overrides any existing expectations for this stub. - - Args: - method: The expected method. - url: The expected URL to access. - response_code: The expected response code. - response_data: The expected response data. - response_headers: Headers to serve back, if any. - request_payload: The expected request payload, if any. - request_headers: Any expected request headers. - urlfetch_size_error: Set to True if this call should raise - a urlfetch_errors.ResponseTooLargeError - urlfetch_error: Set to True if this call should raise a - urlfetch_errors.Error exception when made. - apiproxy_error: Set to True if this call should raise an - apiproxy_errors.Error exception when made. - deadline_error: Set to True if this call should raise a - google.appengine.runtime.DeadlineExceededError error. - """ - error_instance = None - if urlfetch_error: - error_instance = apiproxy_errors.ApplicationError( - urlfetch_service_pb.URLFetchServiceError.FETCH_ERROR, 'mock error') - elif urlfetch_size_error: - error_instance = apiproxy_errors.ApplicationError( - urlfetch_service_pb.URLFetchServiceError.RESPONSE_TOO_LARGE, - 'mock error') - elif apiproxy_error: - error_instance = apiproxy_errors.OverQuotaError() - elif deadline_error: - error_instance = runtime.DeadlineExceededError() - - self._expectations[(method.lower(), url)] = ( - request_payload, request_headers, response_code, - response_data, response_headers, error_instance) - - def verify_and_reset(self): - """Verify that all expectations have been met and clear any remaining.""" - old_expectations = self._expectations - self._expectations = {} - if old_expectations: - assert False, '%d expectations remain: %r' % ( - len(old_expectations), old_expectations) - - def _RetrieveURL(self, url, payload, method, headers, request, - response, follow_redirects=True, deadline=None, - validate_certificate=False): - """Test implementation of retrieving a URL. - - Args: - All override super-class's parameters. - """ - header_dict = dict((h.key(), h.value()) for h in headers) - header_text = None - if headers: - header_text = ', '.join( - '%r="%r"' % (k, v) for (k, v) in header_dict.iteritems()) - logging.info('Received URLFetch request:\n%s %r\nHeaders: %r\nPayload: %r', - method, url, header_text, payload) - - key = (method.lower(), url) - try: - expected = self._expectations.pop(key) - except: - assert False, 'Did not expect: %s %s' % key - - (request_payload, request_headers, response_code, - response_data, response_headers, error_instance) = expected - - if request_payload: - assert payload == request_payload, ( - 'Request payload: "%s" did not match expected: "%s"' % - (request_payload, payload)) - if request_headers: - for key, expected in request_headers.iteritems(): - found = header_dict.get(key) - assert found == expected, ('Value for request header %s was ' - '"%s", expected "%s"' % (key, found, expected)) - if error_instance is not None: - raise error_instance - - response.set_statuscode(response_code) - response.set_content(response_data) - if response_headers: - for key, value in response_headers.iteritems(): - header = response.add_header() - header.set_key(key) - header.set_value(value) - - -instance = URLFetchServiceTestStub() diff --git a/hub/welcome.html b/hub/welcome.html deleted file mode 100644 index 358bb83..0000000 --- a/hub/welcome.html +++ /dev/null @@ -1,56 +0,0 @@ - - - Hub - PubSubHubbub - - - - -

Welcome to the PubSubHubbub reference Hub server!

- -

-PubSubHubbub is a simple, open, web-hook-based pubsub (publish/subscribe) protocol. -

-

-Decentralized and free. Anybody can run a PubSubHubbub server or anybody can use an open server. -

-

-PubSubHubbub is just a protocol, not a service, but we're running this as an open test server for anybody to use to help bootstrap the protocol. Feel free to publish to or subscribe from this hub. You can migrate in the future when you want to run your own hub server, or you can just keep using this one. -

-

-See the project for details. This presentation has an overview: -

- - -

Hub debug

-

From here you can, -

    -
  • Subscribe to a feed or debug your subscriber (with stats!).
  • -
  • Publish a feed or debug your published feeds.
  • -
-

- -
- -

-These legal disclaimers are here because this Hub is run by Google as a service. If you don't want to agree to these terms you should just run your own hub. The PubSubHubbub protocol is decentralized and free. -

- -
-
-©2010 Google - - -Terms of Service - - - - -Privacy Policy - -
-
- Powered by Google App Engine -
-
- - - diff --git a/index.html b/index.html new file mode 100644 index 0000000..7b90f5f --- /dev/null +++ b/index.html @@ -0,0 +1,5 @@ + + +

click here for the latest version of the PubSubHubbub spec.

diff --git a/loadtest/app.yaml b/loadtest/app.yaml deleted file mode 100644 index 3513f02..0000000 --- a/loadtest/app.yaml +++ /dev/null @@ -1,8 +0,0 @@ -application: pubsubhubbub-loadtest -version: 1 -runtime: python -api_version: 1 - -handlers: -- url: .* - script: main.py diff --git a/loadtest/atom.xml b/loadtest/atom.xml deleted file mode 100644 index d9b9328..0000000 --- a/loadtest/atom.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - Random test blog - - - {% now "Y-m-d\TH:i:s\Z" %} - {{self_url}} - - Foobar - - - {% for random_id in all_ids %} - - Random item #{{random_id}} - {{random_id}} - {% now "Y-m-d\TH:i:s\Z" %} - {% now "Y-m-d\TH:i:s\Z" %} - - This is the content for random item #{{random_id}} - - - {% endfor %} - - diff --git a/loadtest/index.yaml b/loadtest/index.yaml deleted file mode 100644 index a3b9e05..0000000 --- a/loadtest/index.yaml +++ /dev/null @@ -1,11 +0,0 @@ -indexes: - -# AUTOGENERATED - -# This index.yaml is automatically updated whenever the dev_appserver -# detects that a new type of query is run. If you want to manage the -# index.yaml file manually, remove the above marker line (the line -# saying "# AUTOGENERATED"). If you want to manage some indexes -# manually, move them above the marker line. The index.yaml file is -# automatically uploaded to the admin console when you next deploy -# your application using appcfg.py. diff --git a/loadtest/main.py b/loadtest/main.py deleted file mode 100755 index 24c3319..0000000 --- a/loadtest/main.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Loadtest script that provides a dummy publisher and subscriber. - -Every time a feed is requested from this loadtest publisher, it will produce a -new, random Atom payload. The loadtest app also provides a subscriber interface -that will accept any subscription. All notification messages will be thrown -away; it only keeps a count. -""" - -import random -import wsgiref.handlers -from google.appengine.ext import webapp -from google.appengine.ext.webapp import template - - -class FeedHandler(webapp.RequestHandler): - - def get(self, name): - self.response.headers['Content-Type'] = 'application/xml+atom' - self.response.out.write(template.render('atom.xml', { - 'self_url': self.request.url, - 'all_ids': random.sample(xrange(10**9), 25), - })) - - -class SubscriberHandler(webapp.RequestHandler): - - def get(self, name): - self.response.out.write(self.request.get('hub.challenge')) - self.response.set_status(200) - - def post(self, name): - pass - - -application = webapp.WSGIApplication([ - (r'/feed/(.*)', FeedHandler), - (r'/subscriber/(.*)', SubscriberHandler), -], debug=True) - - -def main(): - wsgiref.handlers.CGIHandler().run(application) - - -if __name__ == '__main__': - main() diff --git a/media/light_pings_bandwidth.png b/media/light_pings_bandwidth.png deleted file mode 100644 index 75c62e4..0000000 Binary files a/media/light_pings_bandwidth.png and /dev/null differ diff --git a/media/light_pings_cache_sync.png b/media/light_pings_cache_sync.png deleted file mode 100644 index 113ef99..0000000 Binary files a/media/light_pings_cache_sync.png and /dev/null differ diff --git a/media/light_pings_combined_publisher.png b/media/light_pings_combined_publisher.png deleted file mode 100644 index 230bae9..0000000 Binary files a/media/light_pings_combined_publisher.png and /dev/null differ diff --git a/media/light_pings_cpu.png b/media/light_pings_cpu.png deleted file mode 100644 index 7485ae1..0000000 Binary files a/media/light_pings_cpu.png and /dev/null differ diff --git a/media/light_pings_naive_pings.png b/media/light_pings_naive_pings.png deleted file mode 100644 index 381fc2f..0000000 Binary files a/media/light_pings_naive_pings.png and /dev/null differ diff --git a/media/light_pings_proxy.png b/media/light_pings_proxy.png deleted file mode 100644 index fd8bc5a..0000000 Binary files a/media/light_pings_proxy.png and /dev/null differ diff --git a/media/light_pings_proxy_bandwidth.png b/media/light_pings_proxy_bandwidth.png deleted file mode 100644 index 2f226bc..0000000 Binary files a/media/light_pings_proxy_bandwidth.png and /dev/null differ diff --git a/media/light_pings_proxy_bandwidth_slow.png b/media/light_pings_proxy_bandwidth_slow.png deleted file mode 100644 index 18e4afd..0000000 Binary files a/media/light_pings_proxy_bandwidth_slow.png and /dev/null differ diff --git a/media/light_pings_proxy_cpu.png b/media/light_pings_proxy_cpu.png deleted file mode 100644 index 03e9ff7..0000000 Binary files a/media/light_pings_proxy_cpu.png and /dev/null differ diff --git a/media/light_pings_proxy_cpu_slow.png b/media/light_pings_proxy_cpu_slow.png deleted file mode 100644 index ecfc831..0000000 Binary files a/media/light_pings_proxy_cpu_slow.png and /dev/null differ diff --git a/media/pshb_bandwidth.png b/media/pshb_bandwidth.png deleted file mode 100644 index b0e9bcd..0000000 Binary files a/media/pshb_bandwidth.png and /dev/null differ diff --git a/media/pshb_cache_sync.png b/media/pshb_cache_sync.png deleted file mode 100644 index 91b28fb..0000000 Binary files a/media/pshb_cache_sync.png and /dev/null differ diff --git a/media/pshb_combined_publisher.png b/media/pshb_combined_publisher.png deleted file mode 100644 index 14dbb3c..0000000 Binary files a/media/pshb_combined_publisher.png and /dev/null differ diff --git a/media/pshb_naive_cpu.png b/media/pshb_naive_cpu.png deleted file mode 100644 index 0551418..0000000 Binary files a/media/pshb_naive_cpu.png and /dev/null differ diff --git a/media/pshb_naive_pings.png b/media/pshb_naive_pings.png deleted file mode 100644 index eec5fb1..0000000 Binary files a/media/pshb_naive_pings.png and /dev/null differ diff --git a/nonstandard/fat_publish.py b/nonstandard/fat_publish.py deleted file mode 100644 index e623fdc..0000000 --- a/nonstandard/fat_publish.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Fat publish receiver for working around caching issues with the demo hub. - -This isn't part of the PubSubHubbub spec. We have no intentions of making it to -be part of the spec. This extension hook is useful for publishers who want to -try out the PubSubHubbub protocol without building or running their own hub at -first. Receiving fat publish events directly from publishers allows the Hub to -work around any replication/caching delays that are described in this wiki entry -(see the section on multiple datacenters): - - http://code.google.com/p/pubsubhubbub/wiki/PublisherEfficiency - -To enable this module, symlink this 'fat_publish.py' file into the 'hooks' -directory of the hub application; then put your shared secret into the -'fat_publish_secret.txt' file in the 'hooks' directory. - -To use this module as a client, send POST requests with the parameters -'topic', 'content', and 'signature'. They should look like this: - - POST /fatping HTTP/1.1 - Content-Type: application/x-www-form-urlencoded - Content-Length: ... - - topic=http%3A%2F%2Fexample.com%2Fmytopic&\ - content=&\ - signature= -""" - -import logging - -from google.appengine.ext import webapp - - -SECRET_FILE = 'hooks/fat_publish_secret.txt' - - -# Define the Hook class for testing. -if 'register' not in globals(): - class Hook(object): - pass - - -class FatPublishHandler(webapp.RequestHandler): - """Request handler for receiving fat publishes. - - Returns 204 on success, 403 on auth errors, 400 on bad feeds, 500 on - any other type of error. - """ - - secret = None - - def post(self): - topic = self.request.get('topic') - content = self.request.get('content') - signature = self.request.get('signature') - - if not (topic and content and signature): - error_message = ( - 'Fat publish must have required parameters in urlencoded format: ' - '"topic", "content", "signature"') - logging.error(error_message) - self.response.set_status(400) - self.response.out.write(error_message) - return - - logging.debug('Fat publish for topic=%s, signature=%s, size=%s', - topic, signature, len(content)) - - if not Subscription.has_subscribers(topic): - logging.debug('Ignoring fat publish because there are no subscribers.') - self.response.set_status(204) - return - - logging.info('Subscribers found. Accepting fat publish event.') - expected_signature = sha1_hmac(self.secret, content + topic) - if expected_signature != signature: - error_message = ( - 'Received fat publish with invalid signature. ' - 'expected=%s, found=%s' % (expected_signature, signature)) - logging.error(error_message) - self.response.set_status(403) - self.response.out.write(error_message) - return - - feed_record = FeedRecord.get_or_create(topic) - if parse_feed(feed_record, self.request.headers, content): - self.response.set_status(204) - else: - self.response.out.write('Could not parse or save feed updates.') - self.response.set_status(400) - - -def create_handler(shared_secret): - """Creates a FatPublishHandler sub-class with a particular shared secret. - - Args: - shared_secret: Used to verify the authenticity of fat publishes. - """ - class SpecificFatPublishHandler(FatPublishHandler): - secret = shared_secret - return SpecificFatPublishHandler - - -class FatPublishHook(Hook): - """Hook for accepting fat publishes from publishers.""" - - def __init__(self, handler): - """Initializer. - - Args: - handler: FatPublishHandler class to add for fatpinging. - """ - self.handler = handler - - def inspect(self, args, kwargs): - """Adds the FatPublishHandler to the list of request handlers.""" - args[0].append((r'/fatping', self.handler)) - return False - - -if 'register' in globals(): - # You can re-register this same hook here with different shared secrets if - # you would like to allow other publishing endpoints to do the same thing - # with separate access controls. - register(modify_handlers, FatPublishHook( - create_handler(open(SECRET_FILE).read()))) diff --git a/nonstandard/fat_publish_test.py b/nonstandard/fat_publish_test.py deleted file mode 100755 index 220455a..0000000 --- a/nonstandard/fat_publish_test.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Tests for the fat_publish module.""" - -import logging -logging.basicConfig(format='%(levelname)-8s %(filename)s] %(message)s') -import os -import sys -import unittest - -# Run these tests from the 'hub' directory. -sys.path.insert(0, os.getcwd()) -os.chdir('../hub') -sys.path.insert(0, os.getcwd()) - -import testutil -testutil.fix_path() - -from google.appengine.ext import webapp - -import main -import fat_publish - -################################################################################ - -# Do aliasing that would happen anyways during Hook module loading. -fat_publish.sha1_hmac = main.sha1_hmac -fat_publish.FeedRecord = main.FeedRecord -fat_publish.Subscription = main.Subscription - - -class FatPingHandlerTest(testutil.HandlerTestBase): - """Tests for the FatPingHandler class.""" - - secret = 'thisismysecret' - handler_class = fat_publish.create_handler(secret) - - def setUp(self): - """Sets up the test harness.""" - testutil.HandlerTestBase.setUp(self) - self.expected_headers = { - 'Content-Length': '-1', # Used by WebOb for urlencoded POSTs. - 'Content-Type': 'application/x-www-form-urlencoded', - } - self.topic = 'http://example.com/mytopic' - self.fakefeed = 'my fake feed' - self.fakefeed_signature = '5f9418a2e221ced6a0bc1263aaebcce297438740' - self.success = False - - self.feed_record = main.FeedRecord.get_or_create(self.topic) - main.Subscription.insert('callback', self.topic, 'token', 'secret') - - def parse_feed_mock(record, headers, body): - self.assertEquals(self.feed_record.topic, record.topic) - self.assertEquals(self.expected_headers, headers) - self.assertEquals(self.fakefeed, body) - return self.success - - fat_publish.parse_feed = parse_feed_mock - - def testNoSubscribers(self): - """Tests when there are no subscribers.""" - self.success = False - main.Subscription.remove('callback', self.topic) - self.handle('post', - ('topic', self.topic), - ('content', self.fakefeed), - ('signature', self.fakefeed_signature)) - self.assertEquals(204, self.response_code()) - - def testSuccessfulParsing(self): - """Tests when parsing is successful.""" - self.success = True - self.handle('post', - ('topic', self.topic), - ('content', self.fakefeed), - ('signature', self.fakefeed_signature)) - self.assertEquals(204, self.response_code()) - - def testHeaders(self): - """Tests that request headers are preserved.""" - HEADER = 'HTTP_MY_HEADER' - self.expected_headers['My-Header'] = os.environ[HEADER] = 'cheese' - try: - self.success = True - self.expected_headers - self.handle('post', - ('topic', self.topic), - ('content', self.fakefeed), - ('signature', self.fakefeed_signature)) - self.assertEquals(204, self.response_code()) - finally: - del os.environ[HEADER] - - def testParseFails(self): - """Tests when parsing fails.""" - self.success = False - self.handle('post', - ('topic', self.topic), - ('content', self.fakefeed), - ('signature', self.fakefeed_signature)) - self.assertEquals(400, self.response_code()) - - def testBadSignature(self): - """Tests when the signature is present but invalid.""" - self.success = True - self.handle('post', - ('topic', self.topic), - ('content', self.fakefeed), - ('signature', 'bad')) - self.assertEquals(403, self.response_code()) - - def testMissingParams(self): - """Tests when parameters are missing.""" - self.success = True - # No signature. - self.handle('post', - ('topic', self.topic), - ('content', self.fakefeed)) - self.assertEquals(400, self.response_code()) - - # No topic. - self.handle('post', - ('content', self.fakefeed), - ('signature', self.fakefeed_signature)) - self.assertEquals(400, self.response_code()) - - # No content. - self.handle('post', - ('topic', self.topic), - ('signature', self.fakefeed_signature)) - self.assertEquals(400, self.response_code()) - - -class FatPublishHookTest(unittest.TestCase): - """Tests for the FatPublishHook class.""" - - def testCreateHook(self): - """Tests creating a hook.""" - fat_handler = object() - hook = fat_publish.FatPublishHook(fat_handler) - handlers = [object(), object()] - original_handlers = list(handlers) - self.assertFalse(hook.inspect((handlers,), {})) - self.assertEquals(original_handlers + [(r'/fatping', fat_handler)], - handlers) - -################################################################################ - -if __name__ == '__main__': - unittest.main() diff --git a/nonstandard/virtual_feed.py b/nonstandard/virtual_feed.py deleted file mode 100644 index fc085ea..0000000 --- a/nonstandard/virtual_feed.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2010 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Virtual feed receiver for publishing aggregate feeds using fat pings. - -This isn't part of the PubSubHubbub spec. We have no intentions of making it to -be part of the spec. This extension hook is useful for creating a virtual -feed (e.g., a firehose feed) using an aggregation of other feeds that are -fat pinged to the hub. - -It's up to your code to connect your fat-ping request handler to the -'inject_virtual_feed' method (see fat_publish.py for one way to do it). That -function will parse your feed, extract the entries, and enqueue the virtual -feed update. Multiple fat pings delivered in this manner will be collated -together by their virtual feed topics into a combined payload, which is then -injected into the reference hub's event delivery pipeline. This collation is -useful because it controls how many HTTP requests will be sent to subscribers -to this virtual feed, and lets you make the tradeoff between delivery latency -and request overhead. -""" - -import logging - -from google.appengine.ext import db -from google.appengine.ext import webapp - -import fork_join_queue - -################################################################################ -# Constants - -VIRTUAL_FEED_QUEUE = 'virtual-feeds' - -################################################################################ - -# Define these symbols for testing. -if 'register' not in globals(): - class Hook(object): - pass - def work_queue_only(func): - return func - sha1_hash = None - - -class FeedFragment(db.Model): - """Represents a fragment of a virtual feed that will be collated. - - The Key and key_name are not used. - - Fields: - topic: The topic of the virtual feed being collated. - header_footer: The feed envelope. - entries: The ... text segments that were parsed from the - source feeds, already joined together with newlines. - format: 'rss' or 'atom'. - """ - topic = db.TextProperty() - header_footer = db.TextProperty() - entries = db.TextProperty() - format = db.TextProperty() - - -VIRTUAL_FEED_QUEUE = fork_join_queue.MemcacheForkJoinQueue( - FeedFragment, - None, - '/work/virtual_feeds', - VIRTUAL_FEED_QUEUE, - batch_size=20, - batch_period_ms=1000, - lock_timeout_ms=1000, - sync_timeout_ms=250, - stall_timeout_ms=30000, - acquire_timeout_ms=10, - acquire_attempts=50, - shard_count=1, - expiration_seconds=60) # Give up on fragments after 60 seconds. - - -def inject_virtual_feed(topic, format, header_footer, entries_map): - """Injects a virtual feed update to be collated and then delievered. - - Args: - topic: The topic URL for the virtual feed. - format: The format of the virtual feed ('rss' or 'atom'). - header_footer: The feed envelope to use for the whole virtual feed. - entries_map: Dictionary mapping feed entry IDs to strings containing - full entry payloads (e.g., from to including the tags). - - Raises: - MemcacheError if the virtual feed could not be injected. - """ - fragment = FeedFragment( - key=db.Key.from_path(FeedFragment.kind(), 'unused'), - topic=topic, - header_footer=header_footer, - entries='\n'.join(entries_map.values()), - format=format) - - # Update the name of the queue to include a hash of the topic URL. This - # allows us to use a single VIRTUAL_FEED_QUEUE instance to represent a - # different logical queue for each virtual feed topic we would like to - # collate. - VIRTUAL_FEED_QUEUE.name = 'fjq-%s-%s-' % ( - FeedFragment.kind(), sha1_hash(topic)) - work_index = VIRTUAL_FEED_QUEUE.next_index() - try: - VIRTUAL_FEED_QUEUE.put(work_index, [fragment]) - finally: - VIRTUAL_FEED_QUEUE.add(work_index) - - -class CollateFeedHandler(webapp.RequestHandler): - """Worker handler that collates virtual feed updates and enqueues them.""" - - @work_queue_only - def post(self): - # Restore the pseudo-name of the queue so we can properly pop tasks - # from it for the specific virtual feed this task is targetting. - task_name = self.request.headers['X-AppEngine-TaskName'] - VIRTUAL_FEED_QUEUE.name, rest = task_name.split('--') - VIRTUAL_FEED_QUEUE.name += '-' - - fragment_list = VIRTUAL_FEED_QUEUE.pop_request(self.request) - if not fragment_list: - logging.warning('Pop of virtual feed task %r found no fragments.', - task_name) - return - - fragment = fragment_list[0] - entry_payloads = [f.entries for f in fragment_list] - - def txn(): - event_to_deliver = EventToDeliver.create_event_for_topic( - fragment.topic, - fragment.format, - self.request.headers.get('Content-Type', 'application/atom+xml'), - fragment.header_footer, - entry_payloads, - set_parent=False, - max_failures=1) - db.put(event_to_deliver) - event_to_deliver.enqueue() - - db.run_in_transaction(txn) - logging.debug('Injected %d fragments for virtual topic %r', - len(fragment_list), fragment.topic) - - -class VirtualFeedHook(Hook): - """Adds the CollateFeedHandler to the list of request handlers.""" - - def inspect(self, args, kwargs): - args[0].append((r'/work/virtual_feeds', CollateFeedHandler)) - return False - - -if 'register' in globals(): - register(modify_handlers, VirtualFeedHook()) diff --git a/nonstandard/virtual_feed_test.py b/nonstandard/virtual_feed_test.py deleted file mode 100755 index 14f560e..0000000 --- a/nonstandard/virtual_feed_test.py +++ /dev/null @@ -1,216 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2010 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Tests for the virtual_feed module.""" - -import logging -logging.basicConfig(format='%(levelname)-8s %(filename)s] %(message)s') -import os -import sys -import unittest - -# Run these tests from the 'hub' directory. -sys.path.insert(0, os.getcwd()) -os.chdir('../hub') -sys.path.insert(0, os.getcwd()) - -import testutil -testutil.fix_path() - -from google.appengine.ext import webapp - -import main -import virtual_feed - -################################################################################ - -# Do aliasing that would happen anyways during Hook module loading.= -virtual_feed.sha1_hash = main.sha1_hash -virtual_feed.EventToDeliver = main.EventToDeliver - - -class InjectVirtualFeedTest(testutil.HandlerTestBase): - """Tests for the inject_virtual_feed function.""" - - def setUp(self): - """Sets up the test harness.""" - testutil.setup_for_testing() - self.topic = 'http://example.com/my-topic/1' - self.topic2 = 'http://example.com/my-topic/2' - self.format = 'atom' - self.header_footer = 'tag:my-id\n' - self.entries_map = { - 'one': 'first data', - 'two': 'second data', - 'three': 'third data', - } - os.environ['CURRENT_VERSION_ID'] = 'my-version.1234' - virtual_feed.VIRTUAL_FEED_QUEUE.queue_name = 'default' - - def testInsertOneFragment(self): - """Tests inserting one new fragment.""" - virtual_feed.inject_virtual_feed( - self.topic, self.format, self.header_footer, self.entries_map) - task = testutil.get_tasks('default', index=0, expected_count=1) - self.assertTrue(task['name'].startswith( - 'fjq-FeedFragment-54124f41c1ea6e67e4beacac85b9f015e6830d41--' - 'my-version-')) - results = virtual_feed.VIRTUAL_FEED_QUEUE.pop(task['name']) - self.assertEquals(1, len(results)) - fragment = results[0] - self.assertEquals(self.topic, fragment.topic) - self.assertEquals(self.header_footer, fragment.header_footer) - self.assertEquals(self.format, fragment.format) - self.assertEquals( - 'third data\n' # Hash order - 'second data\n' - 'first data', - fragment.entries) - - def testInsertMultipleFragments(self): - """Tests inserting multiple fragments on different virtual topics.""" - virtual_feed.inject_virtual_feed( - self.topic, self.format, self.header_footer, self.entries_map) - virtual_feed.inject_virtual_feed( - self.topic2, self.format, self.header_footer, self.entries_map) - - task1, task2 = testutil.get_tasks('default', expected_count=2) - self.assertTrue(task1['name'].startswith( - 'fjq-FeedFragment-54124f41c1ea6e67e4beacac85b9f015e6830d41--' - 'my-version-')) - self.assertTrue(task2['name'].startswith( - 'fjq-FeedFragment-0449375bf584a7a5d3a09b344a726dead30c3927--' - 'my-version-')) - - virtual_feed.VIRTUAL_FEED_QUEUE.name = \ - 'fjq-FeedFragment-54124f41c1ea6e67e4beacac85b9f015e6830d41-' - fragment1 = virtual_feed.VIRTUAL_FEED_QUEUE.pop(task1['name'])[0] - self.assertEquals(self.topic, fragment1.topic) - - virtual_feed.VIRTUAL_FEED_QUEUE.name = \ - 'fjq-FeedFragment-0449375bf584a7a5d3a09b344a726dead30c3927-' - fragment2 = virtual_feed.VIRTUAL_FEED_QUEUE.pop(task2['name'])[0] - self.assertEquals(self.topic2, fragment2.topic) - - -class CollateFeedHandlerTest(testutil.HandlerTestBase): - """Tests for the CollateFeedHandler class.""" - - handler_class = virtual_feed.CollateFeedHandler - - def setUp(self): - """Sets up the test harness.""" - testutil.HandlerTestBase.setUp(self) - self.topic = 'http://example.com/my-topic/1' - self.topic2 = 'http://example.com/my-topic/2' - self.format = 'atom' - self.header_footer = 'tag:my-id\n' - self.entries_map = { - 'one': 'first data', - 'two': 'second data', - 'three': 'third data', - } - os.environ['CURRENT_VERSION_ID'] = 'my-version.1234' - virtual_feed.VIRTUAL_FEED_QUEUE.queue_name = 'default' - - def testNoWork(self): - """Tests when the queue is empty.""" - os.environ['HTTP_X_APPENGINE_TASKNAME'] = ( - 'fjq-FeedFragment-54124f41c1ea6e67e4beacac85b9f01abb6830d41--' - 'my-version-42630240-2654435761-0') - self.handle('post') - - def testOneFragment(self): - """Tests when there is one fragment in the queue.""" - virtual_feed.inject_virtual_feed( - self.topic, self.format, self.header_footer, self.entries_map) - task = testutil.get_tasks('default', index=0, expected_count=1) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = task['name'] - self.handle('post') - - event_list = list(main.EventToDeliver.all()) - self.assertEquals(1, len(event_list)) - event = event_list[0] - - # No parent to ensure it's not rate limited by an entity group. - self.assertEquals(None, event.key().parent()) - - self.assertEquals(self.topic, event.topic) - self.assertEquals( - '\n' - 'tag:my-id\n\n' - 'third data\n' - 'second data\n' - 'first data\n' - '', - event.payload) - self.assertEquals('application/atom+xml', event.content_type) - self.assertEquals(1, event.max_failures) - - task = testutil.get_tasks('event-delivery', index=0, expected_count=1) - self.assertEquals(str(event.key()), task['params']['event_key']) - - def testMultipleFragments(self): - """Tests when there is more than one fragment in the queue.""" - virtual_feed.inject_virtual_feed( - self.topic, self.format, self.header_footer, self.entries_map) - virtual_feed.inject_virtual_feed( - self.topic, self.format, self.header_footer, self.entries_map) - task = testutil.get_tasks('default', index=0, expected_count=1) - os.environ['HTTP_X_APPENGINE_TASKNAME'] = task['name'] - self.handle('post') - - event_list = list(main.EventToDeliver.all()) - self.assertEquals(1, len(event_list)) - event = event_list[0] - - self.assertEquals(self.topic, event.topic) - self.assertEquals( - '\n' - 'tag:my-id\n\n' - 'third data\n' - 'second data\n' - 'first data\n' - 'third data\n' - 'second data\n' - 'first data\n' - '', - event.payload) - - def testMultipleQueues(self): - """Tests multiple virtual feeds and queues.""" - virtual_feed.inject_virtual_feed( - self.topic, self.format, self.header_footer, self.entries_map) - virtual_feed.inject_virtual_feed( - self.topic2, self.format, self.header_footer, self.entries_map) - task1, task2 = testutil.get_tasks('default', expected_count=2) - - os.environ['HTTP_X_APPENGINE_TASKNAME'] = task1['name'] - self.handle('post') - - os.environ['HTTP_X_APPENGINE_TASKNAME'] = task2['name'] - self.handle('post') - - event_list = list(main.EventToDeliver.all()) - self.assertEquals(2, len(event_list)) - self.assertEquals(self.topic, event_list[0].topic) - self.assertEquals(self.topic2, event_list[1].topic) - -################################################################################ - -if __name__ == '__main__': - unittest.main() diff --git a/presentation_gadget.xml b/presentation_gadget.xml deleted file mode 100644 index b07e102..0000000 --- a/presentation_gadget.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/publisher/app.yaml b/publisher/app.yaml deleted file mode 100644 index 15b746d..0000000 --- a/publisher/app.yaml +++ /dev/null @@ -1,8 +0,0 @@ -application: pubsubhubbub-publisher1 -version: 1 -runtime: python -api_version: 1 - -handlers: -- url: .* - script: main.py diff --git a/publisher/atom.xml b/publisher/atom.xml deleted file mode 100644 index 76f1b66..0000000 --- a/publisher/atom.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - Publisher example - - - {{first_message.get_zulu_time}} - {{source}} - nobody - - {% for message in messages %} - - {{message.title}} - {{source}}/{{message.key.id_or_name}} - {{message.get_zulu_time}} - {{message.content|escape}} - - {% endfor %} - diff --git a/publisher/index.yaml b/publisher/index.yaml deleted file mode 100644 index 200e0b5..0000000 --- a/publisher/index.yaml +++ /dev/null @@ -1,17 +0,0 @@ -indexes: - -# AUTOGENERATED - -# This index.yaml is automatically updated whenever the dev_appserver -# detects that a new type of query is run. If you want to manage the -# index.yaml file manually, remove the above marker line (the line -# saying "# AUTOGENERATED"). If you want to manage some indexes -# manually, move them above the marker line. The index.yaml file is -# automatically uploaded to the admin console when you next deploy -# your application using appcfg.py. - -# Unused in query history -- copied from input. -- kind: Message - properties: - - name: when - direction: desc diff --git a/publisher/input.html b/publisher/input.html deleted file mode 100644 index fd010a0..0000000 --- a/publisher/input.html +++ /dev/null @@ -1,40 +0,0 @@ - - - Publisher - - - - - -

Publisher page

- -
-
-
Title:
- -
-
-
Message:
- -
- -
-
- -

Previous messages

- -{% for message in messages %} -
-
{{message.title}}
-
- Published at {{message.get_zulu_time}}, ID: {{message.key.id_or_name}} -
-
{{message.content}}
-
-{% endfor %} - - - diff --git a/publisher/main.py b/publisher/main.py deleted file mode 100755 index c48f5f8..0000000 --- a/publisher/main.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2007 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Simple publisher example that pings the hub after publishing.""" - -import logging -import urllib -import wsgiref.handlers - -from google.appengine.api import urlfetch -from google.appengine.ext import webapp -from google.appengine.ext.webapp import template -from google.appengine.ext import db - - -class Message(db.Model): - """A message to publish.""" - title = db.TextProperty(default='') - content = db.TextProperty(default='') - when = db.DateTimeProperty(auto_now_add=True) - - def get_zulu_time(self): - return self.when.strftime("%Y-%m-%dT%H:%M:%SZ") - - -class MainHandler(webapp.RequestHandler): - """Allows users to publish new entries.""" - - def get(self): - context = dict(messages=Message.gql('ORDER BY when DESC').fetch(20)) - self.response.out.write(template.render('input.html', context)) - - def post(self): - hub_url = self.request.get('hub') - message = Message(title=self.request.get('title'), - content=self.request.get('content')) - message.put() - - headers = {'content-type': 'application/x-www-form-urlencoded'} - post_params = { - 'hub.mode': 'publish', - 'hub.url': self.request.host_url + '/feed', - } - payload = urllib.urlencode(post_params) - try: - response = urlfetch.fetch(hub_url, method='POST', payload=payload) - except urlfetch.Error: - logging.exception('Failed to deliver publishing message to %s', hub_url) - else: - logging.info('URL fetch status_code=%d, content="%s"', - response.status_code, response.content) - self.redirect('/') - - -class FeedHandler(webapp.RequestHandler): - """Renders an Atom feed of published entries.""" - - def get(self): - messages = Message.gql('ORDER BY when DESC').fetch(20) - context = { - 'messages': messages, - 'source': self.request.host_url + '/feed', - } - if messages: - context['first_message'] = messages[0] - self.response.headers['content-type'] = 'application/xml' - self.response.out.write(template.render('atom.xml', context)) - - -def main(): - application = webapp.WSGIApplication([('/', MainHandler), - ('/feed', FeedHandler)], - debug=True) - wsgiref.handlers.CGIHandler().run(application) - - -if __name__ == '__main__': - main() diff --git a/publisher_clients/README.txt b/publisher_clients/README.txt deleted file mode 100644 index 62d5c12..0000000 --- a/publisher_clients/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -This directory is for client libraries that do the publishing -("pinging") part of the pubsubhubbub protocol. - diff --git a/publisher_clients/perl-schwartz/.shipit b/publisher_clients/perl-schwartz/.shipit deleted file mode 100644 index 59b4fae..0000000 --- a/publisher_clients/perl-schwartz/.shipit +++ /dev/null @@ -1,6 +0,0 @@ -steps = FindVersion, ChangeVersion, CheckChangeLog, DistTest, Commit, Tag, MakeDist, UploadCPAN -svn.tagpattern = TheSchwartz-Worker-PubSubHubbubPublish-perl-%v - - - - diff --git a/publisher_clients/perl-schwartz/CHANGES b/publisher_clients/perl-schwartz/CHANGES deleted file mode 100644 index 501d8b6..0000000 --- a/publisher_clients/perl-schwartz/CHANGES +++ /dev/null @@ -1,6 +0,0 @@ -1.00 (2009-04-26) - - * first release. works, including batching. - - - diff --git a/publisher_clients/perl-schwartz/MANIFEST b/publisher_clients/perl-schwartz/MANIFEST deleted file mode 100644 index 391d384..0000000 --- a/publisher_clients/perl-schwartz/MANIFEST +++ /dev/null @@ -1,7 +0,0 @@ -CHANGES -demo-worker.pl -demo-insert-job.pl -lib/TheSchwartz/Worker/PubSubHubbubPublish.pm -Makefile.PL -MANIFEST This list of files -t/00-use.t diff --git a/publisher_clients/perl-schwartz/Makefile.PL b/publisher_clients/perl-schwartz/Makefile.PL deleted file mode 100644 index 380e304..0000000 --- a/publisher_clients/perl-schwartz/Makefile.PL +++ /dev/null @@ -1,13 +0,0 @@ -use ExtUtils::MakeMaker; -WriteMakefile( 'NAME' => 'TheSchwartz::Worker::PubSubHubbubPublish', - 'VERSION_FROM' => 'lib/TheSchwartz/Worker/PubSubHubbubPublish.pm', - 'PREREQ_PM' => { - 'TheSchwartz::Worker' => 0, - 'Net::PubSubHubbub::Publisher' => 0.91, - }, - ABSTRACT_FROM => 'lib/TheSchwartz/Worker/PubSubHubbubPublish.pm', - AUTHOR => 'Brad Fitzpatrick ', - ); - - - diff --git a/publisher_clients/perl-schwartz/demo-insert-job.pl b/publisher_clients/perl-schwartz/demo-insert-job.pl deleted file mode 100755 index 2241a4e..0000000 --- a/publisher_clients/perl-schwartz/demo-insert-job.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -use TheSchwartz; -use Getopt::Long; - -my $n_jobs = 1; -my $hub_url = "http://pubsubhubbub.appspot.com/"; - -GetOptions("n=i" => \$n_jobs, - "hub=s" => \$hub_url, - ) or die "Unknown options."; - -my $client = TheSchwartz->new(databases => [{ - user => "root", - dsn => "dbi:mysql:theschwartz", -}]); - -for (1..$n_jobs) { - my $topic = "http://publisher.example.com/topic/" . rand() . ".atom"; - print "Submitting dummy topic $topic ...\n"; - - my $job = TheSchwartz::Job->new(funcname => 'TheSchwartz::Worker::PubSubHubbubPublish', - arg => { - hub => $hub_url, - topic_url => $topic, - }, - coalesce => $hub_url, - ); - - my $handle = $client->insert($job); - print " job handle: $handle\n"; -} - diff --git a/publisher_clients/perl-schwartz/demo-worker.pl b/publisher_clients/perl-schwartz/demo-worker.pl deleted file mode 100755 index c8638a8..0000000 --- a/publisher_clients/perl-schwartz/demo-worker.pl +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/perl - -use strict; -use lib 'lib'; -use TheSchwartz; -use TheSchwartz::Worker::PubSubHubbubPublish; - -my $client = TheSchwartz->new(databases => [{ - user => "root", - dsn => "dbi:mysql:theschwartz", -}]); - -$client->can_do("TheSchwartz::Worker::PubSubHubbubPublish"); -$client->work; diff --git a/publisher_clients/perl-schwartz/lib/TheSchwartz/Worker/PubSubHubbubPublish.pm b/publisher_clients/perl-schwartz/lib/TheSchwartz/Worker/PubSubHubbubPublish.pm deleted file mode 100644 index 516e998..0000000 --- a/publisher_clients/perl-schwartz/lib/TheSchwartz/Worker/PubSubHubbubPublish.pm +++ /dev/null @@ -1,132 +0,0 @@ -=head1 NAME - -TheSchwartz::Worker::PubSubHubbubPublish - ping pubsubhubbub hub servers - -=head1 SYNOPSIS - - use TheSchwartz; - use TheSchwartz::Worker::PubSubHubbubPublish; - my $sclient = TheSchwartz->new(databases => \@Conf::YOUR_DBS); - $sclient->can_do("TheSchwartz::Worker::PubSubHubbubPublish"); - $sclient->work; # main loop of program; goes forever, pinging as needed - -=head1 DESCRIPTION - -This is a worker class for sending pings to PubSubHubbub hub servers. -See L and L for more -information. - -=head1 JOB ARGUMENTS - -When constructing a job using L's insert_job -method, construct your L instance with its -'argument' of the following form: - - { - hub => $hub_url, # the hub's endpoint URL - topic_url => $url, # Atom URL that was updated - } - -Also, if you set your L's C property to be -the hub URL, this worker will do batch pings instead, vastly reducing -the number of HTTP requests it does. - -=cut - -package TheSchwartz::Worker::PubSubHubbubPublish; -use strict; -use base 'TheSchwartz::Worker'; -use Storable; -use Net::PubSubHubbub::Publisher 0.91; - -our $VERSION = '1.00'; - -our $MAX_BATCH_SIZE = 50; - -my $keep_exit_status_for = 0; -sub set_keep_exit_status_for { $keep_exit_status_for = shift; } - -my %publisher; # $hub -> Net::PubSubHubbub::Publisher - -sub work { - my ($class, $job) = @_; - my $client = $job->handle->client; - my $hub = $job->arg->{hub}; - unless ($hub && $hub =~ m!^https?://\S+$!) { - $job->permanent_failure("Bogus hub $hub. Ignoring job."); - return; - } - - my @jobs; - my @topics; - - my $add_job = sub { - my $j = shift; - my $args = $j->arg; - unless ($args->{hub} eq $hub) { - # Each job must share the same hub. - warn "WARNING: coalesced job had different hub in its args. Skipping."; - return; - } - - push @jobs, $j; - push @topics, $args->{topic_url}; - }; - $add_job->($job); - - my $publisher = $publisher{$hub} ||= - Net::PubSubHubbub::Publisher->new(hub => $hub); - - while (@topics < $MAX_BATCH_SIZE) { - my $j = $client->find_job_with_coalescing_value(__PACKAGE__, $hub); - last unless $j; - $add_job->($j); - } - - if ($publisher->publish_update(@topics)) { - warn "Pinged $hub about topic(s): @topics.\n"; - foreach my $j (@jobs) { - $j->completed; - } - return; - } - - my $failure_reason = $publisher->last_response->status_line; - warn "Failed to ping $hub about @topics: $failure_reason\n"; - $job->failed($failure_reason); -} - -sub keep_exit_status_for { - return 0 unless $keep_exit_status_for; - return $keep_exit_status_for->() if ref $keep_exit_status_for eq "CODE"; - return $keep_exit_status_for; -} - -sub grab_for { 30 } -sub max_retries { 10 } -sub retry_delay { - my ($class, $fails) = @_; - return 30 * $fails; -} - -=head1 AUTHOR - -Brad Fitzpatrick -- brad@danga.com - -=head1 COPYRIGHT, LICENSE, and WARRANTY - -Copyright 2009, Brad Fitzpatrick. - -License to use under the same terms as Perl itself. - -This software comes with no warranty of any kind. - -=head1 SEE ALSO - -L - -L - -=cut - -1; diff --git a/publisher_clients/perl-schwartz/t/00-use.t b/publisher_clients/perl-schwartz/t/00-use.t deleted file mode 100644 index 174ff43..0000000 --- a/publisher_clients/perl-schwartz/t/00-use.t +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/perl -use Test::More tests => 1; -BEGIN { - use_ok( 'TheSchwartz::Worker::PubSubHubbubPublish' ); -} diff --git a/publisher_clients/perl/.gitignore b/publisher_clients/perl/.gitignore deleted file mode 100644 index 960abec..0000000 --- a/publisher_clients/perl/.gitignore +++ /dev/null @@ -1 +0,0 @@ -svn-commit.tmp diff --git a/publisher_clients/perl/.shipit b/publisher_clients/perl/.shipit deleted file mode 100644 index d1a416d..0000000 --- a/publisher_clients/perl/.shipit +++ /dev/null @@ -1,5 +0,0 @@ -steps = FindVersion, ChangeVersion, CheckChangeLog, DistTest, Commit, Tag, MakeDist, UploadCPAN -svn.tagpattern = Net-PubSubHubbub-Publisher-perl-%v - - - diff --git a/publisher_clients/perl/ChangeLog b/publisher_clients/perl/ChangeLog deleted file mode 100644 index dcb7570..0000000 --- a/publisher_clients/perl/ChangeLog +++ /dev/null @@ -1,7 +0,0 @@ -0.91: 2009-04-26 - - * Multi-topic support; better docs. - -0.90: 2009-04-26 - - * Initial release. diff --git a/publisher_clients/perl/MANIFEST b/publisher_clients/perl/MANIFEST deleted file mode 100644 index 32e29e7..0000000 --- a/publisher_clients/perl/MANIFEST +++ /dev/null @@ -1,6 +0,0 @@ -ChangeLog -pubsubhubbub-publish -lib/Net/PubSubHubbub/Publisher.pm -Makefile.PL -MANIFEST This list of files -t/00-use.t diff --git a/publisher_clients/perl/Makefile.PL b/publisher_clients/perl/Makefile.PL deleted file mode 100644 index 607f4c7..0000000 --- a/publisher_clients/perl/Makefile.PL +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/perl -use strict; -use ExtUtils::MakeMaker; - -WriteMakefile( NAME => 'Net::PubSubHubbub::Publisher', - VERSION_FROM => 'lib/Net/PubSubHubbub/Publisher.pm', - EXE_FILES => [ 'pubsubhubbub-publish' ], - PREREQ_PM => { - 'LWP::UserAgent' => 0, - }, - ABSTRACT_FROM => 'lib/Net/PubSubHubbub/Publisher.pm', - AUTHOR => 'Brad Fitzpatrick ', - ); - - - - diff --git a/publisher_clients/perl/lib/Net/PubSubHubbub/Publisher.pm b/publisher_clients/perl/lib/Net/PubSubHubbub/Publisher.pm deleted file mode 100644 index 6429ac2..0000000 --- a/publisher_clients/perl/lib/Net/PubSubHubbub/Publisher.pm +++ /dev/null @@ -1,123 +0,0 @@ -package Net::PubSubHubbub::Publisher; -use strict; -use LWP::UserAgent; -use HTTP::Request::Common; -use Carp qw(croak); - -=head1 NAME - -Net::PubSubHubbub::Publisher - client library to ping a PubSubHubbub hub - -=head1 OVERVIEW - - my $pub = Net::PubSubHubbub::Publisher->new(hub => $hub); - $pub->publish_update($atom_topic_url) or - die "Ping failed: " . $pub->last_response->status_line; - -=cut - -our $VERSION = "0.91"; - -=head1 CONSTRUCTOR - -=over 4 - -=item C(hub => $hub[, ua => $ua]) - -Takes a required hub URL, and an optional L instance. - -=back - -=cut - -sub new { - my ($class, %opts) = @_; - my $ua = delete $opts{ua}; - my $hub = delete $opts{hub}; - unless ($hub) { - croak("Required option 'hub' not set."); - } - unless ($hub =~ m!^https?://!) { - croak("Bogus hub URL of $hub"); - } - if (%opts) { - die "Unknown options: " . join(", ", sort keys %opts); - } - unless ($ua) { - $ua = LWP::UserAgent->new( - keep_alive => 1, - agent => "Net-PubSubHubbub-Publisher-perl/$VERSION", - ); - } - return bless { - ua => $ua, - hub => $hub, - }, $class; -} - -=head1 METHODS - -=over 4 - -=item C($topic_url) - -=item C(@topic_urls) - -Sends a ping that the provided Topic URL(s) has/have been updated. - -Returns true on success. If false, see C to figure out -why it failed. - -=cut - -sub publish_update { - my ($self, @urls) = @_; - croak "No URL(s) provided" unless @urls; - foreach my $url (@urls) { - croak("Bogus URL: $url") unless $url =~ m!^https?://!; - } - my @args = ("hub.mode" => "publish"); - push @args, map { ("hub.url" => $_) } @urls; - my $req = POST $self->{hub}, \@args; - my $res = $self->{last_res} = $self->{ua}->request($req); - return 1 if $res->is_success; - return 0; -} - -=item C() - -Returns the last L. Use this when C -fails to discover why it failed. - -=cut - -sub last_response { - my $self = shift; - return $self->{last_res}; -} - -1; - -=back - -=head1 COPYRIGHT & LICENSE - -This module is Copyright (c) 2009 Brad Fitzpatrick. -All rights reserved. - -You may distribute under the terms of either the GNU General Public -License or the Artistic License, as specified in the Perl README file. - -=head1 WARRANTY - -This is free software. IT COMES WITHOUT WARRANTY OF ANY KIND. - -=head1 AUTHOR - -Brad Fitzpatrick - -=head1 SEE ALSO - -L -- PubSubHubbub home - -=cut diff --git a/publisher_clients/perl/pubsubhubbub-publish b/publisher_clients/perl/pubsubhubbub-publish deleted file mode 100755 index fbb96d4..0000000 --- a/publisher_clients/perl/pubsubhubbub-publish +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl -# -*-perl-*- - -use strict; -use lib 'lib'; -use Net::PubSubHubbub::Publisher; -use Getopt::Long; -use LWP::Simple (); - -sub usage { - my $err = shift; - if ($err) { - warn "ERROR: $err\n\n"; - } - print STDERR < [ ...] - -Pings the hub, notifying the hub that the provided 'topic_url' has -been updated. - -Options: ---hub= Which hub endpoint to ping. Defaults - to the open, reference hub, but you need to use whatever hub - that your Topic URL references. - -END - - exit(1); -} - -my $hub = "http://pubsubhubbub.appspot.com/"; -GetOptions("hub=s" => \$hub) - or usage(); - -my @topic_urls = @ARGV or usage("topic_url required."); -foreach my $url (@topic_urls) { - usage("Bogus topic URL: $url") - unless $url =~ m!^https?://\S+$!; -} - -usage("No hub provided.") - unless $hub && $hub =~ m!^https?://\S+$!; - -my $publisher = Net::PubSubHubbub::Publisher->new(hub => $hub); -unless ($publisher->publish_update(@topic_urls)) { - warn "Error pinging hub: " . $publisher->last_response->status_line; - exit(1); -} diff --git a/publisher_clients/perl/t/00-use.t b/publisher_clients/perl/t/00-use.t deleted file mode 100644 index 897cd1e..0000000 --- a/publisher_clients/perl/t/00-use.t +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl -# -*-perl -*- - -use strict; -use Test::More tests => 1; - -use_ok("Net::PubSubHubbub::Publisher"); - diff --git a/publisher_clients/php/README.txt b/publisher_clients/php/README.txt deleted file mode 100644 index 3d27c40..0000000 --- a/publisher_clients/php/README.txt +++ /dev/null @@ -1,21 +0,0 @@ -This PHP library for PubSubHubbub was written by Josh Fraser (joshfraser.com) and is released under the Apache 2.0 License - -Usage: -// specify which hub you want to use. in this case we'll use the demo hub on app engine. -$hub_url = "http://pubsubhubbub.appspot.com/"; - -// create a new pubsubhubbub publisher -$p = new Publisher($hub_url); - -// specify the feed that has been updated -$topic_url = "http://www.onlineaspect.com"; - -// notify the hub that the specified topic_url (ATOM feed) has been updated -// alternatively, publish_update() also accepts an array of topic urls -if ($p->publish_update($topic_url)) { -    echo "$topic_url was successfully published to $hub_url"; -} else { -    echo "Ooops..."; -    print_r($p->last_response()); -} - \ No newline at end of file diff --git a/publisher_clients/php/examples/publisher_example.php b/publisher_clients/php/examples/publisher_example.php deleted file mode 100644 index 5b5ec8a..0000000 --- a/publisher_clients/php/examples/publisher_example.php +++ /dev/null @@ -1,51 +0,0 @@ -"; - -// process form -if ($_POST['sub']) { - - $hub_url = $_POST['hub_url']; - $topic_url = $_POST['topic_url']; - - // check that a hub url is specified - if (!$hub_url) { - echo "Please specify a hub url.

back"; - exit(); - } - // check that a topic url is specified - if (!$topic_url) { - echo "Please specify a topic url to publish.

back"; - exit(); - } - - // $hub_url = "http://pubsubhubbub.appspot.com/publish"; - $p = new Publisher($hub_url); - if ($p->publish_update($topic_url)) { - echo "$topic_url was successfully published to $hub_url

back"; - } else { - echo "ooops..."; - print_r($p->last_response()); - } - -} else { - - // display a primitive form for testing - echo "
"; - echo "hub url:
"; - echo "topic url:
"; - echo "
"; - echo "
"; - -} - -echo ""; - -?> \ No newline at end of file diff --git a/publisher_clients/php/library/publisher.php b/publisher_clients/php/library/publisher.php deleted file mode 100644 index f176a9b..0000000 --- a/publisher_clients/php/library/publisher.php +++ /dev/null @@ -1,86 +0,0 @@ -hub_url = $hub_url; - } - - // accepts either a single url or an array of urls - public function publish_update($topic_urls, $http_function = false) { - if (!isset($topic_urls)) - throw new Exception('Please specify a topic url'); - - // check that we're working with an array - if (!is_array($topic_urls)) { - $topic_urls = array($topic_urls); - } - - // set the mode to publish - $post_string = "hub.mode=publish"; - // loop through each topic url - foreach ($topic_urls as $topic_url) { - - // lightweight check that we're actually working w/ a valid url - if (!preg_match("|^https?://|i",$topic_url)) - throw new Exception('The specified topic url does not appear to be valid: '.$topic_url); - - // append the topic url parameters - $post_string .= "&hub.url=".urlencode($topic_url); - } - - // make the http post request and return true/false - // easy to over-write to use your own http function - if ($http_function) - return $http_function($this->hub_url,$post_string); - else - return $this->http_post($this->hub_url,$post_string); - } - - // returns any error message from the latest request - public function last_response() { - return $this->last_response; - } - - // default http function that uses curl to post to the hub endpoint - private function http_post($url, $post_string) { - - // add any additional curl options here - $options = array(CURLOPT_URL => $url, - CURLOPT_POST => true, - CURLOPT_POSTFIELDS => $post_string, - CURLOPT_USERAGENT => "PubSubHubbub-Publisher-PHP/1.0"); - - $ch = curl_init(); - curl_setopt_array($ch, $options); - - $response = curl_exec($ch); - $this->last_response = $response; - $info = curl_getinfo($ch); - - curl_close($ch); - - // all good - if ($info['http_code'] == 204) - return true; - return false; - } -} - -?> \ No newline at end of file diff --git a/publisher_clients/python/CHANGES b/publisher_clients/python/CHANGES deleted file mode 100644 index b7f18d1..0000000 --- a/publisher_clients/python/CHANGES +++ /dev/null @@ -1,2 +0,0 @@ -1.00 (2009-05-17) - * First release. diff --git a/publisher_clients/python/pubsubhubbub_publish.py b/publisher_clients/python/pubsubhubbub_publish.py deleted file mode 100644 index 9ae6e66..0000000 --- a/publisher_clients/python/pubsubhubbub_publish.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Simple Publisher client for PubSubHubbub. - -Example usage: - - from pubsubhubbub_publish import * - try: - publish('http://pubsubhubbub.appspot.com', - 'http://example.com/feed1/atom.xml', - 'http://example.com/feed2/atom.xml', - 'http://example.com/feed3/atom.xml') - except PublishError, e: - # handle exception... - -Set the 'http_proxy' environment variable on *nix or Windows to use an -HTTP proxy. -""" - -__author__ = 'bslatkin@gmail.com (Brett Slatkin)' - -import urllib -import urllib2 - - -class PublishError(Exception): - """An error occurred while trying to publish to the hub.""" - - -URL_BATCH_SIZE = 100 - - -def publish(hub, *urls): - """Publishes an event to a hub. - - Args: - hub: The hub to publish the event to. - **urls: One or more URLs to publish to. If only a single URL argument is - passed and that item is an iterable that is not a string, the contents of - that iterable will be used to produce the list of published URLs. If - more than URL_BATCH_SIZE URLs are supplied, this function will batch them - into chunks across multiple requests. - - Raises: - PublishError if anything went wrong during publishing. - """ - if len(urls) == 1 and not isinstance(urls[0], basestring): - urls = list(urls[0]) - - for i in xrange(0, len(urls), URL_BATCH_SIZE): - chunk = urls[i:i+URL_BATCH_SIZE] - data = urllib.urlencode( - {'hub.url': chunk, 'hub.mode': 'publish'}, doseq=True) - try: - response = urllib2.urlopen(hub, data) - except (IOError, urllib2.HTTPError), e: - if hasattr(e, 'code') and e.code == 204: - continue - error = '' - if hasattr(e, 'read'): - error = e.read() - raise PublishError('%s, Response: "%s"' % (e, error)) diff --git a/publisher_clients/python/pubsubhubbub_publish_test.py b/publisher_clients/python/pubsubhubbub_publish_test.py deleted file mode 100755 index 09a185b..0000000 --- a/publisher_clients/python/pubsubhubbub_publish_test.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Tests for the pubsubhubbub_publish module.""" - -__author__ = 'bslatkin@gmail.com (Brett Slatkin)' - -import BaseHTTPServer -import urllib -import unittest -import threading - -import pubsubhubbub_publish - - -REQUESTS = 0 - - -class RequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): - def do_POST(self): - global REQUESTS - print 'Accessed', self.path - REQUESTS += 1 - - length = int(self.headers.get('content-length', 0)) - if not length: - return self.send_error(500) - body = self.rfile.read(length) - - if self.path == '/single': - if body != urllib.urlencode( - {'hub.url': 'http://example.com/feed', 'hub.mode': 'publish'}): - self.send_error(500) - self.wfile.write('Bad body. Found:') - self.wfile.write(body) - else: - self.send_response(204) - elif self.path == '/multiple': - if body != urllib.urlencode( - {'hub.url': ['http://example.com/feed', - 'http://example.com/feed2', - 'http://example.com/feed3'], - 'hub.mode': 'publish'}, doseq=True): - self.send_error(500) - self.wfile.write('Bad body. Found:') - self.wfile.write(body) - else: - self.send_response(204) - elif self.path == '/batch': - self.send_response(204) - elif self.path == '/fail': - self.send_error(400) - self.wfile.write('bad argument') - else: - self.send_error(404) - - -class PublishTest(unittest.TestCase): - - def setUp(self): - global REQUESTS - REQUESTS = 0 - self.server = BaseHTTPServer.HTTPServer(('', 0), RequestHandler) - t = threading.Thread(target=self.server.serve_forever) - t.setDaemon(True) - t.start() - self.hub = 'http://%s:%d' % ( - self.server.server_name, self.server.server_port) - self.feed = 'http://example.com/feed' - self.feed2 = 'http://example.com/feed2' - self.feed3 = 'http://example.com/feed3' - - def testSingle(self): - pubsubhubbub_publish.publish(self.hub + '/single', self.feed) - self.assertEquals(1, REQUESTS) - - def testMultiple(self): - pubsubhubbub_publish.publish(self.hub + '/multiple', - self.feed, self.feed2, self.feed3) - - def testList(self): - pubsubhubbub_publish.publish(self.hub + '/multiple', - [self.feed, self.feed2, self.feed3]) - - def testIterable(self): - pubsubhubbub_publish.publish(self.hub + '/multiple', - iter([self.feed, self.feed2, self.feed3])) - - def testBatchSizeLimit(self): - old = pubsubhubbub_publish.URL_BATCH_SIZE - try: - pubsubhubbub_publish.URL_BATCH_SIZE = 2 - pubsubhubbub_publish.publish(self.hub + '/batch', - [self.feed, self.feed2, self.feed3]) - finally: - pubsubhubbub_publish.URL_BATCH_SIZE = old - self.assertEquals(2, REQUESTS) - - def testBadHubHostname(self): - self.assertRaises( - pubsubhubbub_publish.PublishError, - pubsubhubbub_publish.publish, - 'http://asdf.does.not.resolve', self.feed) - - def testBadArgument(self): - self.assertRaises( - pubsubhubbub_publish.PublishError, - pubsubhubbub_publish.publish, - self.hub + '/fail', self.feed) - - def testBadHubUrl(self): - self.assertRaises( - pubsubhubbub_publish.PublishError, - pubsubhubbub_publish.publish, - 'not://a.url.is.this', self.feed) - - def testNotFound(self): - self.assertRaises( - pubsubhubbub_publish.PublishError, - pubsubhubbub_publish.publish, - self.hub + '/unknown', self.feed) - - -if __name__ == '__main__': - unittest.main() diff --git a/publisher_clients/python/setup.py b/publisher_clients/python/setup.py deleted file mode 100644 index 756f967..0000000 --- a/publisher_clients/python/setup.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2009 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from distutils.core import setup - - -LONG_DESC = ( - 'A simple, open, server-to-server web-hook-based pubsub ' - '(publish/subscribe) protocol as a simple extension to Atom. ' - 'Parties (servers) speaking the PubSubHubbub protocol can get ' - 'near-instant notifications (via webhook callbacks) when a topic ' - '(Atom URL) they\'re interested in is updated.') - -setup(name='PubSubHubbub_Publisher', - version='1.0', - description='Publisher client for PubSubHubbub', - long_description=LONG_DESC, - author='Brett Slatkin', - author_email='bslatkin@gmail.com', - url='http://code.google.com/p/pubsubhubbub/', - py_modules=['pubsubhubbub_publish'], - license="Apache 2.0") diff --git a/publisher_clients/wordpress/1.0/publisher.php b/publisher_clients/wordpress/1.0/publisher.php deleted file mode 100644 index f176a9b..0000000 --- a/publisher_clients/wordpress/1.0/publisher.php +++ /dev/null @@ -1,86 +0,0 @@ -hub_url = $hub_url; - } - - // accepts either a single url or an array of urls - public function publish_update($topic_urls, $http_function = false) { - if (!isset($topic_urls)) - throw new Exception('Please specify a topic url'); - - // check that we're working with an array - if (!is_array($topic_urls)) { - $topic_urls = array($topic_urls); - } - - // set the mode to publish - $post_string = "hub.mode=publish"; - // loop through each topic url - foreach ($topic_urls as $topic_url) { - - // lightweight check that we're actually working w/ a valid url - if (!preg_match("|^https?://|i",$topic_url)) - throw new Exception('The specified topic url does not appear to be valid: '.$topic_url); - - // append the topic url parameters - $post_string .= "&hub.url=".urlencode($topic_url); - } - - // make the http post request and return true/false - // easy to over-write to use your own http function - if ($http_function) - return $http_function($this->hub_url,$post_string); - else - return $this->http_post($this->hub_url,$post_string); - } - - // returns any error message from the latest request - public function last_response() { - return $this->last_response; - } - - // default http function that uses curl to post to the hub endpoint - private function http_post($url, $post_string) { - - // add any additional curl options here - $options = array(CURLOPT_URL => $url, - CURLOPT_POST => true, - CURLOPT_POSTFIELDS => $post_string, - CURLOPT_USERAGENT => "PubSubHubbub-Publisher-PHP/1.0"); - - $ch = curl_init(); - curl_setopt_array($ch, $options); - - $response = curl_exec($ch); - $this->last_response = $response; - $info = curl_getinfo($ch); - - curl_close($ch); - - // all good - if ($info['http_code'] == 204) - return true; - return false; - } -} - -?> \ No newline at end of file diff --git a/publisher_clients/wordpress/1.0/pubsubhubbub.php b/publisher_clients/wordpress/1.0/pubsubhubbub.php deleted file mode 100755 index 7c2c668..0000000 --- a/publisher_clients/wordpress/1.0/pubsubhubbub.php +++ /dev/null @@ -1,179 +0,0 @@ -PubSubHubbub settings page -Version: 1.1 -Author: Josh Fraser -Author Email: josh@eventvue.com -Author URI: http://www.joshfraser.com -*/ - -include("publisher.php"); - -// function that is called whenever a new post is published -function publish_to_hub($post_id) { - - // we want to notify the hub for every feed - $feed_urls = array(); - $feed_urls[] = get_bloginfo('atom_url'); - $feed_urls[] = get_bloginfo('rss_url'); - $feed_urls[] = get_bloginfo('rdf_url'); - $feed_urls[] = get_bloginfo('rss2_url'); - // remove dups (ie. they all point to feedburner) - $feed_urls = array_unique($feed_urls); - // get the address of the publish endpoint on the hub - $hub_url = get_pubsub_endpoint(); - $p = new Publisher($hub_url); - // need better error handling - if (!$p->publish_update($feed_urls, "http_post_wp")) { - print_r($p->last_response()); - } - return $post_id; -} - -function add_atom_link_tag() { - $sub_url = get_pubsub_endpoint(); - echo ''; -} - -function add_rss_link_tag() { - $sub_url = get_pubsub_endpoint(); - echo ''; -} - -function add_rdf_ns_link() { - echo 'xmlns:atom="http://www.w3.org/2005/Atom"'; -} - -// hack to add the atom definition to the RSS feed -// start capturing the feed output. this is run at priority 9 (before output) -function start_rss_link_tag() { - ob_start(); -} - -// this is run at priority 11 (after output) -// add in the xmlns atom definition link -function end_rss_link_tag() { - $feed = ob_get_clean(); - $pattern = '//i'; - $replacement = ''; - // change to - echo preg_replace($pattern, $replacement, $feed); -} - -// add a link to our settings page in the WP menu -function add_plugin_menu() { - add_options_page('PubSubHubbub Settings', 'PubSubHubbub', 8, __FILE__, 'add_settings_page'); -} - -// get the endpoints from the wordpress options table -// valid parameters are "publish" or "subscribe" -function get_pubsub_endpoint() { - $endpoint = get_option('pubsub_endpoint'); - - // if no values have been set, revert to the defaults (pubsubhubbub on app engine) - if (!$endpoint) { - $endpoint = "http://pubsubhubbub.appspot.com"; - } - return $endpoint; -} - -// write the content for our settings page that allows you to define your endpoints -function add_settings_page() { ?> -
-

Define a custom endpoint

- -
- - - - - - - - - - - -
Endpoint URL:
- - - - -

- -

- -
- -

- Thanks for using PubSubHubbub. Learn more about PubSubHubbub and author of this plugin: - - -
- -agent = "(PubSubHubbub-Publisher-WP/1.0)"; - $snoopy->submit($url,$post_vars); - $response = $snoopy->results; - // TODO: store the last_response. requires a litle refactoring work. - $response_code = $snoopy->response_code; - if ($response_code == 204) - return true; - return false; -} - - -// attach the handler that gets called every time you publish a post -add_action('publish_post', 'publish_to_hub'); -// add the link to our settings page in the WP menu structure -add_action('admin_menu', 'add_plugin_menu'); - -// add the link tag that points to the hub in the header of our template... - -// to our atom feed -add_action('atom_head', 'add_atom_link_tag'); -// to our RSS 0.92 feed (requires a bit of a hack to include the ATOM namespace definition) -add_action('do_feed_rss', 'start_rss_link_tag', 9); // run before output -add_action('do_feed_rss', 'end_rss_link_tag', 11); // run after output -add_action('rss_head', 'add_rss_link_tag'); -// to our RDF / RSS 1 feed -add_action('rdf_ns', 'add_rdf_ns_link'); -add_action('rdf_header', 'add_rss_link_tag'); -// to our RSS 2 feed -add_action('rss2_head', 'add_rss_link_tag'); -// to our main HTML header -- not sure if we want to include this long-term or not. -add_action('wp_head', 'add_atom_link_tag'); - -?> \ No newline at end of file diff --git a/publisher_clients/wordpress/1.0/readme.txt b/publisher_clients/wordpress/1.0/readme.txt deleted file mode 100644 index 767ac25..0000000 --- a/publisher_clients/wordpress/1.0/readme.txt +++ /dev/null @@ -1,43 +0,0 @@ -=== Plugin Name === -Contributors: joshfraz -Donate link: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5426516 -Tags: pubsubhubbub -Requires at least: 2.5 -Tested up to: 2.7 -Stable tag: /trunk/ - -A better way to tell the world when your blog is updated. - -== Description == - -This plugin that implements [the PubSubHubbub protocol](http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.1.html "the PubSubHubbub protocol"). [PubSubHubbub](http://code.google.com/p/pubsubhubbub/ "PubSubHubbub") is a simple, open, server-to-server web-hook-based pubsub (publish/subscribe) protocol as a simple extension to Atom and RSS. - -Parties (servers) speaking the PubSubHubbub protocol can get near-instant notifications (via webhook callbacks) when a topic (feed URL) they're interested in is updated. - -This plugin: - -* Notifies your specified hub each time you publish a new post -* Announces your specified hub by adding `` to your template header and ATOM feed -* Adds `` to your RSS feeds along with the necessary XMLNS declaration for RSS 0.92/1.0 - -The PubSubHubbub protocol is decentralized and free. No company is at the center of this controlling it. Anybody can run a hub, or anybody can ping (publish) or subscribe using open hubs. If no custom hub is specified, this plugin will use the demonstration hub that is running on Google App Engine. - -== Installation == - -1. Upload the `pubsubhubbub` directory to your `/wp-content/plugins/` directory -2. Activate the plugin through the 'Plugins' menu in WordPress -3. Select a custom hub under your PubSubHubbub Settings (optional) - -== Frequently Asked Questions == - -= Where can I learn more about the PubSubHubbub protocol? = - -You can visit [PubSubHubbb on Google Code](http://code.google.com/p/pubsubhubbub/ "PubSubHubbb on Google Code") - -= Where can I learn more about the author of this plugin? = - -You can learn more about [Josh Fraser](http://www.joshfraser.com "Josh Fraser") at [Online Aspect](http://www.onlineaspect.com "Online Aspect") - -== Screenshots == - -1. The PubSubHubbub Settings page allows you to define custom endpoints for your chosen hub diff --git a/publisher_clients/wordpress/1.0/screenshot-1.png b/publisher_clients/wordpress/1.0/screenshot-1.png deleted file mode 100644 index 91d84af..0000000 Binary files a/publisher_clients/wordpress/1.0/screenshot-1.png and /dev/null differ diff --git a/publisher_clients/wordpress/1.1/publisher.php b/publisher_clients/wordpress/1.1/publisher.php deleted file mode 100644 index f176a9b..0000000 --- a/publisher_clients/wordpress/1.1/publisher.php +++ /dev/null @@ -1,86 +0,0 @@ -hub_url = $hub_url; - } - - // accepts either a single url or an array of urls - public function publish_update($topic_urls, $http_function = false) { - if (!isset($topic_urls)) - throw new Exception('Please specify a topic url'); - - // check that we're working with an array - if (!is_array($topic_urls)) { - $topic_urls = array($topic_urls); - } - - // set the mode to publish - $post_string = "hub.mode=publish"; - // loop through each topic url - foreach ($topic_urls as $topic_url) { - - // lightweight check that we're actually working w/ a valid url - if (!preg_match("|^https?://|i",$topic_url)) - throw new Exception('The specified topic url does not appear to be valid: '.$topic_url); - - // append the topic url parameters - $post_string .= "&hub.url=".urlencode($topic_url); - } - - // make the http post request and return true/false - // easy to over-write to use your own http function - if ($http_function) - return $http_function($this->hub_url,$post_string); - else - return $this->http_post($this->hub_url,$post_string); - } - - // returns any error message from the latest request - public function last_response() { - return $this->last_response; - } - - // default http function that uses curl to post to the hub endpoint - private function http_post($url, $post_string) { - - // add any additional curl options here - $options = array(CURLOPT_URL => $url, - CURLOPT_POST => true, - CURLOPT_POSTFIELDS => $post_string, - CURLOPT_USERAGENT => "PubSubHubbub-Publisher-PHP/1.0"); - - $ch = curl_init(); - curl_setopt_array($ch, $options); - - $response = curl_exec($ch); - $this->last_response = $response; - $info = curl_getinfo($ch); - - curl_close($ch); - - // all good - if ($info['http_code'] == 204) - return true; - return false; - } -} - -?> \ No newline at end of file diff --git a/publisher_clients/wordpress/1.1/pubsubhubbub.php b/publisher_clients/wordpress/1.1/pubsubhubbub.php deleted file mode 100755 index 7c2c668..0000000 --- a/publisher_clients/wordpress/1.1/pubsubhubbub.php +++ /dev/null @@ -1,179 +0,0 @@ -PubSubHubbub settings page -Version: 1.1 -Author: Josh Fraser -Author Email: josh@eventvue.com -Author URI: http://www.joshfraser.com -*/ - -include("publisher.php"); - -// function that is called whenever a new post is published -function publish_to_hub($post_id) { - - // we want to notify the hub for every feed - $feed_urls = array(); - $feed_urls[] = get_bloginfo('atom_url'); - $feed_urls[] = get_bloginfo('rss_url'); - $feed_urls[] = get_bloginfo('rdf_url'); - $feed_urls[] = get_bloginfo('rss2_url'); - // remove dups (ie. they all point to feedburner) - $feed_urls = array_unique($feed_urls); - // get the address of the publish endpoint on the hub - $hub_url = get_pubsub_endpoint(); - $p = new Publisher($hub_url); - // need better error handling - if (!$p->publish_update($feed_urls, "http_post_wp")) { - print_r($p->last_response()); - } - return $post_id; -} - -function add_atom_link_tag() { - $sub_url = get_pubsub_endpoint(); - echo ''; -} - -function add_rss_link_tag() { - $sub_url = get_pubsub_endpoint(); - echo ''; -} - -function add_rdf_ns_link() { - echo 'xmlns:atom="http://www.w3.org/2005/Atom"'; -} - -// hack to add the atom definition to the RSS feed -// start capturing the feed output. this is run at priority 9 (before output) -function start_rss_link_tag() { - ob_start(); -} - -// this is run at priority 11 (after output) -// add in the xmlns atom definition link -function end_rss_link_tag() { - $feed = ob_get_clean(); - $pattern = '//i'; - $replacement = ''; - // change to - echo preg_replace($pattern, $replacement, $feed); -} - -// add a link to our settings page in the WP menu -function add_plugin_menu() { - add_options_page('PubSubHubbub Settings', 'PubSubHubbub', 8, __FILE__, 'add_settings_page'); -} - -// get the endpoints from the wordpress options table -// valid parameters are "publish" or "subscribe" -function get_pubsub_endpoint() { - $endpoint = get_option('pubsub_endpoint'); - - // if no values have been set, revert to the defaults (pubsubhubbub on app engine) - if (!$endpoint) { - $endpoint = "http://pubsubhubbub.appspot.com"; - } - return $endpoint; -} - -// write the content for our settings page that allows you to define your endpoints -function add_settings_page() { ?> -
-

Define a custom endpoint

- -
- - - - - - - - - - - -
Endpoint URL:
- - - - -

- -

- -
- -

- Thanks for using PubSubHubbub. Learn more about PubSubHubbub and author of this plugin: - - -
- -agent = "(PubSubHubbub-Publisher-WP/1.0)"; - $snoopy->submit($url,$post_vars); - $response = $snoopy->results; - // TODO: store the last_response. requires a litle refactoring work. - $response_code = $snoopy->response_code; - if ($response_code == 204) - return true; - return false; -} - - -// attach the handler that gets called every time you publish a post -add_action('publish_post', 'publish_to_hub'); -// add the link to our settings page in the WP menu structure -add_action('admin_menu', 'add_plugin_menu'); - -// add the link tag that points to the hub in the header of our template... - -// to our atom feed -add_action('atom_head', 'add_atom_link_tag'); -// to our RSS 0.92 feed (requires a bit of a hack to include the ATOM namespace definition) -add_action('do_feed_rss', 'start_rss_link_tag', 9); // run before output -add_action('do_feed_rss', 'end_rss_link_tag', 11); // run after output -add_action('rss_head', 'add_rss_link_tag'); -// to our RDF / RSS 1 feed -add_action('rdf_ns', 'add_rdf_ns_link'); -add_action('rdf_header', 'add_rss_link_tag'); -// to our RSS 2 feed -add_action('rss2_head', 'add_rss_link_tag'); -// to our main HTML header -- not sure if we want to include this long-term or not. -add_action('wp_head', 'add_atom_link_tag'); - -?> \ No newline at end of file diff --git a/publisher_clients/wordpress/1.1/readme.txt b/publisher_clients/wordpress/1.1/readme.txt deleted file mode 100644 index 767ac25..0000000 --- a/publisher_clients/wordpress/1.1/readme.txt +++ /dev/null @@ -1,43 +0,0 @@ -=== Plugin Name === -Contributors: joshfraz -Donate link: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5426516 -Tags: pubsubhubbub -Requires at least: 2.5 -Tested up to: 2.7 -Stable tag: /trunk/ - -A better way to tell the world when your blog is updated. - -== Description == - -This plugin that implements [the PubSubHubbub protocol](http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.1.html "the PubSubHubbub protocol"). [PubSubHubbub](http://code.google.com/p/pubsubhubbub/ "PubSubHubbub") is a simple, open, server-to-server web-hook-based pubsub (publish/subscribe) protocol as a simple extension to Atom and RSS. - -Parties (servers) speaking the PubSubHubbub protocol can get near-instant notifications (via webhook callbacks) when a topic (feed URL) they're interested in is updated. - -This plugin: - -* Notifies your specified hub each time you publish a new post -* Announces your specified hub by adding `` to your template header and ATOM feed -* Adds `` to your RSS feeds along with the necessary XMLNS declaration for RSS 0.92/1.0 - -The PubSubHubbub protocol is decentralized and free. No company is at the center of this controlling it. Anybody can run a hub, or anybody can ping (publish) or subscribe using open hubs. If no custom hub is specified, this plugin will use the demonstration hub that is running on Google App Engine. - -== Installation == - -1. Upload the `pubsubhubbub` directory to your `/wp-content/plugins/` directory -2. Activate the plugin through the 'Plugins' menu in WordPress -3. Select a custom hub under your PubSubHubbub Settings (optional) - -== Frequently Asked Questions == - -= Where can I learn more about the PubSubHubbub protocol? = - -You can visit [PubSubHubbb on Google Code](http://code.google.com/p/pubsubhubbub/ "PubSubHubbb on Google Code") - -= Where can I learn more about the author of this plugin? = - -You can learn more about [Josh Fraser](http://www.joshfraser.com "Josh Fraser") at [Online Aspect](http://www.onlineaspect.com "Online Aspect") - -== Screenshots == - -1. The PubSubHubbub Settings page allows you to define custom endpoints for your chosen hub diff --git a/publisher_clients/wordpress/1.1/screenshot-1.png b/publisher_clients/wordpress/1.1/screenshot-1.png deleted file mode 100644 index 91d84af..0000000 Binary files a/publisher_clients/wordpress/1.1/screenshot-1.png and /dev/null differ diff --git a/publisher_clients/wordpress/1.2/publisher.php b/publisher_clients/wordpress/1.2/publisher.php deleted file mode 100644 index f176a9b..0000000 --- a/publisher_clients/wordpress/1.2/publisher.php +++ /dev/null @@ -1,86 +0,0 @@ -hub_url = $hub_url; - } - - // accepts either a single url or an array of urls - public function publish_update($topic_urls, $http_function = false) { - if (!isset($topic_urls)) - throw new Exception('Please specify a topic url'); - - // check that we're working with an array - if (!is_array($topic_urls)) { - $topic_urls = array($topic_urls); - } - - // set the mode to publish - $post_string = "hub.mode=publish"; - // loop through each topic url - foreach ($topic_urls as $topic_url) { - - // lightweight check that we're actually working w/ a valid url - if (!preg_match("|^https?://|i",$topic_url)) - throw new Exception('The specified topic url does not appear to be valid: '.$topic_url); - - // append the topic url parameters - $post_string .= "&hub.url=".urlencode($topic_url); - } - - // make the http post request and return true/false - // easy to over-write to use your own http function - if ($http_function) - return $http_function($this->hub_url,$post_string); - else - return $this->http_post($this->hub_url,$post_string); - } - - // returns any error message from the latest request - public function last_response() { - return $this->last_response; - } - - // default http function that uses curl to post to the hub endpoint - private function http_post($url, $post_string) { - - // add any additional curl options here - $options = array(CURLOPT_URL => $url, - CURLOPT_POST => true, - CURLOPT_POSTFIELDS => $post_string, - CURLOPT_USERAGENT => "PubSubHubbub-Publisher-PHP/1.0"); - - $ch = curl_init(); - curl_setopt_array($ch, $options); - - $response = curl_exec($ch); - $this->last_response = $response; - $info = curl_getinfo($ch); - - curl_close($ch); - - // all good - if ($info['http_code'] == 204) - return true; - return false; - } -} - -?> \ No newline at end of file diff --git a/publisher_clients/wordpress/1.2/pubsubhubbub.php b/publisher_clients/wordpress/1.2/pubsubhubbub.php deleted file mode 100755 index cc42c1e..0000000 --- a/publisher_clients/wordpress/1.2/pubsubhubbub.php +++ /dev/null @@ -1,211 +0,0 @@ -publish_update($feed_urls, "http_post_wp")) { - // TODO: add better error handling here - } - } - return $post_id; -} - -function add_atom_link_tag() { - $hub_urls = get_pubsub_endpoints(); - foreach ($hub_urls as $hub_url) { - echo ''; - } -} - -function add_rss_link_tag() { - $hub_urls = get_pubsub_endpoints(); - foreach ($hub_urls as $hub_url) { - echo ''; - } -} - -function add_rdf_ns_link() { - echo 'xmlns:atom="http://www.w3.org/2005/Atom"'; -} - -// hack to add the atom definition to the RSS feed -// start capturing the feed output. this is run at priority 9 (before output) -function start_rss_link_tag() { - ob_start(); -} - -// this is run at priority 11 (after output) -// add in the xmlns atom definition link -function end_rss_link_tag() { - $feed = ob_get_clean(); - $pattern = '//i'; - $replacement = ''; - // change to - echo preg_replace($pattern, $replacement, $feed); -} - -// add a link to our settings page in the WP menu -function add_plugin_menu() { - add_options_page('PubSubHubbub Settings', 'PubSubHubbub', 8, __FILE__, 'add_settings_page'); -} - -// get the endpoints from the wordpress options table -// valid parameters are "publish" or "subscribe" -function get_pubsub_endpoints() { - $endpoints = get_option('pubsub_endpoints'); - $hub_urls = explode("\n",$endpoints); - - // if no values have been set, revert to the defaults (pubsubhubbub on app engine & superfeedr) - if (!$endpoints) { - $hub_urls[] = "http://pubsubhubbub.appspot.com"; - $hub_urls[] = "http://superfeedr.com/hubbub"; - } - - // clean out any blank values - foreach ($hub_urls as $key => $value) { - if (is_null($value) || $value=="") { - unset($hub_urls[$key]); - } else { - $hub_urls[$key] = trim($hub_urls[$key]); - } - } - - return $hub_urls; -} - -// write the content for our settings page that allows you to define your endpoints -function add_settings_page() { ?> -
-

Define custom hubs

- -
- - - - - - - - - - - -
Hubs (one per line)
- - - - -

- -

- -
- -

-
- Thanks for using PubSubHubbub!
- Visit these links to learn more about PubSubHubbub and the author of this plugin:
- -
- -
- -agent = "(PubSubHubbub-Publisher-WP/1.0)"; - $snoopy->submit($url,$post_vars); - $response = $snoopy->results; - // TODO: store the last_response. requires a litle refactoring work. - $response_code = $snoopy->response_code; - if ($response_code == 204) - return true; - return false; -} - -// add a settings link next to deactive / edit -function add_settings_link( $links, $file ) { - if( $file == 'pubsubhubbub/pubsubhubbub.php' && function_exists( "admin_url" ) ) { - $settings_link = '' . __('Settings') . ''; - array_unshift( $links, $settings_link ); // before other links - } - return $links; -} - -// attach the handler that gets called every time you publish a post -add_action('publish_post', 'publish_to_hub'); -// add the link to our settings page in the WP menu structure -add_action('admin_menu', 'add_plugin_menu'); - -// add the link tag that points to the hub in the header of our template... - -// to our atom feed -add_action('atom_head', 'add_atom_link_tag'); -// to our RSS 0.92 feed (requires a bit of a hack to include the ATOM namespace definition) -add_action('do_feed_rss', 'start_rss_link_tag', 9); // run before output -add_action('do_feed_rss', 'end_rss_link_tag', 11); // run after output -add_action('rss_head', 'add_rss_link_tag'); -// to our RDF / RSS 1 feed -add_action('rdf_ns', 'add_rdf_ns_link'); -add_action('rdf_header', 'add_rss_link_tag'); -// to our RSS 2 feed -add_action('rss2_head', 'add_rss_link_tag'); -// to our main HTML header -- not sure if we want to include this long-term or not. -add_action('wp_head', 'add_atom_link_tag'); - -add_filter('plugin_action_links', 'add_settings_link', 10, 2); - -?> \ No newline at end of file diff --git a/publisher_clients/wordpress/1.2/readme.txt b/publisher_clients/wordpress/1.2/readme.txt deleted file mode 100644 index b8c3457..0000000 --- a/publisher_clients/wordpress/1.2/readme.txt +++ /dev/null @@ -1,46 +0,0 @@ -=== Plugin Name === -Contributors: joshfraz -Donate link: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5426516 -Tags: pubsubhubbub -Requires at least: 2.5 -Tested up to: 2.8.4 -Stable tag: /trunk/ - -A better way to tell the world when your blog is updated. - -== Description == - -This [PubSubHubbub](http://code.google.com/p/pubsubhubbub/ "PubSubHubbub") plugin is a simple way to let people know in real-time when your blog is updated. PubSubHubbub is quickly gaining adoption and is already being used by Google Reader, Google Alerts, FriendFeed and more. - -This plugin: - -* Now supports multiple hubs! -* Supports all of the feed formats used by WordPress, not just ATOM and RSS2 -* Announces which hubs you are using by adding `` declarations to your template header and ATOM feed -* Adds `` to your RSS feeds along with the necessary XMLNS declaration for RSS 0.92/1.0 - -By default this plugin will ping the following hubs: -* [Demo hub on Google App Engine](http://pubsubhubbub.appspot.com "Demo hub on Google App Engine") -* [SuperFeedr](http://superfeedr.com/hubbub "SuperFeedr") - -Please contact me if you operate a hub that you would like to be included. - -== Installation == - -1. Upload the `pubsubhubbub` directory to your `/wp-content/plugins/` directory -2. Activate the plugin through the 'Plugins' menu in WordPress -3. Select custom hubs under your PubSubHubbub Settings (optional) - -== Frequently Asked Questions == - -= Where can I learn more about the PubSubHubbub protocol? = - -You can visit [PubSubHubbb on Google Code](http://code.google.com/p/pubsubhubbub/ "PubSubHubbb on Google Code") - -= Where can I learn more about the author of this plugin? = - -You can learn more about [Josh Fraser](http://www.joshfraser.com "Josh Fraser") at [Online Aspect](http://www.onlineaspect.com "Online Aspect") - -== Screenshots == - -1. The PubSubHubbub Settings page allows you to define which hubs you want to use diff --git a/publisher_clients/wordpress/1.2/screenshot-1.png b/publisher_clients/wordpress/1.2/screenshot-1.png deleted file mode 100644 index 8bbeacd..0000000 Binary files a/publisher_clients/wordpress/1.2/screenshot-1.png and /dev/null differ diff --git a/publisher_clients/wordpress/1.3/publisher.php b/publisher_clients/wordpress/1.3/publisher.php deleted file mode 100644 index f176a9b..0000000 --- a/publisher_clients/wordpress/1.3/publisher.php +++ /dev/null @@ -1,86 +0,0 @@ -hub_url = $hub_url; - } - - // accepts either a single url or an array of urls - public function publish_update($topic_urls, $http_function = false) { - if (!isset($topic_urls)) - throw new Exception('Please specify a topic url'); - - // check that we're working with an array - if (!is_array($topic_urls)) { - $topic_urls = array($topic_urls); - } - - // set the mode to publish - $post_string = "hub.mode=publish"; - // loop through each topic url - foreach ($topic_urls as $topic_url) { - - // lightweight check that we're actually working w/ a valid url - if (!preg_match("|^https?://|i",$topic_url)) - throw new Exception('The specified topic url does not appear to be valid: '.$topic_url); - - // append the topic url parameters - $post_string .= "&hub.url=".urlencode($topic_url); - } - - // make the http post request and return true/false - // easy to over-write to use your own http function - if ($http_function) - return $http_function($this->hub_url,$post_string); - else - return $this->http_post($this->hub_url,$post_string); - } - - // returns any error message from the latest request - public function last_response() { - return $this->last_response; - } - - // default http function that uses curl to post to the hub endpoint - private function http_post($url, $post_string) { - - // add any additional curl options here - $options = array(CURLOPT_URL => $url, - CURLOPT_POST => true, - CURLOPT_POSTFIELDS => $post_string, - CURLOPT_USERAGENT => "PubSubHubbub-Publisher-PHP/1.0"); - - $ch = curl_init(); - curl_setopt_array($ch, $options); - - $response = curl_exec($ch); - $this->last_response = $response; - $info = curl_getinfo($ch); - - curl_close($ch); - - // all good - if ($info['http_code'] == 204) - return true; - return false; - } -} - -?> \ No newline at end of file diff --git a/publisher_clients/wordpress/1.3/pubsubhubbub.php b/publisher_clients/wordpress/1.3/pubsubhubbub.php deleted file mode 100755 index f0f5f7f..0000000 --- a/publisher_clients/wordpress/1.3/pubsubhubbub.php +++ /dev/null @@ -1,221 +0,0 @@ -publish_update($feed_urls, "http_post_wp")) { - // TODO: add better error handling here - } - } - return $post_id; -} - -function add_atom_link_tag() { - $hub_urls = get_pubsub_endpoints(); - foreach ($hub_urls as $hub_url) { - echo ''; - } -} - -function add_rss_link_tag() { - $hub_urls = get_pubsub_endpoints(); - foreach ($hub_urls as $hub_url) { - echo ''; - } -} - -function add_rdf_ns_link() { - echo 'xmlns:atom="http://www.w3.org/2005/Atom"'; -} - -// hack to add the atom definition to the RSS feed -// start capturing the feed output. this is run at priority 9 (before output) -function start_rss_link_tag() { - ob_start(); -} - -// this is run at priority 11 (after output) -// add in the xmlns atom definition link -function end_rss_link_tag() { - $feed = ob_get_clean(); - $pattern = '//i'; - $replacement = ''; - // change to - echo preg_replace($pattern, $replacement, $feed); -} - -// add a link to our settings page in the WP menu -function add_plugin_menu() { - add_options_page('PubSubHubbub Settings', 'PubSubHubbub', 8, __FILE__, 'add_settings_page'); -} - -// get the endpoints from the wordpress options table -// valid parameters are "publish" or "subscribe" -function get_pubsub_endpoints() { - $endpoints = get_option('pubsub_endpoints'); - $hub_urls = explode("\n",$endpoints); - - // if no values have been set, revert to the defaults (pubsubhubbub on app engine & superfeedr) - if (!$endpoints) { - $hub_urls[] = "http://pubsubhubbub.appspot.com"; - $hub_urls[] = "http://superfeedr.com/hubbub"; - } - - // clean out any blank values - foreach ($hub_urls as $key => $value) { - if (is_null($value) || $value=="") { - unset($hub_urls[$key]); - } else { - $hub_urls[$key] = trim($hub_urls[$key]); - } - } - - return $hub_urls; -} - -// write the content for our settings page that allows you to define your endpoints -function add_settings_page() { ?> -
-

Define custom hubs

- -
- - - - - - - - - - - - - - - -
Hubs (one per line)
- - - - -

- -

- -
- -

-
- Thanks for using PubSubHubbub!
- Visit these links to learn more about PubSubHubbub and the author of this plugin:
- -
- -
- -agent = "(PubSubHubbub-Publisher-WP/1.0)"; - $snoopy->submit($url,$post_vars); - $response = $snoopy->results; - // TODO: store the last_response. requires a litle refactoring work. - $response_code = $snoopy->response_code; - if ($response_code == 204) - return true; - return false; -} - -// add a settings link next to deactive / edit -function add_settings_link( $links, $file ) { - if( $file == 'pubsubhubbub/pubsubhubbub.php' && function_exists( "admin_url" ) ) { - $settings_link = '' . __('Settings') . ''; - array_unshift( $links, $settings_link ); // before other links - } - return $links; -} - -// attach the handler that gets called every time you publish a post -add_action('publish_post', 'publish_to_hub'); -// add the link to our settings page in the WP menu structure -add_action('admin_menu', 'add_plugin_menu'); - -// keep WPMU happy -add_action('admin_init', 'register_my_settings'); -function register_my_settings() { - register_setting('my_settings_group','pubsub_endpoints'); -} - -// add the link tag that points to the hub in the header of our template... - -// to our atom feed -add_action('atom_head', 'add_atom_link_tag'); -// to our RSS 0.92 feed (requires a bit of a hack to include the ATOM namespace definition) -add_action('do_feed_rss', 'start_rss_link_tag', 9); // run before output -add_action('do_feed_rss', 'end_rss_link_tag', 11); // run after output -add_action('rss_head', 'add_rss_link_tag'); -// to our RDF / RSS 1 feed -add_action('rdf_ns', 'add_rdf_ns_link'); -add_action('rdf_header', 'add_rss_link_tag'); -// to our RSS 2 feed -add_action('rss2_head', 'add_rss_link_tag'); -// to our main HTML header -- not sure if we want to include this long-term or not. -add_action('wp_head', 'add_atom_link_tag'); - -add_filter('plugin_action_links', 'add_settings_link', 10, 2); - -?> \ No newline at end of file diff --git a/publisher_clients/wordpress/1.3/readme.txt b/publisher_clients/wordpress/1.3/readme.txt deleted file mode 100644 index bbc5ac1..0000000 --- a/publisher_clients/wordpress/1.3/readme.txt +++ /dev/null @@ -1,54 +0,0 @@ -=== Plugin Name === -Contributors: joshfraz -Donate link: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5426516 -Tags: pubsubhubbub -Requires at least: 2.5 -Tested up to: 2.9.1 -Stable tag: /trunk/ - -A better way to tell the world when your blog is updated. - -== Description == - -This [PubSubHubbub](http://code.google.com/p/pubsubhubbub/ "PubSubHubbub") plugin is a simple way to let people know in real-time when your blog is updated. PubSubHubbub is quickly gaining adoption and is already being used by Google Reader, Google Alerts, FriendFeed and more. - -This plugin: - -* Now supports multi-user installations! -* Supports multiple hubs! -* Supports all of the feed formats used by WordPress, not just ATOM and RSS2 -* Announces which hubs you are using by adding `` declarations to your template header and ATOM feed -* Adds `` to your RSS feeds along with the necessary XMLNS declaration for RSS 0.92/1.0 - -By default this plugin will ping the following hubs: - -* [Demo hub on Google App Engine](http://pubsubhubbub.appspot.com "Demo hub on Google App Engine") -* [SuperFeedr](http://superfeedr.com/hubbub "SuperFeedr") - -Please contact me if you operate a hub that you would like to be included as a default option. - -== Installation == - -1. Upload the `pubsubhubbub` directory to your `/wp-content/plugins/` directory -2. Activate the plugin through the 'Plugins' menu in WordPress -3. Select custom hubs under your PubSubHubbub Settings (optional) - -Note: PHP 5.0 or better is required. - -== Frequently Asked Questions == - -= Where can I learn more about the PubSubHubbub protocol? = - -You can visit [PubSubHubbb on Google Code](http://code.google.com/p/pubsubhubbub/ "PubSubHubbb on Google Code") - -= Where can I learn more about the author of this plugin? = - -You can learn more about [Josh Fraser](http://www.joshfraser.com "Josh Fraser") at [Online Aspect](http://www.onlineaspect.com "Online Aspect") - -= Does this plugin work with MU? = - -Multi-user support was added in version 1.3 - -== Screenshots == - -1. The PubSubHubbub Settings page allows you to define which hubs you want to use diff --git a/publisher_clients/wordpress/1.3/screenshot-1.png b/publisher_clients/wordpress/1.3/screenshot-1.png deleted file mode 100644 index 8bbeacd..0000000 Binary files a/publisher_clients/wordpress/1.3/screenshot-1.png and /dev/null differ diff --git a/publisher_clients/wordpress/README.txt b/publisher_clients/wordpress/README.txt deleted file mode 100644 index 0e3b7bc..0000000 --- a/publisher_clients/wordpress/README.txt +++ /dev/null @@ -1,13 +0,0 @@ -This Wordpress pluginy for PubSubHubbub was written by Josh Fraser (joshfraser.com) and is released under the GPL License - -More information about this plugin can be found at: -http://wordpress.org/extend/plugins/pubsubhubbub/ - -Please note that this code is included here for reference, but does not necessarily reflect the latest changes to this plugin. For the latest version, please visit http://wordpress.org/extend/plugins/pubsubhubbub/ or download the code at http://svn.wp-plugins.org/pubsubhubbub - -Thanks! -Josh - josh@eventvue.com - - - - diff --git a/pubsubhubbub-core-0.3.html b/pubsubhubbub-core-0.3.html index 3928375..b26296b 100644 --- a/pubsubhubbub-core-0.3.html +++ b/pubsubhubbub-core-0.3.html @@ -154,7 +154,7 @@


PubSubHubbub Core 0.3 -- Working Draft

Abstract

An open, simple, web-scale pubsub protocol, along with an open source - reference implentation targetting Google App Engine. Notably, however, + reference implentation targeting Google App Engine. Notably, however, nothing in the protocol is centralized, or Google- or App Engine-specific. Anybody can play.

@@ -345,7 +345,7 @@

Table of Contents

to know whether or not they'd missed any recent items-- like TCP SACK) MAY be provided as context. Some examples of Atom feed entries follow.

<?xml version="1.0"?>
-<atom:feed>
+<feed xmlns="http://www.w3.org/2005/Atom">
   <!-- Normally here would be source, title, author, id, etc ... -->
 
   <link rel="hub" href="http://myhub.example.com/endpoint" />
@@ -392,7 +392,7 @@ 

Table of Contents

<updated>2008-07-10T12:28:13Z</updated> </entry> -</atom:feed> +</feed>


 TOC 
@@ -410,7 +410,7 @@

Table of Contents

Example:

<?xml version="1.0"?>
-<atom:feed>
+<feed xmlns="http://www.w3.org/2005/Atom">
   <!-- Normally here would be source, title, author, id, etc ... -->
   <link rel="hub" href="https://myhub.example.com/endpoint" />
   <link rel="self" href="http://publisher.example.com/topic.xml" />
@@ -421,11 +421,11 @@ 

Table of Contents

<entry> .... </entry> -</atom:feed> +</feed>

Hubs MUST use the same URL for both the publishing and subscribing interfaces, which is why only a single atom:link - element is required to declare a hub. Publishers SHOULD use HTTPS (Fielding, R., Gettys, J., Mogul, J., Frystyk, H., Masinter, L., Leach, P., and T. Berners-Lee, “Hypertext Transfer Protocol -- HTTP/1.1,” .) [RFC2616] in their hubs' discovery URLs. However, + element is required to declare a hub. Publishers SHOULD use HTTPS (Fielding, R., Gettys, J., Mogul, J., Frystyk, H., Masinter, L., Leach, P., and T. Berners-Lee, “Hypertext Transfer Protocol -- HTTP/1.1,” .) [RFC2616] in their hubs' discovery URLs. However, subscribers that do not support HTTPS (Rescorla, E., “HTTP Over TLS,” May 2000.) [RFC2818] MAY try to fallback to HTTP (Fielding, R., Gettys, J., Mogul, J., Frystyk, H., Masinter, L., Leach, P., and T. Berners-Lee, “Hypertext Transfer Protocol -- HTTP/1.1,” .) [RFC2616], which MAY work depending on the hub's policy. @@ -875,7 +875,7 @@

Table of Contents

This allows the hub to generate a single X-Hub-Signature header to sign the entire payload. Hubs MUST return an error response (4xx, 5xx) for subscription requests with overlapping callback URLs and different secret values. - +

With an aggregated set of feeds, the hub SHOULD reproduce all of the elements from the source feed inside the @@ -892,7 +892,7 @@

Table of Contents

Example aggregated feed:

<?xml version="1.0"?>
-<atom:feed>
+<feed xmlns="http://www.w3.org/2005/Atom">
   <title>Aggregated feed</title>
   <updated>2008-08-11T02:17:44Z</updated>
   <id>http://myhub.example.com/aggregated?1232427842-39823</id>
@@ -931,7 +931,7 @@ 

Table of Contents

</content> </entry> -</atom:feed> +<feed>


 TOC 
@@ -1012,7 +1012,7 @@

9. References

FAQ and other documentation. - +



 TOC 
diff --git a/pubsubhubbub-core-0.4.html b/pubsubhubbub-core-0.4.html new file mode 100644 index 0000000..6f5f641 --- /dev/null +++ b/pubsubhubbub-core-0.4.html @@ -0,0 +1,771 @@ + + + + + + + PubSubHubbub Core 0.4 -- Working Draft + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Network Working GroupB. Fitzpatrick
Internet-DraftB. Slatkin
Intended status: InformationalGoogle, Inc
Expires: August 8, 2014M. Atkins
Six Apart Ltd.
J. Genestoux
Notifixious Inc.
February 4, 2014
+ +

PubSubHubbub Core 0.4 -- Working Draft
+ pubsubhubbub-core-0.4.xml

+ +

+ Abstract +

+

An open, simple, web-scale and decentralized pubsub protocol. Anybody can play.

+

As opposed to more developed (and more complex) pubsub specs like Jabber Publish-Subscribe [XEP-0060] this spec's base profile (the barrier-to-entry to speak it) is dead simple. The fancy bits required for high-volume publishers and subscribers are optional. The base profile is HTTP-based, as opposed to XMPP (see more on this below).

+

To dramatically simplify this spec in several places where we had to choose between supporting A or B, we took it upon ourselves to say "only A", rather than making it an implementation decision.

+

We offer this spec in hopes that it fills a need or at least advances the state of the discussion in the pubsub space. Polling sucks. We think a decentralized pubsub layer is a fundamental, missing layer in the Internet architecture today and its existence, more than just enabling the obvious lower latency feed readers, would enable many cool applications, most of which we can't even imagine. But we're looking forward to decentralized social networking.

+

+ Status of This Memo +

+

This Internet-Draft is submitted in full conformance with the provisions of BCP 78 and BCP 79.

+

Internet-Drafts are working documents of the Internet Engineering Task Force (IETF). Note that other groups may also distribute working documents as Internet-Drafts. The list of current Internet-Drafts is at http://datatracker.ietf.org/drafts/current/.

+

Internet-Drafts are draft documents valid for a maximum of six months and may be updated, replaced, or obsoleted by other documents at any time. It is inappropriate to use Internet-Drafts as reference material or to cite them other than as "work in progress."

+

This Internet-Draft will expire on August 8, 2014.

+

+ Copyright Notice +

+

Copyright (c) 2014 IETF Trust and the persons identified as the document authors. All rights reserved.

+

This document is subject to BCP 78 and the IETF Trust's Legal Provisions Relating to IETF Documents (http://trustee.ietf.org/license-info) in effect on the date of publication of this document. Please review these documents carefully, as they describe your rights and restrictions with respect to this document. Code Components extracted from this document must include Simplified BSD License text as described in Section 4.e of the Trust Legal Provisions and are provided without warranty as described in the Simplified BSD License.

+ + +
+

Table of Contents

+ + +

1. Notation and Conventions

+

The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC2119]. Domain name examples use [RFC2606].

+

2. Definitions

+

+ +

+
Topic:
+
An HTTP [RFC2616] resource URL. The unit to which one can subscribe to changes.
+
Hub ("the hub"):
+
The server (URL [RFC3986]) which implements both sides of this protocol. Any hub MAY implement its own policies on who can use it.
+
Publisher:
+
An owner of a topic. Notifies the hub when the topic feed has been updated. As in almost all pubsub systems, the publisher is unaware of the subscribers, if any. Other pubsub systems might call the publisher the "source".
+
Subscriber:
+
An entity (person or program) that wants to be notified of changes on a topic. The subscriber must be directly network-accessible and is identified by its Subscriber Callback URL.
+
Subscription:
+
A unique relation to a topic by a subscriber that indicates it should receive updates for that topic. A subscription's unique key is the tuple (Topic URL, Subscriber Callback URL). Subscriptions may (at the hub's decision) have expiration times akin to DHCP leases which must be periodically renewed.
+
Subscriber Callback URL:
+
The URL [RFC3986] at which a subscriber wishes to receive notifications.
+
Event:
+
An event that causes updates to multiple topics. For each event that happens (e.g. "Brad posted to the Linux Community."), multiple topics could be affected (e.g. "Brad posted." and "Linux community has new post"). Publisher events cause topics to be updated and the hub looks up all subscriptions for affected topics, sending out notifications to subscribers.
+
Notification:
+
A payload describing how a topic's contents have changed, or the full updated content. Depending on the topic's content type, the difference (or "delta") may be computed by the hub and sent to all subscribers.
+
+

3. High-level protocol flow

+

(This section is non-normative.)

+

+ +

    +
  • Publishers notify their hub(s) URLs when their topic(s) change.
  • +
  • Subscribers POST to one or more of the advertised hubs for a topic they're interested in. Alternatively, some hubs may offer auto-polling capability, to let {their,any} subscribers subscribe to topics which don't advertise a hub.
  • +
  • The hub caches minimal metadata (id, data, entry digest) about each topic's previous state. When the hub re-fetches a topic feed (on its own initiative or as a result of a publisher's ping) and finds a delta, it enqueues a notification to all registered subscribers.
  • +
+

4. Discovery

+

A potential subscriber initiates discovery by retrieving (GET or HEAD request) the topic to which it wants to subscribe. The HTTP [RFC2616] response from the publisher MUST include at least one Link Header [RFC5988] with rel=hub (a hub link header) as well as exactly one Link Header [RFC5988] with rel=self (the self link header). The former MUST indicate the exact URL of a PubSubHubbub hub designated by the publisher. If more than one URL is specified, it is expected that the publisher pings each of these URLs, so the subscriber may subscribe to one or more of these. The latter will point to the permanent URL for the resource being polled.

+

In the absence of HTTP [RFC2616] Link headers, subscribers MAY fall back to other methods to discover the hub(s) and the canonical URI of the topic. If the topic is an XML based feed, it MAY use embedded link elements as described in Appendix B of Web Linking [RFC5988]. Similarly, for HTML pages, it MAY use embedded link elements as described in Appendix A of Web Linking [RFC5988]. Finally, publishers MAY also use the Well-Known Uniform Resource Identifiers [RFC5785] .host-meta to include the <Link> element with rel="hub".

+

5. Subscribing and Unsubscribing

+

Subscribing to a topic URL consists of four parts that may occur immediately in sequence or have a delay.

+

+ +

    +
  • Requesting a subscription using the hub
  • +
  • Validating the subscription with the publisher (OPTIONAL)
  • +
  • Confirming the subscription was actually desired by the subscriber
  • +
  • Periodically reconfirming the subscription is still active (OPTIONAL)
  • +
+

Unsubscribing works in the same way, except with a single parameter changed to indicate the desire to unsubscribe. Also, the Hub will not validate unsubscription requests with the publisher.

+

5.1. Subscriber Sends Subscription Request

+

Subscription is initiated by the subscriber making an HTTPS [RFC2616] or HTTP [RFC2616] POST request to the hub URL. This request has a Content-Type of application/x-www-form-urlencoded (described in Section 17.13.4 of [W3C.REC-html401-19991224]) and the following parameters in its body:

+

+ +

+
hub.callback
+
REQUIRED. The subscriber's callback URL where notifications should be delivered. It is considered good practice to use a unique callback URL for each subscription.
+
hub.mode
+
REQUIRED. The literal string "subscribe" or "unsubscribe", depending on the goal of the request.
+
hub.topic
+
REQUIRED. The topic URL that the subscriber wishes to subscribe to or unsubscribe from.
+
hub.lease_seconds
+
OPTIONAL. Number of seconds for which the subscriber would like to have the subscription active. Hubs MAY choose to respect this value or not, depending on their own policies. This parameter MAY be present for unsubscription requests and MUST be ignored by the hub in that case.
+
hub.secret
+
OPTIONAL. A subscriber-provided secret string that will be used to compute an HMAC digest for authorized content distribution [authednotify]. If not supplied, the HMAC digest will not be present for content distribution requests. This parameter SHOULD only be specified when the request was made over HTTPS [RFC2818]. This parameter MUST be less than 200 bytes in length.
+
+

Subscribers MAY also include additional HTTP [RFC2616] request parameters, as well as HTTP [RFC2616] Headers if they are required by the hub. In the context of social web applications, it is considered good practice to include a From HTTP [RFC2616] header (as described in section 14.22 of Hypertext Transfer Protocol [RFC2616]) to indicate on behalf of which user the subscription is being performed.

+

Hubs MUST ignore additional request parameters they do not understand.

+

Hubs MUST allow subscribers to re-request subscriptions that are already activated. Each subsequent request to a hub to subscribe or unsubscribe MUST override the previous subscription state for a specific topic URL and callback URL combination once the action is verified. Any failures to confirm the subscription action MUST leave the subscription state unchanged. This is required so subscribers can renew their subscriptions before the lease seconds period is over without any interruption.

+

5.1.1. Subscription Parameter Details

+

The topic and callback URLs MAY use HTTP [RFC2616] or HTTPS [RFC2818] schemes. The topic URL MUST be the one advertised by the publisher in a Self Link Header during the discovery phase. (See Section 4). Hubs MAY refuse subscriptions if the topic URL does not correspond to the one advertised by the publisher. The topic URL can otherwise be free-form following the URI spec [RFC3986]. Hubs MUST always decode non-reserved characters for these URL parameters; see section 2.4 on "When to Encode or Decode" in the URI spec [RFC3986].

+

The callback URL MAY contain arbitrary query string parameters (e.g., ?foo=bar&red=fish). Hubs MUST preserve the query string during subscription verification by appending new parameters to the end of the list using the & (ampersand) character to join. Existing parameters with names that overlap with those used by verification requests will not be overwritten. For event notification, the callback URL will be POSTed to including any query-string parameters in the URL portion of the request, not as POST body parameters.

+

5.1.2. Subscription Response Details

+

The hub MUST respond to a subscription request with an HTTP [RFC2616] 202 "Accepted" response to indicate that the request was received and will now be verified (Section 5.3) and validated (Section 5.2) by the hub. The hub SHOULD perform the verification and validation of intent as soon as possible.

+

If a hub finds any errors in the subscription request, an appropriate HTTP [RFC2616] error response code (4xx or 5xx) MUST be returned. In the event of an error, hubs SHOULD return a description of the error in the response body as plain text. Hubs MAY decide to reject some callback URLs or topic URLs based on their own policies (e.g., domain authorization, topic URL port numbers).

+

5.2. Subscription Validation

+

Subscriptions MAY be validated by the Hubs who may require more details to accept or refuse a subscription. The Hub MAY also check with the publisher whether the subscription should be accepted.

+

If (and when), the subscription is accepted, the hub MUST perform the verification of intent [verifysub] of the subscriber.

+

If (and when), the subscription is denied, the hub MUST inform the subscriber by sending an HTTP [RFC2616] GET request to the subscriber's callback URL as given in the subscription request. This request has the following query string arguments appended (format described in Section 17.13.4 of [W3C.REC-html401-19991224]):

+

+ +

+
hub.mode
+
REQUIRED. The literal string "denied".
+
hub.topic
+
REQUIRED. The topic URL given in the corresponding subscription request.
+
hub.reason
+
OPTIONAL. The hub may include a reason for which the subscription has been denied.
+
+

Hubs may provide an additional HTTP [RFC2616] Location header (as described in section 14.30 of Hypertext Transfer Protocol [RFC2616]) to indicate that the subscriber may retry subscribing to a different hub.topic. This allows for limited distribution to specific groups or users in the context of social web applications.

+

The subscription MAY be denied by the hub at any point (even if it was previously accepted). The Subscriber SHOULD then consider that the subscription is not possible anymore.

+

5.3. Hub Verifies Intent of the Subscriber

+

In order to prevent an attacker from creating unwanted subscriptions on behalf of a subscriber (or unsubscribing desired ones), a hub must ensure that the subscriber did indeed send the subscription request.

+

The hub verifies a subscription request by sending an HTTP [RFC2616] GET request to the subscriber's callback URL as given in the subscription request. This request has the following query string arguments appended (format described in Section 17.13.4 of [W3C.REC-html401-19991224]):

+

+ +

+
hub.mode
+
REQUIRED. The literal string "subscribe" or "unsubscribe", which matches the original request to the hub from the subscriber.
+
hub.topic
+
REQUIRED. The topic URL given in the corresponding subscription request.
+
hub.challenge
+
REQUIRED. A hub-generated, random string that MUST be echoed by the subscriber to verify the subscription.
+
hub.lease_seconds
+
REQUIRED/OPTIONAL. The hub-determined number of seconds that the subscription will stay active before expiring, measured from the time the verification request was made from the hub to the subscriber. Hubs MUST supply this parameter for subscription requests. This parameter MAY be present for unsubscribe requests and MUST be ignored by subscribers during unsubscription.
+
+

5.3.1. Verification Details

+

The subscriber MUST confirm that the hub.topic corresponds to a pending subscription or unsubscription that it wishes to carry out. If so, the subscriber MUST respond with an HTTP success (2xx) code with a response body equal to the hub.challenge parameter. If the subscriber does not agree with the action, the subscriber MUST respond with a 404 "Not Found" response.

+

The hub MUST consider other server response codes (3xx, 4xx, 5xx) to mean that the verification request has failed. If the subscriber returns an HTTP [RFC2616] success (2xx) but the content body does not match the hub.challenge parameter, the hub MUST also consider verification to have failed.

+

Hubs MAY make the hub.lease_seconds equal to the value the subscriber passed in their subscription request but MAY change the value depending on the hub's policies. To sustain a subscription, the subscriber MUST re-request the subscription on the hub before hub.lease_seconds seconds has elapsed.

+

6. Publishing

+

The publisher MUST inform the hubs it previously designated when a topic has been updated. The hub and the publisher can agree on any mechanism, as long as the hub is eventually able send the updated payload to the subscribers.

+

7. Content Distribution

+

A content distribution request is an HTTP [RFC2616] POST request from hub to the subscriber's callback URL with the payload of the notification. This request MUST have a Content-Type corresponding to the type of the topic. The hub MAY reduce the payload to a diff between two consecutive versions if its format allows it.

+

The request MUST include a Link Header [RFC5988] with rel=hub pointing to the Hub as well as a Link Header [RFC5988] with rel=self set to the topic that's being updated. The Hub SHOULD combine both headers into a single Link Header [RFC5988].

+

The successful response from the subscriber's callback URL MUST be an HTTP [RFC2616] success (2xx) code. The hub MUST consider all other subscriber response codes as failures; that means subscribers MUST NOT use HTTP redirects for moving subscriptions. The response body from the subscriber MUST be ignored by the hub. Hubs SHOULD retry notifications repeatedly until successful (up to some reasonable maximum over a reasonable time period). Subscribers SHOULD respond to notifications as quickly as possible; their success response code SHOULD only indicate receipt of the message, not acknowledgment that it was successfully processed by the subscriber.

+

8. Authenticated Content Distribution

+

If the subscriber supplied a value for hub.secret in their subscription request, the hub MUST generate an HMAC signature of the payload and include that signature in the request headers of the content distribution request. The X-Hub-Signature header's value MUST be in the form sha1=signature where signature is a 40-byte, hexadecimal representation of a SHA1 signature [RFC3174]. The signature MUST be computed using the HMAC algorithm [RFC2104] with the request body as the data and the hub.secret as the key.

+

When subscribers receive a content distribution request with the X-Hub-Signature header specified, they SHOULD recompute the SHA1 signature with the shared secret using the same method as the hub. If the signature does not match, subscribers MUST still return a 2xx success response to acknowledge receipt, but locally ignore the message as invalid. Using this technique along with HTTPS [RFC2818] for subscription requests enables simple subscribers to receive authenticated notifications from hubs without the need for subscribers to run an HTTPS [RFC2818] server.

+

Please note however that this signature only ensures that the payload was not forged. Since the notification also includes headers, these should not be considered as safe by the subscriber, unless of course the subscriber uses HTTPS [RFC2818] callbacks.

+

9. References

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ [RFC2104] + Krawczyk, H., Bellare, M. and R. Canetti, "HMAC: Keyed-Hashing for Message Authentication", RFC 2104, Feb 1997.
+ [RFC2119] + Bradner, B., "Key words for use in RFCs to Indicate Requirement Levels", RFC 2119, Mar 1997.
+ [RFC2606] + Eastlake, D. and A. Panitz, "Reserved Top Level DNS Names", RFC 2606, Jun 1999.
+ [RFC2616] + Fielding, R., Gettys, J., Mogul, J., Frystyk, H., Masinter, L., Leach, P. and T. Berners-Lee, "Hypertext Transfer Protocol -- HTTP/1.1", RFC 2616, Jun 1999.
+ [RFC2818] + Rescorla, E., "HTTP Over TLS", RFC 2818, May 2000.
+ [RFC3174] + Eastlake, D. and P. Jones, "US Secure Hash Algorithm 1 (SHA1)", RFC 3174, September 2001.
+ [RFC3986] + Berners-Lee, T., "Uniform Resource Identifiers (URI): Generic Syntax", RFC 3986, Jan 2005.
+ [RFC5785] + Nottingham, M. and E. Hammer-Lahav, "Defining Well-Known Uniform Resource Identifiers (URIs)", RFC 5785, Apr 2010.
+ [RFC5988] + Nottingham, M., "Web Linking", RFC 5988, October 2010.
+ [W3C.REC-html401-19991224] + Raggett, D., Hors, A. and I. Jacobs, "HTML 4.01 Specification", World Wide Web Consortium Recommendation REC-html401-19991224, December 1999.
+ [XEP-0060] + Millard, P., Saint-Andre, P. and R. Meijer, "Publish-Subscribe", XSF XEP 0060, Jul 2010.
+

Appendix A. Specification Feedback

+

Feedback on this specification is welcomed via the PubSubHubbub W3C Community Group.For more information, see the W3C PubSubHubbub Community Group.

+

+ Authors' Addresses +

+
+
+ + Brad Fitzpatrick + + + Google, Inc + + + + + + + + + + EMail: brad@danga.com + +
+
+
+ + Brett Slatkin + + + Google, Inc + + + + + + + + + + EMail: bslatkin@gmail.com + +
+
+
+ + Martin Atkins + + + Six Apart Ltd. + + + + + + + + + + EMail: mart@degeneration.co.uk + +
+
+
+ + Julien Genestoux + + + Notifixious Inc. + + + + + + + + + + EMail: julien@superfeedr.com + +
+
+ + + diff --git a/pubsubhubbub-core-0.4.xml b/pubsubhubbub-core-0.4.xml new file mode 100644 index 0000000..590123a --- /dev/null +++ b/pubsubhubbub-core-0.4.xml @@ -0,0 +1,338 @@ + + + + + + + + + + + + + PubSubHubbub Core 0.4 -- Working Draft + + Google, Inc +
+ brad@danga.com +
+
+ + Google, Inc +
+ bslatkin@gmail.com +
+
+ + Six Apart Ltd. +
+ mart@degeneration.co.uk +
+
+ + Notifixious Inc. +
+ julien@superfeedr.com +
+
+ + + An open, simple, web-scale and decentralized pubsub protocol. Anybody can play. + As opposed to more developed (and more complex) pubsub specs like Jabber Publish-Subscribe this spec's base profile (the barrier-to-entry to speak it) is dead simple. The fancy bits required for high-volume publishers and subscribers are optional. The base profile is HTTP-based, as opposed to XMPP (see more on this below). + To dramatically simplify this spec in several places where we had to choose between supporting A or B, we took it upon ourselves to say "only A", rather than making it an implementation decision. + We offer this spec in hopes that it fills a need or at least advances the state of the discussion in the pubsub space. Polling sucks. We think a decentralized pubsub layer is a fundamental, missing layer in the Internet architecture today and its existence, more than just enabling the obvious lower latency feed readers, would enable many cool applications, most of which we can't even imagine. But we're looking forward to decentralized social networking. + +
+ +
+ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in . Domain name examples use . +
+
+ + + An HTTP resource URL. The unit to which one can subscribe to changes. + The server (URL) which implements both sides of this protocol. Any hub MAY implement its own policies on who can use it. + An owner of a topic. Notifies the hub when the topic feed has been updated. As in almost all pubsub systems, the publisher is unaware of the subscribers, if any. Other pubsub systems might call the publisher the "source". + An entity (person or program) that wants to be notified of changes on a topic. The subscriber must be directly network-accessible and is identified by its Subscriber Callback URL. + A unique relation to a topic by a subscriber that indicates it should receive updates for that topic. A subscription's unique key is the tuple (Topic URL, Subscriber Callback URL). Subscriptions may (at the hub's decision) have expiration times akin to DHCP leases which must be periodically renewed. + The URL at which a subscriber wishes to receive notifications. + An event that causes updates to multiple topics. For each event that happens (e.g. "Brad posted to the Linux Community."), multiple topics could be affected (e.g. "Brad posted." and "Linux community has new post"). Publisher events cause topics to be updated and the hub looks up all subscriptions for affected topics, sending out notifications to subscribers. + A payload describing how a topic's contents have changed, or the full updated content. Depending on the topic's content type, the difference (or "delta") may be computed by the hub and sent to all subscribers. + + +
+
+ (This section is non-normative.) + + + Publishers notify their hub(s) URLs when their topic(s) change. + Subscribers POST to one or more of the advertised hubs for a topic they're interested in. Alternatively, some hubs may offer auto-polling capability, to let {their,any} subscribers subscribe to topics which don't advertise a hub. + The hub caches minimal metadata (id, data, entry digest) about each topic's previous state. When the hub re-fetches a topic feed (on its own initiative or as a result of a publisher's ping) and finds a delta, it enqueues a notification to all registered subscribers. + + +
+
+ A potential subscriber initiates discovery by retrieving (GET or HEAD request) the topic to which it wants to subscribe. The HTTP response from the publisher MUST include at least one Link Header with rel=hub (a hub link header) as well as exactly one Link Header with rel=self (the self link header). The former MUST indicate the exact URL of a PubSubHubbub hub designated by the publisher. If more than one URL is specified, it is expected that the publisher pings each of these URLs, so the subscriber may subscribe to one or more of these. The latter will point to the permanent URL for the resource being polled. + In the absence of HTTP Link headers, subscribers MAY fall back to other methods to discover the hub(s) and the canonical URI of the topic. If the topic is an XML based feed, it MAY use embedded link elements as described in Appendix B of Web Linking. Similarly, for HTML pages, it MAY use embedded link elements as described in Appendix A of Web Linking. Finally, publishers MAY also use the Well-Known Uniform Resource Identifiers .host-meta to include the <Link> element with rel="hub". +
+
+ Subscribing to a topic URL consists of four parts that may occur immediately in sequence or have a delay. + + + Requesting a subscription using the hub + Validating the subscription with the publisher (OPTIONAL) + Confirming the subscription was actually desired by the subscriber + Periodically reconfirming the subscription is still active (OPTIONAL) + + + Unsubscribing works in the same way, except with a single parameter changed to indicate the desire to unsubscribe. Also, the Hub will not validate unsubscription requests with the publisher. +
+ Subscription is initiated by the subscriber making an HTTPS or HTTP POST request to the hub URL. This request has a Content-Type of application/x-www-form-urlencoded (described in Section 17.13.4 of ) and the following parameters in its body: + + + REQUIRED. The subscriber's callback URL where notifications should be delivered. It is considered good practice to use a unique callback URL for each subscription. + REQUIRED. The literal string "subscribe" or "unsubscribe", depending on the goal of the request. + REQUIRED. The topic URL that the subscriber wishes to subscribe to or unsubscribe from. + OPTIONAL. Number of seconds for which the subscriber would like to have the subscription active. Hubs MAY choose to respect this value or not, depending on their own policies. This parameter MAY be present for unsubscription requests and MUST be ignored by the hub in that case. + OPTIONAL. A subscriber-provided secret string that will be used to compute an HMAC digest for authorized content distribution. If not supplied, the HMAC digest will not be present for content distribution requests. This parameter SHOULD only be specified when the request was made over HTTPS. This parameter MUST be less than 200 bytes in length. + + + Subscribers MAY also include additional HTTP request parameters, as well as HTTP Headers if they are required by the hub. + Hubs MUST ignore additional request parameters they do not understand. + Hubs MUST allow subscribers to re-request subscriptions that are already activated. Each subsequent request to a hub to subscribe or unsubscribe MUST override the previous subscription state for a specific topic URL and callback URL combination once the action is verified. Any failures to confirm the subscription action MUST leave the subscription state unchanged. This is required so subscribers can renew their subscriptions before the lease seconds period is over without any interruption. +
+ The topic and callback URLs MAY use HTTP or HTTPS schemes. The topic URL MUST be the one advertised by the publisher during the discovery phase. (See ). Hubs MAY refuse subscriptions if the topic URL does not correspond to the one advertised by the publisher. The topic URL can otherwise be free-form following the URI spec. Hubs MUST always decode non-reserved characters for these URL parameters; see section 2.4 on "When to Encode or Decode" in the URI spec. + The callback URL MAY contain arbitrary query string parameters (e.g., ?foo=bar&red=fish). Hubs MUST preserve the query string during subscription verification by appending new parameters to the end of the list using the & (ampersand) character to join. Existing parameters with names that overlap with those used by verification requests will not be overwritten. For event notification, the callback URL will be POSTed to including any query-string parameters in the URL portion of the request, not as POST body parameters. +
+
+ The hub MUST respond to a subscription request with an HTTP 202 "Accepted" response to indicate that the request was received and will now be verified () and validated () by the hub. The hub SHOULD perform the verification and validation of intent as soon as possible. + If a hub finds any errors in the subscription request, an appropriate HTTP error response code (4xx or 5xx) MUST be returned. In the event of an error, hubs SHOULD return a description of the error in the response body as plain text. Hubs MAY decide to reject some callback URLs or topic URLs based on their own policies (e.g., domain authorization, topic URL port numbers). +
+
+
+ Subscriptions MAY be validated by the Hubs who may require more details to accept or refuse a subscription. The Hub MAY also check with the publisher whether the subscription should be accepted. + If (and when), the subscription is accepted, the hub MUST perform the verification of intent of the subscriber. + If (and when), the subscription is denied, the hub MUST inform the subscriber by sending an HTTP GET request to the subscriber's callback URL as given in the subscription request. This request has the following query string arguments appended (format described in Section 17.13.4 of ): + + + REQUIRED. The literal string "denied". + REQUIRED. The topic URL given in the corresponding subscription request. + OPTIONAL. The hub may include a reason for which the subscription has been denied. + + + Hubs may provide an additional HTTP Location header (as described in section 14.30 of Hypertext Transfer Protocol) to indicate that the subscriber may retry subscribing to a different hub.topic. This allows for limited distribution to specific groups or users in the context of social web applications. + The subscription MAY be denied by the hub at any point (even if it was previously accepted). The Subscriber SHOULD then consider that the subscription is not possible anymore. +
+
+ In order to prevent an attacker from creating unwanted subscriptions on behalf of a subscriber (or unsubscribing desired ones), a hub MUST ensure that the subscriber did indeed send the subscription request. + The hub verifies a subscription request by sending an HTTP GET request to the subscriber's callback URL as given in the subscription request. This request has the following query string arguments appended (format described in Section 17.13.4 of ): + + + REQUIRED. The literal string "subscribe" or "unsubscribe", which matches the original request to the hub from the subscriber. + REQUIRED. The topic URL given in the corresponding subscription request. + REQUIRED. A hub-generated, random string that MUST be echoed by the subscriber to verify the subscription. + REQUIRED/OPTIONAL. The hub-determined number of seconds that the subscription will stay active before expiring, measured from the time the verification request was made from the hub to the subscriber. Hubs MUST supply this parameter for subscription requests. This parameter MAY be present for unsubscribe requests and MUST be ignored by subscribers during unsubscription. + + +
+ The subscriber MUST confirm that the hub.topic corresponds to a pending subscription or unsubscription that it wishes to carry out. If so, the subscriber MUST respond with an HTTP success (2xx) code with a response body equal to the hub.challenge parameter. If the subscriber does not agree with the action, the subscriber MUST respond with a 404 "Not Found" response. + The hub MUST consider other server response codes (3xx, 4xx, 5xx) to mean that the verification request has failed. If the subscriber returns an HTTP success (2xx) but the content body does not match the hub.challenge parameter, the hub MUST also consider verification to have failed. + Hubs MAY make the hub.lease_seconds equal to the value the subscriber passed in their subscription request but MAY change the value depending on the hub's policies. To sustain a subscription, the subscriber MUST re-request the subscription on the hub before hub.lease_seconds seconds has elapsed. +
+
+
+
+ The publisher MUST inform the hubs it previously designated when a topic has been updated. The hub and the publisher can agree on any mechanism, as long as the hub is eventually able send the updated payload to the subscribers. +
+
+ A content distribution request is an HTTP POST request from hub to the subscriber's callback URL with the payload of the notification. This request MUST have a Content-Type corresponding to the type of the topic. The hub MAY reduce the payload to a diff between two consecutive versions if its format allows it. + The request MUST include a Link Header with rel=hub pointing to the Hub as well as a Link Header with rel=self set to the topic that's being updated. The Hub SHOULD combine both headers into a single Link Header. + The successful response from the subscriber's callback URL MUST be an HTTP success (2xx) code. The hub MUST consider all other subscriber response codes as failures; that means subscribers MUST NOT use HTTP redirects for moving subscriptions. The response body from the subscriber MUST be ignored by the hub. Hubs SHOULD retry notifications repeatedly until successful (up to some reasonable maximum over a reasonable time period). Subscribers SHOULD respond to notifications as quickly as possible; their success response code SHOULD only indicate receipt of the message, not acknowledgment that it was successfully processed by the subscriber. +
+
+ If the subscriber supplied a value for hub.secret in their subscription request, the hub MUST generate an HMAC signature of the payload and include that signature in the request headers of the content distribution request. The X-Hub-Signature header's value MUST be in the form sha1=signature where signature is a 40-byte, hexadecimal representation of a SHA1 signature. The signature MUST be computed using the HMAC algorithm with the request body as the data and the hub.secret as the key. + When subscribers receive a content distribution request with the X-Hub-Signature header specified, they SHOULD recompute the SHA1 signature with the shared secret using the same method as the hub. If the signature does not match, subscribers MUST still return a 2xx success response to acknowledge receipt, but locally ignore the message as invalid. Using this technique along with HTTPS for subscription requests enables simple subscribers to receive authenticated notifications from hubs without the need for subscribers to run an HTTPS server. + Please note however that this signature only ensures that the payload was not forged. Since the notification also includes headers, these should not be considered as safe by the subscriber, unless of course the subscriber uses HTTPS callbacks. +
+
+ + + + + HMAC: Keyed-Hashing for Message Authentication + + IBM, T.J. Watson Research Center + + + University of California at San Diego, Dept of Computer Science and Engineering + + + IBM T.J. Watson Research Center + + + + + + + + Reserved Top Level DNS Names + + + + + + + + + + + + + Key words for use in RFCs to Indicate Requirement Levels + + Alis Technologies + + + + + + + + Hypertext Transfer Protocol -- HTTP/1.1 + + UC Irvine + + + Compaq/W3C + + + Compaq + + + W3C/MIT + + + Xerox + + + Microsoft + + + W3C/MIT + + + + + + + + HTTP Over TLS + + + + + + This memo describes how to use Transport Layer Security (TLS) to secure Hypertext Transfer Protocol (HTTP) connections over the Internet.This memo provides information for the Internet community. + + + + + + + + Web Linking + + + + + + This document specifies relation types for Web links, and defines a registry for them.It also defines the use of such links in HTTP headers with the Link header field. + + + + + + + + US Secure Hash Algorithm 1 (SHA1) + + + + + + + + + The purpose of this document is to make the SHA-1 (Secure Hash Algorithm 1) hash algorithm conveniently available to the Internet community.This memo provides information for the Internet community. + + + + + + + + Uniform Resource Identifiers (URI): Generic Syntax + + + + + + + + + + Defining Well-Known Uniform Resource Identifiers (URIs) + + + + + + + + + + + + + HTML 4.01 Specification + + + + + + + + + + + + + + + + + Publish-Subscribe + + + + + + + + + + + + + + + +
+ Feedback on this specification is welcomed via the PubSubHubbub W3C Community Group.For more information, see the W3C PubSubHubbub Community Group. +
+
+
diff --git a/pubsubhubbub-core.xml b/pubsubhubbub-core.xml index 0d27bb7..4b440f3 100644 --- a/pubsubhubbub-core.xml +++ b/pubsubhubbub-core.xml @@ -2,7 +2,7 @@ - pass - - def handle_pi(self, text): - # called for each processing instruction, e.g. - pass - - def handle_decl(self, text): - pass - - def parse_declaration(self, i): - # override internal declaration handler to handle CDATA blocks - if _debug: sys.stderr.write('entering parse_declaration\n') - if self.rawdata[i:i+9] == '', i) - if k == -1: k = len(self.rawdata) - self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) - return k+3 - else: - k = self.rawdata.find('>', i) - return k+1 - - def mapContentType(self, contentType): - contentType = contentType.lower() - if contentType == 'text': - contentType = 'text/plain' - elif contentType == 'html': - contentType = 'text/html' - elif contentType == 'xhtml': - contentType = 'application/xhtml+xml' - return contentType - - def trackNamespace(self, prefix, uri): - loweruri = uri.lower() - if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: - self.version = 'rss090' - if loweruri == 'http://purl.org/rss/1.0/' and not self.version: - self.version = 'rss10' - if loweruri == 'http://www.w3.org/2005/atom' and not self.version: - self.version = 'atom10' - if loweruri.find('backend.userland.com/rss') <> -1: - # match any backend.userland.com namespace - uri = 'http://backend.userland.com/rss' - loweruri = uri - if self._matchnamespaces.has_key(loweruri): - self.namespacemap[prefix] = self._matchnamespaces[loweruri] - self.namespacesInUse[self._matchnamespaces[loweruri]] = uri - else: - self.namespacesInUse[prefix or ''] = uri - - def resolveURI(self, uri): - return _urljoin(self.baseuri or '', uri) - - def decodeEntities(self, element, data): - return data - - def push(self, element, expectingText): - self.elementstack.append([element, expectingText, []]) - - def pop(self, element, stripWhitespace=1): - if not self.elementstack: return - if self.elementstack[-1][0] != element: return - - element, expectingText, pieces = self.elementstack.pop() - output = ''.join(pieces) - if stripWhitespace: - output = output.strip() - if not expectingText: return output - - # decode base64 content - if base64 and self.contentparams.get('base64', 0): - try: - output = base64.decodestring(output) - except binascii.Error: - pass - except binascii.Incomplete: - pass - - # resolve relative URIs - if (element in self.can_be_relative_uri) and output: - output = self.resolveURI(output) - - # decode entities within embedded markup - if not self.contentparams.get('base64', 0): - output = self.decodeEntities(element, output) - - # remove temporary cruft from contentparams - try: - del self.contentparams['mode'] - except KeyError: - pass - try: - del self.contentparams['base64'] - except KeyError: - pass - - # resolve relative URIs within embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: - if element in self.can_contain_relative_uris: - output = _resolveRelativeURIs(output, self.baseuri, self.encoding) - - # sanitize embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: - if element in self.can_contain_dangerous_markup: - output = _sanitizeHTML(output, self.encoding) - - if self.encoding and type(output) != type(u''): - try: - output = unicode(output, self.encoding) - except: - pass - - # categories/tags/keywords/whatever are handled in _end_category - if element == 'category': - return output - - # store output in appropriate place(s) - if self.inentry and not self.insource: - if element == 'content': - self.entries[-1].setdefault(element, []) - contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - self.entries[-1][element].append(contentparams) - elif element == 'link': - self.entries[-1][element] = output - if output: - self.entries[-1]['links'][-1]['href'] = output - else: - if element == 'description': - element = 'summary' - self.entries[-1][element] = output - if self.incontent: - contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - self.entries[-1][element + '_detail'] = contentparams - elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): - context = self._getContext() - if element == 'description': - element = 'subtitle' - context[element] = output - if element == 'link': - context['links'][-1]['href'] = output - elif self.incontent: - contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - context[element + '_detail'] = contentparams - return output - - def pushContent(self, tag, attrsD, defaultContentType, expectingText): - self.incontent += 1 - self.contentparams = FeedParserDict({ - 'type': self.mapContentType(attrsD.get('type', defaultContentType)), - 'language': self.lang, - 'base': self.baseuri}) - self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) - self.push(tag, expectingText) - - def popContent(self, tag): - value = self.pop(tag) - self.incontent -= 1 - self.contentparams.clear() - return value - - def _mapToStandardPrefix(self, name): - colonpos = name.find(':') - if colonpos <> -1: - prefix = name[:colonpos] - suffix = name[colonpos+1:] - prefix = self.namespacemap.get(prefix, prefix) - name = prefix + ':' + suffix - return name - - def _getAttribute(self, attrsD, name): - return attrsD.get(self._mapToStandardPrefix(name)) - - def _isBase64(self, attrsD, contentparams): - if attrsD.get('mode', '') == 'base64': - return 1 - if self.contentparams['type'].startswith('text/'): - return 0 - if self.contentparams['type'].endswith('+xml'): - return 0 - if self.contentparams['type'].endswith('/xml'): - return 0 - return 1 - - def _itsAnHrefDamnIt(self, attrsD): - href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) - if href: - try: - del attrsD['url'] - except KeyError: - pass - try: - del attrsD['uri'] - except KeyError: - pass - attrsD['href'] = href - return attrsD - - def _save(self, key, value): - context = self._getContext() - context.setdefault(key, value) - - def _start_rss(self, attrsD): - versionmap = {'0.91': 'rss091u', - '0.92': 'rss092', - '0.93': 'rss093', - '0.94': 'rss094'} - if not self.version: - attr_version = attrsD.get('version', '') - version = versionmap.get(attr_version) - if version: - self.version = version - elif attr_version.startswith('2.'): - self.version = 'rss20' - else: - self.version = 'rss' - - def _start_dlhottitles(self, attrsD): - self.version = 'hotrss' - - def _start_channel(self, attrsD): - self.infeed = 1 - self._cdf_common(attrsD) - _start_feedinfo = _start_channel - - def _cdf_common(self, attrsD): - if attrsD.has_key('lastmod'): - self._start_modified({}) - self.elementstack[-1][-1] = attrsD['lastmod'] - self._end_modified() - if attrsD.has_key('href'): - self._start_link({}) - self.elementstack[-1][-1] = attrsD['href'] - self._end_link() - - def _start_feed(self, attrsD): - self.infeed = 1 - versionmap = {'0.1': 'atom01', - '0.2': 'atom02', - '0.3': 'atom03'} - if not self.version: - attr_version = attrsD.get('version') - version = versionmap.get(attr_version) - if version: - self.version = version - else: - self.version = 'atom' - - def _end_channel(self): - self.infeed = 0 - _end_feed = _end_channel - - def _start_image(self, attrsD): - self.inimage = 1 - self.push('image', 0) - context = self._getContext() - context.setdefault('image', FeedParserDict()) - - def _end_image(self): - self.pop('image') - self.inimage = 0 - - def _start_textinput(self, attrsD): - self.intextinput = 1 - self.push('textinput', 0) - context = self._getContext() - context.setdefault('textinput', FeedParserDict()) - _start_textInput = _start_textinput - - def _end_textinput(self): - self.pop('textinput') - self.intextinput = 0 - _end_textInput = _end_textinput - - def _start_author(self, attrsD): - self.inauthor = 1 - self.push('author', 1) - _start_managingeditor = _start_author - _start_dc_author = _start_author - _start_dc_creator = _start_author - _start_itunes_author = _start_author - - def _end_author(self): - self.pop('author') - self.inauthor = 0 - self._sync_author_detail() - _end_managingeditor = _end_author - _end_dc_author = _end_author - _end_dc_creator = _end_author - _end_itunes_author = _end_author - - def _start_itunes_owner(self, attrsD): - self.inpublisher = 1 - self.push('publisher', 0) - - def _end_itunes_owner(self): - self.pop('publisher') - self.inpublisher = 0 - self._sync_author_detail('publisher') - - def _start_contributor(self, attrsD): - self.incontributor = 1 - context = self._getContext() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('contributor', 0) - - def _end_contributor(self): - self.pop('contributor') - self.incontributor = 0 - - def _start_dc_contributor(self, attrsD): - self.incontributor = 1 - context = self._getContext() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('name', 0) - - def _end_dc_contributor(self): - self._end_name() - self.incontributor = 0 - - def _start_name(self, attrsD): - self.push('name', 0) - _start_itunes_name = _start_name - - def _end_name(self): - value = self.pop('name') - if self.inpublisher: - self._save_author('name', value, 'publisher') - elif self.inauthor: - self._save_author('name', value) - elif self.incontributor: - self._save_contributor('name', value) - elif self.intextinput: - context = self._getContext() - context['textinput']['name'] = value - _end_itunes_name = _end_name - - def _start_width(self, attrsD): - self.push('width', 0) - - def _end_width(self): - value = self.pop('width') - try: - value = int(value) - except: - value = 0 - if self.inimage: - context = self._getContext() - context['image']['width'] = value - - def _start_height(self, attrsD): - self.push('height', 0) - - def _end_height(self): - value = self.pop('height') - try: - value = int(value) - except: - value = 0 - if self.inimage: - context = self._getContext() - context['image']['height'] = value - - def _start_url(self, attrsD): - self.push('href', 1) - _start_homepage = _start_url - _start_uri = _start_url - - def _end_url(self): - value = self.pop('href') - if self.inauthor: - self._save_author('href', value) - elif self.incontributor: - self._save_contributor('href', value) - elif self.inimage: - context = self._getContext() - context['image']['href'] = value - elif self.intextinput: - context = self._getContext() - context['textinput']['link'] = value - _end_homepage = _end_url - _end_uri = _end_url - - def _start_email(self, attrsD): - self.push('email', 0) - _start_itunes_email = _start_email - - def _end_email(self): - value = self.pop('email') - if self.inpublisher: - self._save_author('email', value, 'publisher') - elif self.inauthor: - self._save_author('email', value) - elif self.incontributor: - self._save_contributor('email', value) - _end_itunes_email = _end_email - - def _getContext(self): - if self.insource: - context = self.sourcedata - elif self.inentry: - context = self.entries[-1] - else: - context = self.feeddata - return context - - def _save_author(self, key, value, prefix='author'): - context = self._getContext() - context.setdefault(prefix + '_detail', FeedParserDict()) - context[prefix + '_detail'][key] = value - self._sync_author_detail() - - def _save_contributor(self, key, value): - context = self._getContext() - context.setdefault('contributors', [FeedParserDict()]) - context['contributors'][-1][key] = value - - def _sync_author_detail(self, key='author'): - context = self._getContext() - detail = context.get('%s_detail' % key) - if detail: - name = detail.get('name') - email = detail.get('email') - if name and email: - context[key] = '%s (%s)' % (name, email) - elif name: - context[key] = name - elif email: - context[key] = email - else: - author = context.get(key) - if not author: return - emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) - if not emailmatch: return - email = emailmatch.group(0) - # probably a better way to do the following, but it passes all the tests - author = author.replace(email, '') - author = author.replace('()', '') - author = author.strip() - if author and (author[0] == '('): - author = author[1:] - if author and (author[-1] == ')'): - author = author[:-1] - author = author.strip() - context.setdefault('%s_detail' % key, FeedParserDict()) - context['%s_detail' % key]['name'] = author - context['%s_detail' % key]['email'] = email - - def _start_subtitle(self, attrsD): - self.pushContent('subtitle', attrsD, 'text/plain', 1) - _start_tagline = _start_subtitle - _start_itunes_subtitle = _start_subtitle - - def _end_subtitle(self): - self.popContent('subtitle') - _end_tagline = _end_subtitle - _end_itunes_subtitle = _end_subtitle - - def _start_rights(self, attrsD): - self.pushContent('rights', attrsD, 'text/plain', 1) - _start_dc_rights = _start_rights - _start_copyright = _start_rights - - def _end_rights(self): - self.popContent('rights') - _end_dc_rights = _end_rights - _end_copyright = _end_rights - - def _start_item(self, attrsD): - self.entries.append(FeedParserDict()) - self.push('item', 0) - self.inentry = 1 - self.guidislink = 0 - id = self._getAttribute(attrsD, 'rdf:about') - if id: - context = self._getContext() - context['id'] = id - self._cdf_common(attrsD) - _start_entry = _start_item - _start_product = _start_item - - def _end_item(self): - self.pop('item') - self.inentry = 0 - _end_entry = _end_item - - def _start_dc_language(self, attrsD): - self.push('language', 1) - _start_language = _start_dc_language - - def _end_dc_language(self): - self.lang = self.pop('language') - _end_language = _end_dc_language - - def _start_dc_publisher(self, attrsD): - self.push('publisher', 1) - _start_webmaster = _start_dc_publisher - - def _end_dc_publisher(self): - self.pop('publisher') - self._sync_author_detail('publisher') - _end_webmaster = _end_dc_publisher - - def _start_published(self, attrsD): - self.push('published', 1) - _start_dcterms_issued = _start_published - _start_issued = _start_published - - def _end_published(self): - value = self.pop('published') - self._save('published_parsed', _parse_date(value)) - _end_dcterms_issued = _end_published - _end_issued = _end_published - - def _start_updated(self, attrsD): - self.push('updated', 1) - _start_modified = _start_updated - _start_dcterms_modified = _start_updated - _start_pubdate = _start_updated - _start_dc_date = _start_updated - - def _end_updated(self): - value = self.pop('updated') - parsed_value = _parse_date(value) - self._save('updated_parsed', parsed_value) - _end_modified = _end_updated - _end_dcterms_modified = _end_updated - _end_pubdate = _end_updated - _end_dc_date = _end_updated - - def _start_created(self, attrsD): - self.push('created', 1) - _start_dcterms_created = _start_created - - def _end_created(self): - value = self.pop('created') - self._save('created_parsed', _parse_date(value)) - _end_dcterms_created = _end_created - - def _start_expirationdate(self, attrsD): - self.push('expired', 1) - - def _end_expirationdate(self): - self._save('expired_parsed', _parse_date(self.pop('expired'))) - - def _start_cc_license(self, attrsD): - self.push('license', 1) - value = self._getAttribute(attrsD, 'rdf:resource') - if value: - self.elementstack[-1][2].append(value) - self.pop('license') - - def _start_creativecommons_license(self, attrsD): - self.push('license', 1) - - def _end_creativecommons_license(self): - self.pop('license') - - def _addTag(self, term, scheme, label): - context = self._getContext() - tags = context.setdefault('tags', []) - if (not term) and (not scheme) and (not label): return - value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) - if value not in tags: - tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) - - def _start_category(self, attrsD): - if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) - term = attrsD.get('term') - scheme = attrsD.get('scheme', attrsD.get('domain')) - label = attrsD.get('label') - self._addTag(term, scheme, label) - self.push('category', 1) - _start_dc_subject = _start_category - _start_keywords = _start_category - - def _end_itunes_keywords(self): - for term in self.pop('itunes_keywords').split(): - self._addTag(term, 'http://www.itunes.com/', None) - - def _start_itunes_category(self, attrsD): - self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) - self.push('category', 1) - - def _end_category(self): - value = self.pop('category') - if not value: return - context = self._getContext() - tags = context['tags'] - if value and len(tags) and not tags[-1]['term']: - tags[-1]['term'] = value - else: - self._addTag(value, None, None) - _end_dc_subject = _end_category - _end_keywords = _end_category - _end_itunes_category = _end_category - - def _start_cloud(self, attrsD): - self._getContext()['cloud'] = FeedParserDict(attrsD) - - def _start_link(self, attrsD): - attrsD.setdefault('rel', 'alternate') - attrsD.setdefault('type', 'text/html') - attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): - attrsD['href'] = self.resolveURI(attrsD['href']) - expectingText = self.infeed or self.inentry or self.insource - context = self._getContext() - context.setdefault('links', []) - context['links'].append(FeedParserDict(attrsD)) - if attrsD['rel'] == 'enclosure': - self._start_enclosure(attrsD) - if attrsD.has_key('href'): - expectingText = 0 - if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): - context['link'] = attrsD['href'] - else: - self.push('link', expectingText) - _start_producturl = _start_link - - def _end_link(self): - value = self.pop('link') - context = self._getContext() - if self.intextinput: - context['textinput']['link'] = value - if self.inimage: - context['image']['link'] = value - _end_producturl = _end_link - - def _start_guid(self, attrsD): - self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') - self.push('id', 1) - - def _end_guid(self): - value = self.pop('id') - self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) - if self.guidislink: - # guid acts as link, but only if 'ispermalink' is not present or is 'true', - # and only if the item doesn't already have a link element - self._save('link', value) - - def _start_title(self, attrsD): - self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) - _start_dc_title = _start_title - _start_media_title = _start_title - - def _end_title(self): - value = self.popContent('title') - context = self._getContext() - if self.intextinput: - context['textinput']['title'] = value - elif self.inimage: - context['image']['title'] = value - _end_dc_title = _end_title - _end_media_title = _end_title - - def _start_description(self, attrsD): - context = self._getContext() - if context.has_key('summary'): - self._summaryKey = 'content' - self._start_content(attrsD) - else: - self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) - - def _start_abstract(self, attrsD): - self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) - - def _end_description(self): - if self._summaryKey == 'content': - self._end_content() - else: - value = self.popContent('description') - context = self._getContext() - if self.intextinput: - context['textinput']['description'] = value - elif self.inimage: - context['image']['description'] = value - self._summaryKey = None - _end_abstract = _end_description - - def _start_info(self, attrsD): - self.pushContent('info', attrsD, 'text/plain', 1) - _start_feedburner_browserfriendly = _start_info - - def _end_info(self): - self.popContent('info') - _end_feedburner_browserfriendly = _end_info - - def _start_generator(self, attrsD): - if attrsD: - attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): - attrsD['href'] = self.resolveURI(attrsD['href']) - self._getContext()['generator_detail'] = FeedParserDict(attrsD) - self.push('generator', 1) - - def _end_generator(self): - value = self.pop('generator') - context = self._getContext() - if context.has_key('generator_detail'): - context['generator_detail']['name'] = value - - def _start_admin_generatoragent(self, attrsD): - self.push('generator', 1) - value = self._getAttribute(attrsD, 'rdf:resource') - if value: - self.elementstack[-1][2].append(value) - self.pop('generator') - self._getContext()['generator_detail'] = FeedParserDict({'href': value}) - - def _start_admin_errorreportsto(self, attrsD): - self.push('errorreportsto', 1) - value = self._getAttribute(attrsD, 'rdf:resource') - if value: - self.elementstack[-1][2].append(value) - self.pop('errorreportsto') - - def _start_summary(self, attrsD): - context = self._getContext() - if context.has_key('summary'): - self._summaryKey = 'content' - self._start_content(attrsD) - else: - self._summaryKey = 'summary' - self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) - _start_itunes_summary = _start_summary - - def _end_summary(self): - if self._summaryKey == 'content': - self._end_content() - else: - self.popContent(self._summaryKey or 'summary') - self._summaryKey = None - _end_itunes_summary = _end_summary - - def _start_enclosure(self, attrsD): - attrsD = self._itsAnHrefDamnIt(attrsD) - self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) - href = attrsD.get('href') - if href: - context = self._getContext() - if not context.get('id'): - context['id'] = href - - def _start_source(self, attrsD): - self.insource = 1 - - def _end_source(self): - self.insource = 0 - self._getContext()['source'] = copy.deepcopy(self.sourcedata) - self.sourcedata.clear() - - def _start_content(self, attrsD): - self.pushContent('content', attrsD, 'text/plain', 1) - src = attrsD.get('src') - if src: - self.contentparams['src'] = src - self.push('content', 1) - - def _start_prodlink(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) - - def _start_body(self, attrsD): - self.pushContent('content', attrsD, 'application/xhtml+xml', 1) - _start_xhtml_body = _start_body - - def _start_content_encoded(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) - _start_fullitem = _start_content_encoded - - def _end_content(self): - copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) - value = self.popContent('content') - if copyToDescription: - self._save('description', value) - _end_body = _end_content - _end_xhtml_body = _end_content - _end_content_encoded = _end_content - _end_fullitem = _end_content - _end_prodlink = _end_content - - def _start_itunes_image(self, attrsD): - self.push('itunes_image', 0) - self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) - _start_itunes_link = _start_itunes_image - - def _end_itunes_block(self): - value = self.pop('itunes_block', 0) - self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 - - def _end_itunes_explicit(self): - value = self.pop('itunes_explicit', 0) - self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 - -if _XML_AVAILABLE: - class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): - def __init__(self, baseuri, baselang, encoding): - if _debug: sys.stderr.write('trying StrictFeedParser\n') - xml.sax.handler.ContentHandler.__init__(self) - _FeedParserMixin.__init__(self, baseuri, baselang, encoding) - self.bozo = 0 - self.exc = None - - def startPrefixMapping(self, prefix, uri): - self.trackNamespace(prefix, uri) - - def startElementNS(self, name, qname, attrs): - namespace, localname = name - lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') <> -1: - # match any backend.userland.com namespace - namespace = 'http://backend.userland.com/rss' - lowernamespace = namespace - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] - else: - givenprefix = None - prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): - raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix - if prefix: - localname = prefix + ':' + localname - localname = str(localname).lower() - if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) - - # qname implementation is horribly broken in Python 2.1 (it - # doesn't report any), and slightly broken in Python 2.2 (it - # doesn't report the xml: namespace). So we match up namespaces - # with a known list first, and then possibly override them with - # the qnames the SAX parser gives us (if indeed it gives us any - # at all). Thanks to MatejC for helping me test this and - # tirelessly telling me that it didn't work yet. - attrsD = {} - for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): - lowernamespace = (namespace or '').lower() - prefix = self._matchnamespaces.get(lowernamespace, '') - if prefix: - attrlocalname = prefix + ':' + attrlocalname - attrsD[str(attrlocalname).lower()] = attrvalue - for qname in attrs.getQNames(): - attrsD[str(qname).lower()] = attrs.getValueByQName(qname) - self.unknown_starttag(localname, attrsD.items()) - - def characters(self, text): - self.handle_data(text) - - def endElementNS(self, name, qname): - namespace, localname = name - lowernamespace = str(namespace or '').lower() - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] - else: - givenprefix = '' - prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if prefix: - localname = prefix + ':' + localname - localname = str(localname).lower() - self.unknown_endtag(localname) - - def error(self, exc): - self.bozo = 1 - self.exc = exc - - def fatalError(self, exc): - self.error(exc) - raise exc - -class _BaseHTMLProcessor(sgmllib.SGMLParser): - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - - def __init__(self, encoding): - self.encoding = encoding - if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) - sgmllib.SGMLParser.__init__(self) - - def reset(self): - self.pieces = [] - sgmllib.SGMLParser.reset(self) - - def _shorttag_replace(self, match): - tag = match.group(1) - if tag in self.elements_no_end_tag: - return '<' + tag + ' />' - else: - return '<' + tag + '>' - - def feed(self, data): - data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace - data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) - data = data.replace(''', "'") - data = data.replace('"', '"') - if self.encoding and type(data) == type(u''): - data = data.encode(self.encoding) - sgmllib.SGMLParser.feed(self, data) - - def normalize_attrs(self, attrs): - # utility method to be called by descendants - attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] - return attrs - - def unknown_starttag(self, tag, attrs): - # called for each start tag - # attrs is a list of (attr, value) tuples - # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
-        uattrs = []
-        # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
-        for key, value in attrs:
-            if type(value) != type(u''):
-                value = unicode(value, self.encoding)
-            uattrs.append((unicode(key, self.encoding), value))
-        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
-        if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
-        else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
-
-    def unknown_endtag(self, tag):
-        # called for each end tag, e.g. for 
, tag will be 'pre' - # Reconstruct the original end tag. - if tag not in self.elements_no_end_tag: - self.pieces.append("" % locals()) - - def handle_charref(self, ref): - # called for each character reference, e.g. for ' ', ref will be '160' - # Reconstruct the original character reference. - self.pieces.append('&#%(ref)s;' % locals()) - - def handle_entityref(self, ref): - # called for each entity reference, e.g. for '©', ref will be 'copy' - # Reconstruct the original entity reference. - self.pieces.append('&%(ref)s;' % locals()) - - def handle_data(self, text): - # called for each block of plain text, i.e. outside of any tag and - # not containing any character or entity references - # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) - self.pieces.append(text) - - def handle_comment(self, text): - # called for each HTML comment, e.g. - # Reconstruct the original comment. - self.pieces.append('' % locals()) - - def handle_pi(self, text): - # called for each processing instruction, e.g. - # Reconstruct original processing instruction. - self.pieces.append('' % locals()) - - def handle_decl(self, text): - # called for the DOCTYPE, if present, e.g. - # - # Reconstruct original DOCTYPE - self.pieces.append('' % locals()) - - _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match - def _scan_name(self, i, declstartpos): - rawdata = self.rawdata - n = len(rawdata) - if i == n: - return None, -1 - m = self._new_declname_match(rawdata, i) - if m: - s = m.group() - name = s.strip() - if (i + len(s)) == n: - return None, -1 # end of buffer - return name.lower(), m.end() - else: - self.handle_data(rawdata) -# self.updatepos(declstartpos, i) - return None, -1 - - def output(self): - '''Return processed HTML as a single string''' - return ''.join([str(p) for p in self.pieces]) - -class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): - def __init__(self, baseuri, baselang, encoding): - sgmllib.SGMLParser.__init__(self) - _FeedParserMixin.__init__(self, baseuri, baselang, encoding) - - def decodeEntities(self, element, data): - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace('"', '"') - data = data.replace(''', ''') - data = data.replace(''', ''') - if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace(''', "'") - return data - -class _RelativeURIResolver(_BaseHTMLProcessor): - relative_uris = [('a', 'href'), - ('applet', 'codebase'), - ('area', 'href'), - ('blockquote', 'cite'), - ('body', 'background'), - ('del', 'cite'), - ('form', 'action'), - ('frame', 'longdesc'), - ('frame', 'src'), - ('iframe', 'longdesc'), - ('iframe', 'src'), - ('head', 'profile'), - ('img', 'longdesc'), - ('img', 'src'), - ('img', 'usemap'), - ('input', 'src'), - ('input', 'usemap'), - ('ins', 'cite'), - ('link', 'href'), - ('object', 'classid'), - ('object', 'codebase'), - ('object', 'data'), - ('object', 'usemap'), - ('q', 'cite'), - ('script', 'src')] - - def __init__(self, baseuri, encoding): - _BaseHTMLProcessor.__init__(self, encoding) - self.baseuri = baseuri - - def resolveURI(self, uri): - return _urljoin(self.baseuri, uri) - - def unknown_starttag(self, tag, attrs): - attrs = self.normalize_attrs(attrs) - attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] - _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - -def _resolveRelativeURIs(htmlSource, baseURI, encoding): - if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') - p = _RelativeURIResolver(baseURI, encoding) - p.feed(htmlSource) - return p.output() - -class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', - 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', - 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', - 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', - 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', - 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', - 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', - 'thead', 'tr', 'tt', 'u', 'ul', 'var'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', - 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', - 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', - 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', - 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', - 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', - 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', - 'usemap', 'valign', 'value', 'vspace', 'width'] - - unacceptable_elements_with_end_tag = ['script', 'applet'] - - def reset(self): - _BaseHTMLProcessor.reset(self) - self.unacceptablestack = 0 - - def unknown_starttag(self, tag, attrs): - if not tag in self.acceptable_elements: - if tag in self.unacceptable_elements_with_end_tag: - self.unacceptablestack += 1 - return - attrs = self.normalize_attrs(attrs) - attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] - _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - - def unknown_endtag(self, tag): - if not tag in self.acceptable_elements: - if tag in self.unacceptable_elements_with_end_tag: - self.unacceptablestack -= 1 - return - _BaseHTMLProcessor.unknown_endtag(self, tag) - - def handle_pi(self, text): - pass - - def handle_decl(self, text): - pass - - def handle_data(self, text): - if not self.unacceptablestack: - _BaseHTMLProcessor.handle_data(self, text) - -def _sanitizeHTML(htmlSource, encoding): - p = _HTMLSanitizer(encoding) - p.feed(htmlSource) - data = p.output() - if TIDY_MARKUP: - # loop through list of preferred Tidy interfaces looking for one that's installed, - # then set up a common _tidy function to wrap the interface-specific API. - _tidy = None - for tidy_interface in PREFERRED_TIDY_INTERFACES: - try: - if tidy_interface == "uTidy": - from tidy import parseString as _utidy - def _tidy(data, **kwargs): - return str(_utidy(data, **kwargs)) - break - elif tidy_interface == "mxTidy": - from mx.Tidy import Tidy as _mxtidy - def _tidy(data, **kwargs): - nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) - return data - break - except: - pass - if _tidy: - utf8 = type(data) == type(u'') - if utf8: - data = data.encode('utf-8') - data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") - if utf8: - data = unicode(data, 'utf-8') - if data.count(''): - data = data.split('>', 1)[1] - if data.count('= '2.3.3' - assert base64 != None - user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') - realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] - self.add_password(realm, host, user, passw) - retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) - self.reset_retry_count() - return retry - except: - return self.http_error_default(req, fp, code, msg, headers) - -def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): - """URL, filename, or string --> stream - - This function lets you define parsers that take any input source - (URL, pathname to local or network file, or actual data as a string) - and deal with it in a uniform manner. Returned object is guaranteed - to have all the basic stdio read methods (read, readline, readlines). - Just .close() the object when you're done with it. - - If the etag argument is supplied, it will be used as the value of an - If-None-Match request header. - - If the modified argument is supplied, it must be a tuple of 9 integers - as returned by gmtime() in the standard Python time module. This MUST - be in GMT (Greenwich Mean Time). The formatted date/time will be used - as the value of an If-Modified-Since request header. - - If the agent argument is supplied, it will be used as the value of a - User-Agent request header. - - If the referrer argument is supplied, it will be used as the value of a - Referer[sic] request header. - - If handlers is supplied, it is a list of handlers used to build a - urllib2 opener. - """ - - if hasattr(url_file_stream_or_string, 'read'): - return url_file_stream_or_string - - if url_file_stream_or_string == '-': - return sys.stdin - - if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): - if not agent: - agent = USER_AGENT - # test for inline user:password for basic auth - auth = None - if base64: - urltype, rest = urllib.splittype(url_file_stream_or_string) - realhost, rest = urllib.splithost(rest) - if realhost: - user_passwd, realhost = urllib.splituser(realhost) - if user_passwd: - url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) - auth = base64.encodestring(user_passwd).strip() - # try to open with urllib2 (to use optional headers) - request = urllib2.Request(url_file_stream_or_string) - request.add_header('User-Agent', agent) - if etag: - request.add_header('If-None-Match', etag) - if modified: - # format into an RFC 1123-compliant timestamp. We can't use - # time.strftime() since the %a and %b directives can be affected - # by the current locale, but RFC 2616 states that dates must be - # in English. - short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) - if referrer: - request.add_header('Referer', referrer) - if gzip and zlib: - request.add_header('Accept-encoding', 'gzip, deflate') - elif gzip: - request.add_header('Accept-encoding', 'gzip') - elif zlib: - request.add_header('Accept-encoding', 'deflate') - else: - request.add_header('Accept-encoding', '') - if auth: - request.add_header('Authorization', 'Basic %s' % auth) - if ACCEPT_HEADER: - request.add_header('Accept', ACCEPT_HEADER) - request.add_header('A-IM', 'feed') # RFC 3229 support - opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) - opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent - try: - return opener.open(request) - finally: - opener.close() # JohnD - - # try to open with native open function (if url_file_stream_or_string is a filename) - try: - return open(url_file_stream_or_string) - except: - pass - - # treat url_file_stream_or_string as string - return _StringIO(str(url_file_stream_or_string)) - -_date_handlers = [] -def registerDateHandler(func): - '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' - _date_handlers.insert(0, func) - -# ISO-8601 date parsing routines written by Fazal Majid. -# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 -# parser is beyond the scope of feedparser and would be a worthwhile addition -# to the Python library. -# A single regular expression cannot parse ISO 8601 date formats into groups -# as the standard is highly irregular (for instance is 030104 2003-01-04 or -# 0301-04-01), so we use templates instead. -# Please note the order in templates is significant because we need a -# greedy match. -_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', - 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', - '-YY-?MM', '-OOO', '-YY', - '--MM-?DD', '--MM', - '---DD', - 'CC', ''] -_iso8601_re = [ - tmpl.replace( - 'YYYY', r'(?P\d{4})').replace( - 'YY', r'(?P\d\d)').replace( - 'MM', r'(?P[01]\d)').replace( - 'DD', r'(?P[0123]\d)').replace( - 'OOO', r'(?P[0123]\d\d)').replace( - 'CC', r'(?P\d\d$)') - + r'(T?(?P\d{2}):(?P\d{2})' - + r'(:(?P\d{2}))?' - + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' - for tmpl in _iso8601_tmpl] -del tmpl -_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] -del regex -def _parse_date_iso8601(dateString): - '''Parse a variety of ISO-8601-compatible formats like 20040105''' - m = None - for _iso8601_match in _iso8601_matches: - m = _iso8601_match(dateString) - if m: break - if not m: return - if m.span() == (0, 0): return - params = m.groupdict() - ordinal = params.get('ordinal', 0) - if ordinal: - ordinal = int(ordinal) - else: - ordinal = 0 - year = params.get('year', '--') - if not year or year == '--': - year = time.gmtime()[0] - elif len(year) == 2: - # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 - year = 100 * int(time.gmtime()[0] / 100) + int(year) - else: - year = int(year) - month = params.get('month', '-') - if not month or month == '-': - # ordinals are NOT normalized by mktime, we simulate them - # by setting month=1, day=ordinal - if ordinal: - month = 1 - else: - month = time.gmtime()[1] - month = int(month) - day = params.get('day', 0) - if not day: - # see above - if ordinal: - day = ordinal - elif params.get('century', 0) or \ - params.get('year', 0) or params.get('month', 0): - day = 1 - else: - day = time.gmtime()[2] - else: - day = int(day) - # special case of the century - is the first year of the 21st century - # 2000 or 2001 ? The debate goes on... - if 'century' in params.keys(): - year = (int(params['century']) - 1) * 100 + 1 - # in ISO 8601 most fields are optional - for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: - if not params.get(field, None): - params[field] = 0 - hour = int(params.get('hour', 0)) - minute = int(params.get('minute', 0)) - second = int(params.get('second', 0)) - # weekday is normalized by mktime(), we can ignore it - weekday = 0 - # daylight savings is complex, but not needed for feedparser's purposes - # as time zones, if specified, include mention of whether it is active - # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and - # and most implementations have DST bugs - daylight_savings_flag = 0 - tm = [year, month, day, hour, minute, second, weekday, - ordinal, daylight_savings_flag] - # ISO 8601 time zone adjustments - tz = params.get('tz') - if tz and tz != 'Z': - if tz[0] == '-': - tm[3] += int(params.get('tzhour', 0)) - tm[4] += int(params.get('tzmin', 0)) - elif tz[0] == '+': - tm[3] -= int(params.get('tzhour', 0)) - tm[4] -= int(params.get('tzmin', 0)) - else: - return None - # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) - # which is guaranteed to normalize d/m/y/h/m/s. - # Many implementations have bugs, but we'll pretend they don't. - return time.localtime(time.mktime(tm)) -registerDateHandler(_parse_date_iso8601) - -# 8-bit date handling routines written by ytrewq1. -_korean_year = u'\ub144' # b3e2 in euc-kr -_korean_month = u'\uc6d4' # bff9 in euc-kr -_korean_day = u'\uc77c' # c0cf in euc-kr -_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr -_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr - -_korean_onblog_date_re = \ - re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ - (_korean_year, _korean_month, _korean_day)) -_korean_nate_date_re = \ - re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ - (_korean_am, _korean_pm)) -def _parse_date_onblog(dateString): - '''Parse a string according to the OnBlog 8-bit date format''' - m = _korean_onblog_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_onblog) - -def _parse_date_nate(dateString): - '''Parse a string according to the Nate 8-bit date format''' - m = _korean_nate_date_re.match(dateString) - if not m: return - hour = int(m.group(5)) - ampm = m.group(4) - if (ampm == _korean_pm): - hour += 12 - hour = str(hour) - if len(hour) == 1: - hour = '0' + hour - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_nate) - -_mssql_date_re = \ - re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') -def _parse_date_mssql(dateString): - '''Parse a string according to the MS SQL date format''' - m = _mssql_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_mssql) - -# Unicode strings for Greek date strings -_greek_months = \ - { \ - u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 - u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 - u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 - u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 - u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 - u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 - u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 - u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 - u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 - u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 - u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 - u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 - u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 - u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 - u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 - u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 - u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 - } - -_greek_wdays = \ - { \ - u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 - u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 - u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 - u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 - u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 - u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 - u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 - } - -_greek_date_format_re = \ - re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') - -def _parse_date_greek(dateString): - '''Parse a string according to a Greek 8-bit date format.''' - m = _greek_date_format_re.match(dateString) - if not m: return - try: - wday = _greek_wdays[m.group(1)] - month = _greek_months[m.group(3)] - except: - return - rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ - {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ - 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ - 'zonediff': m.group(8)} - if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) - return _parse_date_rfc822(rfc822date) -registerDateHandler(_parse_date_greek) - -# Unicode strings for Hungarian date strings -_hungarian_months = \ - { \ - u'janu\u00e1r': u'01', # e1 in iso-8859-2 - u'febru\u00e1ri': u'02', # e1 in iso-8859-2 - u'm\u00e1rcius': u'03', # e1 in iso-8859-2 - u'\u00e1prilis': u'04', # e1 in iso-8859-2 - u'm\u00e1ujus': u'05', # e1 in iso-8859-2 - u'j\u00fanius': u'06', # fa in iso-8859-2 - u'j\u00falius': u'07', # fa in iso-8859-2 - u'augusztus': u'08', - u'szeptember': u'09', - u'okt\u00f3ber': u'10', # f3 in iso-8859-2 - u'november': u'11', - u'december': u'12', - } - -_hungarian_date_format_re = \ - re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') - -def _parse_date_hungarian(dateString): - '''Parse a string according to a Hungarian 8-bit date format.''' - m = _hungarian_date_format_re.match(dateString) - if not m: return - try: - month = _hungarian_months[m.group(2)] - day = m.group(3) - if len(day) == 1: - day = '0' + day - hour = m.group(4) - if len(hour) == 1: - hour = '0' + hour - except: - return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ - {'year': m.group(1), 'month': month, 'day': day,\ - 'hour': hour, 'minute': m.group(5),\ - 'zonediff': m.group(6)} - if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_hungarian) - -# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by -# Drake and licensed under the Python license. Removed all range checking -# for month, day, hour, minute, and second, since mktime will normalize -# these later -def _parse_date_w3dtf(dateString): - def __extract_date(m): - year = int(m.group('year')) - if year < 100: - year = 100 * int(time.gmtime()[0] / 100) + int(year) - if year < 1000: - return 0, 0, 0 - julian = m.group('julian') - if julian: - julian = int(julian) - month = julian / 30 + 1 - day = julian % 30 + 1 - jday = None - while jday != julian: - t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) - jday = time.gmtime(t)[-2] - diff = abs(jday - julian) - if jday > julian: - if diff < day: - day = day - diff - else: - month = month - 1 - day = 31 - elif jday < julian: - if day + diff < 28: - day = day + diff - else: - month = month + 1 - return year, month, day - month = m.group('month') - day = 1 - if month is None: - month = 1 - else: - month = int(month) - day = m.group('day') - if day: - day = int(day) - else: - day = 1 - return year, month, day - - def __extract_time(m): - if not m: - return 0, 0, 0 - hours = m.group('hours') - if not hours: - return 0, 0, 0 - hours = int(hours) - minutes = int(m.group('minutes')) - seconds = m.group('seconds') - if seconds: - seconds = int(seconds) - else: - seconds = 0 - return hours, minutes, seconds - - def __extract_tzd(m): - '''Return the Time Zone Designator as an offset in seconds from UTC.''' - if not m: - return 0 - tzd = m.group('tzd') - if not tzd: - return 0 - if tzd == 'Z': - return 0 - hours = int(m.group('tzdhours')) - minutes = m.group('tzdminutes') - if minutes: - minutes = int(minutes) - else: - minutes = 0 - offset = (hours*60 + minutes) * 60 - if tzd[0] == '+': - return -offset - return offset - - __date_re = ('(?P\d\d\d\d)' - '(?:(?P-|)' - '(?:(?P\d\d\d)' - '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?') - __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' - __tzd_rx = re.compile(__tzd_re) - __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' - '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' - + __tzd_re) - __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) - __datetime_rx = re.compile(__datetime_re) - m = __datetime_rx.match(dateString) - if (m is None) or (m.group() != dateString): return - gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) - if gmt[0] == 0: return - return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) -registerDateHandler(_parse_date_w3dtf) - -def _parse_date_rfc822(dateString): - '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' - data = dateString.split() - if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: - del data[0] - if len(data) == 4: - s = data[3] - i = s.find('+') - if i > 0: - data[3:] = [s[:i], s[i+1:]] - else: - data.append('') - dateString = " ".join(data) - if len(data) < 5: - dateString += ' 00:00:00 GMT' - tm = rfc822.parsedate_tz(dateString) - if tm: - return time.gmtime(rfc822.mktime_tz(tm)) -# rfc822.py defines several time zones, but we define some extra ones. -# 'ET' is equivalent to 'EST', etc. -_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} -rfc822._timezones.update(_additional_timezones) -registerDateHandler(_parse_date_rfc822) - -def _parse_date(dateString): - '''Parses a variety of date formats into a 9-tuple in GMT''' - for handler in _date_handlers: - try: - date9tuple = handler(dateString) - if not date9tuple: continue - if len(date9tuple) != 9: - if _debug: sys.stderr.write('date handler function must return 9-tuple\n') - raise ValueError - map(int, date9tuple) - return date9tuple - except Exception, e: - if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) - pass - return None - -def _getCharacterEncoding(http_headers, xml_data): - '''Get the character encoding of the XML document - - http_headers is a dictionary - xml_data is a raw string (not Unicode) - - This is so much trickier than it sounds, it's not even funny. - According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type - is application/xml, application/*+xml, - application/xml-external-parsed-entity, or application/xml-dtd, - the encoding given in the charset parameter of the HTTP Content-Type - takes precedence over the encoding given in the XML prefix within the - document, and defaults to 'utf-8' if neither are specified. But, if - the HTTP Content-Type is text/xml, text/*+xml, or - text/xml-external-parsed-entity, the encoding given in the XML prefix - within the document is ALWAYS IGNORED and only the encoding given in - the charset parameter of the HTTP Content-Type header should be - respected, and it defaults to 'us-ascii' if not specified. - - Furthermore, discussion on the atom-syntax mailing list with the - author of RFC 3023 leads me to the conclusion that any document - served with a Content-Type of text/* and no charset parameter - must be treated as us-ascii. (We now do this.) And also that it - must always be flagged as non-well-formed. (We now do this too.) - - If Content-Type is unspecified (input was local file or non-HTTP source) - or unrecognized (server just got it totally wrong), then go by the - encoding given in the XML prefix of the document and default to - 'iso-8859-1' as per the HTTP specification (RFC 2616). - - Then, assuming we didn't find a character encoding in the HTTP headers - (and the HTTP Content-type allowed us to look in the body), we need - to sniff the first few bytes of the XML data and try to determine - whether the encoding is ASCII-compatible. Section F of the XML - specification shows the way here: - http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info - - If the sniffed encoding is not ASCII-compatible, we need to make it - ASCII compatible so that we can sniff further into the XML declaration - to find the encoding attribute, which will tell us the true encoding. - - Of course, none of this guarantees that we will be able to parse the - feed in the declared character encoding (assuming it was declared - correctly, which many are not). CJKCodecs and iconv_codec help a lot; - you should definitely install them if you can. - http://cjkpython.i18n.org/ - ''' - - def _parseHTTPContentType(content_type): - '''takes HTTP Content-Type header and returns (content type, charset) - - If no charset is specified, returns (content type, '') - If no content type is specified, returns ('', '') - Both return parameters are guaranteed to be lowercase strings - ''' - content_type = content_type or '' - content_type, params = cgi.parse_header(content_type) - return content_type, params.get('charset', '').replace("'", '') - - sniffed_xml_encoding = '' - xml_encoding = '' - true_encoding = '' - http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) - # Must sniff for non-ASCII-compatible character encodings before - # searching for XML declaration. This heuristic is defined in - # section F of the XML specification: - # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info - try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = _ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - # ASCII-compatible - pass - xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) - except: - xml_encoding_match = None - if xml_encoding_match: - xml_encoding = xml_encoding_match.groups()[0].lower() - if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - acceptable_content_type = 0 - application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') - text_content_types = ('text/xml', 'text/xml-external-parsed-entity') - if (http_content_type in application_content_types) or \ - (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): - acceptable_content_type = 1 - true_encoding = http_encoding or xml_encoding or 'utf-8' - elif (http_content_type in text_content_types) or \ - (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): - acceptable_content_type = 1 - true_encoding = http_encoding or 'us-ascii' - elif http_content_type.startswith('text/'): - true_encoding = http_encoding or 'us-ascii' - elif http_headers and (not http_headers.has_key('content-type')): - true_encoding = xml_encoding or 'iso-8859-1' - else: - true_encoding = xml_encoding or 'utf-8' - return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type - -def _toUTF8(data, encoding): - '''Changes an XML data stream on the fly to specify a new encoding - - data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already - encoding is a string recognized by encodings.aliases - ''' - if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16be': - sys.stderr.write('trying utf-16be instead\n') - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16le': - sys.stderr.write('trying utf-16le instead\n') - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-8': - sys.stderr.write('trying utf-8 instead\n') - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32be': - sys.stderr.write('trying utf-32be instead\n') - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32le': - sys.stderr.write('trying utf-32le instead\n') - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding) - if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) - declmatch = re.compile('^<\?xml[^>]*?>') - newdecl = '''''' - if declmatch.search(newdata): - newdata = declmatch.sub(newdecl, newdata) - else: - newdata = newdecl + u'\n' + newdata - return newdata.encode('utf-8') - -def _stripDoctype(data): - '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) - - rss_version may be 'rss091n' or None - stripped_data is the same XML document, minus the DOCTYPE - ''' - entity_pattern = re.compile(r']*?)>', re.MULTILINE) - data = entity_pattern.sub('', data) - doctype_pattern = re.compile(r']*?)>', re.MULTILINE) - doctype_results = doctype_pattern.findall(data) - doctype = doctype_results and doctype_results[0] or '' - if doctype.lower().count('netscape'): - version = 'rss091n' - else: - version = None - data = doctype_pattern.sub('', data) - return version, data - -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): - '''Parse a feed from a URL, file, stream, or string''' - result = FeedParserDict() - result['feed'] = FeedParserDict() - result['entries'] = [] - if _XML_AVAILABLE: - result['bozo'] = 0 - if type(handlers) == types.InstanceType: - handlers = [handlers] - try: - f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) - data = f.read() - except Exception, e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' - f = None - - # if feed is gzip-compressed, decompress it - if f and data and hasattr(f, 'headers'): - if gzip and f.headers.get('content-encoding', '') == 'gzip': - try: - data = gzip.GzipFile(fileobj=_StringIO(data)).read() - except Exception, e: - # Some feeds claim to be gzipped but they're not, so - # we get garbage. Ideally, we should re-request the - # feed without the 'Accept-encoding: gzip' header, - # but we don't. - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' - elif zlib and f.headers.get('content-encoding', '') == 'deflate': - try: - data = zlib.decompress(data, -zlib.MAX_WBITS) - except Exception, e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' - - # save HTTP headers - if hasattr(f, 'info'): - info = f.info() - result['etag'] = info.getheader('ETag') - last_modified = info.getheader('Last-Modified') - if last_modified: - result['modified'] = _parse_date(last_modified) - if hasattr(f, 'url'): - result['href'] = f.url - result['status'] = 200 - if hasattr(f, 'status'): - result['status'] = f.status - if hasattr(f, 'headers'): - result['headers'] = f.headers.dict - if hasattr(f, 'close'): - f.close() - - # there are four encodings to keep track of: - # - http_encoding is the encoding declared in the Content-Type HTTP header - # - xml_encoding is the encoding declared in the ; changed -# project name -#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); -# removed unnecessary urllib code -- urllib2 should always be available anyway; -# return actual url, status, and full HTTP headers (as result['url'], -# result['status'], and result['headers']) if parsing a remote feed over HTTP -- -# this should pass all the HTTP tests at ; -# added the latest namespace-of-the-week for RSS 2.0 -#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom -# User-Agent (otherwise urllib2 sends two, which confuses some servers) -#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for -# inline and as used in some RSS 2.0 feeds -#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or -# textInput, and also to return the character encoding (if specified) -#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking -# nested divs within content (JohnD); fixed missing sys import (JohanS); -# fixed regular expression to capture XML character encoding (Andrei); -# added support for Atom 0.3-style links; fixed bug with textInput tracking; -# added support for cloud (MartijnP); added support for multiple -# category/dc:subject (MartijnP); normalize content model: 'description' gets -# description (which can come from description, summary, or full content if no -# description), 'content' gets dict of base/language/type/value (which can come -# from content:encoded, xhtml:body, content, or fullitem); -# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang -# tracking; fixed bug tracking unknown tags; fixed bug tracking content when -# element is not in default namespace (like Pocketsoap feed); -# resolve relative URLs in link, guid, docs, url, comments, wfw:comment, -# wfw:commentRSS; resolve relative URLs within embedded HTML markup in -# description, xhtml:body, content, content:encoded, title, subtitle, -# summary, info, tagline, and copyright; added support for pingback and -# trackback namespaces -#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback -# namespaces, as opposed to 2.6 when I said I did but didn't really; -# sanitize HTML markup within some elements; added mxTidy support (if -# installed) to tidy HTML markup within some elements; fixed indentation -# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available -# (FazalM); universal date parsing and normalization (FazalM): 'created', modified', -# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed', -# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified' -# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa -#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory -# leak not closing url opener (JohnD); added dc:publisher support (MarekK); -# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK) -#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in -# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); -# fixed relative URI processing for guid (skadz); added ICBM support; added -# base64 support -#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many -# blogspot.com sites); added _debug variable -#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing -#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); -# added several new supported namespaces; fixed bug tracking naked markup in -# description; added support for enclosure; added support for source; re-added -# support for cloud which got dropped somehow; added support for expirationDate -#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking -# xml:base URI, one for documents that don't define one explicitly and one for -# documents that define an outer and an inner xml:base that goes out of scope -# before the end of the document -#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level -#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] -# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized; -# added support for creativeCommons:license and cc:license; added support for -# full Atom content model in title, tagline, info, copyright, summary; fixed bug -# with gzip encoding (not always telling server we support it when we do) -#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail -# (dictionary of 'name', 'url', 'email'); map author to author_detail if author -# contains name + email address -#3.0b8 - 1/28/2004 - MAP - added support for contributor -#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added -# support for summary -#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from -# xml.util.iso8601 -#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain -# dangerous markup; fiddled with decodeEntities (not right); liberalized -# date parsing even further -#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); -# added support to Atom 0.2 subtitle; added support for Atom content model -# in copyright; better sanitizing of dangerous HTML elements with end tags -# (script, frameset) -#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, -# etc.) in embedded markup, in either HTML or XHTML form (
,
,
) -#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under -# Python 2.1 -#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; -# fixed bug capturing author and contributor URL; fixed bug resolving relative -# links in author and contributor URL; fixed bug resolvin relative links in -# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's -# namespace tests, and included them permanently in the test suite with his -# permission; fixed namespace handling under Python 2.1 -#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) -#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 -#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); -# use libxml2 (if available) -#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author -# name was in parentheses; removed ultra-problematic mxTidy support; patch to -# workaround crash in PyXML/expat when encountering invalid entities -# (MarkMoraes); support for textinput/textInput -#3.0b20 - 4/7/2004 - MAP - added CDF support -#3.0b21 - 4/14/2004 - MAP - added Hot RSS support -#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in -# results dict; changed results dict to allow getting values with results.key -# as well as results[key]; work around embedded illformed HTML with half -# a DOCTYPE; work around malformed Content-Type header; if character encoding -# is wrong, try several common ones before falling back to regexes (if this -# works, bozo_exception is set to CharacterEncodingOverride); fixed character -# encoding issues in BaseHTMLProcessor by tracking encoding and converting -# from Unicode to raw strings before feeding data to sgmllib.SGMLParser; -# convert each value in results to Unicode (if possible), even if using -# regex-based parsing -#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain -# high-bit characters in attributes in embedded HTML in description (thanks -# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in -# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking -# about a mapped key -#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and -# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could -# cause the same encoding to be tried twice (even if it failed the first time); -# fixed DOCTYPE stripping when DOCTYPE contained entity declarations; -# better textinput and image tracking in illformed RSS 1.0 feeds -#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed -# my blink tag tests -#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that -# failed to parse utf-16 encoded feeds; made source into a FeedParserDict; -# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url; -# added support for image; refactored parse() fallback logic to try other -# encodings if SAX parsing fails (previously it would only try other encodings -# if re-encoding failed); remove unichr madness in normalize_attrs now that -# we're properly tracking encoding in and out of BaseHTMLProcessor; set -# feed.language from root-level xml:lang; set entry.id from rdf:about; -# send Accept header -#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between -# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are -# windows-1252); fixed regression that could cause the same encoding to be -# tried twice (even if it failed the first time) -#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; -# recover from malformed content-type header parameter with no equals sign -# ('text/xml; charset:iso-8859-1') -#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities -# to Unicode equivalents in illformed feeds (aaronsw); added and -# passed tests for converting character entities to Unicode equivalents -# in illformed feeds (aaronsw); test for valid parsers when setting -# XML_AVAILABLE; make version and encoding available when server returns -# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like -# digest auth or proxy support); add code to parse username/password -# out of url and send as basic authentication; expose downloading-related -# exceptions in bozo_exception (aaronsw); added __contains__ method to -# FeedParserDict (aaronsw); added publisher_detail (aaronsw) -#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always -# convert feed to UTF-8 before passing to XML parser; completely revamped -# logic for determining character encoding and attempting XML parsing -# (much faster); increased default timeout to 20 seconds; test for presence -# of Location header on redirects; added tests for many alternate character -# encodings; support various EBCDIC encodings; support UTF-16BE and -# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support -# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no -# XML parsers are available; added support for 'Content-encoding: deflate'; -# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules -# are available -#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure -# problem tracking xml:base and xml:lang if element declares it, child -# doesn't, first grandchild redeclares it, and second grandchild doesn't; -# refactored date parsing; defined public registerDateHandler so callers -# can add support for additional date formats at runtime; added support -# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added -# zopeCompatibilityHack() which turns FeedParserDict into a regular -# dictionary, required for Zope compatibility, and also makes command- -# line debugging easier because pprint module formats real dictionaries -# better than dictionary-like objects; added NonXMLContentType exception, -# which is stored in bozo_exception when a feed is served with a non-XML -# media type such as 'text/plain'; respect Content-Language as default -# language if not xml:lang is present; cloud dict is now FeedParserDict; -# generator dict is now FeedParserDict; better tracking of xml:lang, -# including support for xml:lang='' to unset the current language; -# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default -# namespace; don't overwrite final status on redirects (scenarios: -# redirecting to a URL that returns 304, redirecting to a URL that -# redirects to another URL with a different type of redirect); add -# support for HTTP 303 redirects -#4.0 - MAP - support for relative URIs in xml:base attribute; fixed -# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229; -# support for Atom 1.0; support for iTunes extensions; new 'tags' for -# categories/keywords/etc. as array of dict -# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0 -# terminology; parse RFC 822-style dates with no time; lots of other -# bug fixes -#4.1 - MAP - removed socket timeout; added support for chardet library diff --git a/subscriber/index.yaml b/subscriber/index.yaml deleted file mode 100644 index 045e8e4..0000000 --- a/subscriber/index.yaml +++ /dev/null @@ -1,17 +0,0 @@ -indexes: - -# AUTOGENERATED - -# This index.yaml is automatically updated whenever the dev_appserver -# detects that a new type of query is run. If you want to manage the -# index.yaml file manually, remove the above marker line (the line -# saying "# AUTOGENERATED"). If you want to manage some indexes -# manually, move them above the marker line. The index.yaml file is -# automatically uploaded to the admin console when you next deploy -# your application using appcfg.py. - -# Unused in query history -- copied from input. -- kind: SomeUpdate - properties: - - name: updated - direction: desc diff --git a/subscriber/main.py b/subscriber/main.py deleted file mode 100755 index 1e16157..0000000 --- a/subscriber/main.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2008 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Simple subscriber that aggregates all feeds together.""" - -import hashlib -import logging -import random -import wsgiref.handlers -from google.appengine.ext import db -from google.appengine.ext import webapp -from google.appengine.ext.webapp import template - -import feedparser -import simplejson - - -class SomeUpdate(db.Model): - """Some topic update. - - Key name will be a hash of the feed source and item ID. - """ - title = db.TextProperty() - content = db.TextProperty() - updated = db.DateTimeProperty(auto_now_add=True) - link = db.TextProperty() - - -class InputHandler(webapp.RequestHandler): - """Handles feed input and subscription""" - - def get(self): - # Just subscribe to everything. - self.response.out.write(self.request.get('hub.challenge')) - self.response.set_status(200) - - def post(self): - body = self.request.body.decode('utf-8') - logging.info('Post body is %d characters', len(body)) - - data = feedparser.parse(self.request.body) - if data.bozo: - logging.error('Bozo feed data. %s: %r', - data.bozo_exception.__class__.__name__, - data.bozo_exception) - if (hasattr(data.bozo_exception, 'getLineNumber') and - hasattr(data.bozo_exception, 'getMessage')): - line = data.bozo_exception.getLineNumber() - logging.error('Line %d: %s', line, data.bozo_exception.getMessage()) - segment = self.request.body.split('\n')[line-1] - logging.info('Body segment with error: %r', segment.decode('utf-8')) - return self.response.set_status(500) - - update_list = [] - logging.info('Found %d entries', len(data.entries)) - for entry in data.entries: - if hasattr(entry, 'content'): - # This is Atom. - entry_id = entry.id - content = entry.content[0].value - link = entry.get('link', '') - title = entry.get('title', '') - else: - content = entry.get('description', '') - title = entry.get('title', '') - link = entry.get('link', '') - entry_id = (entry.get('id', '') or link or title or content) - - logging.info('Found entry with title = "%s", id = "%s", ' - 'link = "%s", content = "%s"', - title, entry_id, link, content) - update_list.append(SomeUpdate( - key_name='key_' + hashlib.sha1(link + '\n' + entry_id).hexdigest(), - title=title, - content=content, - link=link)) - db.put(update_list) - self.response.set_status(200) - self.response.out.write("Aight. Saved."); - - -class DebugHandler(webapp.RequestHandler): - """Debug handler for simulating events.""" - def get(self): - self.response.out.write(""" - - -
-
Simulate feed:
- -
-
- - -""") - - -class ViewHandler(webapp.RequestHandler): - """Shows the items to anyone as HTML.""" - - def get(self): - context = dict(entries=SomeUpdate.gql('ORDER BY updated DESC').fetch(50)) - self.response.out.write(template.render('subscriber.html', context)) - - -class ItemsHandler(webapp.RequestHandler): - """Gets the items.""" - - def get(self): - encoder = simplejson.JSONEncoder() - stuff = [] - for update in SomeUpdate.gql('ORDER BY updated DESC').fetch(10): - stuff.append({'time': str(update.updated), - 'title': update.title, - 'content': update.content, - 'source': update.link}) - self.response.out.write(encoder.encode(stuff)) - - -application = webapp.WSGIApplication( - [ - (r'/items', ItemsHandler), - (r'/debug', DebugHandler), - # Wildcard below so we can test multiple subscribers in a single app. - (r'/subscriber.*', InputHandler), - (r'/', ViewHandler), - ], - debug=True) - - -def main(): - wsgiref.handlers.CGIHandler().run(application) - - -if __name__ == '__main__': - main() diff --git a/subscriber/simplejson/__init__.py b/subscriber/simplejson/__init__.py deleted file mode 100644 index 4683a28..0000000 --- a/subscriber/simplejson/__init__.py +++ /dev/null @@ -1,378 +0,0 @@ -r""" -A simple, fast, extensible JSON encoder and decoder - -JSON (JavaScript Object Notation) is a subset of -JavaScript syntax (ECMA-262 3rd edition) used as a lightweight data -interchange format. - -simplejson exposes an API familiar to uses of the standard library -marshal and pickle modules. - -Encoding basic Python object hierarchies:: - - >>> import simplejson - >>> simplejson.dumps(['foo', {'bar': ('baz', None, 1.0, 2)}]) - '["foo", {"bar": ["baz", null, 1.0, 2]}]' - >>> print simplejson.dumps("\"foo\bar") - "\"foo\bar" - >>> print simplejson.dumps(u'\u1234') - "\u1234" - >>> print simplejson.dumps('\\') - "\\" - >>> print simplejson.dumps({"c": 0, "b": 0, "a": 0}, sort_keys=True) - {"a": 0, "b": 0, "c": 0} - >>> from StringIO import StringIO - >>> io = StringIO() - >>> simplejson.dump(['streaming API'], io) - >>> io.getvalue() - '["streaming API"]' - -Compact encoding:: - - >>> import simplejson - >>> compact = simplejson.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':')) - >>> # Can't assume dict ordering - >>> compact in ('[1,2,3,{"4":5,"6":7}]', '[1,2,3,{"6":7,"4":5}]') - True - -Pretty printing:: - - >>> import simplejson - >>> print simplejson.dumps({'4': 5, '6': 7}, sort_keys=True, indent=4) - { - "4": 5, - "6": 7 - } - -Decoding JSON:: - - >>> import simplejson - >>> simplejson.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') == ["foo", {"bar":["baz", None, 1.0, 2]}] - True - >>> simplejson.loads('"\\"foo\\bar"') == '"foo\x08ar' - True - >>> from StringIO import StringIO - >>> io = StringIO('["streaming API"]') - >>> simplejson.load(io) == ["streaming API"] - True - -Specializing JSON object decoding:: - - >>> import simplejson - >>> def as_complex(dct): - ... if '__complex__' in dct: - ... return complex(dct['real'], dct['imag']) - ... return dct - ... - >>> simplejson.loads('{"__complex__": true, "real": 1, "imag": 2}', - ... object_hook=as_complex) - (1+2j) - >>> import decimal - >>> simplejson.loads('1.1', parse_float=decimal.Decimal) - Decimal("1.1") - -Extending JSONEncoder:: - - >>> import simplejson - >>> class ComplexEncoder(simplejson.JSONEncoder): - ... def default(self, obj): - ... if isinstance(obj, complex): - ... return [obj.real, obj.imag] - ... return simplejson.JSONEncoder.default(self, obj) - ... - >>> dumps(2 + 1j, cls=ComplexEncoder) - '[2.0, 1.0]' - >>> ComplexEncoder().encode(2 + 1j) - '[2.0, 1.0]' - >>> ''.join(ComplexEncoder().iterencode(2 + 1j)) - '[2.0, 1.0]' - - -Using simplejson from the shell to validate and -pretty-print:: - - $ echo '{"json":"obj"}' | python -msimplejson.tool - { - "json": "obj" - } - $ echo '{ 1.2:3.4}' | python -msimplejson.tool - Expecting property name: line 1 column 2 (char 2) - -Note that the JSON produced by this module's default settings -is a subset of YAML, so it may be used as a serializer for that as well. -""" -__version__ = '2.0.1' -__all__ = [ - 'dump', 'dumps', 'load', 'loads', - 'JSONDecoder', 'JSONEncoder', -] - -if __name__ == '__main__': - import warnings - warnings.warn('python -msimplejson is deprecated, use python -msiplejson.tool', DeprecationWarning) - from simplejson.decoder import JSONDecoder - from simplejson.encoder import JSONEncoder -else: - from decoder import JSONDecoder - from encoder import JSONEncoder - -_default_encoder = JSONEncoder( - skipkeys=False, - ensure_ascii=True, - check_circular=True, - allow_nan=True, - indent=None, - separators=None, - encoding='utf-8', - default=None, -) - -def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, - allow_nan=True, cls=None, indent=None, separators=None, - encoding='utf-8', default=None, **kw): - """ - Serialize ``obj`` as a JSON formatted stream to ``fp`` (a - ``.write()``-supporting file-like object). - - If ``skipkeys`` is ``True`` then ``dict`` keys that are not basic types - (``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``) - will be skipped instead of raising a ``TypeError``. - - If ``ensure_ascii`` is ``False``, then the some chunks written to ``fp`` - may be ``unicode`` instances, subject to normal Python ``str`` to - ``unicode`` coercion rules. Unless ``fp.write()`` explicitly - understands ``unicode`` (as in ``codecs.getwriter()``) this is likely - to cause an error. - - If ``check_circular`` is ``False``, then the circular reference check - for container types will be skipped and a circular reference will - result in an ``OverflowError`` (or worse). - - If ``allow_nan`` is ``False``, then it will be a ``ValueError`` to - serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) - in strict compliance of the JSON specification, instead of using the - JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``). - - If ``indent`` is a non-negative integer, then JSON array elements and object - members will be pretty-printed with that indent level. An indent level - of 0 will only insert newlines. ``None`` is the most compact representation. - - If ``separators`` is an ``(item_separator, dict_separator)`` tuple - then it will be used instead of the default ``(', ', ': ')`` separators. - ``(',', ':')`` is the most compact JSON representation. - - ``encoding`` is the character encoding for str instances, default is UTF-8. - - ``default(obj)`` is a function that should return a serializable version - of obj or raise TypeError. The default simply raises TypeError. - - To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the - ``.default()`` method to serialize additional types), specify it with - the ``cls`` kwarg. - """ - # cached encoder - if (skipkeys is False and ensure_ascii is True and - check_circular is True and allow_nan is True and - cls is None and indent is None and separators is None and - encoding == 'utf-8' and default is None and not kw): - iterable = _default_encoder.iterencode(obj) - else: - if cls is None: - cls = JSONEncoder - iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii, - check_circular=check_circular, allow_nan=allow_nan, indent=indent, - separators=separators, encoding=encoding, - default=default, **kw).iterencode(obj) - # could accelerate with writelines in some versions of Python, at - # a debuggability cost - for chunk in iterable: - fp.write(chunk) - - -def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, - allow_nan=True, cls=None, indent=None, separators=None, - encoding='utf-8', default=None, **kw): - """ - Serialize ``obj`` to a JSON formatted ``str``. - - If ``skipkeys`` is ``True`` then ``dict`` keys that are not basic types - (``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``) - will be skipped instead of raising a ``TypeError``. - - If ``ensure_ascii`` is ``False``, then the return value will be a - ``unicode`` instance subject to normal Python ``str`` to ``unicode`` - coercion rules instead of being escaped to an ASCII ``str``. - - If ``check_circular`` is ``False``, then the circular reference check - for container types will be skipped and a circular reference will - result in an ``OverflowError`` (or worse). - - If ``allow_nan`` is ``False``, then it will be a ``ValueError`` to - serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in - strict compliance of the JSON specification, instead of using the - JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``). - - If ``indent`` is a non-negative integer, then JSON array elements and - object members will be pretty-printed with that indent level. An indent - level of 0 will only insert newlines. ``None`` is the most compact - representation. - - If ``separators`` is an ``(item_separator, dict_separator)`` tuple - then it will be used instead of the default ``(', ', ': ')`` separators. - ``(',', ':')`` is the most compact JSON representation. - - ``encoding`` is the character encoding for str instances, default is UTF-8. - - ``default(obj)`` is a function that should return a serializable version - of obj or raise TypeError. The default simply raises TypeError. - - To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the - ``.default()`` method to serialize additional types), specify it with - the ``cls`` kwarg. - """ - # cached encoder - if (skipkeys is False and ensure_ascii is True and - check_circular is True and allow_nan is True and - cls is None and indent is None and separators is None and - encoding == 'utf-8' and default is None and not kw): - return _default_encoder.encode(obj) - if cls is None: - cls = JSONEncoder - return cls( - skipkeys=skipkeys, ensure_ascii=ensure_ascii, - check_circular=check_circular, allow_nan=allow_nan, indent=indent, - separators=separators, encoding=encoding, default=default, - **kw).encode(obj) - - -_default_decoder = JSONDecoder(encoding=None, object_hook=None) - - -def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None, - parse_int=None, parse_constant=None, **kw): - """ - Deserialize ``fp`` (a ``.read()``-supporting file-like object containing - a JSON document) to a Python object. - - If the contents of ``fp`` is encoded with an ASCII based encoding other - than utf-8 (e.g. latin-1), then an appropriate ``encoding`` name must - be specified. Encodings that are not ASCII based (such as UCS-2) are - not allowed, and should be wrapped with - ``codecs.getreader(fp)(encoding)``, or simply decoded to a ``unicode`` - object and passed to ``loads()`` - - ``object_hook`` is an optional function that will be called with the - result of any object literal decode (a ``dict``). The return value of - ``object_hook`` will be used instead of the ``dict``. This feature - can be used to implement custom decoders (e.g. JSON-RPC class hinting). - - To use a custom ``JSONDecoder`` subclass, specify it with the ``cls`` - kwarg. - """ - return loads(fp.read(), - encoding=encoding, cls=cls, object_hook=object_hook, - parse_float=parse_float, parse_int=parse_int, - parse_constant=parse_constant, **kw) - - -def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None, - parse_int=None, parse_constant=None, **kw): - """ - Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON - document) to a Python object. - - If ``s`` is a ``str`` instance and is encoded with an ASCII based encoding - other than utf-8 (e.g. latin-1) then an appropriate ``encoding`` name - must be specified. Encodings that are not ASCII based (such as UCS-2) - are not allowed and should be decoded to ``unicode`` first. - - ``object_hook`` is an optional function that will be called with the - result of any object literal decode (a ``dict``). The return value of - ``object_hook`` will be used instead of the ``dict``. This feature - can be used to implement custom decoders (e.g. JSON-RPC class hinting). - - ``parse_float``, if specified, will be called with the string - of every JSON float to be decoded. By default this is equivalent to - float(num_str). This can be used to use another datatype or parser - for JSON floats (e.g. decimal.Decimal). - - ``parse_int``, if specified, will be called with the string - of every JSON int to be decoded. By default this is equivalent to - int(num_str). This can be used to use another datatype or parser - for JSON integers (e.g. float). - - ``parse_constant``, if specified, will be called with one of the - following strings: -Infinity, Infinity, NaN, null, true, false. - This can be used to raise an exception if invalid JSON numbers - are encountered. - - To use a custom ``JSONDecoder`` subclass, specify it with the ``cls`` - kwarg. - """ - if (cls is None and encoding is None and object_hook is None and - parse_int is None and parse_float is None and - parse_constant is None and not kw): - return _default_decoder.decode(s) - if cls is None: - cls = JSONDecoder - if object_hook is not None: - kw['object_hook'] = object_hook - if parse_float is not None: - kw['parse_float'] = parse_float - if parse_int is not None: - kw['parse_int'] = parse_int - if parse_constant is not None: - kw['parse_constant'] = parse_constant - return cls(encoding=encoding, **kw).decode(s) - - -# -# Compatibility cruft from other libraries -# - - -def decode(s): - """ - demjson, python-cjson API compatibility hook. Use loads(s) instead. - """ - import warnings - warnings.warn("simplejson.loads(s) should be used instead of decode(s)", - DeprecationWarning) - return loads(s) - - -def encode(obj): - """ - demjson, python-cjson compatibility hook. Use dumps(s) instead. - """ - import warnings - warnings.warn("simplejson.dumps(s) should be used instead of encode(s)", - DeprecationWarning) - return dumps(obj) - - -def read(s): - """ - jsonlib, JsonUtils, python-json, json-py API compatibility hook. - Use loads(s) instead. - """ - import warnings - warnings.warn("simplejson.loads(s) should be used instead of read(s)", - DeprecationWarning) - return loads(s) - - -def write(obj): - """ - jsonlib, JsonUtils, python-json, json-py API compatibility hook. - Use dumps(s) instead. - """ - import warnings - warnings.warn("simplejson.dumps(s) should be used instead of write(s)", - DeprecationWarning) - return dumps(obj) - - -if __name__ == '__main__': - import simplejson.tool - simplejson.tool.main() diff --git a/subscriber/simplejson/_speedups.c b/subscriber/simplejson/_speedups.c deleted file mode 100644 index 620c916..0000000 --- a/subscriber/simplejson/_speedups.c +++ /dev/null @@ -1,2094 +0,0 @@ -#include "Python.h" -#include "structmember.h" -#if PY_VERSION_HEX < 0x02060000 && !defined(Py_TYPE) -#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) -#endif -#if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN) -typedef int Py_ssize_t; -#define PY_SSIZE_T_MAX INT_MAX -#define PY_SSIZE_T_MIN INT_MIN -#define PyInt_FromSsize_t PyInt_FromLong -#define PyInt_AsSsize_t PyInt_AsLong -#endif - -#ifdef __GNUC__ -#define UNUSED __attribute__((__unused__)) -#else -#define UNUSED -#endif - -#define DEFAULT_ENCODING "utf-8" - -#define PyScanner_Check(op) PyObject_TypeCheck(op, &PyScannerType) -#define PyScanner_CheckExact(op) (Py_TYPE(op) == &PyScannerType) -#define PyEncoder_Check(op) PyObject_TypeCheck(op, &PyEncoderType) -#define PyEncoder_CheckExact(op) (Py_TYPE(op) == &PyEncoderType) - -static PyTypeObject PyScannerType; -static PyTypeObject PyEncoderType; - -typedef struct _PyScannerObject { - PyObject_HEAD - PyObject *encoding; - PyObject *strict; - PyObject *object_hook; - PyObject *parse_float; - PyObject *parse_int; - PyObject *parse_constant; -} PyScannerObject; - -static PyMemberDef scanner_members[] = { - {"encoding", T_OBJECT, offsetof(PyScannerObject, encoding), READONLY, "encoding"}, - {"strict", T_OBJECT, offsetof(PyScannerObject, strict), READONLY, "strict"}, - {"object_hook", T_OBJECT, offsetof(PyScannerObject, object_hook), READONLY, "object_hook"}, - {"parse_float", T_OBJECT, offsetof(PyScannerObject, parse_float), READONLY, "parse_float"}, - {"parse_int", T_OBJECT, offsetof(PyScannerObject, parse_int), READONLY, "parse_int"}, - {"parse_constant", T_OBJECT, offsetof(PyScannerObject, parse_constant), READONLY, "parse_constant"}, - {NULL} -}; - -typedef struct _PyEncoderObject { - PyObject_HEAD - PyObject *markers; - PyObject *defaultfn; - PyObject *encoder; - PyObject *indent; - PyObject *key_separator; - PyObject *item_separator; - PyObject *sort_keys; - PyObject *skipkeys; - int fast_encode; - int allow_nan; -} PyEncoderObject; - -static PyMemberDef encoder_members[] = { - {"markers", T_OBJECT, offsetof(PyEncoderObject, markers), READONLY, "markers"}, - {"default", T_OBJECT, offsetof(PyEncoderObject, defaultfn), READONLY, "default"}, - {"encoder", T_OBJECT, offsetof(PyEncoderObject, encoder), READONLY, "encoder"}, - {"indent", T_OBJECT, offsetof(PyEncoderObject, indent), READONLY, "indent"}, - {"key_separator", T_OBJECT, offsetof(PyEncoderObject, key_separator), READONLY, "key_separator"}, - {"item_separator", T_OBJECT, offsetof(PyEncoderObject, item_separator), READONLY, "item_separator"}, - {"sort_keys", T_OBJECT, offsetof(PyEncoderObject, sort_keys), READONLY, "sort_keys"}, - {"skipkeys", T_OBJECT, offsetof(PyEncoderObject, skipkeys), READONLY, "skipkeys"}, - {NULL} -}; - -static Py_ssize_t -ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars); -static PyObject * -ascii_escape_unicode(PyObject *pystr); -static PyObject * -ascii_escape_str(PyObject *pystr); -static PyObject * -py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr); -void init_speedups(void); -static PyObject * -scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx); -static PyObject * -scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx); -static PyObject * -_build_rval_index_tuple(PyObject *rval, Py_ssize_t idx); -static int -scanner_init(PyObject *self, PyObject *args, PyObject *kwds); -static void -scanner_dealloc(PyObject *self); -static int -encoder_init(PyObject *self, PyObject *args, PyObject *kwds); -static void -encoder_dealloc(PyObject *self); -static int -encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level); -static int -encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level); -static int -encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level); -static PyObject * -_encoded_const(PyObject *const); -static void -raise_errmsg(char *msg, PyObject *s, Py_ssize_t end); -static PyObject * -encoder_encode_string(PyEncoderObject *s, PyObject *obj); -static int -_convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr); -static PyObject * -_convertPyInt_FromSsize_t(Py_ssize_t *size_ptr); -static PyObject * -encoder_encode_float(PyEncoderObject *s, PyObject *obj); - -#define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"') -#define IS_WHITESPACE(c) (((c) == ' ') || ((c) == '\t') || ((c) == '\n') || ((c) == '\r')) - -#define MIN_EXPANSION 6 -#ifdef Py_UNICODE_WIDE -#define MAX_EXPANSION (2 * MIN_EXPANSION) -#else -#define MAX_EXPANSION MIN_EXPANSION -#endif - -static int -_convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr) -{ - *size_ptr = PyInt_AsSsize_t(o); - if (*size_ptr == -1 && PyErr_Occurred()); - return 1; - return 0; -} - -static PyObject * -_convertPyInt_FromSsize_t(Py_ssize_t *size_ptr) -{ - return PyInt_FromSsize_t(*size_ptr); -} - -static Py_ssize_t -ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) -{ - output[chars++] = '\\'; - switch (c) { - case '\\': output[chars++] = (char)c; break; - case '"': output[chars++] = (char)c; break; - case '\b': output[chars++] = 'b'; break; - case '\f': output[chars++] = 'f'; break; - case '\n': output[chars++] = 'n'; break; - case '\r': output[chars++] = 'r'; break; - case '\t': output[chars++] = 't'; break; - default: -#ifdef Py_UNICODE_WIDE - if (c >= 0x10000) { - /* UTF-16 surrogate pair */ - Py_UNICODE v = c - 0x10000; - c = 0xd800 | ((v >> 10) & 0x3ff); - output[chars++] = 'u'; - output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf]; - output[chars++] = "0123456789abcdef"[(c >> 8) & 0xf]; - output[chars++] = "0123456789abcdef"[(c >> 4) & 0xf]; - output[chars++] = "0123456789abcdef"[(c ) & 0xf]; - c = 0xdc00 | (v & 0x3ff); - output[chars++] = '\\'; - } -#endif - output[chars++] = 'u'; - output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf]; - output[chars++] = "0123456789abcdef"[(c >> 8) & 0xf]; - output[chars++] = "0123456789abcdef"[(c >> 4) & 0xf]; - output[chars++] = "0123456789abcdef"[(c ) & 0xf]; - } - return chars; -} - -static PyObject * -ascii_escape_unicode(PyObject *pystr) -{ - Py_ssize_t i; - Py_ssize_t input_chars; - Py_ssize_t output_size; - Py_ssize_t chars; - PyObject *rval; - char *output; - Py_UNICODE *input_unicode; - - input_chars = PyUnicode_GET_SIZE(pystr); - input_unicode = PyUnicode_AS_UNICODE(pystr); - - /* One char input can be up to 6 chars output, estimate 4 of these */ - output_size = 2 + (MIN_EXPANSION * 4) + input_chars; - rval = PyString_FromStringAndSize(NULL, output_size); - if (rval == NULL) { - return NULL; - } - output = PyString_AS_STRING(rval); - chars = 0; - output[chars++] = '"'; - for (i = 0; i < input_chars; i++) { - Py_UNICODE c = input_unicode[i]; - if (S_CHAR(c)) { - output[chars++] = (char)c; - } - else { - chars = ascii_escape_char(c, output, chars); - } - if (output_size - chars < (1 + MAX_EXPANSION)) { - /* There's more than four, so let's resize by a lot */ - output_size *= 2; - /* This is an upper bound */ - if (output_size > 2 + (input_chars * MAX_EXPANSION)) { - output_size = 2 + (input_chars * MAX_EXPANSION); - } - if (_PyString_Resize(&rval, output_size) == -1) { - return NULL; - } - output = PyString_AS_STRING(rval); - } - } - output[chars++] = '"'; - if (_PyString_Resize(&rval, chars) == -1) { - return NULL; - } - return rval; -} - -static PyObject * -ascii_escape_str(PyObject *pystr) -{ - Py_ssize_t i; - Py_ssize_t input_chars; - Py_ssize_t output_size; - Py_ssize_t chars; - PyObject *rval; - char *output; - char *input_str; - - input_chars = PyString_GET_SIZE(pystr); - input_str = PyString_AS_STRING(pystr); - - /* Fast path for a string that's already ASCII */ - for (i = 0; i < input_chars; i++) { - Py_UNICODE c = (Py_UNICODE)(unsigned char)input_str[i]; - if (!S_CHAR(c)) { - /* If we have to escape something, scan the string for unicode */ - Py_ssize_t j; - for (j = i; j < input_chars; j++) { - c = (Py_UNICODE)(unsigned char)input_str[j]; - if (c > 0x7f) { - /* We hit a non-ASCII character, bail to unicode mode */ - PyObject *uni; - uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict"); - if (uni == NULL) { - return NULL; - } - rval = ascii_escape_unicode(uni); - Py_DECREF(uni); - return rval; - } - } - break; - } - } - - if (i == input_chars) { - /* Input is already ASCII */ - output_size = 2 + input_chars; - } - else { - /* One char input can be up to 6 chars output, estimate 4 of these */ - output_size = 2 + (MIN_EXPANSION * 4) + input_chars; - } - rval = PyString_FromStringAndSize(NULL, output_size); - if (rval == NULL) { - return NULL; - } - output = PyString_AS_STRING(rval); - output[0] = '"'; - - /* We know that everything up to i is ASCII already */ - chars = i + 1; - memcpy(&output[1], input_str, i); - - for (; i < input_chars; i++) { - Py_UNICODE c = (Py_UNICODE)(unsigned char)input_str[i]; - if (S_CHAR(c)) { - output[chars++] = (char)c; - } - else { - chars = ascii_escape_char(c, output, chars); - } - /* An ASCII char can't possibly expand to a surrogate! */ - if (output_size - chars < (1 + MIN_EXPANSION)) { - /* There's more than four, so let's resize by a lot */ - output_size *= 2; - if (output_size > 2 + (input_chars * MIN_EXPANSION)) { - output_size = 2 + (input_chars * MIN_EXPANSION); - } - if (_PyString_Resize(&rval, output_size) == -1) { - return NULL; - } - output = PyString_AS_STRING(rval); - } - } - output[chars++] = '"'; - if (_PyString_Resize(&rval, chars) == -1) { - return NULL; - } - return rval; -} - -static void -raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) -{ - static PyObject *errmsg_fn = NULL; - PyObject *pymsg; - if (errmsg_fn == NULL) { - PyObject *decoder = PyImport_ImportModule("simplejson.decoder"); - if (decoder == NULL) - return; - errmsg_fn = PyObject_GetAttrString(decoder, "errmsg"); - Py_DECREF(decoder); - if (errmsg_fn == NULL) - return; - } - pymsg = PyObject_CallFunction(errmsg_fn, "(zOO&)", msg, s, _convertPyInt_FromSsize_t, &end); - if (pymsg) { - PyErr_SetObject(PyExc_ValueError, pymsg); - Py_DECREF(pymsg); - } -} - -static PyObject * -join_list_string(PyObject *lst) -{ - static PyObject *joinfn = NULL; - if (joinfn == NULL) { - PyObject *ustr = PyString_FromStringAndSize(NULL, 0); - if (ustr == NULL) - return NULL; - - joinfn = PyObject_GetAttrString(ustr, "join"); - Py_DECREF(ustr); - if (joinfn == NULL) - return NULL; - } - return PyObject_CallFunctionObjArgs(joinfn, lst, NULL); -} - -static PyObject * -_build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { - /* - steal a reference to rval, returns (rval, idx) - */ - if (rval == NULL) { - return NULL; - } - PyObject *tpl; - PyObject *pyidx = PyInt_FromSsize_t(idx); - if (pyidx == NULL) { - Py_DECREF(rval); - return NULL; - } - tpl = PyTuple_Pack(2, rval, pyidx); - Py_DECREF(pyidx); - Py_DECREF(rval); - return tpl; -} - -static PyObject * -scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_ssize_t *next_end_ptr) -{ - PyObject *rval; - Py_ssize_t len = PyString_GET_SIZE(pystr); - Py_ssize_t begin = end - 1; - Py_ssize_t next = begin; - int has_unicode = 0; - char *buf = PyString_AS_STRING(pystr); - PyObject *chunks = PyList_New(0); - if (chunks == NULL) { - goto bail; - } - if (end < 0 || len <= end) { - PyErr_SetString(PyExc_ValueError, "end is out of bounds"); - goto bail; - } - while (1) { - /* Find the end of the string or the next escape */ - Py_UNICODE c = 0; - PyObject *chunk = NULL; - for (next = end; next < len; next++) { - c = (unsigned char)buf[next]; - if (c == '"' || c == '\\') { - break; - } - else if (strict && c <= 0x1f) { - raise_errmsg("Invalid control character at", pystr, next); - goto bail; - } - else if (c > 0x7f) { - has_unicode = 1; - } - } - if (!(c == '"' || c == '\\')) { - raise_errmsg("Unterminated string starting at", pystr, begin); - goto bail; - } - /* Pick up this chunk if it's not zero length */ - if (next != end) { - PyObject *strchunk = PyString_FromStringAndSize(&buf[end], next - end); - if (strchunk == NULL) { - goto bail; - } - if (has_unicode) { - chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL); - Py_DECREF(strchunk); - if (chunk == NULL) { - goto bail; - } - } - else { - chunk = strchunk; - } - if (PyList_Append(chunks, chunk)) { - goto bail; - } - Py_DECREF(chunk); - } - next++; - if (c == '"') { - end = next; - break; - } - if (next == len) { - raise_errmsg("Unterminated string starting at", pystr, begin); - goto bail; - } - c = buf[next]; - if (c != 'u') { - /* Non-unicode backslash escapes */ - end = next + 1; - switch (c) { - case '"': break; - case '\\': break; - case '/': break; - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - default: c = 0; - } - if (c == 0) { - raise_errmsg("Invalid \\escape", pystr, end - 2); - goto bail; - } - } - else { - c = 0; - next++; - end = next + 4; - if (end >= len) { - raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); - goto bail; - } - /* Decode 4 hex digits */ - for (; next < end; next++) { - c <<= 4; - Py_UNICODE digit = buf[next]; - switch (digit) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - c |= (digit - '0'); break; - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': - c |= (digit - 'a' + 10); break; - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': - c |= (digit - 'A' + 10); break; - default: - raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); - goto bail; - } - } -#ifdef Py_UNICODE_WIDE - /* Surrogate pair */ - if ((c & 0xfc00) == 0xd800) { - Py_UNICODE c2 = 0; - if (end + 6 >= len) { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } - if (buf[next++] != '\\' || buf[next++] != 'u') { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } - end += 6; - /* Decode 4 hex digits */ - for (; next < end; next++) { - c2 <<= 4; - Py_UNICODE digit = buf[next]; - switch (digit) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - c2 |= (digit - '0'); break; - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': - c2 |= (digit - 'a' + 10); break; - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': - c2 |= (digit - 'A' + 10); break; - default: - raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); - goto bail; - } - } - if ((c2 & 0xfc00) != 0xdc00) { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } - c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); - } - else if ((c & 0xfc00) == 0xdc00) { - raise_errmsg("Unpaired low surrogate", pystr, end - 5); - goto bail; - } -#endif - } - if (c > 0x7f) { - has_unicode = 1; - } - if (has_unicode) { - chunk = PyUnicode_FromUnicode(&c, 1); - if (chunk == NULL) { - goto bail; - } - } - else { - char c_char = Py_CHARMASK(c); - chunk = PyString_FromStringAndSize(&c_char, 1); - if (chunk == NULL) { - goto bail; - } - } - if (PyList_Append(chunks, chunk)) { - goto bail; - } - Py_DECREF(chunk); - } - - rval = join_list_string(chunks); - if (rval == NULL) { - goto bail; - } - Py_DECREF(chunks); - chunks = NULL; - *next_end_ptr = end; - return rval; -bail: - *next_end_ptr = -1; - Py_XDECREF(chunks); - return NULL; -} - - -static PyObject * -scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr) -{ - PyObject *rval; - Py_ssize_t len = PyUnicode_GET_SIZE(pystr); - Py_ssize_t begin = end - 1; - Py_ssize_t next = begin; - const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); - PyObject *chunks = PyList_New(0); - if (chunks == NULL) { - goto bail; - } - if (end < 0 || len <= end) { - PyErr_SetString(PyExc_ValueError, "end is out of bounds"); - goto bail; - } - while (1) { - /* Find the end of the string or the next escape */ - Py_UNICODE c = 0; - PyObject *chunk = NULL; - for (next = end; next < len; next++) { - c = buf[next]; - if (c == '"' || c == '\\') { - break; - } - else if (strict && c <= 0x1f) { - raise_errmsg("Invalid control character at", pystr, next); - goto bail; - } - } - if (!(c == '"' || c == '\\')) { - raise_errmsg("Unterminated string starting at", pystr, begin); - goto bail; - } - /* Pick up this chunk if it's not zero length */ - if (next != end) { - chunk = PyUnicode_FromUnicode(&buf[end], next - end); - if (chunk == NULL) { - goto bail; - } - if (PyList_Append(chunks, chunk)) { - goto bail; - } - Py_DECREF(chunk); - } - next++; - if (c == '"') { - end = next; - break; - } - if (next == len) { - raise_errmsg("Unterminated string starting at", pystr, begin); - goto bail; - } - c = buf[next]; - if (c != 'u') { - /* Non-unicode backslash escapes */ - end = next + 1; - switch (c) { - case '"': break; - case '\\': break; - case '/': break; - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - default: c = 0; - } - if (c == 0) { - raise_errmsg("Invalid \\escape", pystr, end - 2); - goto bail; - } - } - else { - c = 0; - next++; - end = next + 4; - if (end >= len) { - raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); - goto bail; - } - /* Decode 4 hex digits */ - for (; next < end; next++) { - c <<= 4; - Py_UNICODE digit = buf[next]; - switch (digit) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - c |= (digit - '0'); break; - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': - c |= (digit - 'a' + 10); break; - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': - c |= (digit - 'A' + 10); break; - default: - raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); - goto bail; - } - } -#ifdef Py_UNICODE_WIDE - /* Surrogate pair */ - if ((c & 0xfc00) == 0xd800) { - Py_UNICODE c2 = 0; - if (end + 6 >= len) { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } - if (buf[next++] != '\\' || buf[next++] != 'u') { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } - end += 6; - /* Decode 4 hex digits */ - for (; next < end; next++) { - c2 <<= 4; - Py_UNICODE digit = buf[next]; - switch (digit) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - c2 |= (digit - '0'); break; - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': - c2 |= (digit - 'a' + 10); break; - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': - c2 |= (digit - 'A' + 10); break; - default: - raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); - goto bail; - } - } - if ((c2 & 0xfc00) != 0xdc00) { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } - c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); - } - else if ((c & 0xfc00) == 0xdc00) { - raise_errmsg("Unpaired low surrogate", pystr, end - 5); - goto bail; - } -#endif - } - chunk = PyUnicode_FromUnicode(&c, 1); - if (chunk == NULL) { - goto bail; - } - if (PyList_Append(chunks, chunk)) { - goto bail; - } - Py_DECREF(chunk); - } - - rval = join_list_string(chunks); - if (rval == NULL) { - goto bail; - } - Py_DECREF(chunks); - *next_end_ptr = end; - return rval; -bail: - *next_end_ptr = -1; - Py_XDECREF(chunks); - return NULL; -} - -PyDoc_STRVAR(pydoc_scanstring, - "scanstring(basestring, end, encoding) -> (str, end)\n" - "\n" - "..." -); - -static PyObject * -py_scanstring(PyObject* self UNUSED, PyObject *args) -{ - PyObject *pystr; - PyObject *rval; - Py_ssize_t end; - Py_ssize_t next_end = -1; - char *encoding = NULL; - int strict = 0; - if (!PyArg_ParseTuple(args, "OO&|zi:scanstring", &pystr, _convertPyInt_AsSsize_t, &end, &encoding, &strict)) { - return NULL; - } - if (encoding == NULL) { - encoding = DEFAULT_ENCODING; - } - if (PyString_Check(pystr)) { - rval = scanstring_str(pystr, end, encoding, strict, &next_end); - } - else if (PyUnicode_Check(pystr)) { - rval = scanstring_unicode(pystr, end, strict, &next_end); - } - else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string, not %.80s", - Py_TYPE(pystr)->tp_name); - return NULL; - } - return _build_rval_index_tuple(rval, next_end); -} - -PyDoc_STRVAR(pydoc_encode_basestring_ascii, - "encode_basestring_ascii(basestring) -> str\n" - "\n" - "..." -); - -static PyObject * -py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr) -{ - /* METH_O */ - if (PyString_Check(pystr)) { - return ascii_escape_str(pystr); - } - else if (PyUnicode_Check(pystr)) { - return ascii_escape_unicode(pystr); - } - else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string, not %.80s", - Py_TYPE(pystr)->tp_name); - return NULL; - } -} - -static void -scanner_dealloc(PyObject *self) -{ - assert(PyScanner_Check(self)); - PyScannerObject *s = (PyScannerObject *)self; - Py_XDECREF(s->encoding); - Py_XDECREF(s->strict); - Py_XDECREF(s->object_hook); - Py_XDECREF(s->parse_float); - Py_XDECREF(s->parse_int); - Py_XDECREF(s->parse_constant); - s->encoding = NULL; - s->strict = NULL; - s->object_hook = NULL; - s->parse_float = NULL; - s->parse_int = NULL; - s->parse_constant = NULL; - self->ob_type->tp_free(self); -} - -static PyObject * -_parse_object_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx) { - char *str = PyString_AS_STRING(pystr); - Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; - PyObject *tpl = NULL; - PyObject *rval = PyDict_New(); - PyObject *key = NULL; - char *encoding = PyString_AS_STRING(s->encoding); - int strict = PyObject_IsTrue(s->strict); - Py_ssize_t next_idx; - if (rval == NULL) - return NULL; - - /* skip whitespace after { */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* only loop if the object is non-empty */ - if (idx <= end_idx && str[idx] != '}') { - while (idx <= end_idx) { - /* read key */ - if (str[idx] != '"') { - raise_errmsg("Expecting property name", pystr, idx); - goto bail; - } - key = scanstring_str(pystr, idx + 1, encoding, strict, &next_idx); - if (key == NULL) - goto bail; - Py_INCREF(key); - idx = next_idx; - - /* skip whitespace between key and : delimiter, read :, skip whitespace */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - if (idx > end_idx || str[idx] != ':') { - raise_errmsg("Expecting : delimiter", pystr, idx); - goto bail; - } - idx++; - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* read any JSON data type and de-tuplefy the (rval, idx) */ - tpl = scan_once_str(s, pystr, idx); - if (tpl == NULL) - goto bail; - next_idx = PyInt_AsSsize_t(PyTuple_GET_ITEM(tpl, 1)); - if (next_idx == -1 && PyErr_Occurred()) - goto bail; - if (PyDict_SetItem(rval, key, PyTuple_GET_ITEM(tpl, 0)) == -1) - goto bail; - Py_DECREF(tpl); - idx = next_idx; - tpl = NULL; - - /* skip whitespace before } or , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* bail if the object is closed or we didn't get the , delimiter */ - if (idx > end_idx) break; - if (str[idx] == '}') { - break; - } - else if (str[idx] != ',') { - raise_errmsg("Expecting , delimiter", pystr, idx); - goto bail; - } - idx++; - - /* skip whitespace after , delimiter */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - } - } - /* verify that idx < end_idx, str[idx] should be '}' */ - if (idx > end_idx || str[idx] != '}') { - raise_errmsg("Expecting object", pystr, end_idx); - goto bail; - } - /* if object_hook is not None: rval = object_hook(rval) */ - if (s->object_hook != Py_None) { - tpl = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); - if (tpl == NULL) - goto bail; - Py_DECREF(rval); - rval = tpl; - tpl = NULL; - } - return _build_rval_index_tuple(rval, idx + 1); -bail: - Py_XDECREF(key); - Py_XDECREF(tpl); - Py_DECREF(rval); - return NULL; -} - -static PyObject * -_parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx) { - Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); - Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; - PyObject *tpl = NULL; - PyObject *rval = PyDict_New(); - PyObject *key = NULL; - int strict = PyObject_IsTrue(s->strict); - Py_ssize_t next_idx; - if (rval == NULL) - return NULL; - - /* skip whitespace after { */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* only loop if the object is non-empty */ - if (idx <= end_idx && str[idx] != '}') { - while (idx <= end_idx) { - /* read key */ - if (str[idx] != '"') { - raise_errmsg("Expecting property name", pystr, idx); - goto bail; - } - key = scanstring_unicode(pystr, idx + 1, strict, &next_idx); - if (key == NULL) - goto bail; - idx = next_idx; - - /* skip whitespace between key and : delimiter, read :, skip whitespace */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - if (idx > end_idx || str[idx] != ':') { - raise_errmsg("Expecting : delimiter", pystr, idx); - goto bail; - } - idx++; - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* read any JSON term and de-tuplefy the (rval, idx) */ - tpl = scan_once_unicode(s, pystr, idx); - if (tpl == NULL) - goto bail; - next_idx = PyInt_AsSsize_t(PyTuple_GET_ITEM(tpl, 1)); - if (next_idx == -1 && PyErr_Occurred()) - goto bail; - if (PyDict_SetItem(rval, key, PyTuple_GET_ITEM(tpl, 0)) == -1) - goto bail; - Py_DECREF(tpl); - idx = next_idx; - tpl = NULL; - - /* skip whitespace before } or , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* bail if the object is closed or we didn't get the , delimiter */ - if (idx > end_idx) break; - if (str[idx] == '}') { - break; - } - else if (str[idx] != ',') { - raise_errmsg("Expecting , delimiter", pystr, idx); - goto bail; - } - idx++; - - /* skip whitespace after , delimiter */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - } - } - - /* verify that idx < end_idx, str[idx] should be '}' */ - if (idx > end_idx || str[idx] != '}') { - raise_errmsg("Expecting object", pystr, end_idx); - goto bail; - } - - /* if object_hook is not None: rval = object_hook(rval) */ - if (s->object_hook != Py_None) { - tpl = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); - if (tpl == NULL) - goto bail; - Py_DECREF(rval); - rval = tpl; - tpl = NULL; - } - return _build_rval_index_tuple(rval, idx + 1); -bail: - Py_XDECREF(key); - Py_XDECREF(tpl); - Py_DECREF(rval); - return NULL; -} - -static PyObject * -_parse_array_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx) { - char *str = PyString_AS_STRING(pystr); - Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; - PyObject *tpl = NULL; - PyObject *rval = PyList_New(0); - Py_ssize_t next_idx; - if (rval == NULL) - return NULL; - - /* skip whitespace after [ */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* only loop if the array is non-empty */ - if (idx <= end_idx && str[idx] != ']') { - while (idx <= end_idx) { - - /* read any JSON term and de-tuplefy the (rval, idx) */ - tpl = scan_once_str(s, pystr, idx); - if (tpl == NULL) - goto bail; - next_idx = PyInt_AsSsize_t(PyTuple_GET_ITEM(tpl, 1)); - if (next_idx == -1 && PyErr_Occurred()) - goto bail; - if (PyList_Append(rval, PyTuple_GET_ITEM(tpl, 0)) == -1) - goto bail; - Py_DECREF(tpl); - idx = next_idx; - tpl = NULL; - - /* skip whitespace between term and , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* bail if the array is closed or we didn't get the , delimiter */ - if (idx > end_idx) break; - if (str[idx] == ']') { - break; - } - else if (str[idx] != ',') { - raise_errmsg("Expecting , delimiter", pystr, idx); - goto bail; - } - idx++; - - /* skip whitespace after , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - } - } - - /* verify that idx < end_idx, str[idx] should be ']' */ - if (idx > end_idx || str[idx] != ']') { - raise_errmsg("Expecting object", pystr, end_idx); - goto bail; - } - return _build_rval_index_tuple(rval, idx + 1); -bail: - Py_XDECREF(tpl); - Py_DECREF(rval); - return NULL; -} - -static PyObject * -_parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx) { - Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); - Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; - PyObject *tpl = NULL; - PyObject *rval = PyList_New(0); - Py_ssize_t next_idx; - if (rval == NULL) - return NULL; - - /* skip whitespace after [ */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* only loop if the array is non-empty */ - if (idx <= end_idx && str[idx] != ']') { - while (idx <= end_idx) { - - /* read any JSON term and de-tuplefy the (rval, idx) */ - tpl = scan_once_unicode(s, pystr, idx); - if (tpl == NULL) - goto bail; - next_idx = PyInt_AsSsize_t(PyTuple_GET_ITEM(tpl, 1)); - if (next_idx == -1 && PyErr_Occurred()) - goto bail; - if (PyList_Append(rval, PyTuple_GET_ITEM(tpl, 0)) == -1) - goto bail; - Py_DECREF(tpl); - idx = next_idx; - tpl = NULL; - - /* skip whitespace between term and , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - - /* bail if the array is closed or we didn't get the , delimiter */ - if (idx > end_idx) break; - if (str[idx] == ']') { - break; - } - else if (str[idx] != ',') { - raise_errmsg("Expecting , delimiter", pystr, idx); - goto bail; - } - idx++; - - /* skip whitespace after , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - } - } - - /* verify that idx < end_idx, str[idx] should be ']' */ - if (idx > end_idx || str[idx] != ']') { - raise_errmsg("Expecting object", pystr, end_idx); - goto bail; - } - return _build_rval_index_tuple(rval, idx + 1); -bail: - Py_XDECREF(tpl); - Py_DECREF(rval); - return NULL; -} - -static PyObject * -_parse_constant(PyScannerObject *s, char *constant, Py_ssize_t idx) { - PyObject *cstr; - PyObject *rval; - /* constant is "NaN", "Infinity", or "-Infinity" */ - cstr = PyString_InternFromString(constant); - if (cstr == NULL) - return NULL; - - /* rval = parse_constant(constant) */ - rval = PyObject_CallFunctionObjArgs(s->parse_constant, cstr, NULL); - idx += PyString_GET_SIZE(cstr); - Py_DECREF(cstr); - return _build_rval_index_tuple(rval, idx); -} - -static PyObject * -_match_number_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t start) { - char *str = PyString_AS_STRING(pystr); - Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; - Py_ssize_t idx = start; - int is_float = 0; - PyObject *rval; - PyObject *numstr; - - /* read a sign if it's there, make sure it's not the end of the string */ - if (str[idx] == '-') { - idx++; - if (idx > end_idx) { - PyErr_SetNone(PyExc_StopIteration); - return NULL; - } - } - - /* read as many integer digits as we find as long as it doesn't start with 0 */ - if (str[idx] >= '1' && str[idx] <= '9') { - idx++; - while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; - } - /* if it starts with 0 we only expect one integer digit */ - else if (str[idx] == '0') { - idx++; - } - /* no integer digits, error */ - else { - PyErr_SetNone(PyExc_StopIteration); - return NULL; - } - - /* if the next char is '.' followed by a digit then read all float digits */ - if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') { - is_float = 1; - idx += 2; - while (idx < end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; - } - - /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */ - if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) { - - /* save the index of the 'e' or 'E' just in case we need to backtrack */ - Py_ssize_t e_start = idx; - idx++; - - /* read an exponent sign if present */ - if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++; - - /* read all digits */ - while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; - - /* if we got a digit, then parse as float. if not, backtrack */ - if (str[idx - 1] >= '0' && str[idx - 1] <= '9') { - is_float = 1; - } - else { - idx = e_start; - } - } - - /* copy the section we determined to be a number */ - numstr = PyString_FromStringAndSize(&str[start], idx - start); - if (numstr == NULL) - return NULL; - if (is_float) { - /* parse as a float using a fast path if available, otherwise call user defined method */ - if (s->parse_float != (PyObject *)&PyFloat_Type) { - rval = PyObject_CallFunctionObjArgs(s->parse_float, numstr, NULL); - } - else { - rval = PyFloat_FromDouble(PyOS_ascii_atof(PyString_AS_STRING(numstr))); - } - } - else { - /* parse as an int using a fast path if available, otherwise call user defined method */ - if (s->parse_int != (PyObject *)&PyInt_Type) { - rval = PyObject_CallFunctionObjArgs(s->parse_int, numstr, NULL); - } - else { - rval = PyInt_FromString(PyString_AS_STRING(numstr), NULL, 10); - } - } - Py_DECREF(numstr); - return _build_rval_index_tuple(rval, idx); -} - -static PyObject * -_match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start) { - Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); - Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; - Py_ssize_t idx = start; - int is_float = 0; - PyObject *rval; - PyObject *numstr; - - /* read a sign if it's there, make sure it's not the end of the string */ - if (str[idx] == '-') { - idx++; - if (idx > end_idx) { - PyErr_SetNone(PyExc_StopIteration); - return NULL; - } - } - - /* read as many integer digits as we find as long as it doesn't start with 0 */ - if (str[idx] >= '1' && str[idx] <= '9') { - idx++; - while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; - } - /* if it starts with 0 we only expect one integer digit */ - else if (str[idx] == '0') { - idx++; - } - /* no integer digits, error */ - else { - PyErr_SetNone(PyExc_StopIteration); - return NULL; - } - - /* if the next char is '.' followed by a digit then read all float digits */ - if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') { - is_float = 1; - idx += 2; - while (idx < end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; - } - - /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */ - if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) { - Py_ssize_t e_start = idx; - idx++; - - /* read an exponent sign if present */ - if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++; - - /* read all digits */ - while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; - - /* if we got a digit, then parse as float. if not, backtrack */ - if (str[idx - 1] >= '0' && str[idx - 1] <= '9') { - is_float = 1; - } - else { - idx = e_start; - } - } - - /* copy the section we determined to be a number */ - numstr = PyUnicode_FromUnicode(&str[start], idx - start); - if (numstr == NULL) - return NULL; - if (is_float) { - /* parse as a float using a fast path if available, otherwise call user defined method */ - if (s->parse_float != (PyObject *)&PyFloat_Type) { - rval = PyObject_CallFunctionObjArgs(s->parse_float, numstr, NULL); - } - else { - rval = PyFloat_FromString(numstr, NULL); - } - } - else { - /* no fast path for unicode -> int, just call */ - rval = PyObject_CallFunctionObjArgs(s->parse_int, numstr, NULL); - } - Py_DECREF(numstr); - return _build_rval_index_tuple(rval, idx); -} - -static PyObject * -scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx) -{ - char *str = PyString_AS_STRING(pystr); - Py_ssize_t length = PyString_GET_SIZE(pystr); - Py_ssize_t next_idx = -1; - PyObject *rval; - if (idx >= length) { - PyErr_SetNone(PyExc_StopIteration); - return NULL; - } - switch (str[idx]) { - case '"': - /* string */ - rval = scanstring_str(pystr, idx + 1, - PyString_AS_STRING(s->encoding), - PyObject_IsTrue(s->strict), - &next_idx); - return _build_rval_index_tuple(rval, next_idx); - case '{': - /* object */ - return _parse_object_str(s, pystr, idx + 1); - case '[': - /* array */ - return _parse_array_str(s, pystr, idx + 1); - case 'n': - /* null */ - if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') { - Py_INCREF(Py_None); - return _build_rval_index_tuple(Py_None, idx + 4); - } - break; - case 't': - /* true */ - if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') { - Py_INCREF(Py_True); - return _build_rval_index_tuple(Py_True, idx + 4); - } - break; - case 'f': - /* false */ - if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') { - Py_INCREF(Py_False); - return _build_rval_index_tuple(Py_False, idx + 5); - } - break; - case 'N': - /* NaN */ - if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') { - return _parse_constant(s, "NaN", idx); - } - break; - case 'I': - /* Infinity */ - if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') { - return _parse_constant(s, "Infinity", idx); - } - break; - case '-': - /* -Infinity */ - if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') { - return _parse_constant(s, "-Infinity", idx); - } - break; - } - /* Didn't find a string, object, array, or named constant. Look for a number. */ - return _match_number_str(s, pystr, idx); -} - -static PyObject * -scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx) -{ - Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); - Py_ssize_t length = PyUnicode_GET_SIZE(pystr); - Py_ssize_t next_idx = -1; - PyObject *rval; - if (idx >= length) { - PyErr_SetNone(PyExc_StopIteration); - return NULL; - } - switch (str[idx]) { - case '"': - /* string */ - rval = scanstring_unicode(pystr, idx + 1, - PyObject_IsTrue(s->strict), - &next_idx); - return _build_rval_index_tuple(rval, next_idx); - case '{': - /* object */ - return _parse_object_unicode(s, pystr, idx + 1); - case '[': - /* array */ - return _parse_array_unicode(s, pystr, idx + 1); - case 'n': - /* null */ - if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') { - Py_INCREF(Py_None); - return _build_rval_index_tuple(Py_None, idx + 4); - } - break; - case 't': - /* true */ - if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') { - Py_INCREF(Py_True); - return _build_rval_index_tuple(Py_True, idx + 4); - } - break; - case 'f': - /* false */ - if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') { - Py_INCREF(Py_False); - return _build_rval_index_tuple(Py_False, idx + 5); - } - break; - case 'N': - /* NaN */ - if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') { - return _parse_constant(s, "NaN", idx); - } - break; - case 'I': - /* Infinity */ - if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') { - return _parse_constant(s, "Infinity", idx); - } - break; - case '-': - /* -Infinity */ - if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') { - return _parse_constant(s, "-Infinity", idx); - } - break; - } - /* Didn't find a string, object, array, or named constant. Look for a number. */ - return _match_number_unicode(s, pystr, idx); -} - -static PyObject * -scanner_call(PyObject *self, PyObject *args, PyObject *kwds) -{ - PyObject *pystr; - Py_ssize_t idx; - static char *kwlist[] = {"string", "idx", NULL}; - PyScannerObject *s = (PyScannerObject *)self; - assert(PyScanner_Check(self)); - if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:scan_once", kwlist, &pystr, _convertPyInt_AsSsize_t, &idx)) - return NULL; - if (PyString_Check(pystr)) { - return scan_once_str(s, pystr, idx); - } - else if (PyUnicode_Check(pystr)) { - return scan_once_unicode(s, pystr, idx); - } - else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string, not %.80s", - Py_TYPE(pystr)->tp_name); - return NULL; - } -} - -static int -scanner_init(PyObject *self, PyObject *args, PyObject *kwds) -{ - PyObject *ctx; - static char *kwlist[] = {"context", NULL}; - - assert(PyScanner_Check(self)); - PyScannerObject *s = (PyScannerObject *)self; - - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx)) - return -1; - - s->encoding = NULL; - s->strict = NULL; - s->object_hook = NULL; - s->parse_float = NULL; - s->parse_int = NULL; - s->parse_constant = NULL; - - /* PyString_AS_STRING is used on encoding */ - s->encoding = PyObject_GetAttrString(ctx, "encoding"); - if (s->encoding == Py_None) { - Py_DECREF(Py_None); - s->encoding = PyString_InternFromString(DEFAULT_ENCODING); - } - else if (PyUnicode_Check(s->encoding)) { - PyObject *tmp = PyUnicode_AsEncodedString(s->encoding, NULL, NULL); - Py_DECREF(s->encoding); - s->encoding = tmp; - } - if (s->encoding == NULL || !PyString_Check(s->encoding)) - goto bail; - - /* All of these will fail "gracefully" so we don't need to verify them */ - s->strict = PyObject_GetAttrString(ctx, "strict"); - if (s->strict == NULL) - goto bail; - s->object_hook = PyObject_GetAttrString(ctx, "object_hook"); - if (s->object_hook == NULL) - goto bail; - s->parse_float = PyObject_GetAttrString(ctx, "parse_float"); - if (s->parse_float == NULL) - goto bail; - s->parse_int = PyObject_GetAttrString(ctx, "parse_int"); - if (s->parse_int == NULL) - goto bail; - s->parse_constant = PyObject_GetAttrString(ctx, "parse_constant"); - if (s->parse_constant == NULL) - goto bail; - - return 0; - -bail: - Py_XDECREF(s->encoding); - Py_XDECREF(s->strict); - Py_XDECREF(s->object_hook); - Py_XDECREF(s->parse_float); - Py_XDECREF(s->parse_int); - Py_XDECREF(s->parse_constant); - s->encoding = NULL; - s->strict = NULL; - s->object_hook = NULL; - s->parse_float = NULL; - s->parse_int = NULL; - s->parse_constant = NULL; - return -1; -} - -PyDoc_STRVAR(scanner_doc, "JSON scanner object"); - -static -PyTypeObject PyScannerType = { - PyObject_HEAD_INIT(0) - 0, /* tp_internal */ - "make_scanner", /* tp_name */ - sizeof(PyScannerObject), /* tp_basicsize */ - 0, /* tp_itemsize */ - scanner_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - scanner_call, /* tp_call */ - 0, /* tp_str */ - 0,/* PyObject_GenericGetAttr, */ /* tp_getattro */ - 0,/* PyObject_GenericSetAttr, */ /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - scanner_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - 0, /* tp_methods */ - scanner_members, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - scanner_init, /* tp_init */ - 0,/* PyType_GenericAlloc, */ /* tp_alloc */ - 0,/* PyType_GenericNew, */ /* tp_new */ - 0,/* _PyObject_Del, */ /* tp_free */ -}; - -static int -encoder_init(PyObject *self, PyObject *args, PyObject *kwds) -{ - static char *kwlist[] = {"markers", "default", "encoder", "indent", "key_separator", "item_separator", "sort_keys", "skipkeys", "allow_nan", NULL}; - - assert(PyEncoder_Check(self)); - PyEncoderObject *s = (PyEncoderObject *)self; - PyObject *allow_nan; - - s->markers = NULL; - s->defaultfn = NULL; - s->encoder = NULL; - s->indent = NULL; - s->key_separator = NULL; - s->item_separator = NULL; - s->sort_keys = NULL; - s->skipkeys = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOOOOOOO:make_encoder", kwlist, - &s->markers, &s->defaultfn, &s->encoder, &s->indent, &s->key_separator, &s->item_separator, &s->sort_keys, &s->skipkeys, &allow_nan)) - return -1; - - Py_INCREF(s->markers); - Py_INCREF(s->defaultfn); - Py_INCREF(s->encoder); - Py_INCREF(s->indent); - Py_INCREF(s->key_separator); - Py_INCREF(s->item_separator); - Py_INCREF(s->sort_keys); - Py_INCREF(s->skipkeys); - s->fast_encode = (PyCFunction_Check(s->encoder) && PyCFunction_GetFunction(s->encoder) == (PyCFunction)py_encode_basestring_ascii); - s->allow_nan = PyObject_IsTrue(allow_nan); - return 0; -} - -static PyObject * -encoder_call(PyObject *self, PyObject *args, PyObject *kwds) -{ - static char *kwlist[] = {"obj", "_current_indent_level", NULL}; - PyObject *obj; - PyObject *rval; - Py_ssize_t indent_level; - PyEncoderObject *s = (PyEncoderObject *)self; - assert(PyEncoder_Check(self)); - if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist, - &obj, _convertPyInt_AsSsize_t, &indent_level)) - return NULL; - rval = PyList_New(0); - if (rval == NULL) - return NULL; - if (encoder_listencode_obj(s, rval, obj, indent_level)) { - Py_DECREF(rval); - return NULL; - } - return rval; -} - -static PyObject * -_encoded_const(PyObject *obj) -{ - if (obj == Py_None) { - static PyObject *s_null = NULL; - if (s_null == NULL) { - s_null = PyString_InternFromString("null"); - } - return s_null; - } - else if (obj == Py_True) { - static PyObject *s_true = NULL; - if (s_true == NULL) { - s_true = PyString_InternFromString("true"); - } - return s_true; - } - else if (obj == Py_False) { - static PyObject *s_false = NULL; - if (s_false == NULL) { - s_false = PyString_InternFromString("false"); - } - return s_false; - } - else { - PyErr_SetString(PyExc_ValueError, "not a const"); - return NULL; - } -} - -static PyObject * -encoder_encode_float(PyEncoderObject *s, PyObject *obj) -{ - double i = PyFloat_AS_DOUBLE(obj); - if (!Py_IS_FINITE(i)) { - if (!s->allow_nan) { - PyErr_SetString(PyExc_ValueError, "Out of range float values are not JSON compliant"); - return NULL; - } - if (i > 0) { - return PyString_FromString("Infinity"); - } - else if (i < 0) { - return PyString_FromString("-Infinity"); - } - else { - return PyString_FromString("NaN"); - } - } - /* Use a better float format here? */ - return PyObject_Repr(obj); -} - -static PyObject * -encoder_encode_string(PyEncoderObject *s, PyObject *obj) -{ - if (s->fast_encode) - return py_encode_basestring_ascii(NULL, obj); - else - return PyObject_CallFunctionObjArgs(s->encoder, obj, NULL); -} - -static int -encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level) -{ - if (obj == Py_None || obj == Py_True || obj == Py_False) { - PyObject *cstr = _encoded_const(obj); - if (cstr == NULL) - return -1; - return PyList_Append(rval, cstr); - } - else if (PyString_Check(obj) || PyUnicode_Check(obj)) - { - PyObject *encoded = encoder_encode_string(s, obj); - if (encoded == NULL) - return -1; - return PyList_Append(rval, encoded); - } - else if (PyInt_Check(obj) || PyLong_Check(obj)) { - PyObject *encoded = PyObject_Str(obj); - if (encoded == NULL) - return -1; - return PyList_Append(rval, encoded); - } - else if (PyFloat_Check(obj)) { - PyObject *encoded = encoder_encode_float(s, obj); - if (encoded == NULL) - return -1; - return PyList_Append(rval, encoded); - } - else if (PyList_Check(obj) || PyTuple_Check(obj)) { - return encoder_listencode_list(s, rval, obj, indent_level); - } - else if (PyDict_Check(obj)) { - return encoder_listencode_dict(s, rval, obj, indent_level); - } - else { - PyObject *ident = NULL; - if (s->markers != Py_None) { - ident = PyLong_FromVoidPtr(obj); - int has_key; - if (ident == NULL) - return -1; - has_key = PyDict_Contains(s->markers, ident); - if (has_key) { - if (has_key != -1) - PyErr_SetString(PyExc_ValueError, "Circular reference detected"); - Py_DECREF(ident); - return -1; - } - if (PyDict_SetItem(s->markers, ident, obj)) { - Py_DECREF(ident); - return -1; - } - } - PyObject *newobj = PyObject_CallFunctionObjArgs(s->defaultfn, obj, NULL); - if (newobj == NULL) { - Py_DECREF(ident); - return -1; - } - int rv = encoder_listencode_obj(s, rval, newobj, indent_level); - Py_DECREF(newobj); - if (rv) { - Py_DECREF(ident); - return -1; - } - if (ident != NULL) { - if (PyDict_DelItem(s->markers, ident)) { - Py_DECREF(ident); - ident = NULL; - return -1; - } - Py_DECREF(ident); - ident = NULL; - } - return rv; - } -} - -static int -encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level) -{ - static PyObject *open_dict = NULL; - static PyObject *close_dict = NULL; - static PyObject *empty_dict = NULL; - PyObject *kstr = NULL; - if (open_dict == NULL || close_dict == NULL || empty_dict == NULL) { - open_dict = PyString_InternFromString("{"); - close_dict = PyString_InternFromString("}"); - empty_dict = PyString_InternFromString("{}"); - if (open_dict == NULL || close_dict == NULL || empty_dict == NULL) - return -1; - } - PyObject *ident = NULL; - if (PyDict_Size(dct) == 0) - return PyList_Append(rval, empty_dict); - - if (s->markers != Py_None) { - ident = PyLong_FromVoidPtr(dct); - int has_key; - if (ident == NULL) - goto bail; - has_key = PyDict_Contains(s->markers, ident); - if (has_key) { - if (has_key != -1) - PyErr_SetString(PyExc_ValueError, "Circular reference detected"); - goto bail; - } - if (PyDict_SetItem(s->markers, ident, dct)) { - goto bail; - } - } - - if (PyList_Append(rval, open_dict)) - goto bail; - - if (s->indent != Py_None) { - /* TODO: DOES NOT RUN */ - indent_level += 1; - /* - newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) - separator = _item_separator + newline_indent - buf += newline_indent - */ - } - - /* TODO: C speedup not implemented for sort_keys */ - - PyObject *key, *value; - Py_ssize_t pos = 0; - int skipkeys = PyObject_IsTrue(s->skipkeys); - Py_ssize_t idx = 0; - while (PyDict_Next(dct, &pos, &key, &value)) { - if (PyString_Check(key) || PyUnicode_Check(key)) { - Py_INCREF(key); - kstr = key; - } - else if (PyFloat_Check(key)) { - kstr = encoder_encode_float(s, key); - if (kstr == NULL) - goto bail; - } - else if (PyInt_Check(key) || PyLong_Check(key)) { - kstr = PyObject_Str(key); - if (kstr == NULL) - goto bail; - } - else if (key == Py_True || key == Py_False || key == Py_None) { - kstr = _encoded_const(key); - } - else if (skipkeys) { - continue; - } - else { - /* TODO: include repr of key */ - PyErr_SetString(PyExc_ValueError, "keys must be a string"); - goto bail; - } - - if (idx) { - if (PyList_Append(rval, s->item_separator)) - goto bail; - } - - PyObject *encoded = encoder_encode_string(s, kstr); - Py_DECREF(kstr); - kstr = NULL; - if (encoded == NULL) - goto bail; - if (PyList_Append(rval, encoded)) - goto bail; - if (PyList_Append(rval, s->key_separator)) - goto bail; - if (encoder_listencode_obj(s, rval, value, indent_level)) - goto bail; - idx += 1; - } - if (ident != NULL) { - if (PyDict_DelItem(s->markers, ident)) - goto bail; - Py_DECREF(ident); - ident = NULL; - } - if (s->indent != Py_None) { - /* TODO: DOES NOT RUN */ - indent_level -= 1; - /* - yield '\n' + (' ' * (_indent * _current_indent_level)) - */ - } - if (PyList_Append(rval, close_dict)) - goto bail; - return 0; - -bail: - Py_XDECREF(kstr); - Py_XDECREF(ident); - return -1; -} - - -static int -encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level) -{ - static PyObject *open_array = NULL; - static PyObject *close_array = NULL; - static PyObject *empty_array = NULL; - if (open_array == NULL || close_array == NULL || empty_array == NULL) { - open_array = PyString_InternFromString("["); - close_array = PyString_InternFromString("]"); - empty_array = PyString_InternFromString("[]"); - if (open_array == NULL || close_array == NULL || empty_array == NULL) - return -1; - } - PyObject *ident = NULL; - PyObject *s_fast = PySequence_Fast(seq, "_iterencode_list needs a sequence"); - if (s_fast == NULL) - return -1; - Py_ssize_t num_items = PySequence_Fast_GET_SIZE(s_fast); - if (num_items == 0) { - Py_DECREF(s_fast); - return PyList_Append(rval, empty_array); - } - - if (s->markers != Py_None) { - ident = PyLong_FromVoidPtr(seq); - int has_key; - if (ident == NULL) - goto bail; - has_key = PyDict_Contains(s->markers, ident); - if (has_key) { - if (has_key != -1) - PyErr_SetString(PyExc_ValueError, "Circular reference detected"); - goto bail; - } - if (PyDict_SetItem(s->markers, ident, seq)) { - goto bail; - } - } - - PyObject **seq_items = PySequence_Fast_ITEMS(s_fast); - if (PyList_Append(rval, open_array)) - goto bail; - Py_ssize_t i; - if (s->indent != Py_None) { - /* TODO: DOES NOT RUN */ - indent_level += 1; - /* - newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) - separator = _item_separator + newline_indent - buf += newline_indent - */ - } - for (i = 0; i < num_items; i++) { - PyObject *obj = seq_items[i]; - if (i) { - if (PyList_Append(rval, s->item_separator)) - goto bail; - } - if (encoder_listencode_obj(s, rval, obj, indent_level)) - goto bail; - } - if (ident != NULL) { - if (PyDict_DelItem(s->markers, ident)) - goto bail; - Py_DECREF(ident); - ident = NULL; - } - if (s->indent != Py_None) { - /* TODO: DOES NOT RUN */ - indent_level -= 1; - /* - yield '\n' + (' ' * (_indent * _current_indent_level)) - */ - } - if (PyList_Append(rval, close_array)) - goto bail; - Py_DECREF(s_fast); - return 0; - -bail: - Py_XDECREF(ident); - Py_DECREF(s_fast); - return -1; -} - -static void -encoder_dealloc(PyObject *self) -{ - assert(PyEncoder_Check(self)); - PyEncoderObject *s = (PyEncoderObject *)self; - Py_XDECREF(s->markers); - s->markers = NULL; - Py_XDECREF(s->defaultfn); - s->defaultfn = NULL; - Py_XDECREF(s->encoder); - s->encoder = NULL; - Py_XDECREF(s->indent); - s->indent = NULL; - Py_XDECREF(s->key_separator); - s->key_separator = NULL; - Py_XDECREF(s->item_separator); - s->item_separator = NULL; - Py_XDECREF(s->sort_keys); - s->sort_keys = NULL; - Py_XDECREF(s->skipkeys); - s->skipkeys = NULL; - self->ob_type->tp_free(self); -} - -PyDoc_STRVAR(encoder_doc, "_iterencode(obj, _current_indent_level) -> iterable"); - -static -PyTypeObject PyEncoderType = { - PyObject_HEAD_INIT(0) - 0, /* tp_internal */ - "make_encoder", /* tp_name */ - sizeof(PyEncoderObject), /* tp_basicsize */ - 0, /* tp_itemsize */ - encoder_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - encoder_call, /* tp_call */ - 0, /* tp_str */ - 0,/* PyObject_GenericGetAttr, */ /* tp_getattro */ - 0,/* PyObject_GenericSetAttr, */ /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - encoder_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - 0, /* tp_methods */ - encoder_members, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - encoder_init, /* tp_init */ - 0,/* PyType_GenericAlloc, */ /* tp_alloc */ - 0,/* PyType_GenericNew, */ /* tp_new */ - 0,/* _PyObject_Del, */ /* tp_free */ -}; - -static PyMethodDef speedups_methods[] = { - {"encode_basestring_ascii", - (PyCFunction)py_encode_basestring_ascii, - METH_O, - pydoc_encode_basestring_ascii}, - {"scanstring", - (PyCFunction)py_scanstring, - METH_VARARGS, - pydoc_scanstring}, - {NULL, NULL, 0, NULL} -}; - -PyDoc_STRVAR(module_doc, -"simplejson speedups\n"); - -void -init_speedups(void) -{ - PyObject *m; - PyScannerType.tp_getattro = PyObject_GenericGetAttr; - PyScannerType.tp_setattro = PyObject_GenericSetAttr; - PyScannerType.tp_alloc = PyType_GenericAlloc; - PyScannerType.tp_new = PyType_GenericNew; - PyScannerType.tp_free = _PyObject_Del; - if (PyType_Ready(&PyScannerType) < 0) - return; - PyEncoderType.tp_getattro = PyObject_GenericGetAttr; - PyEncoderType.tp_setattro = PyObject_GenericSetAttr; - PyEncoderType.tp_alloc = PyType_GenericAlloc; - PyEncoderType.tp_new = PyType_GenericNew; - PyEncoderType.tp_free = _PyObject_Del; - if (PyType_Ready(&PyEncoderType) < 0) - return; - m = Py_InitModule3("_speedups", speedups_methods, module_doc); - Py_INCREF((PyObject*)&PyScannerType); - PyModule_AddObject(m, "make_scanner", (PyObject*)&PyScannerType); - Py_INCREF((PyObject*)&PyEncoderType); - PyModule_AddObject(m, "make_encoder", (PyObject*)&PyEncoderType); -} diff --git a/subscriber/simplejson/decoder.py b/subscriber/simplejson/decoder.py deleted file mode 100644 index d5c1a6b..0000000 --- a/subscriber/simplejson/decoder.py +++ /dev/null @@ -1,336 +0,0 @@ -""" -Implementation of JSONDecoder -""" -import re -import sys -import struct - -from simplejson.scanner import make_scanner -try: - from simplejson._speedups import scanstring as c_scanstring -except ImportError: - c_scanstring = None - -FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL - -def _floatconstants(): - _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') - if sys.byteorder != 'big': - _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] - nan, inf = struct.unpack('dd', _BYTES) - return nan, inf, -inf - -NaN, PosInf, NegInf = _floatconstants() - - -def linecol(doc, pos): - lineno = doc.count('\n', 0, pos) + 1 - if lineno == 1: - colno = pos - else: - colno = pos - doc.rindex('\n', 0, pos) - return lineno, colno - - -def errmsg(msg, doc, pos, end=None): - # Note that this function is called from _speedups - lineno, colno = linecol(doc, pos) - if end is None: - return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) - endlineno, endcolno = linecol(doc, end) - return '%s: line %d column %d - line %d column %d (char %d - %d)' % ( - msg, lineno, colno, endlineno, endcolno, pos, end) - - -_CONSTANTS = { - '-Infinity': NegInf, - 'Infinity': PosInf, - 'NaN': NaN, -} - -STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) -BACKSLASH = { - '"': u'"', '\\': u'\\', '/': u'/', - 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', -} - -DEFAULT_ENCODING = "utf-8" - -def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): - if encoding is None: - encoding = DEFAULT_ENCODING - chunks = [] - _append = chunks.append - begin = end - 1 - while 1: - chunk = _m(s, end) - if chunk is None: - raise ValueError( - errmsg("Unterminated string starting at", s, begin)) - end = chunk.end() - content, terminator = chunk.groups() - if content: - if not isinstance(content, unicode): - content = unicode(content, encoding) - _append(content) - if terminator == '"': - break - elif terminator != '\\': - if strict: - raise ValueError(errmsg("Invalid control character %r at", s, end)) - else: - _append(terminator) - continue - try: - esc = s[end] - except IndexError: - raise ValueError( - errmsg("Unterminated string starting at", s, begin)) - if esc != 'u': - try: - m = _b[esc] - except KeyError: - raise ValueError( - errmsg("Invalid \\escape: %r" % (esc,), s, end)) - end += 1 - else: - esc = s[end + 1:end + 5] - next_end = end + 5 - msg = "Invalid \\uXXXX escape" - try: - if len(esc) != 4: - raise ValueError - uni = int(esc, 16) - if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: - msg = "Invalid \\uXXXX\\uXXXX surrogate pair" - if not s[end + 5:end + 7] == '\\u': - raise ValueError - esc2 = s[end + 7:end + 11] - if len(esc2) != 4: - raise ValueError - uni2 = int(esc2, 16) - uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) - next_end += 6 - m = unichr(uni) - except ValueError: - raise ValueError(errmsg(msg, s, end)) - end = next_end - _append(m) - return u''.join(chunks), end - - -# Use speedup if available -scanstring = c_scanstring or py_scanstring - -WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) -WHITESPACE_STR = ' \t\n\r' - -def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR): - pairs = {} - nextchar = s[end:end + 1] - # Normally we expect nextchar == '"' - if nextchar != '"': - if nextchar in _ws: - end = _w(s, end).end() - nextchar = s[end:end + 1] - # Trivial empty object - if nextchar == '}': - return pairs, end + 1 - elif nextchar != '"': - raise ValueError(errmsg("Expecting property name", s, end)) - end += 1 - while True: - key, end = scanstring(s, end, encoding, strict) - - # To skip some function call overhead we optimize the fast paths where - # the JSON key separator is ": " or just ":". - if s[end:end + 1] != ':': - end = _w(s, end).end() - if s[end:end + 1] != ':': - raise ValueError(errmsg("Expecting : delimiter", s, end)) - - end += 1 - - try: - if s[end] in _ws: - end += 1 - if s[end] in _ws: - end = _w(s, end + 1).end() - except IndexError: - pass - - try: - value, end = scan_once(s, end) - except StopIteration: - raise ValueError(errmsg("Expecting object", s, end)) - pairs[key] = value - - try: - nextchar = s[end] - if nextchar in _ws: - end = _w(s, end + 1).end() - nextchar = s[end] - except IndexError: - nextchar = '' - end += 1 - - if nextchar == '}': - break - elif nextchar != ',': - raise ValueError(errmsg("Expecting , delimiter", s, end - 1)) - - try: - nextchar = s[end] - if nextchar in _ws: - end += 1 - nextchar = s[end] - if nextchar in _ws: - end = _w(s, end + 1).end() - nextchar = s[end] - except IndexError: - nextchar = '' - - end += 1 - if nextchar != '"': - raise ValueError(errmsg("Expecting property name", s, end - 1)) - - if object_hook is not None: - pairs = object_hook(pairs) - return pairs, end - -def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): - values = [] - nextchar = s[end:end + 1] - if nextchar in _ws: - end = _w(s, end + 1).end() - nextchar = s[end:end + 1] - # Look-ahead for trivial empty array - if nextchar == ']': - return values, end + 1 - _append = values.append - while True: - try: - value, end = scan_once(s, end) - except StopIteration: - raise ValueError(errmsg("Expecting object", s, end)) - _append(value) - nextchar = s[end:end + 1] - if nextchar in _ws: - end = _w(s, end + 1).end() - nextchar = s[end:end + 1] - end += 1 - if nextchar == ']': - break - elif nextchar != ',': - raise ValueError(errmsg("Expecting , delimiter", s, end)) - - try: - if s[end] in _ws: - end += 1 - if s[end] in _ws: - end = _w(s, end + 1).end() - except IndexError: - pass - - return values, end - -class JSONDecoder(object): - """ - Simple JSON decoder - - Performs the following translations in decoding by default: - - +---------------+-------------------+ - | JSON | Python | - +===============+===================+ - | object | dict | - +---------------+-------------------+ - | array | list | - +---------------+-------------------+ - | string | unicode | - +---------------+-------------------+ - | number (int) | int, long | - +---------------+-------------------+ - | number (real) | float | - +---------------+-------------------+ - | true | True | - +---------------+-------------------+ - | false | False | - +---------------+-------------------+ - | null | None | - +---------------+-------------------+ - - It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as - their corresponding ``float`` values, which is outside the JSON spec. - """ - - __all__ = ['__init__', 'decode', 'raw_decode'] - - def __init__(self, encoding=None, object_hook=None, parse_float=None, - parse_int=None, parse_constant=None, strict=True): - """ - ``encoding`` determines the encoding used to interpret any ``str`` - objects decoded by this instance (utf-8 by default). It has no - effect when decoding ``unicode`` objects. - - Note that currently only encodings that are a superset of ASCII work, - strings of other encodings should be passed in as ``unicode``. - - ``object_hook``, if specified, will be called with the result - of every JSON object decoded and its return value will be used in - place of the given ``dict``. This can be used to provide custom - deserializations (e.g. to support JSON-RPC class hinting). - - ``parse_float``, if specified, will be called with the string - of every JSON float to be decoded. By default this is equivalent to - float(num_str). This can be used to use another datatype or parser - for JSON floats (e.g. decimal.Decimal). - - ``parse_int``, if specified, will be called with the string - of every JSON int to be decoded. By default this is equivalent to - int(num_str). This can be used to use another datatype or parser - for JSON integers (e.g. float). - - ``parse_constant``, if specified, will be called with one of the - following strings: -Infinity, Infinity, NaN. - This can be used to raise an exception if invalid JSON numbers - are encountered. - """ - self.encoding = encoding - self.object_hook = object_hook - self.parse_float = parse_float or float - self.parse_int = parse_int or int - self.parse_constant = parse_constant or _CONSTANTS.__getitem__ - self.strict = strict - self.parse_object = JSONObject - self.parse_array = JSONArray - self.parse_string = scanstring - self.scan_once = make_scanner(self) - - def decode(self, s, _w=WHITESPACE.match): - """ - Return the Python representation of ``s`` (a ``str`` or ``unicode`` - instance containing a JSON document) - """ - obj, end = self.raw_decode(s, idx=_w(s, 0).end()) - end = _w(s, end).end() - if end != len(s): - raise ValueError(errmsg("Extra data", s, end, len(s))) - return obj - - def raw_decode(self, s, idx=0): - """ - Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning - with a JSON document) and return a 2-tuple of the Python - representation and the index in ``s`` where the document ended. - - This can be used to decode a JSON document from a string that may - have extraneous data at the end. - """ - try: - obj, end = self.scan_once(s, idx) - except StopIteration: - raise ValueError("No JSON object could be decoded") - return obj, end - -__all__ = ['JSONDecoder'] diff --git a/subscriber/simplejson/encoder.py b/subscriber/simplejson/encoder.py deleted file mode 100644 index df23d22..0000000 --- a/subscriber/simplejson/encoder.py +++ /dev/null @@ -1,439 +0,0 @@ -""" -Implementation of JSONEncoder -""" -import re - -try: - from simplejson._speedups import encode_basestring_ascii as c_encode_basestring_ascii -except ImportError: - c_encode_basestring_ascii = None -try: - from simplejson._speedups import make_encoder as c_make_encoder -except ImportError: - c_make_encoder = None - -ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]') -ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') -HAS_UTF8 = re.compile(r'[\x80-\xff]') -ESCAPE_DCT = { - '\\': '\\\\', - '"': '\\"', - '\b': '\\b', - '\f': '\\f', - '\n': '\\n', - '\r': '\\r', - '\t': '\\t', -} -for i in range(0x20): - ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) - -# Assume this produces an infinity on all machines (probably not guaranteed) -INFINITY = float('1e66666') -FLOAT_REPR = repr - -def encode_basestring(s): - """ - Return a JSON representation of a Python string - """ - def replace(match): - return ESCAPE_DCT[match.group(0)] - return '"' + ESCAPE.sub(replace, s) + '"' - - -def py_encode_basestring_ascii(s): - if isinstance(s, str) and HAS_UTF8.search(s) is not None: - s = s.decode('utf-8') - def replace(match): - s = match.group(0) - try: - return ESCAPE_DCT[s] - except KeyError: - n = ord(s) - if n < 0x10000: - return '\\u%04x' % (n,) - else: - # surrogate pair - n -= 0x10000 - s1 = 0xd800 | ((n >> 10) & 0x3ff) - s2 = 0xdc00 | (n & 0x3ff) - return '\\u%04x\\u%04x' % (s1, s2) - return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"' - - -encode_basestring_ascii = c_encode_basestring_ascii or py_encode_basestring_ascii - -class JSONEncoder(object): - """ - Extensible JSON encoder for Python data structures. - - Supports the following objects and types by default: - - +-------------------+---------------+ - | Python | JSON | - +===================+===============+ - | dict | object | - +-------------------+---------------+ - | list, tuple | array | - +-------------------+---------------+ - | str, unicode | string | - +-------------------+---------------+ - | int, long, float | number | - +-------------------+---------------+ - | True | true | - +-------------------+---------------+ - | False | false | - +-------------------+---------------+ - | None | null | - +-------------------+---------------+ - - To extend this to recognize other objects, subclass and implement a - ``.default()`` method with another method that returns a serializable - object for ``o`` if possible, otherwise it should call the superclass - implementation (to raise ``TypeError``). - """ - __all__ = ['__init__', 'default', 'encode', 'iterencode'] - item_separator = ', ' - key_separator = ': ' - def __init__(self, skipkeys=False, ensure_ascii=True, - check_circular=True, allow_nan=True, sort_keys=False, - indent=None, separators=None, encoding='utf-8', default=None): - """ - Constructor for JSONEncoder, with sensible defaults. - - If skipkeys is False, then it is a TypeError to attempt - encoding of keys that are not str, int, long, float or None. If - skipkeys is True, such items are simply skipped. - - If ensure_ascii is True, the output is guaranteed to be str - objects with all incoming unicode characters escaped. If - ensure_ascii is false, the output will be unicode object. - - If check_circular is True, then lists, dicts, and custom encoded - objects will be checked for circular references during encoding to - prevent an infinite recursion (which would cause an OverflowError). - Otherwise, no such check takes place. - - If allow_nan is True, then NaN, Infinity, and -Infinity will be - encoded as such. This behavior is not JSON specification compliant, - but is consistent with most JavaScript based encoders and decoders. - Otherwise, it will be a ValueError to encode such floats. - - If sort_keys is True, then the output of dictionaries will be - sorted by key; this is useful for regression tests to ensure - that JSON serializations can be compared on a day-to-day basis. - - If indent is a non-negative integer, then JSON array - elements and object members will be pretty-printed with that - indent level. An indent level of 0 will only insert newlines. - None is the most compact representation. - - If specified, separators should be a (item_separator, key_separator) - tuple. The default is (', ', ': '). To get the most compact JSON - representation you should specify (',', ':') to eliminate whitespace. - - If specified, default is a function that gets called for objects - that can't otherwise be serialized. It should return a JSON encodable - version of the object or raise a ``TypeError``. - - If encoding is not None, then all input strings will be - transformed into unicode using that encoding prior to JSON-encoding. - The default is UTF-8. - """ - - self.skipkeys = skipkeys - self.ensure_ascii = ensure_ascii - self.check_circular = check_circular - self.allow_nan = allow_nan - self.sort_keys = sort_keys - self.indent = indent - if separators is not None: - self.item_separator, self.key_separator = separators - if default is not None: - self.default = default - self.encoding = encoding - - def default(self, o): - """ - Implement this method in a subclass such that it returns - a serializable object for ``o``, or calls the base implementation - (to raise a ``TypeError``). - - For example, to support arbitrary iterators, you could - implement default like this:: - - def default(self, o): - try: - iterable = iter(o) - except TypeError: - pass - else: - return list(iterable) - return JSONEncoder.default(self, o) - """ - raise TypeError("%r is not JSON serializable" % (o,)) - - def encode(self, o): - """ - Return a JSON string representation of a Python data structure. - - >>> JSONEncoder().encode({"foo": ["bar", "baz"]}) - '{"foo": ["bar", "baz"]}' - """ - # This is for extremely simple cases and benchmarks. - if isinstance(o, basestring): - if isinstance(o, str): - _encoding = self.encoding - if (_encoding is not None - and not (_encoding == 'utf-8')): - o = o.decode(_encoding) - if self.ensure_ascii: - return encode_basestring_ascii(o) - else: - return encode_basestring(o) - # This doesn't pass the iterator directly to ''.join() because the - # exceptions aren't as detailed. The list call should be roughly - # equivalent to the PySequence_Fast that ''.join() would do. - chunks = self.iterencode(o, _one_shot=True) - if not isinstance(chunks, (list, tuple)): - chunks = list(chunks) - return ''.join(chunks) - - def iterencode(self, o, _one_shot=False): - """ - Encode the given object and yield each string - representation as available. - - For example:: - - for chunk in JSONEncoder().iterencode(bigobject): - mysocket.write(chunk) - """ - if self.check_circular: - markers = {} - else: - markers = None - if self.ensure_ascii: - _encoder = encode_basestring_ascii - else: - _encoder = encode_basestring - if self.encoding != 'utf-8': - def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding): - if isinstance(o, str): - o = o.decode(_encoding) - return _orig_encoder(o) - - def floatstr(o, allow_nan=self.allow_nan, _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY): - # Check for specials. Note that this type of test is processor- and/or - # platform-specific, so do tests which don't depend on the internals. - - if o != o: - text = 'NaN' - elif o == _inf: - text = 'Infinity' - elif o == _neginf: - text = '-Infinity' - else: - return _repr(o) - - if not allow_nan: - raise ValueError("Out of range float values are not JSON compliant: %r" - % (o,)) - - return text - - - if _one_shot and c_make_encoder is not None and not self.indent and not self.sort_keys: - _iterencode = c_make_encoder( - markers, self.default, _encoder, self.indent, - self.key_separator, self.item_separator, self.sort_keys, - self.skipkeys, self.allow_nan) - else: - _iterencode = _make_iterencode( - markers, self.default, _encoder, self.indent, floatstr, - self.key_separator, self.item_separator, self.sort_keys, - self.skipkeys, _one_shot) - return _iterencode(o, 0) - -def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot, - ## HACK: hand-optimized bytecode; turn globals into locals - False=False, - True=True, - ValueError=ValueError, - basestring=basestring, - dict=dict, - float=float, - id=id, - int=int, - isinstance=isinstance, - list=list, - long=long, - str=str, - tuple=tuple, - ): - - def _iterencode_list(lst, _current_indent_level): - if not lst: - yield '[]' - return - if markers is not None: - markerid = id(lst) - if markerid in markers: - raise ValueError("Circular reference detected") - markers[markerid] = lst - buf = '[' - if _indent is not None: - _current_indent_level += 1 - newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) - separator = _item_separator + newline_indent - buf += newline_indent - else: - newline_indent = None - separator = _item_separator - first = True - for value in lst: - if first: - first = False - else: - buf = separator - if isinstance(value, basestring): - yield buf + _encoder(value) - elif value is None: - yield buf + 'null' - elif value is True: - yield buf + 'true' - elif value is False: - yield buf + 'false' - elif isinstance(value, (int, long)): - yield buf + str(value) - elif isinstance(value, float): - yield buf + _floatstr(value) - else: - yield buf - if isinstance(value, (list, tuple)): - chunks = _iterencode_list(value, _current_indent_level) - elif isinstance(value, dict): - chunks = _iterencode_dict(value, _current_indent_level) - else: - chunks = _iterencode(value, _current_indent_level) - for chunk in chunks: - yield chunk - if newline_indent is not None: - _current_indent_level -= 1 - yield '\n' + (' ' * (_indent * _current_indent_level)) - yield ']' - if markers is not None: - del markers[markerid] - - def _iterencode_dict(dct, _current_indent_level): - if not dct: - yield '{}' - return - if markers is not None: - markerid = id(dct) - if markerid in markers: - raise ValueError("Circular reference detected") - markers[markerid] = dct - yield '{' - if _indent is not None: - _current_indent_level += 1 - newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) - item_separator = _item_separator + newline_indent - yield newline_indent - else: - newline_indent = None - item_separator = _item_separator - first = True - if _sort_keys: - items = dct.items() - items.sort(key=lambda kv: kv[0]) - else: - items = dct.iteritems() - for key, value in items: - if isinstance(key, basestring): - pass - # JavaScript is weakly typed for these, so it makes sense to - # also allow them. Many encoders seem to do something like this. - elif isinstance(key, float): - key = _floatstr(key) - elif isinstance(key, (int, long)): - key = str(key) - elif key is True: - key = 'true' - elif key is False: - key = 'false' - elif key is None: - key = 'null' - elif _skipkeys: - continue - else: - raise TypeError("key %r is not a string" % (key,)) - if first: - first = False - else: - yield item_separator - yield _encoder(key) - yield _key_separator - if isinstance(value, basestring): - yield _encoder(value) - elif value is None: - yield 'null' - elif value is True: - yield 'true' - elif value is False: - yield 'false' - elif isinstance(value, (int, long)): - yield str(value) - elif isinstance(value, float): - yield _floatstr(value) - else: - if isinstance(value, (list, tuple)): - chunks = _iterencode_list(value, _current_indent_level) - elif isinstance(value, dict): - chunks = _iterencode_dict(value, _current_indent_level) - else: - chunks = _iterencode(value, _current_indent_level) - for chunk in chunks: - yield chunk - if newline_indent is not None: - _current_indent_level -= 1 - yield '\n' + (' ' * (_indent * _current_indent_level)) - yield '}' - if markers is not None: - del markers[markerid] - - def _iterencode(o, _current_indent_level): - if isinstance(o, basestring): - yield _encoder(o) - elif o is None: - yield 'null' - elif o is True: - yield 'true' - elif o is False: - yield 'false' - elif isinstance(o, (int, long)): - yield str(o) - elif isinstance(o, float): - yield _floatstr(o) - elif isinstance(o, (list, tuple)): - for chunk in _iterencode_list(o, _current_indent_level): - yield chunk - elif isinstance(o, dict): - for chunk in _iterencode_dict(o, _current_indent_level): - yield chunk - else: - if markers is not None: - markerid = id(o) - if markerid in markers: - raise ValueError("Circular reference detected") - markers[markerid] = o - o = _default(o) - for chunk in _iterencode(o, _current_indent_level): - yield chunk - if markers is not None: - del markers[markerid] - - return _iterencode - - - -__all__ = ['JSONEncoder'] diff --git a/subscriber/simplejson/scanner.py b/subscriber/simplejson/scanner.py deleted file mode 100644 index 1bcbcdd..0000000 --- a/subscriber/simplejson/scanner.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -JSON token scanner -""" -import re -try: - from simplejson._speedups import make_scanner as c_make_scanner -except ImportError: - c_make_scanner = None - -__all__ = ['make_scanner'] - -NUMBER_RE = re.compile( - r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?', - (re.VERBOSE | re.MULTILINE | re.DOTALL)) - -def py_make_scanner(context): - parse_object = context.parse_object - parse_array = context.parse_array - parse_string = context.parse_string - match_number = NUMBER_RE.match - encoding = context.encoding - strict = context.strict - parse_float = context.parse_float - parse_int = context.parse_int - parse_constant = context.parse_constant - object_hook = context.object_hook - - def _scan_once(string, idx): - try: - nextchar = string[idx] - except IndexError: - raise StopIteration - - if nextchar == '"': - return parse_string(string, idx + 1, encoding, strict) - elif nextchar == '{': - return parse_object((string, idx + 1), encoding, strict, _scan_once, object_hook) - elif nextchar == '[': - return parse_array((string, idx + 1), _scan_once) - elif nextchar == 'n' and string[idx:idx + 4] == 'null': - return None, idx + 4 - elif nextchar == 't' and string[idx:idx + 4] == 'true': - return True, idx + 4 - elif nextchar == 'f' and string[idx:idx + 5] == 'false': - return False, idx + 5 - - m = match_number(string, idx) - if m is not None: - integer, frac, exp = m.groups() - if frac or exp: - res = parse_float(integer + (frac or '') + (exp or '')) - else: - res = parse_int(integer) - return res, m.end() - elif nextchar == 'N' and string[idx:idx + 3] == 'NaN': - return parse_constant('NaN'), idx + 3 - elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity': - return parse_constant('Infinity'), idx + 8 - elif nextchar == '-' and string[idx:idx + 9] == '-Infinity': - return parse_constant('-Infinity'), idx + 9 - else: - raise StopIteration - - return _scan_once - -make_scanner = c_make_scanner or py_make_scanner \ No newline at end of file diff --git a/subscriber/simplejson/tests/__init__.py b/subscriber/simplejson/tests/__init__.py deleted file mode 100644 index 17c9796..0000000 --- a/subscriber/simplejson/tests/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest -import doctest - -def additional_tests(): - import simplejson - import simplejson.encoder - import simplejson.decoder - suite = unittest.TestSuite() - for mod in (simplejson, simplejson.encoder, simplejson.decoder): - suite.addTest(doctest.DocTestSuite(mod)) - suite.addTest(doctest.DocFileSuite('../../index.rst')) - return suite - -def main(): - suite = additional_tests() - runner = unittest.TextTestRunner() - runner.run(suite) - -if __name__ == '__main__': - import os - import sys - sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - main() diff --git a/subscriber/simplejson/tests/test_decode.py b/subscriber/simplejson/tests/test_decode.py deleted file mode 100644 index 4db5ec6..0000000 --- a/subscriber/simplejson/tests/test_decode.py +++ /dev/null @@ -1,22 +0,0 @@ -import decimal -from unittest import TestCase - -import simplejson as S - -class TestDecode(TestCase): - def test_decimal(self): - rval = S.loads('1.1', parse_float=decimal.Decimal) - self.assert_(isinstance(rval, decimal.Decimal)) - self.assertEquals(rval, decimal.Decimal('1.1')) - - def test_float(self): - rval = S.loads('1', parse_int=float) - self.assert_(isinstance(rval, float)) - self.assertEquals(rval, 1.0) - - def test_decoder_optimizations(self): - # Several optimizations were made that skip over calls to - # the whitespace regex, so this test is designed to try and - # exercise the uncommon cases. The array cases are already covered. - rval = S.loads('{ "key" : "value" , "k":"v" }') - self.assertEquals(rval, {"key":"value", "k":"v"}) diff --git a/subscriber/simplejson/tests/test_default.py b/subscriber/simplejson/tests/test_default.py deleted file mode 100644 index d4936e3..0000000 --- a/subscriber/simplejson/tests/test_default.py +++ /dev/null @@ -1,9 +0,0 @@ -from unittest import TestCase - -import simplejson as S - -class TestDefault(TestCase): - def test_default(self): - self.assertEquals( - S.dumps(type, default=repr), - S.dumps(repr(type))) diff --git a/subscriber/simplejson/tests/test_dump.py b/subscriber/simplejson/tests/test_dump.py deleted file mode 100644 index 04a17bf..0000000 --- a/subscriber/simplejson/tests/test_dump.py +++ /dev/null @@ -1,13 +0,0 @@ -from unittest import TestCase -from cStringIO import StringIO - -import simplejson as S - -class TestDump(TestCase): - def test_dump(self): - sio = StringIO() - S.dump({}, sio) - self.assertEquals(sio.getvalue(), '{}') - - def test_dumps(self): - self.assertEquals(S.dumps({}), '{}') diff --git a/subscriber/simplejson/tests/test_encode_basestring_ascii.py b/subscriber/simplejson/tests/test_encode_basestring_ascii.py deleted file mode 100644 index 7128495..0000000 --- a/subscriber/simplejson/tests/test_encode_basestring_ascii.py +++ /dev/null @@ -1,38 +0,0 @@ -from unittest import TestCase - -import simplejson.encoder - -CASES = [ - (u'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?', '"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"'), - (u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'), - (u'controls', '"controls"'), - (u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'), - (u'{"object with 1 member":["array with 1 element"]}', '"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"'), - (u' s p a c e d ', '" s p a c e d "'), - (u'\U0001d120', '"\\ud834\\udd20"'), - (u'\u03b1\u03a9', '"\\u03b1\\u03a9"'), - ('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'), - (u'\u03b1\u03a9', '"\\u03b1\\u03a9"'), - ('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'), - (u'\u03b1\u03a9', '"\\u03b1\\u03a9"'), - (u'\u03b1\u03a9', '"\\u03b1\\u03a9"'), - (u"`1~!@#$%^&*()_+-={':[,]}|;.?", '"`1~!@#$%^&*()_+-={\':[,]}|;.?"'), - (u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'), - (u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'), -] - -class TestEncodeBaseStringAscii(TestCase): - def test_py_encode_basestring_ascii(self): - self._test_encode_basestring_ascii(simplejson.encoder.py_encode_basestring_ascii) - - def test_c_encode_basestring_ascii(self): - if not simplejson.encoder.c_encode_basestring_ascii: - return - self._test_encode_basestring_ascii(simplejson.encoder.c_encode_basestring_ascii) - - def _test_encode_basestring_ascii(self, encode_basestring_ascii): - fname = encode_basestring_ascii.__name__ - for input_string, expect in CASES: - result = encode_basestring_ascii(input_string) - self.assertEquals(result, expect, - '%r != %r for %s(%r)' % (result, expect, fname, input_string)) diff --git a/subscriber/simplejson/tests/test_fail.py b/subscriber/simplejson/tests/test_fail.py deleted file mode 100644 index fba7449..0000000 --- a/subscriber/simplejson/tests/test_fail.py +++ /dev/null @@ -1,76 +0,0 @@ -from unittest import TestCase - -import simplejson as S - -# Fri Dec 30 18:57:26 2005 -JSONDOCS = [ - # http://json.org/JSON_checker/test/fail1.json - '"A JSON payload should be an object or array, not a string."', - # http://json.org/JSON_checker/test/fail2.json - '["Unclosed array"', - # http://json.org/JSON_checker/test/fail3.json - '{unquoted_key: "keys must be quoted}', - # http://json.org/JSON_checker/test/fail4.json - '["extra comma",]', - # http://json.org/JSON_checker/test/fail5.json - '["double extra comma",,]', - # http://json.org/JSON_checker/test/fail6.json - '[ , "<-- missing value"]', - # http://json.org/JSON_checker/test/fail7.json - '["Comma after the close"],', - # http://json.org/JSON_checker/test/fail8.json - '["Extra close"]]', - # http://json.org/JSON_checker/test/fail9.json - '{"Extra comma": true,}', - # http://json.org/JSON_checker/test/fail10.json - '{"Extra value after close": true} "misplaced quoted value"', - # http://json.org/JSON_checker/test/fail11.json - '{"Illegal expression": 1 + 2}', - # http://json.org/JSON_checker/test/fail12.json - '{"Illegal invocation": alert()}', - # http://json.org/JSON_checker/test/fail13.json - '{"Numbers cannot have leading zeroes": 013}', - # http://json.org/JSON_checker/test/fail14.json - '{"Numbers cannot be hex": 0x14}', - # http://json.org/JSON_checker/test/fail15.json - '["Illegal backslash escape: \\x15"]', - # http://json.org/JSON_checker/test/fail16.json - '["Illegal backslash escape: \\\'"]', - # http://json.org/JSON_checker/test/fail17.json - '["Illegal backslash escape: \\017"]', - # http://json.org/JSON_checker/test/fail18.json - '[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', - # http://json.org/JSON_checker/test/fail19.json - '{"Missing colon" null}', - # http://json.org/JSON_checker/test/fail20.json - '{"Double colon":: null}', - # http://json.org/JSON_checker/test/fail21.json - '{"Comma instead of colon", null}', - # http://json.org/JSON_checker/test/fail22.json - '["Colon instead of comma": false]', - # http://json.org/JSON_checker/test/fail23.json - '["Bad value", truth]', - # http://json.org/JSON_checker/test/fail24.json - "['single quote']", - # http://code.google.com/p/simplejson/issues/detail?id=3 - u'["A\u001FZ control characters in string"]', -] - -SKIPS = { - 1: "why not have a string payload?", - 18: "spec doesn't specify any nesting limitations", -} - -class TestFail(TestCase): - def test_failures(self): - for idx, doc in enumerate(JSONDOCS): - idx = idx + 1 - if idx in SKIPS: - S.loads(doc) - continue - try: - S.loads(doc) - except ValueError: - pass - else: - self.fail("Expected failure for fail%d.json: %r" % (idx, doc)) diff --git a/subscriber/simplejson/tests/test_float.py b/subscriber/simplejson/tests/test_float.py deleted file mode 100644 index 09a1a08..0000000 --- a/subscriber/simplejson/tests/test_float.py +++ /dev/null @@ -1,14 +0,0 @@ -import math -from unittest import TestCase - -import simplejson as S - -class TestFloat(TestCase): - def test_floats(self): - for num in [1617161771.7650001, math.pi, math.pi**100, math.pi**-100]: - self.assertEquals(float(S.dumps(num)), num) - - def test_ints(self): - for num in [1, 1L, 1<<32, 1<<64]: - self.assertEquals(S.dumps(num), str(num)) - self.assertEquals(int(S.dumps(num)), num) diff --git a/subscriber/simplejson/tests/test_indent.py b/subscriber/simplejson/tests/test_indent.py deleted file mode 100644 index a430e15..0000000 --- a/subscriber/simplejson/tests/test_indent.py +++ /dev/null @@ -1,41 +0,0 @@ -from unittest import TestCase - -import simplejson as S -import textwrap - -class TestIndent(TestCase): - def test_indent(self): - h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth', - {'nifty': 87}, {'field': 'yes', 'morefield': False} ] - - expect = textwrap.dedent("""\ - [ - [ - "blorpie" - ], - [ - "whoops" - ], - [], - "d-shtaeou", - "d-nthiouh", - "i-vhbjkhnth", - { - "nifty": 87 - }, - { - "field": "yes", - "morefield": false - } - ]""") - - - d1 = S.dumps(h) - d2 = S.dumps(h, indent=2, sort_keys=True, separators=(',', ': ')) - - h1 = S.loads(d1) - h2 = S.loads(d2) - - self.assertEquals(h1, h) - self.assertEquals(h2, h) - self.assertEquals(d2, expect) diff --git a/subscriber/simplejson/tests/test_pass1.py b/subscriber/simplejson/tests/test_pass1.py deleted file mode 100644 index 591a18b..0000000 --- a/subscriber/simplejson/tests/test_pass1.py +++ /dev/null @@ -1,76 +0,0 @@ -from unittest import TestCase - -import simplejson as S - -# from http://json.org/JSON_checker/test/pass1.json -JSON = r''' -[ - "JSON Test Pattern pass1", - {"object with 1 member":["array with 1 element"]}, - {}, - [], - -42, - true, - false, - null, - { - "integer": 1234567890, - "real": -9876.543210, - "e": 0.123456789e-12, - "E": 1.234567890E+34, - "": 23456789012E666, - "zero": 0, - "one": 1, - "space": " ", - "quote": "\"", - "backslash": "\\", - "controls": "\b\f\n\r\t", - "slash": "/ & \/", - "alpha": "abcdefghijklmnopqrstuvwyz", - "ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ", - "digit": "0123456789", - "special": "`1~!@#$%^&*()_+-={':[,]}|;.?", - "hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A", - "true": true, - "false": false, - "null": null, - "array":[ ], - "object":{ }, - "address": "50 St. James Street", - "url": "http://www.JSON.org/", - "comment": "// /* */": " ", - " s p a c e d " :[1,2 , 3 - -, - -4 , 5 , 6 ,7 ], - "compact": [1,2,3,4,5,6,7], - "jsontext": "{\"object with 1 member\":[\"array with 1 element\"]}", - "quotes": "" \u0022 %22 0x22 034 "", - "\/\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?" -: "A key can be any string" - }, - 0.5 ,98.6 -, -99.44 -, - -1066 - - -,"rosebud"] -''' - -class TestPass1(TestCase): - def test_parse(self): - # test in/out equivalence and parsing - res = S.loads(JSON) - out = S.dumps(res) - self.assertEquals(res, S.loads(out)) - try: - S.dumps(res, allow_nan=False) - except ValueError: - pass - else: - self.fail("23456789012E666 should be out of range") diff --git a/subscriber/simplejson/tests/test_pass2.py b/subscriber/simplejson/tests/test_pass2.py deleted file mode 100644 index 8cc74e8..0000000 --- a/subscriber/simplejson/tests/test_pass2.py +++ /dev/null @@ -1,14 +0,0 @@ -from unittest import TestCase -import simplejson as S - -# from http://json.org/JSON_checker/test/pass2.json -JSON = r''' -[[[[[[[[[[[[[[[[[[["Not too deep"]]]]]]]]]]]]]]]]]]] -''' - -class TestPass2(TestCase): - def test_parse(self): - # test in/out equivalence and parsing - res = S.loads(JSON) - out = S.dumps(res) - self.assertEquals(res, S.loads(out)) diff --git a/subscriber/simplejson/tests/test_pass3.py b/subscriber/simplejson/tests/test_pass3.py deleted file mode 100644 index 017ea8e..0000000 --- a/subscriber/simplejson/tests/test_pass3.py +++ /dev/null @@ -1,20 +0,0 @@ -from unittest import TestCase - -import simplejson as S - -# from http://json.org/JSON_checker/test/pass3.json -JSON = r''' -{ - "JSON Test Pattern pass3": { - "The outermost value": "must be an object or array.", - "In this test": "It is an object." - } -} -''' - -class TestPass3(TestCase): - def test_parse(self): - # test in/out equivalence and parsing - res = S.loads(JSON) - out = S.dumps(res) - self.assertEquals(res, S.loads(out)) diff --git a/subscriber/simplejson/tests/test_recursion.py b/subscriber/simplejson/tests/test_recursion.py deleted file mode 100644 index d8ffbd5..0000000 --- a/subscriber/simplejson/tests/test_recursion.py +++ /dev/null @@ -1,65 +0,0 @@ -from unittest import TestCase - -import simplejson as S - -class JSONTestObject: - pass - -class RecursiveJSONEncoder(S.JSONEncoder): - recurse = False - def default(self, o): - if o is JSONTestObject: - if self.recurse: - return [JSONTestObject] - else: - return 'JSONTestObject' - return S.JSONEncoder.default(o) - -class TestRecursion(TestCase): - def test_listrecursion(self): - x = [] - x.append(x) - try: - S.dumps(x) - except ValueError: - pass - else: - self.fail("didn't raise ValueError on list recursion") - x = [] - y = [x] - x.append(y) - try: - S.dumps(x) - except ValueError: - pass - else: - self.fail("didn't raise ValueError on alternating list recursion") - y = [] - x = [y, y] - # ensure that the marker is cleared - S.dumps(x) - - def test_dictrecursion(self): - x = {} - x["test"] = x - try: - S.dumps(x) - except ValueError: - pass - else: - self.fail("didn't raise ValueError on dict recursion") - x = {} - y = {"a": x, "b": x} - # ensure that the marker is cleared - S.dumps(x) - - def test_defaultrecursion(self): - enc = RecursiveJSONEncoder() - self.assertEquals(enc.encode(JSONTestObject), '"JSONTestObject"') - enc.recurse = True - try: - enc.encode(JSONTestObject) - except ValueError: - pass - else: - self.fail("didn't raise ValueError on default recursion") diff --git a/subscriber/simplejson/tests/test_scanstring.py b/subscriber/simplejson/tests/test_scanstring.py deleted file mode 100644 index 995176d..0000000 --- a/subscriber/simplejson/tests/test_scanstring.py +++ /dev/null @@ -1,104 +0,0 @@ -import sys -import decimal -from unittest import TestCase - -import simplejson.decoder - -class TestScanString(TestCase): - def test_py_scanstring(self): - self._test_scanstring(simplejson.decoder.py_scanstring) - - def test_c_scanstring(self): - if not simplejson.decoder.c_scanstring: - return - self._test_scanstring(simplejson.decoder.c_scanstring) - - def _test_scanstring(self, scanstring): - self.assertEquals( - scanstring('"z\\ud834\\udd20x"', 1, None, True), - (u'z\U0001d120x', 16)) - - if sys.maxunicode == 65535: - self.assertEquals( - scanstring(u'"z\U0001d120x"', 1, None, True), - (u'z\U0001d120x', 6)) - else: - self.assertEquals( - scanstring(u'"z\U0001d120x"', 1, None, True), - (u'z\U0001d120x', 5)) - - self.assertEquals( - scanstring('"\\u007b"', 1, None, True), - (u'{', 8)) - - self.assertEquals( - scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True), - (u'A JSON payload should be an object or array, not a string.', 60)) - - self.assertEquals( - scanstring('["Unclosed array"', 2, None, True), - (u'Unclosed array', 17)) - - self.assertEquals( - scanstring('["extra comma",]', 2, None, True), - (u'extra comma', 14)) - - self.assertEquals( - scanstring('["double extra comma",,]', 2, None, True), - (u'double extra comma', 21)) - - self.assertEquals( - scanstring('["Comma after the close"],', 2, None, True), - (u'Comma after the close', 24)) - - self.assertEquals( - scanstring('["Extra close"]]', 2, None, True), - (u'Extra close', 14)) - - self.assertEquals( - scanstring('{"Extra comma": true,}', 2, None, True), - (u'Extra comma', 14)) - - self.assertEquals( - scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True), - (u'Extra value after close', 26)) - - self.assertEquals( - scanstring('{"Illegal expression": 1 + 2}', 2, None, True), - (u'Illegal expression', 21)) - - self.assertEquals( - scanstring('{"Illegal invocation": alert()}', 2, None, True), - (u'Illegal invocation', 21)) - - self.assertEquals( - scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True), - (u'Numbers cannot have leading zeroes', 37)) - - self.assertEquals( - scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True), - (u'Numbers cannot be hex', 24)) - - self.assertEquals( - scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True), - (u'Too deep', 30)) - - self.assertEquals( - scanstring('{"Missing colon" null}', 2, None, True), - (u'Missing colon', 16)) - - self.assertEquals( - scanstring('{"Double colon":: null}', 2, None, True), - (u'Double colon', 15)) - - self.assertEquals( - scanstring('{"Comma instead of colon", null}', 2, None, True), - (u'Comma instead of colon', 25)) - - self.assertEquals( - scanstring('["Colon instead of comma": false]', 2, None, True), - (u'Colon instead of comma', 25)) - - self.assertEquals( - scanstring('["Bad value", truth]', 2, None, True), - (u'Bad value', 12)) diff --git a/subscriber/simplejson/tests/test_separators.py b/subscriber/simplejson/tests/test_separators.py deleted file mode 100644 index 4506c95..0000000 --- a/subscriber/simplejson/tests/test_separators.py +++ /dev/null @@ -1,42 +0,0 @@ -import textwrap -from unittest import TestCase - -import simplejson as S - - -class TestSeparators(TestCase): - def test_separators(self): - h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth', - {'nifty': 87}, {'field': 'yes', 'morefield': False} ] - - expect = textwrap.dedent("""\ - [ - [ - "blorpie" - ] , - [ - "whoops" - ] , - [] , - "d-shtaeou" , - "d-nthiouh" , - "i-vhbjkhnth" , - { - "nifty" : 87 - } , - { - "field" : "yes" , - "morefield" : false - } - ]""") - - - d1 = S.dumps(h) - d2 = S.dumps(h, indent=2, sort_keys=True, separators=(' ,', ' : ')) - - h1 = S.loads(d1) - h2 = S.loads(d2) - - self.assertEquals(h1, h) - self.assertEquals(h2, h) - self.assertEquals(d2, expect) diff --git a/subscriber/simplejson/tests/test_unicode.py b/subscriber/simplejson/tests/test_unicode.py deleted file mode 100644 index ff50c12..0000000 --- a/subscriber/simplejson/tests/test_unicode.py +++ /dev/null @@ -1,59 +0,0 @@ -from unittest import TestCase - -import simplejson as S - -class TestUnicode(TestCase): - def test_encoding1(self): - encoder = S.JSONEncoder(encoding='utf-8') - u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' - s = u.encode('utf-8') - ju = encoder.encode(u) - js = encoder.encode(s) - self.assertEquals(ju, js) - - def test_encoding2(self): - u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' - s = u.encode('utf-8') - ju = S.dumps(u, encoding='utf-8') - js = S.dumps(s, encoding='utf-8') - self.assertEquals(ju, js) - - def test_encoding3(self): - u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' - j = S.dumps(u) - self.assertEquals(j, '"\\u03b1\\u03a9"') - - def test_encoding4(self): - u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' - j = S.dumps([u]) - self.assertEquals(j, '["\\u03b1\\u03a9"]') - - def test_encoding5(self): - u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' - j = S.dumps(u, ensure_ascii=False) - self.assertEquals(j, u'"%s"' % (u,)) - - def test_encoding6(self): - u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' - j = S.dumps([u], ensure_ascii=False) - self.assertEquals(j, u'["%s"]' % (u,)) - - def test_big_unicode_encode(self): - u = u'\U0001d120' - self.assertEquals(S.dumps(u), '"\\ud834\\udd20"') - self.assertEquals(S.dumps(u, ensure_ascii=False), u'"\U0001d120"') - - def test_big_unicode_decode(self): - u = u'z\U0001d120x' - self.assertEquals(S.loads('"' + u + '"'), u) - self.assertEquals(S.loads('"z\\ud834\\udd20x"'), u) - - def test_unicode_decode(self): - for i in range(0, 0xd7ff): - u = unichr(i) - json = '"\\u%04x"' % (i,) - self.assertEquals(S.loads(json), u) - - def test_default_encoding(self): - self.assertEquals(S.loads(u'{"a": "\xe9"}'.encode('utf-8')), - {'a': u'\xe9'}) diff --git a/subscriber/simplejson/tool.py b/subscriber/simplejson/tool.py deleted file mode 100644 index caa1818..0000000 --- a/subscriber/simplejson/tool.py +++ /dev/null @@ -1,44 +0,0 @@ -r""" -Using simplejson from the shell to validate and -pretty-print:: - - $ echo '{"json":"obj"}' | python -msimplejson - { - "json": "obj" - } - $ echo '{ 1.2:3.4}' | python -msimplejson - Expecting property name: line 1 column 2 (char 2) - -Note that the JSON produced by this module's default settings -is a subset of YAML, so it may be used as a serializer for that as well. -""" -import simplejson - -# -# Pretty printer: -# curl http://mochikit.com/examples/ajax_tables/domains.json | python -msimplejson.tool -# - -def main(): - import sys - if len(sys.argv) == 1: - infile = sys.stdin - outfile = sys.stdout - elif len(sys.argv) == 2: - infile = open(sys.argv[1], 'rb') - outfile = sys.stdout - elif len(sys.argv) == 3: - infile = open(sys.argv[1], 'rb') - outfile = open(sys.argv[2], 'wb') - else: - raise SystemExit("%s [infile [outfile]]" % (sys.argv[0],)) - try: - obj = simplejson.load(infile) - except ValueError, e: - raise SystemExit(e) - simplejson.dump(obj, outfile, sort_keys=True, indent=4) - outfile.write('\n') - - -if __name__ == '__main__': - main() diff --git a/subscriber/static/agg.js b/subscriber/static/agg.js deleted file mode 100644 index c3aa253..0000000 --- a/subscriber/static/agg.js +++ /dev/null @@ -1,36 +0,0 @@ -function showStuff() { - var req = new XMLHttpRequest(); - req.open("GET", "/items", true); - req.onreadystatechange = function() { - if (req.readyState != 4) { - return; - } - gotStuff(req.status, req.responseText); - }; - req.send(null); -} - -function gotStuff(status, text) { - if (status != 200) { - window.setTimeout(showStuff, 5000); - return; - } - - var content = ""; - var items = eval(text); - if (items.length == 0) { - content = "Nothing yet.\n" - } else { - for (var i = 0; i < items.length; ++i) { - content += "
" + items[i].title + - " at " + items[i].time + "
" + - "
" + items[i].content + "
\n"; - } - } - - document.getElementById("content").innerHTML = content; - window.setTimeout(showStuff, 5000); -} - -window.onload = showStuff; diff --git a/subscriber/subscriber.html b/subscriber/subscriber.html deleted file mode 100644 index 3647d1d..0000000 --- a/subscriber/subscriber.html +++ /dev/null @@ -1,35 +0,0 @@ - - - Subscriber - - - - - -

Subscriber aggregation page

- -
-{% if not entries %} -Nothing yet. -{% else %} -{% for entry in entries %} -
-
- {% if entry.title %}{{entry.title}}{% else %}No title{% endif %} - at {{entry.updated}} - {% if entry.link %} - - {% endif %} -
-
- {% if entry.content %}{{entry.content}}{% else %}No content{% endif %} -
-
-{% endfor %} -{% endif %} -
- - - diff --git a/subscriber_clients/php/example.php b/subscriber_clients/php/example.php deleted file mode 100644 index c247676..0000000 --- a/subscriber_clients/php/example.php +++ /dev/null @@ -1,25 +0,0 @@ -subscribe($feed); - -// unsubscribe from a feed -$s->unsubscribe($feed); - -?> - diff --git a/subscriber_clients/php/subscriber.php b/subscriber_clients/php/subscriber.php deleted file mode 100644 index 139ec27..0000000 --- a/subscriber_clients/php/subscriber.php +++ /dev/null @@ -1,120 +0,0 @@ -hub_url = $hub_url; - $this->callback_url = $callback_url; - $this->credentials = $credentials; - } - - // $use_regexp lets you choose whether to use google AJAX feed api (faster, but cached) or a regexp to read from site - public function find_feed($url, $http_function = false) { - // using google feed API - $url = "http://ajax.googleapis.com/ajax/services/feed/lookup?key={$this->google_key}&v=1.0&q=".urlencode($url); - // fetch the content - if ($http_function) - $response = $http_function($url); - else - $response = $this->http($url); - - $result = json_decode($response, true); - $rss_url = $result['responseData']['url']; - return $rss_url; - } - - public function subscribe($topic_url, $http_function = false) { - return $this->change_subscription("subscribe", $topic_url, $http_function = false); - } - - public function unsubscribe($topic_url, $http_function = false) { - return $this->change_subscription("unsubscribe", $topic_url, $http_function = false); - } - - // helper function since sub/unsub are handled the same way - private function change_subscription($mode, $topic_url, $http_function = false) { - if (!isset($topic_url)) - throw new Exception('Please specify a topic url'); - - // lightweight check that we're actually working w/ a valid url - if (!preg_match("|^https?://|i",$topic_url)) - throw new Exception('The specified topic url does not appear to be valid: '.$topic_url); - - // set the mode subscribe/unsubscribe - $post_string = "hub.mode=".$mode; - $post_string .= "&hub.callback=".urlencode($this->callback_url); - $post_string .= "&hub.verify=".$this->verify; - $post_string .= "&hub.verify_token=".$this->verify_token; - $post_string .= "&hub.lease_seconds=".$this->lease_seconds; - - // append the topic url parameters - $post_string .= "&hub.topic=".urlencode($topic_url); - - // make the http post request and return true/false - // easy to over-write to use your own http function - if ($http_function) - return $http_function($this->hub_url,$post_string); - else - return $this->http($this->hub_url,$post_string); - } - - // default http function that uses curl to post to the hub endpoint - private function http($url, $post_string) { - - // add any additional curl options here - $options = array(CURLOPT_URL => $url, - CURLOPT_USERAGENT => "PubSubHubbub-Subscriber-PHP/1.0", - CURLOPT_RETURNTRANSFER => true); - - if ($post_string) { - $options[CURLOPT_POST] = true; - $options[CURLOPT_POSTFIELDS] = $post_string; - } - - if ($this->credentials) - $options[CURLOPT_USERPWD] = $this->credentials; - - $ch = curl_init(); - curl_setopt_array($ch, $options); - - $response = curl_exec($ch); - $info = curl_getinfo($ch); - - // all good -- anything in the 200 range - if (substr($info['http_code'],0,1) == "2") { - return $response; - } - return false; - } -} - - -?> \ No newline at end of file diff --git a/testsuite/README b/testsuite/README deleted file mode 100644 index 3b857e3..0000000 --- a/testsuite/README +++ /dev/null @@ -1,20 +0,0 @@ -PubSubHubbub Hub Test Suite ---------------------------- - -This suite is intended to be used for hub compliancy. The test examples came -directly from the current 0.1 PubSubHubbub Core working draft. However, not -everything can be tested currently. This may change as the spec changes. - -This requires Ruby, RSpec and Mechanize, the later two can be installed with -Ruby Gems using "gem install rspec mechanize". RSpec is a great approach to -testing and Ruby allows for nice DSL-like testing. The interface is HTTP, so -it's language independent. - -Using the Test Suite: - -$ HUB_URL= spec hub_spec.rb --format specdoc - -Example: - -$ HUB_URL=http://localhost:8000 spec hub_spec.rb --format specdoc - diff --git a/testsuite/hub.rb b/testsuite/hub.rb deleted file mode 100644 index 54bf3c6..0000000 --- a/testsuite/hub.rb +++ /dev/null @@ -1,69 +0,0 @@ -require 'net/http' -require 'uri' - -require 'mechanize' - -class Hub - attr_reader :endpoint - - def initialize(endpoint) - @endpoint = URI.parse(endpoint) - @endpoint.path = '/' if @endpoint.path.empty? - - # This is for a hack to deal with non-auto running tasks on App Engine!? - @is_gae = Net::HTTP.get(@endpoint.host, '/_ah/admin/queues', @endpoint.port).include?('Google') - end - - def subscribe(callback, topic, verify, verify_token=nil) - post_as_subscriber('subscribe', callback, topic, verify, verify_token) - end - - def unsubscribe(callback, topic, verify, verify_token=nil) - post_as_subscriber('unsubscribe', callback, topic, verify, verify_token) - end - - def publish(url) - post_as_publisher('publish', url) - end - - def post_as_subscriber(mode, callback, topic, verify, verify_token=nil) - form_data = { - 'hub.mode' => mode, - 'hub.callback' => callback, - 'hub.topic' => topic, - } - form_data['hub.verify_token'] = verify_token if verify_token - if verify.is_a? String - form_data['hub.verify'] = verify - elsif verify.is_a? Array - # Part 1/2 of multivalue hack - verify.each_with_index do |v, i| - form_data["hub.verify--.#{i}"] = v - end - end - req = Net::HTTP::Post.new(@endpoint.path) - req.form_data = form_data - req.body = req.body.gsub(/\-\-\.\d/, '') # Part 2/2 of multivalue hack - Net::HTTP.new(@endpoint.host, @endpoint.port).start do |http| - http.request(req) - end - end - - def post_as_publisher(mode, url) - res = Net::HTTP.post_form(@endpoint, { - 'hub.mode' => mode, - 'hub.url' => url, - }) - run_feed_pull_task if @is_gae && res.kind_of?(Net::HTTPSuccess) - return res - end - - # In response to http://code.google.com/p/googleappengine/issues/detail?id=1796 - def run_feed_pull_task - page = WWW::Mechanize.new.get("http://#{@endpoint.host}:#{@endpoint.port}/_ah/admin/tasks?queue=feed-pulls") - payload = page.form_with(:action => '/work/pull_feeds')['payload'] rescue nil - return unless payload - Net::HTTP.start(@endpoint.host, @endpoint.port) {|http| http.request_post('/work/pull_feeds', payload, {'X-AppEngine-Development-Payload'=>'1'}) } - page.form_with(:action => '/_ah/admin/tasks').click_button # Delete the task - end -end \ No newline at end of file diff --git a/testsuite/hub_spec.rb b/testsuite/hub_spec.rb deleted file mode 100644 index cbae0fe..0000000 --- a/testsuite/hub_spec.rb +++ /dev/null @@ -1,137 +0,0 @@ -require 'hub' -require 'mocks' -require 'timeout' - -HUB_URL = ENV['HUB_URL'] -raise "Specify a hub URL by setting the HUB_URL environment variable." unless HUB_URL - -def wait_on(something) - begin - Timeout::timeout(3) { break unless something.nil? while true } - rescue Timeout::Error - nil - end -end - -def as_optional - # TODO: record as optional spec failure -end - -shared_examples_for "a hub with publisher and subscriber" do - before(:all) do - @hub = Hub.new(HUB_URL) - @publisher = Publisher.new(@hub) - @subscriber = Subscriber.new(@hub) - end - - after(:all) do - @publisher.stop - @subscriber.stop - end -end - -describe Hub, "publisher interface" do - it_should_behave_like "a hub with publisher and subscriber" - - it "accepts POST request for publish notifications" do - @hub.publish(@publisher.content_url).should be_a_kind_of(Net::HTTPSuccess) - end - - it "SHOULD arrange for a content fetch request after publish notification" # shouldn't it always? - - it "MUST return 204 No Content if publish notification was accepted" do - @hub.publish(@publisher.content_url).should be_an_instance_of(Net::HTTPNoContent) - end - - it "MUST return appropriate HTTP error response code if not accepted" do - @hub.post_as_publisher(nil, nil) .should be_a_kind_of(Net::HTTPClientError) - @hub.post_as_publisher('not publish', nil).should be_a_kind_of(Net::HTTPClientError) - end - - it "sends an HTTP GET request to the topic URL to fetch content" do - # Because GAE-PSH doesn't fetch content unless there are subscriptions, we subscribe - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, 'sync', Subscriber::VERIFY_TOKEN) - - @publisher.last_request_method = nil - @hub.publish(@publisher.content_url) - sleep 1 - wait_on @publisher.last_request_method - @publisher.last_request_method.should == "GET" - end - - it "SHOULD include a header field X-Hub-Subscribers whose value is an integer in content fetch request" do - # Because GAE-PSH doesn't fetch content unless there are subscriptions, we subscribe - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, 'sync', Subscriber::VERIFY_TOKEN) - - @publisher.last_headers = nil - @hub.publish(@publisher.content_url) - sleep 1 - wait_on @publisher.last_headers - @publisher.last_headers.should include("X-Hub-Subscribers") rescue as_optional - end - - -end - -describe Hub, "subscriber interface" do - it_should_behave_like "a hub with publisher and subscriber" - - it "accepts POST request for subscription requests" do - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, 'sync', Subscriber::VERIFY_TOKEN).should be_a_kind_of(Net::HTTPSuccess) - end - - it "REQUIRES mode, callback, topic, and verify parameters in the subscription request" do - @hub.post_as_subscriber(nil, @subscriber.callback_url, @publisher.content_url, nil, Subscriber::VERIFY_TOKEN).should_not be_a_kind_of(Net::HTTPSuccess) - @hub.subscribe(nil, @publisher.content_url, 'sync', Subscriber::VERIFY_TOKEN).should_not be_a_kind_of(Net::HTTPSuccess) - @hub.subscribe(@subscriber.callback_url, nil, 'sync', Subscriber::VERIFY_TOKEN).should_not be_a_kind_of(Net::HTTPSuccess) - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, nil, Subscriber::VERIFY_TOKEN).should_not be_a_kind_of(Net::HTTPSuccess) - end - - it "MUST ignore verify keywords it does not understand" do - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, ['sync','foobar','async'], Subscriber::VERIFY_TOKEN).should be_a_kind_of(Net::HTTPSuccess) - end - - it "MUST return 204 No Content if subscription was created and verified" do - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, 'sync', Subscriber::VERIFY_TOKEN).should be_a_kind_of(Net::HTTPNoContent) - end - - it "MUST return 202 Accepted if the subscription was created but has yet to be verified" do - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, 'async', Subscriber::VERIFY_TOKEN).should be_a_kind_of(Net::HTTPAccepted) - end - - it "MUST return appropriate HTTP error response code in case of any error" do - @hub.post_as_subscriber(nil, nil, nil, nil) .should be_a_kind_of(Net::HTTPClientError) - @hub.post_as_subscriber('not subscribe', nil, nil, nil) .should be_a_kind_of(Net::HTTPClientError) - end - - it "SHOULD return a description of an error in the response body in plain text" - - it "MUST complete verification before returning a response in synchronous mode" - - it "must verify subscriber with a GET request to the callback URL" do - request_method = nil - @subscriber.onrequest = lambda {|req| request_method = req.request_method } - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, 'sync', Subscriber::VERIFY_TOKEN) - wait_on request_method - request_method.should == "GET" - end - - it "is REQUIRED to include mode, topic and challenge query parameters in the verification request" do - query_string = nil - @subscriber.onrequest = lambda {|req| query_string = req.query_string } - @hub.subscribe(@subscriber.callback_url, @publisher.content_url, 'sync', Subscriber::VERIFY_TOKEN) - wait_on query_string - query_string.should include("hub.mode=") - query_string.should include("hub.topic=") - query_string.should include("hub.challenge=") - end - - it "expects an HTTP success code with the challenge parameter in the response body to verify subscription" - - it "expects subscriber to return 404 Not Found if they do not agree with the subscription" - - it "MUST consider other client and server response codes to mean subscription is not verified" - - it "SHOULD retry verification until a definite acknowledgement is received" - -end diff --git a/testsuite/mocks.rb b/testsuite/mocks.rb deleted file mode 100644 index f466a58..0000000 --- a/testsuite/mocks.rb +++ /dev/null @@ -1,132 +0,0 @@ -require 'webrick' - -class Subscriber - PORT = 8089 - VERIFY_TOKEN = 'qfwef9' - - attr_reader :callback_url - attr_accessor :onrequest - - def initialize(hub) - @hub = hub - @server = WEBrick::HTTPServer.new(:Port => PORT, :Logger => WEBrick::Log.new(nil, 0), :AccessLog => WEBrick::Log.new(nil, 0)) - @callback_url = "http://localhost:#{PORT}/callback" - @onrequest = lambda {|req|} - mount "/callback" do |req,res| - @onrequest.call(req) - res.status = 200 - if req.request_method == 'GET' - res.body = /hub.challenge=([^$|&]+)/.match(req.query_string)[1] - else - - end - end - @server_thread = Thread.new do - trap("INT"){ @server.shutdown } - @server.start - end - end - - def mount(path, &block) - @server.mount(path, WEBrick::HTTPServlet::ProcHandler.new(block)) - end - - def stop - @server.shutdown - @server_thread.join - end -end - - -class Publisher - PORT = 8088 - - attr_reader :content_url - attr_reader :content - attr_accessor :onrequest - - attr_accessor :last_request_method - attr_accessor :last_headers - - def initialize(hub) - @hub = hub - @server = WEBrick::HTTPServer.new(:Port => PORT, :Logger => WEBrick::Log.new(nil, 0), :AccessLog => WEBrick::Log.new(nil, 0)) - @content_url = "http://localhost:#{PORT}/happycats.xml" - @last_request_method = nil - @last_headers = nil - @onrequest = lambda {|req|} - @content =< - - - - - - 2008-08-11T02:15:01Z - - - - Heathcliff - - http://publisher.example.com/happycat25.xml - 2008-08-11T02:15:01Z - - What a happy cat. Full content goes here. - - - - - - Heathcliff - - http://publisher.example.com/happycat25.xml - 2008-08-11T02:15:01Z - - What a happy cat! - - - - - - Garfield - - http://publisher.example.com/happycat25.xml - 2008-08-11T02:15:01Z - - - - - Nermal - - http://publisher.example.com/happycat25.xml - 2008-07-10T12:28:13Z - - - -EOF - mount "/happycats.xml" do |req,res| - @onrequest.call(req) - @last_request_method = req.request_method - @last_headers = req.header - res.status = 200 - res['Content-Type'] = 'application/atom+xml' - res.body = @content - end - @server_thread = Thread.new do - trap("INT"){ @server.shutdown } - @server.start - end - end - - def mount(path, &block) - @server.mount(path, WEBrick::HTTPServlet::ProcHandler.new(block)) - end - - def stop - @server.shutdown - @server_thread.join - end -end