diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..8da6fd5 --- /dev/null +++ b/__init__.py @@ -0,0 +1,6 @@ +__authors__ = ['The Doctor ','haxwithaxe '] +__license__ = "GPLv3" + +__help__ = "OH MOD! FIX ME DAMNIT!" + +__all__ = ['model', 'view', 'control', 'util'] diff --git a/control/__init__.py b/control/__init__.py new file mode 100644 index 0000000..f476fb3 --- /dev/null +++ b/control/__init__.py @@ -0,0 +1,13 @@ +""" +Control (MVC) module +""" + +__authors__ = ["haxwithaxe "] + +__license__ = "GPLv3" + +__help__ = "OH MOD! FIX ME DAMNIT!" + + +# FIXME : Add module names +__all__ = [] diff --git a/control/db.py b/control/db.py new file mode 100644 index 0000000..9ed7926 --- /dev/null +++ b/control/db.py @@ -0,0 +1,50 @@ +""" +This is the back-end that provides the information data for the UI by gathering collating findings. +""" + +class MissingDB(Exception): + pass + +# Define Classes +class UDBI: + """Unified database interface""" + def __init__(self): + self.db_list = [] + pass + + def find(self, sources*, **criteria): + """ Find database entries from all sources `sources` (if specified otherwise find from any source) with the key-value style criteria specified by `criteria` otherwise find all entries from the specified source. + @param sources arguments specifying sources to query. + @param criteria keyword arguments giving criteria for the query + @return dict of results [empty dict if no results]""" + pass + + def _db_exist(self, source): + """ See if a required databases exist. + @throws MissingDB exception if a required db is not found + @returns bool + """ + pass + + def _open_all(self): + """For every directory in database_directory ... + model.db.open directory""" + # for each source db, append it to self.db_list and open it + for db in self.sources: + db = DB(db) + if db.exists(): + db.open() + self.db_list.append(db) + + def _end_search(self): + """ Close the search connections to the databases. """ + pass + + def _clean_up(self): + """ Clean up after ourselves. """ + pass + + def close(self): + """ Close the databases. """ + pass + diff --git a/control/rss.py b/control/rss.py new file mode 100644 index 0000000..abed0fe --- /dev/null +++ b/control/rss.py @@ -0,0 +1,18 @@ +""" +RSS Control (MVC) aka Downloader +""" + + +# not sure if this is still maintained but i recall it was easy to use +import feedparser + +class RSSFeed(URL): + """Sub classes control.URL""" + def __init__(self, target, config): + pass + + def parse(self): + pass + + def dump(self): + pass diff --git a/control/twitter.py b/control/twitter.py new file mode 100644 index 0000000..0cc4624 --- /dev/null +++ b/control/twitter.py @@ -0,0 +1,72 @@ +This is the module that pulls and indexes a Twitter feed.""" + +__authors__ = ["The Doctor ", "haxwithaxe "] + +__license__ = "GPLv3" + +import json +import twitter +import ..model.bot.Bot + +# Classes +class TwitterBot(Bot): + """ Twwiter scaper Bot. + One feed per Bot """ + + def __init__(self, feed_id, config_obj): + """ it give it the global Config object or else it gets the hose again ...""" + self.conf = config_obj + self.db = model.twitter.TwitterDB(self.conf.get("twitter-bot", "db-%s" % feed_id)) + + def connect(self): + """ Attempt to connect to Twitter's API server. """ + pass + + def disconnect(self): + """ disconnect from Twitter""" + pass + + def init_db(self): + """If we had to create the database then we have to load content into it by downloading the whole timeline in chunks and process the information.""" + pass + + def on_start(self): + # Go into a loop in which we wait for X seconds and download new tweets + while True: + self._update() + # After catching up, send the Indexer a call to update. + # FIXME: add code to poke indexer + # Sleep + time.sleep(self.conf.get_int("twitter-bot", "wait-interval", 3600)) + # Reload the config so config changes can be applied. + self.conf.reload() + + def _update(self): + """ Download tweets posted since the last recorded tweet in the database. """ + last_tweet = self._get_last_recorded() + # tweets = download from last_tweet['datestamp-field-name'] to now + tweets = self._mangle(tweets) + self._record(tweets) + pass + + def _mangle(self, tweets): + """ Format tweets for passing to the database as key value pairs """ + pass + + def _record(self, tweets): + """ Send tweets to the database""" + [self.db.set(**tweet) for tweet in tweets] + + def _get_last_recorded(self): + """ Get the datestamp of the last tweet added to the database. """ + pass + + def clean(self) + """Clean up after ourselves. + - Delete tempfiles. + """ + pass + + def release_db(self): + """Close the databases.""" + self.db.close() diff --git a/control/url.py b/control/url.py new file mode 100644 index 0000000..a531c43 --- /dev/null +++ b/control/url.py @@ -0,0 +1,34 @@ +""" +URL Control (MVC) +""" + +__authors__ = ["haxwithaxe "] + +__license__ = "GPLv3" + + +class URL: + def __init__(self, target): + """ config is a model.Config object already loaded + @param target a string or urllib2.Request object + """ + self.target = target + self.validate_target() #die hard if we don't have what we need + + def validate_target(self): + """ Fail early ... Fail often ... Fail before production + @throws ValueError if self.target is not a string or urllib2.Request + """ + if not self.target or not isinstance(self.target, str) or not isinstance(self.target, urllib2.Request): + raise ValueError("target must be a URI string or a urllib2.Request object.") + + def get(self, ignore_bad_cert=False): + """ Get page from target. + @param ignore_bad_cert bool, if true don't throw erros when ssl cert is invalid/selfsigned + @return the page retrieved or None if not found. + """ + pass + + def clean(self): + """ Clean up files and open sockets or objects. """ + pass diff --git a/exocortex.py b/exocortex.py deleted file mode 100644 index e97d063..0000000 --- a/exocortex.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python2 - -# This is the front-end that provides the user interface for the information -# gathering and analysis agents. -# By: The Doctor - -# License: GPLv3 - -# Modules -import cherrypy -from mako.lookup import TemplateLookup -import xapian - -# Global variables - -# Classes - -# Helper methods - -# Core code -# Read configuration file. - -# See if a database for the Twitter feed exists. If not, create it with the -# proper schema. -# For every directory in database_directory... -# xapian create-or-open directory - -# Schema: -# - @username (relevant for retweets) -# - datestamp -# - permalink -# - body -# - list of hashtags -# - URLs found in body -# - Need to figure out how to determine whether or not to run them through -# a URL expander (goo.gl, t.co, ow.ly, et cetera). - -# Start up the user interface because it runs in the background in a separate -# thread.A - -# User interface allows the user to search based upon any or all of the fields -# in the schema. Not sure how to do this year - drop-downs? Multiple -# fields, each of which allows Boolean operations? -# Xapian has the ability to search multiple databases simultaneously, so for -# every database found on startup, open a connection. -# Execute a search on the Xapian database inside a try..except block to catch -# NOTFOUND cases. -# Count the number of results. -# Sort the results in descending order based upon timestamps. -# Generate a page of X results (with Y not shown yet) and display them on the -# result page. -# If the first result shown != 0, add a <- Previous link to earlier results. -# If there are more results in the list, add a Next-> link. -# Close the search connections to the databases. - -# Clean up after ourselves. -# Close the databases. -# Delete tempfiles. - -# Fin. - diff --git a/model/__init__.py b/model/__init__.py new file mode 100644 index 0000000..5a8a860 --- /dev/null +++ b/model/__init__.py @@ -0,0 +1,13 @@ +""" +Model (MVC) module +""" + +__authors__ = ["haxwithaxe "] + +__license__ = "GPLv3" + +__help__ = "OH MOD! FIX ME DAMNIT!" + + +# FIXME : Add module names +__all__ = [] diff --git a/model/bot.py b/model/bot.py new file mode 100644 index 0000000..4a6258b --- /dev/null +++ b/model/bot.py @@ -0,0 +1,59 @@ +""" +Bot model +""" + +__authors__ = ["haxwithaxe "] + +__license__ = "GPLv3" + +import pykka +from ..util.actor import * + +CONNECT = Action("connect") +DISCONNECT = Action("disconnect") + +class Bot(pykka.ThreadingActor): + def __init__(self, config): + """ config is a model.Config object already loaded""" + super(Bot, self).__init__() + self.conf = conf + + def on_start(self): + """ Hook for doing any setup that should be done after the actor is started, but before it starts processing messages. AKA the main loop """ + pass + + def on_stop(self): + """ Hook for doing any cleanup that should be done after the actor has processed the last message, and before the actor stops. """ + self.disconnect() + self.clean() + pass + + def on_failure(self, exception_type, exception_value, traceback): + """ Hook for doing any cleanup after an unhandled exception is raised, and before the actor stops. """ + logging.error(exception_type, exception_value, traceback) + pass + + def on_receive(self, message): + """ When a message is recieved decide what to do with it + @param message (picklable dict) – the message to handle + @return anything that should be sent as a reply to the sender + """ + if message == DISCONNECT: + self.disconnect() + elif message == CONNECT: + self.connect() + elif message == CLEAN: + else: + return self._ + + def connect(self): + """ Connect to target. """ + pass + + def disconnect(self): + """ Disconnect from target. """ + pass + + def clean(self): + """ Clean up files and open sockets or objects. """ + pass diff --git a/model/db.py b/model/db.py new file mode 100644 index 0000000..41303dc --- /dev/null +++ b/model/db.py @@ -0,0 +1,56 @@ +class Record: + """ Database Record model""" + default_values = {} + + def __init__(self, **values): + self.values = values or {} + + def get(self): + """ Get the contents of the record """ + self._load_defaults() + return self.values + + def _load_defaults(self): + """Load the default values so we don't pass any empty fields we don't want to be empty to the database backend""" + # shallow copy the default values so we don't mangle it too much + defaults = self.default_values.copy() + # load the given values over the defaults + defaults.update(self.values) + # set values to be the updated dictionary + self.values = defaults + + +class DB: + """ Database model """ + + def __init__(self, source): + """ Set the database source """ + pass + + def open(self): + """ Open and/or load the database object. """ + pass + + def get(self, **criteria): + """ Find and return the Records that match the criteria. """ + pass + + def set(self, **records): + """ Set the values for the matching Records or create one if there is no match. """ + pass + + def close(self): + """ Close the DB object if required """ + pass + + def compact(self): + """For every database in database_directory, instantiate a copy of Compactor() and compact the database to free up disk space and make searches more efficient.""" + pass + + def _file_exists(self): + """ Does the database file exist? """ + pass + + def exists(self): + """ Does the database file and/or table exist """ + pass diff --git a/model/decision.py b/model/decision.py new file mode 100644 index 0000000..17a2eef --- /dev/null +++ b/model/decision.py @@ -0,0 +1,18 @@ +""" +Decision making code interface +""" + +class Decision: + default_init_kwargs = {} + def __init__(self, **kwargs): + args_helper.handle_kwargs(kwargs, default_init_kwargs) + self.certianty = kwargs['certianty'] + self.input = + self.result = + +class Decider: + def __init__(self, *args, **kwars): + pass + + def ask(self, *args, **kwargs): + pass diff --git a/model/indexer.py b/model/indexer.py new file mode 100644 index 0000000..e69de29 diff --git a/model/mind.py b/model/mind.py new file mode 100644 index 0000000..abcef8b --- /dev/null +++ b/model/mind.py @@ -0,0 +1,3 @@ +""" Hive mind for bot groups """ + +class Mind() diff --git a/model/rss.py b/model/rss.py new file mode 100644 index 0000000..e69de29 diff --git a/model/spider.py b/model/spider.py new file mode 100644 index 0000000..46d6eb5 --- /dev/null +++ b/model/spider.py @@ -0,0 +1,60 @@ +""" +Web Spider model (MVC) +""" + +__authors__ = ["haxwithaxe "] + +__license__ = "GPLv3" + +from bot import * +from actor_utils import * + +TARGET_KEY = "target" + +class WebSpider(Bot): + """ Web Spider model + Crawl a page and decide whether to crawl further based on what is found """ + + def __init__(self, target, mind, config): + """ config is a model.Config object already loaded + @param target site to crawl + @param mind spider group hive mind + @param config model.Config object""" + super.__init__(self, config) + self.target = target + self.mind = mind + + def on_stop(self): + """ Hook for doing any cleanup that should be done after the actor has processed the last message, and before the actor stops. """ + self.disconnect() + self.clean() + + def on_receive(self, message): + """ When a message is recieved decide what to do with it + @param message (picklable dict) – the message to handle + @return anything that should be sent as a reply to the sender + """ + if TARGET_KEY in message: + self.target = message[TARGET_KEY] + else: + super.on_receive(self, message) + + def clean(self): + """ Clean up files and open sockets or objects. """ + pass + + def parse(self, page): + """ Do stuff to parse the page for data and return that data """ + pass + + def dump(self, data): + """ stick the data someplace """ + pass + + def download(self, path): + """ download the uri made of self.target+path """ + pass + + def decide(self, data): + """ decide what to do based on input """ + pass diff --git a/twitter.py b/twitter.py deleted file mode 100644 index 49da06d..0000000 --- a/twitter.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python2 - -# This is the module that pulls and indexes Twitter feeds. -# By: The Doctor - -# License: GPLv3 - -# TODO: -# - Make it possible to add, index, and follow arbitrary numbers of Twitter -# feeds. One database per feed? How far can Xapian scale? - -# Modules -import json - -# Global variables. -database_directory = '' - -# Classes - -# Helper methods - -# Core code - -# Attempt to connect to Twitter's API server. If we can't, stop trying but -# make it possible for the user to search already indexed content. - -# For every database in database_directory, instantiate a copy of Compactor() -# and compact the database to free up disk space and make searches more -# efficient. - -# If we had to create the database then we have to load content into it by -# downloading the whole timeline in chunks and process the information. - -# Go into a loop in which we wait for X minutes or Y seconds and download -# tweets posted since the last time the database was updated. -# For every Twitter feed configured for this instance, -# Get the datestamp of the last tweet added to the database. -# Pull all of the tweets posted since the latest one in the database was -# added. -# For each tweet, dissect it to extract the content and add it to the Xapian -# database. -# After catching up, call the Xapian indexer. -# Go to sleep. - -# Clean up after ourselves. -# Close the databases. -# Delete tempfiles. - -# Fin. - diff --git a/util/__init__.py b/util/__init__.py new file mode 100644 index 0000000..cc5223c --- /dev/null +++ b/util/__init__.py @@ -0,0 +1,13 @@ +""" +Utilities module +""" + +__authors__ = ["haxwithaxe "] + +__license__ = "GPLv3" + +__help__ = "OH MOD! FIX ME DAMNIT!" + + +# FIXME : Add module names +__all__ = [] diff --git a/util/actor.py b/util/actor.py new file mode 100644 index 0000000..9e52d71 --- /dev/null +++ b/util/actor.py @@ -0,0 +1,19 @@ +""" +Actor Class Utilities +""" + +__authors__ = ["haxwithaxe "] + +__license__ = "GPLv3" + +import pykka + +class Message: + key = 'message' + def __init__(self, *args, **values): + self.value = values + self.value.update({key: ' '.join(str(args))}) + +class Action(Message): + key = 'action' + diff --git a/util/arg_helper.py b/util/arg_helper.py new file mode 100644 index 0000000..8ed7082 --- /dev/null +++ b/util/arg_helper.py @@ -0,0 +1,10 @@ +""" +functions to help handle *args and **kwargs +""" + +def handle_kwargs(kwargs, defaults): + """ make sure that kwargs has all the values we need so we don't need to check as we go """ + defaults_copy = defaults.copy() + defaults_copy.update(kwargs) + return defaults_copy + diff --git a/view/__init__.py b/view/__init__.py new file mode 100644 index 0000000..48dfe3e --- /dev/null +++ b/view/__init__.py @@ -0,0 +1,13 @@ +""" +View (MVC) module +""" + +__authors__ = ["haxwithaxe "] + +__license__ = "GPLv3" + +__help__ = "OH MOD! FIX ME DAMNIT!" + + +# FIXME : Add module names +__all__ = []