Increased unicode compatibility and fixed bulk upload timeout

buske · buske · commit 5d4992872010 · 2016-04-14T06:12:12.000-04:00
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,6 @@
 #   http://lint.travis-ci.org/
 language: python
 python:
-  - "2.6"
   - "2.7"
   - "3.2"
   - "3.3"
diff --git a/datastore.py b/datastore.py
@@ -11,6 +11,7 @@
 
 import json
 import logging
+import codecs
 
 from elasticsearch import Elasticsearch
 from parsers import OBOParser, GeneParser
@@ -57,7 +58,7 @@ def index(self, filename):
         """Populate the database with patient data from the given file"""
         from models import Patient
 
-        with open(filename) as ifp:
+        with codecs.open(filename, encoding='utf-8') as ifp:
             data = json.load(ifp)
 
         for record in data:
@@ -181,7 +182,7 @@ def index(self, index, filename, Parser):
             commands.extend(command)
 
         data = "".join([json.dumps(command) + "\n" for command in commands])
-        self._db.bulk(data, index=index, doc_type=self.TERM_TYPE_NAME, refresh=True)
+        self._db.bulk(data, index=index, doc_type=self.TERM_TYPE_NAME, refresh=True, request_timeout=60)
 
         n = self._db.count(index=index, doc_type=self.TERM_TYPE_NAME)
         logger.info('Index now contains {} terms'.format(n['count']))
diff --git a/obo.py b/obo.py
@@ -38,9 +38,6 @@
 except ImportError:
     from cStringIO import StringIO
 
-if sys.version_info >= (3,):
-    unicode = str
-
 
 class ParseError(Exception):
     pass
@@ -215,7 +212,7 @@ def _parse_line(self, line):
         # If the value starts with a quotation mark, we parse it as a
         # Python string -- luckily this is the same as an OBO string
         if value_and_mod and value_and_mod[0] == '"':
-            stringio = StringIO(unicode(value_and_mod))
+            stringio = StringIO(value_and_mod)
             gen = tokenize.generate_tokens(stringio.readline)
             for toknum, tokval, _, (_, ecol), _ in gen:
                 if toknum == tokenize.STRING:
diff --git a/parsers.py b/parsers.py
@@ -4,6 +4,8 @@
 
 from __future__ import with_statement, division, unicode_literals
 
+import codecs
+
 from csv import DictReader
 from collections import defaultdict
 
@@ -23,7 +25,7 @@ def __iter__(self):
 
 class OBOParser(BaseParser):
     def documents(self):
-        parser = BaseOBOParser(open(self._filename))
+        parser = BaseOBOParser(codecs.open(self._filename, encoding='utf-8'))
 
         # Parse all terms first
         terms = {}
@@ -69,7 +71,7 @@ def get_ancestors(node_id, ancestors=None):
 
 class TSVParser(BaseParser):
     def _documents(self, columns):
-        with open(self._filename) as ifp:
+        with codecs.open(self._filename, encoding='utf-8') as ifp:
             reader = DictReader(ifp, delimiter=str('\t'))
             for row in reader:
                 term = defaultdict(list)