Skip to content

Commit 5d49928

Browse files
committed
Increased unicode compatibility and fixed bulk upload timeout
1 parent 84ffea6 commit 5d49928

File tree

4 files changed

+8
-9
lines changed

4 files changed

+8
-9
lines changed

.travis.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# http://lint.travis-ci.org/
33
language: python
44
python:
5-
- "2.6"
65
- "2.7"
76
- "3.2"
87
- "3.3"

datastore.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import json
1313
import logging
14+
import codecs
1415

1516
from elasticsearch import Elasticsearch
1617
from parsers import OBOParser, GeneParser
@@ -57,7 +58,7 @@ def index(self, filename):
5758
"""Populate the database with patient data from the given file"""
5859
from models import Patient
5960

60-
with open(filename) as ifp:
61+
with codecs.open(filename, encoding='utf-8') as ifp:
6162
data = json.load(ifp)
6263

6364
for record in data:
@@ -181,7 +182,7 @@ def index(self, index, filename, Parser):
181182
commands.extend(command)
182183

183184
data = "".join([json.dumps(command) + "\n" for command in commands])
184-
self._db.bulk(data, index=index, doc_type=self.TERM_TYPE_NAME, refresh=True)
185+
self._db.bulk(data, index=index, doc_type=self.TERM_TYPE_NAME, refresh=True, request_timeout=60)
185186

186187
n = self._db.count(index=index, doc_type=self.TERM_TYPE_NAME)
187188
logger.info('Index now contains {} terms'.format(n['count']))

obo.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,6 @@
3838
except ImportError:
3939
from cStringIO import StringIO
4040

41-
if sys.version_info >= (3,):
42-
unicode = str
43-
4441

4542
class ParseError(Exception):
4643
pass
@@ -215,7 +212,7 @@ def _parse_line(self, line):
215212
# If the value starts with a quotation mark, we parse it as a
216213
# Python string -- luckily this is the same as an OBO string
217214
if value_and_mod and value_and_mod[0] == '"':
218-
stringio = StringIO(unicode(value_and_mod))
215+
stringio = StringIO(value_and_mod)
219216
gen = tokenize.generate_tokens(stringio.readline)
220217
for toknum, tokval, _, (_, ecol), _ in gen:
221218
if toknum == tokenize.STRING:

parsers.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
from __future__ import with_statement, division, unicode_literals
66

7+
import codecs
8+
79
from csv import DictReader
810
from collections import defaultdict
911

@@ -23,7 +25,7 @@ def __iter__(self):
2325

2426
class OBOParser(BaseParser):
2527
def documents(self):
26-
parser = BaseOBOParser(open(self._filename))
28+
parser = BaseOBOParser(codecs.open(self._filename, encoding='utf-8'))
2729

2830
# Parse all terms first
2931
terms = {}
@@ -69,7 +71,7 @@ def get_ancestors(node_id, ancestors=None):
6971

7072
class TSVParser(BaseParser):
7173
def _documents(self, columns):
72-
with open(self._filename) as ifp:
74+
with codecs.open(self._filename, encoding='utf-8') as ifp:
7375
reader = DictReader(ifp, delimiter=str('\t'))
7476
for row in reader:
7577
term = defaultdict(list)

0 commit comments

Comments
 (0)