-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsgas-db-migrate
More file actions
executable file
·123 lines (91 loc) · 3.55 KB
/
sgas-db-migrate
File metadata and controls
executable file
·123 lines (91 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# SGAS CouchDB migration script
#
# This script will read all document/records from one database, change the _id
# field to smaller one (created by a hash from the record_id value) and insert
# the changed records into a new database. No changes will be done to the
# original database.
#
# Please do not try and have the same source and target database.
#
# usage: sgas-db-migrate http://localhost:5984/oldcollection http://localhost:5984/newcollection
# (make sure the new database is created before starting)
import sys
import hashlib
from itertools import product, chain, starmap
from twisted.internet import reactor, defer
from sgasclient import couchdb, baseconvert
HEX_ALPHABET = '0123456789abcdef'
# copied from sgas.server.usagerecord
# new-style hash, produces smaller/faster b-trees in couchdb
def _create12byteb62hash(record_id):
sha_160_hex = hashlib.sha1(record_id).hexdigest()
sha_160 = int(sha_160_hex, 16)
b62_12byte_max_length = 62**12
b62_hash = baseconvert.base10to62(sha_160 % b62_12byte_max_length)
return b62_hash
def convertRecord(record):
new_id = _create12byteb62hash(record['record_id'])
# print new_id
record['_id'] = new_id
record['convert_version'] = 4
return record
@defer.inlineCallbacks
def main():
if len(sys.argv) < 3:
reactor.stop()
raise SystemExit("Not enough arguments")
if sys.argv[1] == sys.argv[2]:
reactor.stop()
raise SystemExit("Can't have identical source and target databases")
source_db_url = sys.argv[1]
target_db_url = sys.argv[2]
print "Source:", source_db_url
print "Target:", target_db_url
source_db = couchdb.Database(source_db_url)
target_db = couchdb.Database(target_db_url)
# key creation
def ejoin(*args):
key = ''.join(args)
return key, key + '\u9999'
# construct iterator for start and endkeys
hex_iterator = starmap(ejoin, product(HEX_ALPHABET, repeat=3))
remainder_iterator = [('g', '\u9999')]
underscore_iterator = [('_', '_\u9999')]
key_iterator = chain(hex_iterator, remainder_iterator, underscore_iterator)
total_rows = None
total_converted = 0
design_docs = {}
insert_def = None
for startkey, endkey in key_iterator:
#print startkey, endkey
converted_records = []
docs = yield source_db.retrieveDocuments(startkey=startkey, endkey=endkey)
if total_rows is None:
total_rows = docs['total_rows']
print "Number of rows to migrate:", total_rows
n_rows = len(docs['rows'])
sys.stdout.write(str(n_rows) + ',')
sys.stdout.flush()
for row in docs['rows']:
doc = row['doc']
if row['id'].startswith('_design'): # design documents
del doc['_rev'] # otherwise we'll get an update conflict
design_docs[row['id']] = doc
else: # regular documents
converted_doc = convertRecord(doc)
converted_records.append(converted_doc)
total_converted += 1
if insert_def is not None:
yield insert_def
insert_def = target_db.insertDocuments(converted_records)
print # newline after stdout writers
print "Creating design documents (%i)" % len(design_docs)
for idname, ddoc in design_docs.items():
yield target_db.createDocument(ddoc, doc_id=str(idname))
total_converted += 1
print "Documents converted", total_converted
reactor.stop()
if __name__ == '__main__':
reactor.callWhenRunning(main)
reactor.run()