luts3-client/sgas-db-migrate at master · sgas/luts3-client · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python

# SGAS CouchDB migration script
#
# This script will read all document/records from one database, change the _id
# field to smaller one (created by a hash from the record_id value) and insert
# the changed records into a new database. No changes will be done to the
# original database.
#
# Please do not try and have the same source and target database.
#
# usage: sgas-db-migrate http://localhost:5984/oldcollection http://localhost:5984/newcollection
# (make sure the new database is created before starting)


import sys
import hashlib
from itertools import product, chain, starmap

from twisted.internet import reactor, defer

from sgasclient import couchdb, baseconvert


HEX_ALPHABET = '0123456789abcdef'


# copied from sgas.server.usagerecord
# new-style hash, produces smaller/faster b-trees in couchdb
def _create12byteb62hash(record_id):
    sha_160_hex = hashlib.sha1(record_id).hexdigest()
    sha_160 = int(sha_160_hex, 16)
    b62_12byte_max_length = 62**12
    b62_hash = baseconvert.base10to62(sha_160 % b62_12byte_max_length)
    return b62_hash


def convertRecord(record):

    new_id = _create12byteb62hash(record['record_id'])
#    print new_id
    record['_id'] = new_id
    record['convert_version'] = 4
    return record


@defer.inlineCallbacks
def main():

    if len(sys.argv) < 3:
        reactor.stop()
        raise SystemExit("Not enough arguments")

    if sys.argv[1] == sys.argv[2]:
        reactor.stop()
        raise SystemExit("Can't have identical source and target databases")

    source_db_url = sys.argv[1]
    target_db_url = sys.argv[2]

    print "Source:", source_db_url
    print "Target:", target_db_url

    source_db = couchdb.Database(source_db_url)
    target_db = couchdb.Database(target_db_url)

    # key creation
    def ejoin(*args):
        key = ''.join(args)
        return key, key + '\u9999'

    # construct iterator for start and endkeys
    hex_iterator = starmap(ejoin, product(HEX_ALPHABET, repeat=3))
    remainder_iterator = [('g', '\u9999')]
    underscore_iterator = [('_', '_\u9999')]
    key_iterator = chain(hex_iterator, remainder_iterator, underscore_iterator)

    total_rows = None
    total_converted = 0
    design_docs = {}
    insert_def = None

    for startkey, endkey in key_iterator:
        #print startkey, endkey
        converted_records = []
        docs = yield source_db.retrieveDocuments(startkey=startkey, endkey=endkey)
        if total_rows is None:
            total_rows = docs['total_rows']
            print "Number of rows to migrate:", total_rows
        n_rows = len(docs['rows'])
        sys.stdout.write(str(n_rows) + ',')
        sys.stdout.flush()
        for row in docs['rows']:
            doc = row['doc']
            if row['id'].startswith('_design'): # design documents
                del doc['_rev'] # otherwise we'll get an update conflict
                design_docs[row['id']] = doc
            else: # regular documents
                converted_doc = convertRecord(doc)
                converted_records.append(converted_doc)
                total_converted += 1

        if insert_def is not None:
            yield insert_def
        insert_def = target_db.insertDocuments(converted_records)

    print # newline after stdout writers
    print "Creating design documents (%i)" % len(design_docs)
    for idname, ddoc in design_docs.items():
        yield target_db.createDocument(ddoc, doc_id=str(idname))
        total_converted += 1

    print "Documents converted", total_converted

    reactor.stop()


if __name__ == '__main__':
    reactor.callWhenRunning(main)
    reactor.run()