forked from InMobi/docker-hive
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatagen.py
More file actions
104 lines (96 loc) · 2.61 KB
/
datagen.py
File metadata and controls
104 lines (96 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import getopt, sys
import random
import string
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "hc:r:", ["help", "output="])
except getopt.GetoptError as err:
# print help information and exit:
print(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
output = None
verbose = False
for o, a in opts:
if o == "-c":
ncolumns = int(a)
elif o == "-r":
nrows = int(a)
elif o in ("-h", "--help"):
usage()
sys.exit()
else:
assert False, "unhandled option"
types = {
"string" : 0.05,
"varchar(64)" : 0.25,
"char(4)" : 0.4,
"boolean" : 0.45,
"decimal(16, 10)" : 0.55,
"float" : 0.7,
"int" : 1
}
keys = types.keys()
keys = sorted(keys, key=lambda type: types[type])
columns = []
random.seed(1)
basename = str(ncolumns) + "_data";
while ncolumns > 0:
rand = random.random()
for k in keys:
if rand <= types[k]:
columns.append(k)
break
ncolumns = ncolumns - 1
# Generate the DDL
fd = open("%s.ddl" % basename, "w")
fd.write("drop table if exists test_txt;\n");
fd.write("drop table if exists test_orc;\n");
fd.write("create table test_txt(\n")
i = 1
colstrings = []
for column in columns:
colstring = "\tc%d %s" % (i, column)
colstrings.append(colstring)
i = i + 1
fd.write(",\n".join(colstrings))
fd.write("\n);\n")
fd.write("load data local inpath '%s.csv' overwrite into table test_txt;\n" % basename);
fd.write("create table test_orc like test_txt;\n");
fd.write("alter table test_orc set fileformat orc;\n");
fd.write("insert into table test_orc select * from test_txt;\n");
fd.close()
# Generate the data.
fd = open("%s.csv" % basename, "w")
for i in range(0, nrows):
coldata = []
for column in columns:
coldata.append(getRandomStuff(column))
fd.write(",".join(coldata))
fd.write("\n")
def getRandomStuff(column):
if column[0:3] == "str":
return randomText(16)
if column[0:3] == "var":
return randomText(24)
if column[0:3] == "cha":
return randomText(4)
if column[0:3] == "boo":
if (random.random() > 0.5):
return "T"
else:
return "F"
if column[0:3] == "dec":
value = str(random.randint(1, 9999)) + "." + str(random.randint(1, 999))
return value
if column[0:3] == "flo":
return str(random.random() * 500)[0:7]
if column[0:3] == "int":
return str(random.randint(1, 9999))
def randomText(length):
#char_set = string.ascii_uppercase + string.ascii_lowercase + " " + string.digits
char_set = string.ascii_uppercase + " ";
char_set = "ABCDEFGHIJKLMN ";
return ''.join(random.sample(char_set * 3, length))
if __name__ == "__main__":
main()