-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfht16.py
More file actions
executable file
·122 lines (95 loc) · 3.5 KB
/
fht16.py
File metadata and controls
executable file
·122 lines (95 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import math
def fht(data_path, type):
##### 4-header ######
count_4 = 16
head_4_overall = [[0 for i in range(0, 256)] for j in range(0, count_4)]
first_flag = 1
file_count = 0
type_path = os.path.join(data_path, type)
files = os.listdir(type_path)
if len(files) < 100:
return
if type in ["application-octet-stream", "application-xhtml+xml", "text-html"]:
return
end_idx = int(math.ceil(len(files) * 0.25))
else:
end_idx = int(math.ceil(len(files) * 0.75))
files = files[:end_idx]
# Go Through Each File in a particular directory
for file in files:
path = os.path.join(type_path, file)
f = open(path, "rb")
head4 = [[0 for file in range(0, 256)] for j in range(0, count_4)]
# Read bytes till count
for file in range(0, count_4):
byte = f.read(1)
if byte:
byte_index = ord(byte) - 1
head4[file][byte_index] = 1
else:
head4[file] = [-1] * 256
if first_flag == 1:
head_4_overall = head4[:][:]
first_flag = 0
file_count += 1
continue
else:
for k in range(0, count_4):
for j in range(0, 256):
head_4_overall[k][j] = float(float(head_4_overall[k][j]) * file_count + head4[k][j]) / float(
file_count + 1)
file_count += 1
f.close()
# write head4overall to csv file...
out_file = open('../fht16/' + type + '-head4overall.csv', 'w')
out_file.write(",".join([str(i) for i in range(0, 256)]) + "\n")
out_file.write('\n'.join([','.join([str(j) for j in head_4_overall[i]]) for i in range(0, count_4)]))
out_file.close()
######## 4-Trailer #############
count_4 = 16
trail_4_overall = [[0 for i in range(0, 256)] for j in range(0, count_4)]
first_flag = 1
file_count = 0
# Go Through Each File in a particular directory
for file in files:
path = os.path.join(type_path, file)
f = open(path, "rb")
f.seek(0, 2)
size = f.tell()
m = count_4 - 1
trail4 = [[0 for file in range(0, 256)] for j in range(0, count_4)]
while size > 0 and m >= 0:
size -= 1
f.seek(size)
byte = f.read(1)
if byte:
byte_index = ord(byte) - 1
trail4[m][byte_index] = 1
else:
trail4[m] = [-1] * 256
m -= 1
if first_flag == 1:
trail_4_overall = trail4[:][:]
first_flag = 0
file_count += 1
continue
else:
for k in range(0, count_4):
for j in range(0, 256):
trail_4_overall[k][j] = float(
float(trail_4_overall[k][j]) * file_count + trail4[k][j]) / float(
file_count + 1)
file_count += 1
f.close()
# write trail4overall to csv file...
out_file = open('../fht16/' + type + '-trail4overall.csv', 'w')
out_file.write(",".join([str(i) for i in range(0, 256)]) + "\n")
out_file.write('\n'.join([','.join([str(j) for j in trail_4_overall[i]]) for i in range(0, count_4)]))
out_file.close()
if __name__ == "__main__":
data_path = "/Users/minhpham/projects/Ass1-ContentDetection/data/"
for type in os.listdir(data_path):
if type == ".DS_Store":
continue
fht(data_path, type)