-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path08 - BagOfWords.cpp
More file actions
86 lines (71 loc) · 1.92 KB
/
08 - BagOfWords.cpp
File metadata and controls
86 lines (71 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#include<bits/stdc++.h>
#include "02 - Preprocessing.cpp"
#pragma once
class BagOfWords
{
public :
//declaration
void transform(std::vector<std::vector<std::string>> processedData);
};
void BagOfWords::transform(std::vector<std::vector<std::string>> processedData)
{
/*
Basic steps to perform bag of words
1. read the unique words (set)
2. vector<vector<string>>
=> [["hi","hello"],
["you","there","hello"]]
map[word[i][j]]++;
hi -> 1
helllo -> 1
index hi hello you there
0 1 1 0 0
1 0 1 1 1
*/
//1. Reading the unique words from the unique words.text
std::vector<std::string> uniquewords;
std::unordered_map<std::string,int> m;
std::ifstream in;
in.open("05 - unique words.txt");
std::ofstream out;
out.open("08 - BagOfWords.csv");
std::string str;
int z=0;
while(in>>str)
{
std::cout<<"Reading...."<<z+1<<" "<<str<<std::endl;
z++;
uniquewords.push_back(str);
}
//should be 21768 unique words
std::cout<<"Unique words : "<<uniquewords.size()<<std::endl;
//2. writing the columns in the output BagOfWords.csv file
for(int i=0;i<uniquewords.size();i++)
{
out<<uniquewords[i]<<",";
}
out<<std::endl;
//3. creating the bag of words file
for(int i=0;i<processedData.size();i++)
{
for(int j=0;j<processedData[i].size();j++)
{
m[processedData[i][j]]++;
}
for(int k = 0;k<uniquewords.size();k++)
{
if(m.find(uniquewords[k]) == m.end())
{
out<<0<<",";
}
else
{
out<<m[uniquewords[k]]<<",";
}
}
if(i!=processedData.size()-1)
out<<std::endl;
m.clear();
std::cout<<i+1<<std::endl;
}
}