-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrial4_strcmp.cpp
More file actions
134 lines (113 loc) · 3.73 KB
/
trial4_strcmp.cpp
File metadata and controls
134 lines (113 loc) · 3.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
//USES STRCMP INBUILT
//time
#include<chrono>
#include <stdio.h>
#include<iostream>
#include<string>
#include<unordered_set>
#include<fstream>
using namespace std;
//time
using namespace std::chrono;
/*
Unordered_set storing keywords
*/
unordered_set<string> mp;
unordered_set<string> keywords;
void insertion(){
keywords.insert("int");
keywords.insert("float");
keywords.insert("void");
keywords.insert("return");
}
//FILE TO STRING AND FILTER
std::string converttostring(std::string filename = "/Users/sayash/Desktop/Similarity/file1.txt") //default local file
{
std::string s,temp_word; //string to return
FILE *in_file; //open stream
char ch,flag=0;
in_file = fopen(filename.c_str(), "r"); //open file as character string
if (in_file == NULL) //filename error handled
std::cout<<"invalid filename\n";
else
{
while ((ch = fgetc(in_file)) != EOF) //fgetc gets character and points to next character
{
if (ch == ' ' || ch == '\n' || ch == '\t') continue; //de-formatting the code
/*
C++ ignores spaces between statements, tabs, and newline characters
in order to reduce file size by a significant amount, the characters can be deleted and ignored
from edit distance in levenshtein
*/
else if((ch>='A' && ch<='Z') || (ch<='z' && ch>='a')){
if(flag){
while(((ch>='A' && ch<='Z') || (ch<='z' && ch>='a') ||(ch<='9' && ch>='0') )&& ch!=EOF){
//skipping this word
temp_word.push_back(ch);
ch=fgetc(in_file);
}
mp.insert(temp_word);
temp_word="";
if (!(ch == ' ' || ch == '\n' || ch == '\t')) s.push_back(ch);
flag=0;continue;}
while(((ch>='A' && ch<='Z') || (ch<='z' && ch>='a') ||(ch<='9' && ch>='0') )&& ch!=EOF){
temp_word.push_back(ch);
// s+=temp_word;
ch=fgetc(in_file);
// if(keywords.find(temp_word)!=keywords.end()) flag++;
}
if (!(ch == ' ' || ch == '\n' || ch == '\t')) s.push_back(ch);
if(mp.find(temp_word)==mp.end()) s+=temp_word;
// cout<<temp_word<<endl;
if(keywords.find(temp_word)!=keywords.end()) {flag++;}//cout<<1<<endl; inside {}
temp_word="";
}
else{
s.push_back(ch);
}
}
}
fclose(in_file); //closing the stream is very important
return s;
}
//LEVENSHTEIN
int levenshtein(std::string a, std::string b){
int len_a = a.length();
int len_b = b.length();
int d[len_a + 1][len_b+1];
for(int i = 0; i < len_a + 1; i++)
d[i][0] = i;
for(int j = 0; j < len_b + 1; j++)
d[0][j] = j;
for(int i = 1; i < len_a + 1; i++){
for(int j = 1; j < len_b + 1; j++){
if(a[i - 1] == b[j - 1]){
d[i][j] = d[i - 1][j - 1];
}
else{
d[i][j] = 1 + min(min(d[i][j-1],d[i-1][j]),d[i-1][j-1]);
}
}
}
int answer = d[len_a][len_b];
return answer;
}
//MAIN
int main(){
insertion();
//starting clock
auto start = high_resolution_clock::now();
std::string s1 = converttostring("/Users/sayash/Desktop/Similarity/file1.txt");
std::string s2 = converttostring("/Users/sayash/Desktop/Similarity/file3.txt");
cout<<s1<<endl;
cout<<s2<<endl;
//levenshtein comparison
// std::cout<<levenshtein(s1,s2)<<std::endl; //final output
//strcmp
std::cout<<strcmp(s1.c_str(),s2.c_str())<<std::endl;
//stopping clock
// std::cout<<"time taken is: "<<(double)(clock() - tStart)/CLOCKS_PER_SEC<<std::endl;
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cout << "Time taken: "<< duration.count() << " microseconds" << endl;
}