22import re
33from datetime import datetime
44from time import perf_counter
5+ from typing import Literal , Optional
56
67import matplotlib .pyplot as plt
78import numpy as np
89import pandas as pd
9- from decouple import Config , RepositoryEnv
10- from scipy .optimize import curve_fit
11-
1210from codeplag .algorithms .featurebased import counter_metric , struct_compare
1311from codeplag .algorithms .stringbased import gst
1412from codeplag .algorithms .tokenbased import value_jakkar_coef
1513from codeplag .pyplag .utils import get_ast_from_content , get_features_from_ast
14+ from decouple import Config , RepositoryEnv
15+ from scipy .optimize import curve_fit
1616from webparsers .github_parser import GitHubParser
1717
1818
19- def square_func (x , a , b , c ) :
19+ def square_func (x : float , a : float , b : float , c : float ) -> float :
2020 return a * x ** 2 + b * x + c
2121
2222
23- def cube_func (x , a , b , c , d ) :
23+ def cube_func (x : float , a : float , b : float , c : float , d : float ) -> float :
2424 return a * x ** 3 + b * x ** 2 + c * x + d
2525
2626
27- def quart_func (x , a , b , c , d , e ) :
27+ def quart_func (x : float , a : float , b : float , c : float , d : float , e : float ) -> float :
2828 return a * x ** 4 + b * x ** 3 + c * x ** 2 + d * x + e
2929
3030
31- def remove_unnecessary_blank_lines (source_code ) :
31+ def remove_unnecessary_blank_lines (source_code : str ) -> str :
3232 pattern = r"\n+"
3333 return re .sub (pattern , "\n " , source_code )
3434
3535
36- def get_data_from_dir (path = './data' , max_count_lines = None ):
36+ def get_data_from_dir (
37+ path : str = './data' ,
38+ max_count_lines : Optional [int ] = None
39+ ) -> pd .DataFrame :
3740 df = pd .DataFrame ()
3841 for filename in os .listdir (path ):
3942 if not re .search (r'.csv$' , filename ):
@@ -48,7 +51,7 @@ def get_data_from_dir(path='./data', max_count_lines=None):
4851 return df
4952
5053
51- def save_works_from_repo_url (url , check_policy = True ):
54+ def save_works_from_repo_url (url : str , check_policy : bool = True ) -> None :
5255 current_repo_name = url .split ('/' )[- 1 ]
5356 env_config = Config (RepositoryEnv ('../../.env' ))
5457 gh = GitHubParser (
@@ -57,7 +60,7 @@ def save_works_from_repo_url(url, check_policy=True):
5760 access_token = env_config .get ('ACCESS_TOKEN' )
5861 )
5962 files = list (gh .get_files_generator_from_repo_url (url ))
60- files = [(remove_unnecessary_blank_lines (file [ 0 ] ), file [ 1 ] ) for file in files ]
63+ files = [(remove_unnecessary_blank_lines (file . code ), file . link ) for file in files ]
6164
6265 df = pd .DataFrame (
6366 {
@@ -66,18 +69,25 @@ def save_works_from_repo_url(url, check_policy=True):
6669 'extension' : ['py' ] * (len (files ) - 1 ),
6770 'repo_name' : [current_repo_name ] * (len (files ) - 1 ),
6871 'content_len' : [len (file_ [0 ]) for file_ in files [:- 1 ]],
69- 'content_len_without_blank' : [len (file_ [0 ].replace (' ' , '' ).replace ('\n ' , '' ).replace ('\t ' , '' )) for file_ in files [:- 1 ]],
70- 'count_lines_without_blank_lines' : [len (file_ [0 ].splitlines ()) for file_ in files [:- 1 ]]
72+ 'content_len_without_blank' : [
73+ len (file_ [0 ].replace (' ' , '' ).replace ('\n ' , '' ).replace ('\t ' , '' ))
74+ for file_ in files [:- 1 ]
75+ ],
76+ 'count_lines_without_blank_lines' : [
77+ len (file_ [0 ].splitlines ()) for file_ in files [:- 1 ]
78+ ]
7179 }
7280 )
7381 df = df [df ['count_lines_without_blank_lines' ] > 5 ]
7482 df .to_csv (os .path .join ('./data/' , current_repo_name + '.csv' ), sep = ';' )
7583
7684
77- def get_time_to_meta (df , iterations = 10 ):
85+ def get_time_to_meta (df : pd . DataFrame , iterations : int = 10 ) -> pd . DataFrame :
7886 count_lines = []
7987 to_meta_time = []
80- for (index , content ) in df [['content' , 'link' , 'count_lines_without_blank_lines' ]].iterrows ():
88+ for (index , content ) in df [
89+ ['content' , 'link' , 'count_lines_without_blank_lines' ]
90+ ].iterrows ():
8191 print (index , " " * 20 , end = '\r ' )
8292 for _ in range (iterations ):
8393 tree = get_ast_from_content (content [0 ], content [1 ])
@@ -102,8 +112,14 @@ def get_time_to_meta(df, iterations=10):
102112 return output
103113
104114
105- def plot_and_save_result (df , xlabel , ylabel , title , what ,
106- trend = 'linear' ):
115+ def plot_and_save_result (
116+ df : pd .DataFrame ,
117+ xlabel : str ,
118+ ylabel : str ,
119+ title : str ,
120+ what : str ,
121+ trend : Literal ['linear' , 'n^2' , 'n^3' , 'n^4' ] = 'linear'
122+ ) -> None :
107123 # Simple Moving average
108124 unique_count_lines = np .unique (df .count_lines )
109125 mean_times = []
@@ -122,19 +138,50 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
122138 if trend == 'linear' :
123139 z = np .polyfit (unique_count_lines , mean_times , 1 )
124140 p = np .poly1d (z )
125- plt .plot (unique_count_lines , p (unique_count_lines ), "r--" , label = 'Линейный тренд.' )
141+ plt .plot (
142+ unique_count_lines , p (unique_count_lines ), "r--" , label = 'Линейный тренд.'
143+ )
126144 elif trend == 'n^2' :
127- popt_cons , _ = curve_fit (square_func , unique_count_lines , mean_times , bounds = ([- np .inf , 0. , 0. ], [np .inf , 0.1 ** 100 , 0.1 ** 100 ]))
145+ popt_cons , _ = curve_fit (
146+ square_func ,
147+ unique_count_lines ,
148+ mean_times ,
149+ bounds = ([- np .inf , 0. , 0. ], [np .inf , 0.1 ** 100 , 0.1 ** 100 ])
150+ )
128151 p = np .poly1d (popt_cons )
129- plt .plot (unique_count_lines , p (unique_count_lines ), "r--" , label = 'Квадратичный тренд.' )
152+ plt .plot (
153+ unique_count_lines ,
154+ p (unique_count_lines ),
155+ "r--" , label = 'Квадратичный тренд.'
156+ )
130157 elif trend == 'n^3' :
131- popt_cons , _ = curve_fit (cube_func , unique_count_lines , mean_times , bounds = ([- np .inf , 0. , 0. , 0. ], [np .inf , 0.1 ** 100 , 0.1 ** 100 , 0.1 ** 100 ]))
158+ popt_cons , _ = curve_fit (
159+ cube_func ,
160+ unique_count_lines ,
161+ mean_times ,
162+ bounds = ([- np .inf , 0. , 0. , 0. ], [np .inf , 0.1 ** 100 , 0.1 ** 100 , 0.1 ** 100 ])
163+ )
132164 p = np .poly1d (popt_cons )
133- plt .plot (unique_count_lines , p (unique_count_lines ), "r--" , label = 'Кубический тренд.' )
165+ plt .plot (
166+ unique_count_lines ,
167+ p (unique_count_lines ),
168+ "r--" ,
169+ label = 'Кубический тренд.'
170+ )
134171 elif trend == 'n^4' :
135- popt_cons , _ = curve_fit (quart_func , unique_count_lines , mean_times , bounds = ([- np .inf , 0. , 0. , 0. , 0. ], [np .inf , 0.1 ** 100 , 0.1 ** 100 , 0.1 ** 100 , 0.1 ** 100 ]))
172+ popt_cons , _ = curve_fit (
173+ quart_func ,
174+ unique_count_lines ,
175+ mean_times ,
176+ bounds = (
177+ [- np .inf , 0. , 0. , 0. , 0. ],
178+ [np .inf , 0.1 ** 100 , 0.1 ** 100 , 0.1 ** 100 , 0.1 ** 100 ]
179+ )
180+ )
136181 p = np .poly1d (popt_cons )
137182 plt .plot (unique_count_lines , p (unique_count_lines ), "r--" , label = 'n^4.' )
183+ else :
184+ raise Exception (f"Incorrect tred '{ trend } '." )
138185
139186 rolling = pd .DataFrame (
140187 {
@@ -143,24 +190,40 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
143190 }
144191 )
145192 num_window = 20
146- plt .plot (rolling .unique_count_lines , rolling .mean_times .rolling (window = num_window ).mean (), label = f'Скользящее среднее по { num_window } ти замерам.' )
193+ plt .plot (
194+ rolling .unique_count_lines ,
195+ rolling .mean_times .rolling (window = num_window ).mean (),
196+ label = f'Скользящее среднее по { num_window } ти замерам.'
197+ )
147198
148199 plt .ylabel (ylabel , fontsize = 15 )
149200 plt .xlabel (xlabel , fontsize = 15 )
150201 plt .title (title , fontsize = 17 )
151202 plt .legend (loc = 'upper left' )
152- plt .savefig ('./graphics/need_time_{}_{}.png' .format (what , datetime .now ().strftime ("%d%m%Y_%H%M%S" )))
203+ plt .savefig (
204+ './graphics/need_time_{}_{}.png' .format (
205+ what ,
206+ datetime .now ().strftime ("%d%m%Y_%H%M%S" )
207+ )
208+ )
153209
154210
155- def get_time_algorithms (df , work , iterations = 5 , metric = 'fast' ):
211+ def get_time_algorithms (
212+ df : pd .DataFrame ,
213+ work ,
214+ iterations : int = 5 ,
215+ metric : Literal ['fast' , 'gst' , 'structure' ] = 'fast'
216+ ) -> pd .DataFrame :
156217 count_lines = []
157218 times = []
158219 tree1 = get_ast_from_content (work .content , work .link )
159220 if tree1 is None :
160221 raise Exception ("Unexpected error when parsing first work." )
161222
162223 features1 = get_features_from_ast (tree1 , work .link )
163- for (index , content ) in df [['content' , 'link' , 'count_lines_without_blank_lines' ]].iterrows ():
224+ for (index , content ) in df [
225+ ['content' , 'link' , 'count_lines_without_blank_lines' ]
226+ ].iterrows ():
164227 for _ in range (iterations ):
165228 print (index , " " * 20 , end = '\r ' )
166229 tree2 = get_ast_from_content (content [0 ], content [1 ])
@@ -190,8 +253,7 @@ def get_time_algorithms(df, work, iterations=5, metric='fast'):
190253 end = perf_counter () - start
191254 times .append (end )
192255 else :
193- print ('Incorrect metric!' )
194- return 1
256+ raise Exception ('Incorrect metric!' )
195257
196258 count_lines .append (content [2 ])
197259
0 commit comments