11import os
22import re
3+ import sys
34from datetime import datetime
45from time import perf_counter
56from typing import Literal
@@ -34,28 +35,32 @@ def remove_unnecessary_blank_lines(source_code: str) -> str:
3435 return re .sub (pattern , "\n " , source_code )
3536
3637
37- def get_data_from_dir (path : str = "./data" , max_count_lines : int | None = None ) -> pd .DataFrame :
38+ def get_data_from_dir (
39+ path : str = "./data" , max_count_lines : int | None = None
40+ ) -> pd .DataFrame | None :
3841 df = pd .DataFrame ()
3942 for filename in os .listdir (path ):
4043 if not re .search (r".csv$" , filename ):
4144 continue
4245
43- tmp_df = pd .read_csv (os .path .join (path , filename ), sep = ";" , index_col = 0 )
46+ tmp_df = pd .read_csv (os .path .join (path , filename ), sep = ";" , index_col = 0 ) # type: ignore
4447 df = df .append (tmp_df , ignore_index = True )
4548
4649 if max_count_lines :
47- return df [df .count_lines_without_blank_lines < max_count_lines ]
50+ result = df [df .count_lines_without_blank_lines < max_count_lines ]
51+ assert isinstance (result , pd .DataFrame ) or result is None
52+ return result
4853
4954 return df
5055
5156
52- def save_works_from_repo_url (url : str , check_policy : bool = True ) -> None :
57+ def save_works_from_repo_url (url : str , check_policy : bool = True , min_lines : int = 5 ) -> None :
5358 current_repo_name = url .split ("/" )[- 1 ]
5459 env_config = Config (RepositoryEnv ("../../.env" ))
5560 gh = GitHubParser (
5661 file_extensions = (re .compile (r".py$" ),),
5762 check_all = check_policy ,
58- access_token = env_config .get ("ACCESS_TOKEN" ),
63+ access_token = env_config .get ("ACCESS_TOKEN" , default = "" ), # type: ignore
5964 )
6065 files = list (gh .get_files_generator_from_repo_url (url ))
6166 files = [(remove_unnecessary_blank_lines (file .code ), file .link ) for file in files ]
@@ -76,22 +81,34 @@ def save_works_from_repo_url(url: str, check_policy: bool = True) -> None:
7681 ],
7782 }
7883 )
79- df = df [df ["count_lines_without_blank_lines" ] > 5 ]
84+ filtered_df = df ["count_lines_without_blank_lines" ]
85+ assert filtered_df is not None
86+ df = df [filtered_df > min_lines ]
87+ if df is None :
88+ print (f"Nothing to save with minimal count of lines '{ min_lines } '." , file = sys .stderr )
89+ return
8090 df .to_csv (os .path .join ("./data/" , current_repo_name + ".csv" ), sep = ";" )
8191
8292
8393def get_time_to_meta (df : pd .DataFrame , iterations : int = 10 ) -> pd .DataFrame :
8494 count_lines = []
8595 to_meta_time = []
86- for index , content in df [["content" , "link" , "count_lines_without_blank_lines" ]].iterrows ():
96+ filtered_df = df [["content" , "link" , "count_lines_without_blank_lines" ]]
97+ if filtered_df is None :
98+ raise Exception ("DataFrame is empty, nothing to parse." )
99+ for index , content in filtered_df .iterrows ():
100+ code = content [0 ]
101+ filepath = content [1 ]
102+ assert isinstance (code , str )
103+ assert isinstance (filepath , str )
87104 print (index , " " * 20 , end = "\r " )
88105 for _ in range (iterations ):
89- tree = get_ast_from_content (content [ 0 ], content [ 1 ] )
106+ tree = get_ast_from_content (code , filepath )
90107 if tree is None :
91108 break
92109 try :
93110 start = perf_counter ()
94- get_features_from_ast (tree , content [ 1 ] )
111+ get_features_from_ast (tree , filepath )
95112 end = perf_counter () - start
96113 to_meta_time .append (end )
97114 count_lines .append (content [2 ])
@@ -130,7 +147,7 @@ def plot_and_save_result(
130147 p = np .poly1d (z )
131148 plt .plot (unique_count_lines , p (unique_count_lines ), "r--" , label = "Линейный тренд." )
132149 elif trend == "n^2" :
133- popt_cons , _ = curve_fit (
150+ popt_cons , _ = curve_fit ( # type: ignore
134151 square_func ,
135152 unique_count_lines ,
136153 mean_times ,
@@ -144,7 +161,7 @@ def plot_and_save_result(
144161 label = "Квадратичный тренд." ,
145162 )
146163 elif trend == "n^3" :
147- popt_cons , _ = curve_fit (
164+ popt_cons , _ = curve_fit ( # type: ignore
148165 cube_func ,
149166 unique_count_lines ,
150167 mean_times ,
@@ -156,7 +173,7 @@ def plot_and_save_result(
156173 p = np .poly1d (popt_cons )
157174 plt .plot (unique_count_lines , p (unique_count_lines ), "r--" , label = "Кубический тренд." )
158175 elif trend == "n^4" :
159- popt_cons , _ = curve_fit (
176+ popt_cons , _ = curve_fit ( # type: ignore
160177 quart_func ,
161178 unique_count_lines ,
162179 mean_times ,
@@ -200,14 +217,21 @@ def get_time_algorithms(
200217 raise Exception ("Unexpected error when parsing first work." )
201218
202219 features1 = get_features_from_ast (tree1 , work .link )
203- for index , content in df [["content" , "link" , "count_lines_without_blank_lines" ]].iterrows ():
220+ filtered_df = df [["content" , "link" , "count_lines_without_blank_lines" ]]
221+ if filtered_df is None :
222+ raise Exception ("DataFrame is empty, nothing to parse." )
223+ for index , content in filtered_df .iterrows ():
224+ code = content [0 ]
225+ filepath = content [1 ]
226+ assert isinstance (code , str )
227+ assert isinstance (filepath , str )
204228 for _ in range (iterations ):
205229 print (index , " " * 20 , end = "\r " )
206- tree2 = get_ast_from_content (content [ 0 ], content [ 1 ] )
230+ tree2 = get_ast_from_content (code , filepath )
207231 if tree2 is None :
208232 continue
209233 try :
210- features2 = get_features_from_ast (tree2 , content [ 1 ] )
234+ features2 = get_features_from_ast (tree2 , filepath )
211235 except Exception :
212236 continue
213237
0 commit comments