1+ using System ;
2+ using System . Collections . Generic ;
3+ using System . IO ;
4+ using System . Linq ;
5+ using System . Text ;
6+ using System . Text . RegularExpressions ;
7+ using System . Threading . Tasks ;
8+ using Octokit ;
9+ using Storage . Remote . GitHub ;
10+
11+ namespace Platform . Bot . Services
12+ {
13+ /// <summary>
14+ /// <para>
15+ /// Represents the code duplication analysis service.
16+ /// </para>
17+ /// <para></para>
18+ /// </summary>
19+ public class CodeDuplicationAnalysisService
20+ {
21+ private readonly GitHubStorage _storage ;
22+ private const int MinimumCodeFragmentLength = 3 ;
23+ private const int MinimumSimilarityThreshold = 80 ;
24+
25+ /// <summary>
26+ /// <para>
27+ /// Initializes a new <see cref="CodeDuplicationAnalysisService"/> instance.
28+ /// </para>
29+ /// <para></para>
30+ /// </summary>
31+ /// <param name="storage">
32+ /// <para>A git hub storage.</para>
33+ /// <para></para>
34+ /// </param>
35+ public CodeDuplicationAnalysisService ( GitHubStorage storage )
36+ {
37+ _storage = storage ;
38+ }
39+
40+ /// <summary>
41+ /// <para>
42+ /// Represents code fragment information.
43+ /// </para>
44+ /// <para></para>
45+ /// </summary>
46+ public class CodeFragment
47+ {
48+ public string Content { get ; set ; }
49+ public string FilePath { get ; set ; }
50+ public int StartLine { get ; set ; }
51+ public int EndLine { get ; set ; }
52+ public string Hash { get ; set ; }
53+
54+ public CodeFragment ( string content , string filePath , int startLine , int endLine )
55+ {
56+ Content = content ;
57+ FilePath = filePath ;
58+ StartLine = startLine ;
59+ EndLine = endLine ;
60+ Hash = ComputeHash ( content ) ;
61+ }
62+
63+ private static string ComputeHash ( string content )
64+ {
65+ var normalized = NormalizeCode ( content ) ;
66+ using var sha256 = System . Security . Cryptography . SHA256 . Create ( ) ;
67+ var hash = sha256 . ComputeHash ( Encoding . UTF8 . GetBytes ( normalized ) ) ;
68+ return Convert . ToBase64String ( hash ) ;
69+ }
70+
71+ private static string NormalizeCode ( string code )
72+ {
73+ code = Regex . Replace ( code , @"\s+" , " " ) ;
74+ code = Regex . Replace ( code , @"//.*" , "" ) ;
75+ code = Regex . Replace ( code , @"/\*.*?\*/" , "" , RegexOptions . Singleline ) ;
76+ return code . Trim ( ) ;
77+ }
78+ }
79+
80+ /// <summary>
81+ /// <para>
82+ /// Represents duplication group information.
83+ /// </para>
84+ /// <para></para>
85+ /// </summary>
86+ public class DuplicationGroup
87+ {
88+ public List < CodeFragment > Fragments { get ; set ; } = new ( ) ;
89+ public int Count => Fragments . Count ;
90+ public double SimilarityScore { get ; set ; }
91+ public string SuggestedMethodName { get ; set ; } = string . Empty ;
92+ }
93+
94+ /// <summary>
95+ /// <para>
96+ /// Analyzes repository for code duplications.
97+ /// </para>
98+ /// <para></para>
99+ /// </summary>
100+ /// <param name="repository">
101+ /// <para>The repository.</para>
102+ /// <para></para>
103+ /// </param>
104+ /// <returns>
105+ /// <para>The list of duplication groups</para>
106+ /// <para></para>
107+ /// </returns>
108+ public async Task < List < DuplicationGroup > > AnalyzeRepositoryAsync ( Repository repository )
109+ {
110+ var codeFragments = await ExtractCodeFragmentsAsync ( repository ) ;
111+ var duplications = FindDuplications ( codeFragments ) ;
112+ return duplications ;
113+ }
114+
115+ private async Task < List < CodeFragment > > ExtractCodeFragmentsAsync ( Repository repository )
116+ {
117+ var fragments = new List < CodeFragment > ( ) ;
118+ var contents = await GetRepositoryContentsAsync ( repository ) ;
119+
120+ foreach ( var content in contents )
121+ {
122+ if ( IsCodeFile ( content . Name ) )
123+ {
124+ var fileContent = await GetFileContentAsync ( repository , content . Path ) ;
125+ var fileFragments = ExtractFragmentsFromFile ( fileContent , content . Path ) ;
126+ fragments . AddRange ( fileFragments ) ;
127+ }
128+ }
129+
130+ return fragments ;
131+ }
132+
133+ private async Task < IReadOnlyList < RepositoryContent > > GetRepositoryContentsAsync ( Repository repository )
134+ {
135+ try
136+ {
137+ return await GetAllContentsRecursively ( repository , "" ) ;
138+ }
139+ catch ( Exception ex )
140+ {
141+ Console . WriteLine ( $ "Error getting repository contents: { ex . Message } ") ;
142+ return new List < RepositoryContent > ( ) ;
143+ }
144+ }
145+
146+ private async Task < List < RepositoryContent > > GetAllContentsRecursively ( Repository repository , string path )
147+ {
148+ var allContents = new List < RepositoryContent > ( ) ;
149+
150+ try
151+ {
152+ var contents = await _storage . Client . Repository . Content . GetAllContents ( repository . Id , path ) ;
153+
154+ foreach ( var content in contents )
155+ {
156+ if ( content . Type == ContentType . File )
157+ {
158+ allContents . Add ( content ) ;
159+ }
160+ else if ( content . Type == ContentType . Dir )
161+ {
162+ var subContents = await GetAllContentsRecursively ( repository , content . Path ) ;
163+ allContents . AddRange ( subContents ) ;
164+ }
165+ }
166+ }
167+ catch ( Exception ex )
168+ {
169+ Console . WriteLine ( $ "Error getting contents for path { path } : { ex . Message } ") ;
170+ }
171+
172+ return allContents ;
173+ }
174+
175+ private async Task < string > GetFileContentAsync ( Repository repository , string path )
176+ {
177+ try
178+ {
179+ var contents = await _storage . Client . Repository . Content . GetAllContents ( repository . Id , path ) ;
180+ return contents . First ( ) . Content ;
181+ }
182+ catch ( Exception ex )
183+ {
184+ Console . WriteLine ( $ "Error getting file content for { path } : { ex . Message } ") ;
185+ return string . Empty ;
186+ }
187+ }
188+
189+ private static bool IsCodeFile ( string fileName )
190+ {
191+ var codeExtensions = new [ ] { ".cs" , ".js" , ".ts" , ".py" , ".java" , ".cpp" , ".c" , ".h" , ".php" , ".rb" , ".go" , ".rs" , ".swift" } ;
192+ return codeExtensions . Any ( ext => fileName . EndsWith ( ext , StringComparison . OrdinalIgnoreCase ) ) ;
193+ }
194+
195+ private List < CodeFragment > ExtractFragmentsFromFile ( string content , string filePath )
196+ {
197+ var fragments = new List < CodeFragment > ( ) ;
198+ var lines = content . Split ( '\n ' ) ;
199+
200+ for ( int i = 0 ; i < lines . Length - MinimumCodeFragmentLength + 1 ; i ++ )
201+ {
202+ for ( int length = MinimumCodeFragmentLength ; length <= Math . Min ( 10 , lines . Length - i ) ; length ++ )
203+ {
204+ var fragmentLines = lines . Skip ( i ) . Take ( length ) . ToArray ( ) ;
205+ var fragmentContent = string . Join ( "\n " , fragmentLines ) ;
206+
207+ if ( IsValidCodeFragment ( fragmentContent ) )
208+ {
209+ fragments . Add ( new CodeFragment ( fragmentContent , filePath , i + 1 , i + length ) ) ;
210+ }
211+ }
212+ }
213+
214+ return fragments ;
215+ }
216+
217+ private static bool IsValidCodeFragment ( string content )
218+ {
219+ content = content . Trim ( ) ;
220+ if ( string . IsNullOrWhiteSpace ( content ) ) return false ;
221+ if ( content . Length < 50 ) return false ;
222+
223+ var lines = content . Split ( '\n ' ) . Where ( l => ! string . IsNullOrWhiteSpace ( l ) ) . ToArray ( ) ;
224+ if ( lines . Length < MinimumCodeFragmentLength ) return false ;
225+
226+ var codeLineCount = lines . Count ( line =>
227+ ! line . Trim ( ) . StartsWith ( "//" ) &&
228+ ! line . Trim ( ) . StartsWith ( "/*" ) &&
229+ ! line . Trim ( ) . StartsWith ( "*" ) &&
230+ line . Trim ( ) != "{" &&
231+ line . Trim ( ) != "}" ) ;
232+
233+ return codeLineCount >= MinimumCodeFragmentLength ;
234+ }
235+
236+ private List < DuplicationGroup > FindDuplications ( List < CodeFragment > fragments )
237+ {
238+ var groups = new Dictionary < string , DuplicationGroup > ( ) ;
239+
240+ foreach ( var fragment in fragments )
241+ {
242+ if ( groups . ContainsKey ( fragment . Hash ) )
243+ {
244+ groups [ fragment . Hash ] . Fragments . Add ( fragment ) ;
245+ }
246+ else
247+ {
248+ groups [ fragment . Hash ] = new DuplicationGroup
249+ {
250+ Fragments = new List < CodeFragment > { fragment } ,
251+ SimilarityScore = 100.0
252+ } ;
253+ }
254+ }
255+
256+ var duplications = groups . Values
257+ . Where ( g => g . Count > 1 )
258+ . OrderByDescending ( g => g . Count )
259+ . ThenByDescending ( g => g . Fragments . First ( ) . Content . Length )
260+ . ToList ( ) ;
261+
262+ foreach ( var group in duplications )
263+ {
264+ group . SuggestedMethodName = GenerateMethodName ( group . Fragments . First ( ) . Content ) ;
265+ }
266+
267+ return duplications ;
268+ }
269+
270+ private static string GenerateMethodName ( string content )
271+ {
272+ var words = new List < string > ( ) ;
273+ var normalizedContent = content . ToLowerInvariant ( ) ;
274+
275+ var keywords = new [ ] { "get" , "set" , "create" , "update" , "delete" , "find" , "search" , "calculate" , "process" , "validate" , "convert" } ;
276+ var foundKeyword = keywords . FirstOrDefault ( k => normalizedContent . Contains ( k ) ) ;
277+
278+ if ( ! string . IsNullOrEmpty ( foundKeyword ) )
279+ {
280+ words . Add ( char . ToUpper ( foundKeyword [ 0 ] ) + foundKeyword [ 1 ..] ) ;
281+ }
282+ else
283+ {
284+ words . Add ( "Process" ) ;
285+ }
286+
287+ var identifierMatches = Regex . Matches ( content , @"\b[A-Z][a-z]+\b" ) ;
288+ foreach ( Match match in identifierMatches . Take ( 2 ) )
289+ {
290+ if ( ! words . Contains ( match . Value ) )
291+ {
292+ words . Add ( match . Value ) ;
293+ }
294+ }
295+
296+ if ( words . Count == 1 )
297+ {
298+ words . Add ( "Data" ) ;
299+ }
300+
301+ return string . Join ( "" , words ) ;
302+ }
303+ }
304+ }
0 commit comments