Skip to content

Commit 8669149

Browse files
Add semantic analysis scripts (adoptium#1102)
1 parent ed28db8 commit 8669149

File tree

5 files changed

+760
-0
lines changed

5 files changed

+760
-0
lines changed
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Java file analysis script for semantic metrics extraction
4+
"""
5+
6+
import sys
7+
import os
8+
import javalang
9+
import subprocess
10+
from collections import defaultdict
11+
import csv
12+
import re
13+
14+
def get_cyclomatic_complexity(method):
15+
complexity = 1
16+
for _, node in method.filter(javalang.tree.IfStatement):
17+
complexity += 1
18+
for _, node in method.filter(javalang.tree.ForStatement):
19+
complexity += 1
20+
for _, node in method.filter(javalang.tree.WhileStatement):
21+
complexity += 1
22+
for _, node in method.filter(javalang.tree.DoStatement):
23+
complexity += 1
24+
for _, node in method.filter(javalang.tree.SwitchStatement):
25+
complexity += len([s for s in node.cases if s.statements])
26+
for _, node in method.filter(javalang.tree.CatchClause):
27+
complexity += 1
28+
return complexity
29+
30+
def get_bug_count(file_path, repo_dir):
31+
try:
32+
relative_path = os.path.relpath(file_path, repo_dir)
33+
result = subprocess.run(
34+
['git', '-C', repo_dir, 'log', '--follow', '--', relative_path],
35+
capture_output=True,
36+
text=True
37+
)
38+
if result.returncode != 0:
39+
return 0
40+
bug_count = len([line for line in result.stdout.splitlines() if re.search(r'\b(fix|hotfix|bugfix|chore|refactor|test-fix)\b', line, re.IGNORECASE)])
41+
return bug_count
42+
except:
43+
return 0
44+
45+
def analyze_file(file_path, project_name, version, repo_dir):
46+
try:
47+
with open(file_path, 'r', encoding='utf-8') as f:
48+
code = f.read()
49+
except Exception as e:
50+
print(f"Error reading file {file_path}: {e}")
51+
return None
52+
53+
try:
54+
tree = javalang.parse.parse(code)
55+
except Exception as e:
56+
print(f"Error parsing Java code in {file_path}: {e}")
57+
return None
58+
59+
try:
60+
# Count classes found
61+
classes = list(tree.filter(javalang.tree.ClassDeclaration))
62+
if not classes:
63+
print(f"No classes found in {file_path}")
64+
return None
65+
66+
print(f"Found {len(classes)} class(es) in {file_path}")
67+
68+
for _, class_node in classes:
69+
fully_qualified_name = f"{tree.package.name}.{class_node.name}" if tree.package else class_node.name
70+
71+
metrics = {
72+
'project_name': project_name,
73+
'version': version,
74+
'class_name': fully_qualified_name,
75+
'wmc': 0,
76+
'rfc': 0,
77+
'loc': len(code.splitlines()),
78+
'max_cc': 0,
79+
'avg_cc': 0,
80+
'cbo': 0,
81+
'ca': 0,
82+
'ce': 0,
83+
'ic': 0,
84+
'cbm': 0,
85+
'lcom': 0,
86+
'lcom3': 0,
87+
'dit': 0,
88+
'noc': 0,
89+
'mfa': 0,
90+
'npm': 0,
91+
'dam': 0,
92+
'moa': 0,
93+
'cam': 0,
94+
'amc': 0,
95+
'bug': get_bug_count(file_path, repo_dir)
96+
}
97+
98+
# Methods and complexity
99+
methods = class_node.methods
100+
metrics['wmc'] = len(methods)
101+
cc_values = []
102+
method_names = set()
103+
for method in methods:
104+
cc = get_cyclomatic_complexity(method)
105+
cc_values.append(cc)
106+
method_names.add(method.name)
107+
if isinstance(method, javalang.tree.MethodDeclaration):
108+
metrics['npm'] += 1 if method.modifiers and 'public' in method.modifiers else 0
109+
110+
metrics['max_cc'] = max(cc_values) if cc_values else 0
111+
metrics['avg_cc'] = sum(cc_values) / len(cc_values) if cc_values else 0
112+
metrics['amc'] = metrics['loc'] / metrics['wmc'] if metrics['wmc'] > 0 else 0
113+
114+
# Inheritance metrics
115+
metrics['dit'] = 1 if class_node.extends else 0
116+
metrics['ic'] = metrics['dit']
117+
118+
# Coupling and cohesion
119+
fields = [f for f in class_node.fields if isinstance(f, javalang.tree.FieldDeclaration)]
120+
metrics['moa'] = sum(1 for f in fields if f.type and isinstance(f.type, javalang.tree.ReferenceType))
121+
total_fields = len(fields)
122+
private_fields = sum(1 for f in fields if f.modifiers and ('private' in f.modifiers or 'protected' in f.modifiers))
123+
metrics['dam'] = private_fields / total_fields if total_fields > 0 else 0
124+
125+
# LCOM calculation
126+
field_usage = defaultdict(set)
127+
for method in methods:
128+
for _, node in method.filter(javalang.tree.MemberReference):
129+
if node.qualifier in [f.declarators[0].name for f in fields]:
130+
field_usage[method.name].add(node.qualifier)
131+
lcom = 0
132+
for i, m1 in enumerate(methods):
133+
for m2 in methods[i+1:]:
134+
if not (field_usage[m1.name] & field_usage[m2.name]):
135+
lcom += 1
136+
metrics['lcom'] = lcom
137+
metrics['lcom3'] = 2 * lcom / (len(methods) * (len(methods) - 1)) if len(methods) > 1 else 0
138+
139+
# RFC and CBO
140+
called_methods = set()
141+
for method in methods:
142+
for _, node in method.filter(javalang.tree.MethodInvocation):
143+
called_methods.add(node.member)
144+
metrics['rfc'] = len(methods) + len(called_methods)
145+
metrics['cbo'] = len(called_methods)
146+
147+
# CBM: Count intra-class method calls
148+
intra_class_calls = 0
149+
for method in methods:
150+
for _, node in method.filter(javalang.tree.MethodInvocation):
151+
if node.member in method_names:
152+
intra_class_calls += 1
153+
metrics['cbm'] = intra_class_calls
154+
155+
# CAM: Cohesion among methods (simplified)
156+
metrics['cam'] = 0.5 # Default value
157+
158+
# MFA: Measure of functional abstraction
159+
metrics['mfa'] = 0.0 # Default value
160+
161+
print(f"Successfully analyzed class: {fully_qualified_name}")
162+
return metrics
163+
except Exception as e:
164+
print(f"Error analyzing class in {file_path}: {e}")
165+
return None
166+
167+
return None
168+
169+
if __name__ == "__main__":
170+
if len(sys.argv) != 5:
171+
print("Usage: python analyze_java_file.py <file_path> <project_name> <version> <repo_dir>")
172+
sys.exit(1)
173+
174+
file_path = sys.argv[1]
175+
project_name = sys.argv[2]
176+
version = sys.argv[3]
177+
repo_dir = sys.argv[4]
178+
179+
try:
180+
metrics = analyze_file(file_path, project_name, version, repo_dir)
181+
if metrics:
182+
# Write to CSV only if we have actual data
183+
with open('temp_metrics.csv', 'w', newline='') as f:
184+
fieldnames = ['project_name', 'version', 'class_name', 'wmc', 'rfc', 'loc', 'max_cc', 'avg_cc',
185+
'cbo', 'ca', 'ce', 'ic', 'cbm', 'lcom', 'lcom3', 'dit', 'noc', 'mfa',
186+
'npm', 'dam', 'moa', 'cam', 'amc', 'bug']
187+
writer = csv.DictWriter(f, fieldnames=fieldnames)
188+
writer.writeheader() # Write the header first
189+
writer.writerow(metrics)
190+
print("Metrics extracted successfully")
191+
sys.exit(0)
192+
else:
193+
print("No classes found in file or parsing failed")
194+
sys.exit(1)
195+
except Exception as e:
196+
print(f"Unexpected error processing {file_path}: {e}")
197+
sys.exit(1)
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
TensorFlow 2.x compatible AutoEncoder for GlitchWitcher Semantic Analysis
5+
"""
6+
7+
import os
8+
import tensorflow as tf
9+
import math
10+
import numpy as np
11+
12+
# Suppress TensorFlow progress bars and logging
13+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
14+
tf.get_logger().setLevel('ERROR')
15+
16+
def default_error_function(x):
17+
"""Default error function for AutoEncoder"""
18+
return tf.norm(x, axis=1)
19+
20+
class AutoEncoder:
21+
22+
def __init__(self, layers, lr=0.01, epoch=200, batch_size=512, transfer_function=tf.nn.relu, error_func=None, print_device=False):
23+
if error_func is None:
24+
error_func = default_error_function
25+
26+
self.layers = layers
27+
self.lr = lr
28+
self.epoch = epoch
29+
self.batch_size = batch_size
30+
self.transfer_function = transfer_function
31+
self.error_func = error_func
32+
33+
# Check for GPU
34+
device = '/CPU:0'
35+
if tf.config.list_physical_devices('GPU'):
36+
if print_device:
37+
print("\nUsing GPU\n")
38+
device = '/GPU:0'
39+
40+
# Build the model
41+
self._build_model()
42+
43+
def _build_model(self):
44+
"""Build the autoencoder model using TensorFlow 2.x"""
45+
# Encoder
46+
encoder_input = tf.keras.Input(shape=(self.layers[0],))
47+
x = encoder_input
48+
49+
for i in range(len(self.layers) - 1):
50+
limit = 1.0 / math.sqrt(self.layers[i])
51+
x = tf.keras.layers.Dense(
52+
self.layers[i + 1],
53+
activation=self.transfer_function,
54+
kernel_initializer=tf.keras.initializers.RandomUniform(-limit, limit),
55+
bias_initializer='zeros'
56+
)(x)
57+
58+
# Encoder output (latent representation)
59+
self.encoded = x
60+
61+
# Decoder
62+
decoder_input = tf.keras.Input(shape=(self.layers[-1],))
63+
y = decoder_input
64+
65+
for i in reversed(range(len(self.layers) - 1)):
66+
limit = 1.0 / math.sqrt(self.layers[i])
67+
if i == 0:
68+
# Output layer - no activation for reconstruction
69+
y = tf.keras.layers.Dense(
70+
self.layers[i],
71+
kernel_initializer=tf.keras.initializers.RandomUniform(-limit, limit),
72+
bias_initializer='zeros'
73+
)(y)
74+
else:
75+
y = tf.keras.layers.Dense(
76+
self.layers[i],
77+
activation=self.transfer_function,
78+
kernel_initializer=tf.keras.initializers.RandomUniform(-limit, limit),
79+
bias_initializer='zeros'
80+
)(y)
81+
82+
# Create models
83+
self.encoder = tf.keras.Model(encoder_input, self.encoded)
84+
self.decoder = tf.keras.Model(decoder_input, y)
85+
86+
# Autoencoder
87+
self.autoencoder = tf.keras.Model(encoder_input, self.decoder(self.encoded))
88+
89+
# Compile
90+
self.autoencoder.compile(
91+
optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr),
92+
loss='mse'
93+
)
94+
95+
def fit(self, X, print_progress=False):
96+
"""Train the autoencoder"""
97+
batch_count = math.ceil(len(X) / self.batch_size)
98+
99+
# Use fit method for training
100+
history = self.autoencoder.fit(
101+
X, X,
102+
epochs=self.epoch,
103+
batch_size=self.batch_size,
104+
verbose=1 if print_progress else 0,
105+
shuffle=True
106+
)
107+
108+
return self
109+
110+
def transform(self, X):
111+
"""Encode the input data"""
112+
return self.encoder.predict(X, verbose=0)
113+
114+
def inverse_transform(self, X):
115+
"""Decode the encoded data"""
116+
return self.decoder.predict(X, verbose=0)
117+
118+
def fit_transform(self, X):
119+
"""Fit the model and transform the data"""
120+
self.fit(X)
121+
return self.transform(X)
122+
123+
def close(self):
124+
"""Clean up resources"""
125+
# TensorFlow 2.x doesn't need explicit session cleanup
126+
pass
127+
128+
def debugPrint(self):
129+
"""Print model architecture"""
130+
print("AutoEncoder Architecture:")
131+
self.autoencoder.summary()
132+
133+
def load_autoencoder(model_path):
134+
"""Load a saved autoencoder model"""
135+
try:
136+
# Try to load as Keras model
137+
autoencoder = tf.keras.models.load_model(model_path)
138+
139+
# Create wrapper
140+
ae = AutoEncoder([autoencoder.input_shape[1], autoencoder.output_shape[1]])
141+
ae.autoencoder = autoencoder
142+
143+
return ae
144+
except Exception as e:
145+
print(f"Failed to load model: {e}")
146+
return None
147+
148+
def main():
149+
"""Test the autoencoder"""
150+
# Create sample data
151+
X = np.random.rand(100, 20)
152+
153+
# Create and train autoencoder
154+
ae = AutoEncoder([20, 18, 14, 8, 4, 2])
155+
ae.fit(X, print_progress=False)
156+
157+
# Test transform
158+
encoded = ae.transform(X)
159+
decoded = ae.inverse_transform(encoded)
160+
161+
print(f"Original shape: {X.shape}")
162+
print(f"Encoded shape: {encoded.shape}")
163+
print(f"Decoded shape: {decoded.shape}")
164+
print(f"Reconstruction error: {np.mean((X - decoded) ** 2)}")
165+
166+
if __name__ == '__main__':
167+
main()

0 commit comments

Comments
 (0)