-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalidate_model.py
More file actions
133 lines (105 loc) · 4.65 KB
/
validate_model.py
File metadata and controls
133 lines (105 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from predict import TweetPredictor
import json
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_percentage_error
def load_and_preprocess_data(file_path):
"""Load and preprocess the validation data"""
# Read the CSV file
df = pd.read_csv(file_path)
# Set all follower counts to 350
df['follower_count'] = 350
# Convert date_posted and time_posted to datetime
df['datetime'] = pd.to_datetime(df['date_posted'] + ' ' + df['time_posted'])
# Filter out tweets from last 48 hours
cutoff_time = datetime.now() - timedelta(hours=48)
df = df[df['datetime'] < cutoff_time]
# Convert boolean columns
bool_columns = ['has_image', 'has_video', 'has_link', 'has_mention', 'has_crypto_mention', 'has_poll']
for col in bool_columns:
df[col] = df[col].map({'yes': 1, 'no': 0})
# Extract hour and minute from time_posted
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
# Apply log transformation to follower_count and length
df['follower_count_log'] = np.log1p(df['follower_count'])
df['length_log'] = np.log1p(df['length'])
print(f"Set all follower counts to 350 (log value: {np.log1p(350):.2f})")
return df
def calculate_accuracy_buckets(actual, predicted):
"""Calculate percentage of predictions within different accuracy ranges"""
# Calculate percentage error
percent_error = np.abs((actual - predicted) / actual) * 100
# Define accuracy buckets
buckets = {
'within_5%': 0,
'within_10%': 0,
'within_20%': 0,
'within_50%': 0,
'over_50%': 0
}
# Count predictions in each bucket
total = len(percent_error)
buckets['within_5%'] = np.sum(percent_error <= 5) / total * 100
buckets['within_10%'] = np.sum(percent_error <= 10) / total * 100
buckets['within_20%'] = np.sum(percent_error <= 20) / total * 100
buckets['within_50%'] = np.sum(percent_error <= 50) / total * 100
buckets['over_50%'] = np.sum(percent_error > 50) / total * 100
return buckets
def plot_accuracy_distribution(actual, predicted, metric_name):
"""Plot the distribution of prediction accuracy"""
percent_error = np.abs((actual - predicted) / actual) * 100
plt.figure(figsize=(10, 6))
plt.hist(percent_error, bins=50, range=(0, 100))
plt.title(f'Distribution of Prediction Error for {metric_name}')
plt.xlabel('Percentage Error')
plt.ylabel('Number of Predictions')
plt.grid(True, alpha=0.3)
plt.savefig(f'{metric_name}_error_distribution.png')
plt.close()
def main():
# Initialize the predictor
predictor = TweetPredictor(
model_path="checkpoints/best_model.pt",
normalization_params_path="normalization_params.json"
)
# Load and preprocess data
print("Loading and preprocessing validation data...")
df = load_and_preprocess_data('temp_4.csv')
# Prepare features for prediction
feature_columns = [
'follower_count_log', 'content', 'hour', 'minute',
'has_image', 'has_video', 'has_link', 'has_mention',
'has_crypto_mention', 'length_log', 'has_poll'
]
features_df = df[feature_columns].copy()
# Make predictions
print("Making predictions...")
results = predictor.predict(features_df['content'].tolist(), features_df)
# Calculate accuracy metrics for each target
metrics = ['likes', 'retweets', 'replies']
overall_results = {}
for metric in metrics:
print(f"\nAnalyzing {metric} predictions...")
# Calculate accuracy buckets
buckets = calculate_accuracy_buckets(df[metric].values, results[metric].values)
overall_results[metric] = buckets
# Calculate MAPE
mape = mean_absolute_percentage_error(df[metric].values, results[metric].values) * 100
# Print results
print(f"\n{metric.upper()} Prediction Accuracy:")
print(f"Mean Absolute Percentage Error: {mape:.2f}%")
print("\nAccuracy Distribution:")
for bucket, percentage in buckets.items():
print(f"{bucket}: {percentage:.2f}%")
# Plot error distribution
plot_accuracy_distribution(df[metric].values, results[metric].values, metric)
# Save detailed results to JSON
with open('validation_results.json', 'w') as f:
json.dump(overall_results, f, indent=4)
print("\nValidation complete! Results saved to validation_results.json")
print("Error distribution plots saved as PNG files.")
if __name__ == "__main__":
main()