diff --git a/test_fake_news.py b/test_fake_news.py new file mode 100644 index 0000000..7c64707 --- /dev/null +++ b/test_fake_news.py @@ -0,0 +1,75 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from scipy.sparse import hstack + +# ----------------------- +# UTILITY FUNCTIONS +# ----------------------- +def compute_speaker_scores(df, speaker_col='speaker', label_col='label'): + speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count']) + smoothing = 2 # pseudo-count to avoid extremes + speaker_counts['score'] = ((speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) + / (speaker_counts['count'] + smoothing)) + return speaker_counts['score'].to_dict() + +def get_speaker_score_dynamic(speaker_name, speaker_scores): + return speaker_scores.get(speaker_name, 0.5) # default 0.5 if unknown + +# ----------------------- +# SAMPLE DATASET +# ----------------------- +data = { + 'text': [ + "Breaking news: Market hits record high", + "Aliens landed in New York City", + "New study shows coffee improves memory", + "Chocolate cures all diseases", + "Local team wins championship", + "Government hiding the truth about UFOs", + "Scientists discover new species of bird", + "Miracle weight loss pills exposed" + ], + 'label': [1,0,1,0,1,0,1,0], + 'speaker': ["Alice Smith","John Doe","Alice Smith","John Doe", + "Bob Lee","Jane Roe","Bob Lee","Jane Roe"] +} + +# Convert to DataFrame +df = pd.DataFrame(data) + +# ----------------------- +# COMPUTE SPEAKER CREDIBILITY +# ----------------------- +speaker_scores = compute_speaker_scores(df) +df['speaker_score'] = df['speaker'].apply(lambda x: get_speaker_score_dynamic(x, speaker_scores)) + +# Print speaker scores for verification +print("Speaker Credibility Scores:") +for speaker, score in speaker_scores.items(): + print(f"{speaker}: {score:.2f}") + +# ----------------------- +# TEXT FEATURE EXTRACTION +# ----------------------- +vectorizer = TfidfVectorizer(max_features=50) +text_features = vectorizer.fit_transform(df['text']) + +# Combine text features with speaker credibility +speaker_features = df['speaker_score'].values.reshape(-1,1) +X = hstack([text_features, speaker_features]) +y = df['label'] + +# ----------------------- +# TRAIN TEST SPLIT AND MODEL +# ----------------------- +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) +clf = RandomForestClassifier(n_estimators=50, random_state=42) +clf.fit(X_train, y_train) + +# ----------------------- +# EVALUATION +# ----------------------- +accuracy = clf.score(X_test, y_test) +print(f"Test Accuracy: {accuracy:.2f}") diff --git a/train.py b/train.py new file mode 100644 index 0000000..be25572 --- /dev/null +++ b/train.py @@ -0,0 +1,50 @@ +# train.py +# Training script for Fake News Detection +# Adds dynamic speaker credibility as an additional feature + +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from scipy.sparse import hstack # to combine sparse matrices +from utils import compute_speaker_scores, get_speaker_score_dynamic + +# Step 1: Load dataset +# Assume CSV has columns: 'text', 'label', 'speaker' +df = pd.read_csv('data/news_data.csv') + +# Step 2: Compute speaker credibility scores dynamically +# This creates a dictionary {speaker_name: score} based on historical labels +speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label') + +# Step 3: Convert text into TF-IDF features +# max_features limits the number of features for efficiency +vectorizer = TfidfVectorizer(max_features=5000) +text_features = vectorizer.fit_transform(df['text']) + +# Step 4: Generate speaker credibility feature +# Apply the dynamic score function to each row's speaker +speaker_features = df['speaker'].apply( + lambda x: get_speaker_score_dynamic(x, speaker_scores) +).values.reshape(-1, 1) # reshape to 2D array for stacking + +# Step 5: Combine text features with speaker credibility +# hstack allows us to combine sparse text matrix with dense speaker feature +X = hstack([text_features, speaker_features]) + +# Labels (1=real, 0=fake) +y = df['label'] + +# Step 6: Split data into training and test sets +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 +) + +# Step 7: Train the classifier +# Using Random Forest; you can replace with other models if desired +clf = RandomForestClassifier(n_estimators=100, random_state=42) +clf.fit(X_train, y_train) + +# Step 8: Evaluate the model +accuracy = clf.score(X_test, y_test) +print(f"Model Accuracy: {accuracy:.4f}") diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..2b0a47a --- /dev/null +++ b/utils.py @@ -0,0 +1,45 @@ +# utils.py +# Utility functions for speaker credibility score + +import pandas as pd + +def compute_speaker_scores(df, speaker_col='speaker', label_col='label'): + """ + Computes credibility scores for each speaker based on historical article labels. + + Parameters: + - df: pandas DataFrame containing the news dataset + - speaker_col: column name containing speaker/author names + - label_col: column name containing article labels (1=real, 0=fake) + + Returns: + - speaker_scores: dictionary {speaker_name: credibility_score} + """ + # Group by speaker and calculate mean label (fraction of real articles) and count of articles + speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count']) + + # Apply smoothing to avoid extreme scores for speakers with very few articles + smoothing = 2 # pseudo-count + # Compute smoothed credibility score: blends speaker's mean with neutral 0.5 + speaker_counts['score'] = ( + (speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) + / (speaker_counts['count'] + smoothing) + ) + + # Convert the result to a dictionary: {speaker_name: credibility_score} + speaker_scores = speaker_counts['score'].to_dict() + return speaker_scores + +def get_speaker_score_dynamic(speaker_name, speaker_scores): + """ + Fetches the credibility score for a given speaker. + + Parameters: + - speaker_name: name of the speaker/author + - speaker_scores: dictionary from compute_speaker_scores + + Returns: + - credibility score (0 to 1) + - defaults to 0.5 if speaker is unknown + """ + return speaker_scores.get(speaker_name, 0.5)