Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions test_fake_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack

# -----------------------
# UTILITY FUNCTIONS
# -----------------------
def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])
smoothing = 2 # pseudo-count to avoid extremes
speaker_counts['score'] = ((speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing)
/ (speaker_counts['count'] + smoothing))
return speaker_counts['score'].to_dict()

def get_speaker_score_dynamic(speaker_name, speaker_scores):
return speaker_scores.get(speaker_name, 0.5) # default 0.5 if unknown

# -----------------------
# SAMPLE DATASET
# -----------------------
data = {
'text': [
"Breaking news: Market hits record high",
"Aliens landed in New York City",
"New study shows coffee improves memory",
"Chocolate cures all diseases",
"Local team wins championship",
"Government hiding the truth about UFOs",
"Scientists discover new species of bird",
"Miracle weight loss pills exposed"
],
'label': [1,0,1,0,1,0,1,0],
'speaker': ["Alice Smith","John Doe","Alice Smith","John Doe",
"Bob Lee","Jane Roe","Bob Lee","Jane Roe"]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# -----------------------
# COMPUTE SPEAKER CREDIBILITY
# -----------------------
speaker_scores = compute_speaker_scores(df)
df['speaker_score'] = df['speaker'].apply(lambda x: get_speaker_score_dynamic(x, speaker_scores))

# Print speaker scores for verification
print("Speaker Credibility Scores:")
for speaker, score in speaker_scores.items():
print(f"{speaker}: {score:.2f}")

# -----------------------
# TEXT FEATURE EXTRACTION
# -----------------------
vectorizer = TfidfVectorizer(max_features=50)
text_features = vectorizer.fit_transform(df['text'])

# Combine text features with speaker credibility
speaker_features = df['speaker_score'].values.reshape(-1,1)
X = hstack([text_features, speaker_features])
y = df['label']

# -----------------------
# TRAIN TEST SPLIT AND MODEL
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
clf = RandomForestClassifier(n_estimators=50, random_state=42)
clf.fit(X_train, y_train)

# -----------------------
# EVALUATION
# -----------------------
accuracy = clf.score(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
50 changes: 50 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# train.py
# Training script for Fake News Detection
# Adds dynamic speaker credibility as an additional feature

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack # to combine sparse matrices
from utils import compute_speaker_scores, get_speaker_score_dynamic

# Step 1: Load dataset
# Assume CSV has columns: 'text', 'label', 'speaker'
df = pd.read_csv('data/news_data.csv')

# Step 2: Compute speaker credibility scores dynamically
# This creates a dictionary {speaker_name: score} based on historical labels
speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label')

# Step 3: Convert text into TF-IDF features
# max_features limits the number of features for efficiency
vectorizer = TfidfVectorizer(max_features=5000)
text_features = vectorizer.fit_transform(df['text'])

# Step 4: Generate speaker credibility feature
# Apply the dynamic score function to each row's speaker
speaker_features = df['speaker'].apply(
lambda x: get_speaker_score_dynamic(x, speaker_scores)
).values.reshape(-1, 1) # reshape to 2D array for stacking

# Step 5: Combine text features with speaker credibility
# hstack allows us to combine sparse text matrix with dense speaker feature
X = hstack([text_features, speaker_features])

# Labels (1=real, 0=fake)
y = df['label']

# Step 6: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

# Step 7: Train the classifier
# Using Random Forest; you can replace with other models if desired
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 8: Evaluate the model
accuracy = clf.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.4f}")
45 changes: 45 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# utils.py
# Utility functions for speaker credibility score

import pandas as pd

def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
"""
Computes credibility scores for each speaker based on historical article labels.

Parameters:
- df: pandas DataFrame containing the news dataset
- speaker_col: column name containing speaker/author names
- label_col: column name containing article labels (1=real, 0=fake)

Returns:
- speaker_scores: dictionary {speaker_name: credibility_score}
"""
# Group by speaker and calculate mean label (fraction of real articles) and count of articles
speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])

# Apply smoothing to avoid extreme scores for speakers with very few articles
smoothing = 2 # pseudo-count
# Compute smoothed credibility score: blends speaker's mean with neutral 0.5
speaker_counts['score'] = (
(speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing)
/ (speaker_counts['count'] + smoothing)
)

# Convert the result to a dictionary: {speaker_name: credibility_score}
speaker_scores = speaker_counts['score'].to_dict()
return speaker_scores

def get_speaker_score_dynamic(speaker_name, speaker_scores):
"""
Fetches the credibility score for a given speaker.

Parameters:
- speaker_name: name of the speaker/author
- speaker_scores: dictionary from compute_speaker_scores

Returns:
- credibility score (0 to 1)
- defaults to 0.5 if speaker is unknown
"""
return speaker_scores.get(speaker_name, 0.5)