nishitpatel01 · emmaltaylor · Nov 24, 2025
diff --git a/test_fake_news.py b/test_fake_news.py
@@ -0,0 +1,75 @@
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from scipy.sparse import hstack
+
+# -----------------------
+# UTILITY FUNCTIONS
+# -----------------------
+def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
+    speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])
+    smoothing = 2  # pseudo-count to avoid extremes
+    speaker_counts['score'] = ((speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) 
+                               / (speaker_counts['count'] + smoothing))
+    return speaker_counts['score'].to_dict()
+
+def get_speaker_score_dynamic(speaker_name, speaker_scores):
+    return speaker_scores.get(speaker_name, 0.5)  # default 0.5 if unknown
+
+# -----------------------
+# SAMPLE DATASET
+# -----------------------
+data = {
+    'text': [
+        "Breaking news: Market hits record high",
+        "Aliens landed in New York City",
+        "New study shows coffee improves memory",
+        "Chocolate cures all diseases",
+        "Local team wins championship",
+        "Government hiding the truth about UFOs",
+        "Scientists discover new species of bird",
+        "Miracle weight loss pills exposed"
+    ],
+    'label': [1,0,1,0,1,0,1,0],
+    'speaker': ["Alice Smith","John Doe","Alice Smith","John Doe",
+                "Bob Lee","Jane Roe","Bob Lee","Jane Roe"]
+}
+
+# Convert to DataFrame
+df = pd.DataFrame(data)
+
+# -----------------------
+# COMPUTE SPEAKER CREDIBILITY
+# -----------------------
+speaker_scores = compute_speaker_scores(df)
+df['speaker_score'] = df['speaker'].apply(lambda x: get_speaker_score_dynamic(x, speaker_scores))
+
+# Print speaker scores for verification
+print("Speaker Credibility Scores:")
+for speaker, score in speaker_scores.items():
+    print(f"{speaker}: {score:.2f}")
+
+# -----------------------
+# TEXT FEATURE EXTRACTION
+# -----------------------
+vectorizer = TfidfVectorizer(max_features=50)
+text_features = vectorizer.fit_transform(df['text'])
+
+# Combine text features with speaker credibility
+speaker_features = df['speaker_score'].values.reshape(-1,1)
+X = hstack([text_features, speaker_features])
+y = df['label']
+
+# -----------------------
+# TRAIN TEST SPLIT AND MODEL
+# -----------------------
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
+clf = RandomForestClassifier(n_estimators=50, random_state=42)
+clf.fit(X_train, y_train)
+
+# -----------------------
+# EVALUATION
+# -----------------------
+accuracy = clf.score(X_test, y_test)
+print(f"Test Accuracy: {accuracy:.2f}")
diff --git a/train.py b/train.py
@@ -0,0 +1,50 @@
+# train.py
+# Training script for Fake News Detection
+# Adds dynamic speaker credibility as an additional feature
+
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from scipy.sparse import hstack  # to combine sparse matrices
+from utils import compute_speaker_scores, get_speaker_score_dynamic
+
+# Step 1: Load dataset
+# Assume CSV has columns: 'text', 'label', 'speaker'
+df = pd.read_csv('data/news_data.csv')
+
+# Step 2: Compute speaker credibility scores dynamically
+# This creates a dictionary {speaker_name: score} based on historical labels
+speaker_scores = compute_speaker_scores(df, speaker_col='speaker', label_col='label')
+
+# Step 3: Convert text into TF-IDF features
+# max_features limits the number of features for efficiency
+vectorizer = TfidfVectorizer(max_features=5000)
+text_features = vectorizer.fit_transform(df['text'])
+
+# Step 4: Generate speaker credibility feature
+# Apply the dynamic score function to each row's speaker
+speaker_features = df['speaker'].apply(
+    lambda x: get_speaker_score_dynamic(x, speaker_scores)
+).values.reshape(-1, 1)  # reshape to 2D array for stacking
+
+# Step 5: Combine text features with speaker credibility
+# hstack allows us to combine sparse text matrix with dense speaker feature
+X = hstack([text_features, speaker_features])
+
+# Labels (1=real, 0=fake)
+y = df['label']
+
+# Step 6: Split data into training and test sets
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+
+# Step 7: Train the classifier
+# Using Random Forest; you can replace with other models if desired
+clf = RandomForestClassifier(n_estimators=100, random_state=42)
+clf.fit(X_train, y_train)
+
+# Step 8: Evaluate the model
+accuracy = clf.score(X_test, y_test)
+print(f"Model Accuracy: {accuracy:.4f}")
diff --git a/utils.py b/utils.py
@@ -0,0 +1,45 @@
+# utils.py
+# Utility functions for speaker credibility score
+
+import pandas as pd
+
+def compute_speaker_scores(df, speaker_col='speaker', label_col='label'):
+    """
+    Computes credibility scores for each speaker based on historical article labels.
+
+    Parameters:
+    - df: pandas DataFrame containing the news dataset
+    - speaker_col: column name containing speaker/author names
+    - label_col: column name containing article labels (1=real, 0=fake)
+
+    Returns:
+    - speaker_scores: dictionary {speaker_name: credibility_score}
+    """
+    # Group by speaker and calculate mean label (fraction of real articles) and count of articles
+    speaker_counts = df.groupby(speaker_col)[label_col].agg(['mean', 'count'])
+
+    # Apply smoothing to avoid extreme scores for speakers with very few articles
+    smoothing = 2  # pseudo-count
+    # Compute smoothed credibility score: blends speaker's mean with neutral 0.5
+    speaker_counts['score'] = (
+        (speaker_counts['mean'] * speaker_counts['count'] + 0.5 * smoothing) 
+        / (speaker_counts['count'] + smoothing)
+    )
+
+    # Convert the result to a dictionary: {speaker_name: credibility_score}
+    speaker_scores = speaker_counts['score'].to_dict()
+    return speaker_scores
+
+def get_speaker_score_dynamic(speaker_name, speaker_scores):
+    """
+    Fetches the credibility score for a given speaker.
+
+    Parameters:
+    - speaker_name: name of the speaker/author
+    - speaker_scores: dictionary from compute_speaker_scores
+
+    Returns:
+    - credibility score (0 to 1)
+    - defaults to 0.5 if speaker is unknown
+    """
+    return speaker_scores.get(speaker_name, 0.5)