AITextClassifier/ExplorAnalysis+FeatureCreation.py at main · Ibraheem-Ahmad/AITextClassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
df = pd.read_parquet("smalldata.parquet")

# Print basic info
print("Dataset Info:")
df.info()

# Sample data
print("\nSample Data:")
print(df.sample(5))

# Missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Source distribution (to understand label breakdown)
print("\nSource Value Counts:")
print(df['source'].value_counts())

# Anything not labeled "Human" is AI
df['label'] = df['source'].apply(lambda x: 0 if x == "Human" else 1)


sns.countplot(x='label', data=df)
plt.title('Label Distribution (0 = Human, 1 = AI)')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks([0, 1], ['Human', 'AI'])
plt.show()

# Average word length
df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)

# Punctuation count
df['punctuation_count'] = df['text'].apply(lambda x: sum([1 for c in x if c in '.,;:!?']))

# Uppercase letter count
df['uppercase_count'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()))

# Special character count (non-alphanumeric)
df['special_char_count'] = df['text'].apply(lambda x: sum(1 for c in x if not c.isalnum() and not c.isspace()))

# Sentence count (rough estimate based on period)
df['sentence_count'] = df['text'].apply(lambda x: x.count('.'))

# Word density (words per sentence)
df['words_per_sentence'] = df['word_count'] / df['sentence_count'].replace(0, 1)

# Select features to inspect
features = ['text_length', 'word_count', 'avg_word_length', 'punctuation_count',
            'uppercase_count', 'special_char_count', 'sentence_count', 'words_per_sentence', 'label']

# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df[features].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

df.to_parquet("smalldata.parquet", index=False)