Skip to content

Commit 03774fb

Browse files
📝
1 parent e2551d3 commit 03774fb

File tree

1 file changed

+101
-0
lines changed

1 file changed

+101
-0
lines changed

docs/docs/机器学习/传统算法/朴素贝叶斯.md

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,106 @@ P(垃圾邮件|免费,恭喜,辛苦) = P(免费|垃圾邮件)* P(
6262
P(正常邮件|免费,恭喜,辛苦) = P(免费|正常邮件)* P(恭喜|正常邮件)* P(辛苦|正常邮件)* P(正常邮件)= (5+1/20)² * (6+1/20) * (2+1/20) * 0.2 =0.012885
6363
```
6464

65+
```python showLineNumbers
66+
67+
# 参考答案
68+
import numpy as np
69+
70+
class NaiveBayes:
71+
def __init__(self):
72+
self.class_probs = {} # 存储每个类别的先验概率 P(c)
73+
self.word_probs = {} # 存储每个类别中单词的条件概率 P(w|c)
74+
self.vocab = set() # 保存所有出现的单词构成的词汇表
75+
self.smooth = 1 # 拉普拉斯平滑参数
76+
77+
def fit(self, X, y):
78+
# 获取唯一类别和其数量
79+
classes, class_counts = np.unique(y, return_counts=True)
80+
self.class_probs = {label: count / len(y) for label, count in zip(classes, class_counts)} # 先验概率
81+
82+
# 初始化词汇表和词频统计
83+
word_count = {label: {} for label in classes} # 每个类别的词频表
84+
class_word_totals = {label: 0 for label in classes} # 每个类别单词总数
85+
86+
# 遍历每个样本进行分词和统计
87+
for text, label in zip(X, y):
88+
words = text.split(" ")
89+
for word in words:
90+
self.vocab.add(word) # 添加到词汇表
91+
if word not in word_count[label]:
92+
word_count[label][word] = 0
93+
word_count[label][word] += 1 # 更新词频
94+
class_word_totals[label] += 1 # 当前类别单词总数加1
95+
96+
# 计算条件概率 P(w|c) 加拉普拉斯平滑
97+
vocab_size = len(self.vocab) # 词汇表大小
98+
self.word_probs = {label: {} for label in classes}
99+
for label in classes:
100+
for word in self.vocab:
101+
count = word_count[label].get(word, 0) # 获取词频,若未出现则为0
102+
self.word_probs[label][word] = (count + self.smooth) / (
103+
class_word_totals[label] + vocab_size * self.smooth
104+
)
105+
106+
def predict(self, X):
107+
predictions = [] # 存储所有样本的预测结果
108+
for text in X:
109+
words = text.split(" ")
110+
class_scores = {} # 存储每个类别的后验概率
111+
112+
# 计算后验概率 P(c|w1,w2,...,wn)
113+
for label in self.class_probs:
114+
class_scores[label] = self.class_probs[label]
115+
for word in words:
116+
if word in self.word_probs[label]: # 如果词在词汇表中
117+
class_scores[label] *= self.word_probs[label][word]
118+
else:
119+
# 若单词未在词汇表中,跳过计算
120+
class_scores[label] *= 1/len(self.vocab)
121+
122+
# 选择后验概率最大的类别作为预测结果
123+
predictions.append(max(class_scores, key=class_scores.get))
124+
125+
return predictions
126+
127+
def score(self, X, y):
128+
predictions = self.predict(X)
129+
return np.mean(predictions == y)
130+
131+
132+
# 数据
133+
data = np.array([
134+
("恭喜 你 赢得 了 大奖 !","诈骗"),
135+
("请 立即 更新 您 的 账户 信息","诈骗"),
136+
("您的 账户 存在 异常 ,请 尽快 处理","诈骗"),
137+
("这是 您 的 账单 ,请 查看","正常"),
138+
("您的 订单 已 发货","正常"),
139+
("请 确认 您 的 注册 信息","正常"),
140+
("您 有 新的 消息 ,请 查看","正常"),
141+
("点击 此 链接 获取 优惠券","诈骗"),
142+
("您的 账户 已 被 锁定 ,请 立即 联系","诈骗"),
143+
("恭喜 您 获得 免费 试用 !","诈骗"),
144+
("请 不要 分享 您 的 密码","正常"),
145+
("您的 订阅 即将 到期 ,请 续费","正常"),
146+
("您 有 未 读 邮件 ,请 查看","正常"),
147+
("立即 行动 ,获取 限时 优惠 !","诈骗"),
148+
("您的 信用卡 信息 需要 更新","诈骗"),
149+
])
150+
X = data[:, 0] # 文本数据
151+
y = data[:, 1] # 标签数据
152+
153+
# 创建模型并训练
154+
model = NaiveBayes()
155+
model.fit(X, y)
156+
157+
# 输出结果,比较预测类别与实际类别
158+
print(model.score(X, y))
159+
160+
161+
```
162+
163+
### 使用sklearn模块完成
164+
65165
```python showLineNumbers
66166
from sklearn.naive_bayes import GaussianNB
67167
import numpy as np
@@ -89,6 +189,7 @@ print("类别概率:", predicted_proba)
89189

90190
```
91191

192+
92193
### 简单示例
93194

94195
```python showLineNumbers

0 commit comments

Comments
 (0)