# 统计学习方法读书笔记9-朴素贝叶斯习题

1147-柳同学

### 2.视频作业

与课本P63-64页差不多


#!usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author: liujie
@software: PyCharm
@file: naives 自编程实现.py
@time: 2020/10/22 10:03
"""
import numpy as np
import pandas as pd

# 定义朴素贝叶斯类
class NaiveBayes():
# 所有参数初始化
def __init__(self,lambda_):
self.lambda_ = lambda_   # 贝叶斯系数，取0时即为极大似然估计
self.y_types_count = None      # y的数量
self.y_types_proba = None      # y的概率
self.x_types_prob = dict()     # (xi 的编号,xi的取值，y的类型)条件概率

#
def fit(self,x_train,y_train):
# y的所有取值类型[-1,1]
self.y_types = np.unique(y_train)
# 转化为DataFrame数据格式,方便后续计算
x = pd.DataFrame(x_train)
y = pd.DataFrame(y_train)

# y的数量统计
# 利用value_counts()对y进行统计
self.y_types_count = y[0].value_counts()
# y的概率的计算
self.y_types_proba = (self.y_types_count + self.lambda_) / (y.shape[0] + len(self.y_types) * self.lambda_)

# 条件概率的计算
# 遍历xi- 特征
for idx in x.columns:
# 遍历每一个y的类型
for j in self.y_types:
# 选择y==j为真的数据点的第idx个特征的值，并进行统计
p_x_y = x[(y == j).values][idx].value_counts()
print(p_x_y)
for i in p_x_y.index:
print(i)
# 字典-xi 的编号,xi的取值，y的类型
self.x_types_prob[(idx,i,j)] = (p_x_y[i] + self.lambda_) / (self.y_types_count[j] + p_x_y.shape[0] * self.lambda_)

def predict(self,x_new):
res = []
# 遍历y的可能取值
for y in self.y_types:
p_y = self.y_types_proba[y]
p_xy =1
for idx,x in enumerate(x_new):
p_xy *= self.x_types_prob[idx,x,y]
res.append(p_y*p_xy)

for i in range(len(self.y_types)):
print('[{}]对应的概率 : {:.2%}'.format(self.y_types[i],res[i]))

# 返回最大后验概率对应的y值
return self.y_types[np.argmax(res)]

def main():
X_train = np.array([
[1, "S"],
[1, "M"],
[1, "M"],
[1, "S"],
[1, "S"],
[2, "S"],
[2, "M"],
[2, "M"],
[2, "L"],
[2, "L"],
[3, "L"],
[3, "M"],
[3, "M"],
[3, "L"],
[3, "L"]
])
y_train = np.array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1])
clf = NaiveBayes(lambda_=0.2)
clf.fit(X_train, y_train)
X_new = np.array([2, "S"])
y_predict = clf.predict(X_new)
print("{}被分类为:{}".format(X_new, y_predict))

if __name__ == '__main__':
main()

[-1]对应的概率 : 6.51%
[1]对应的概率 : 2.49%
['2' 'S']被分类为:-1


import numpy as np
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn import preprocessing  #预处理

def main():
X_train=np.array([
[1,"S"],
[1,"M"],
[1,"M"],
[1,"S"],
[1,"S"],
[2,"S"],
[2,"M"],
[2,"M"],
[2,"L"],
[2,"L"],
[3,"L"],
[3,"M"],
[3,"M"],
[3,"L"],
[3,"L"]
])
y_train=np.array([-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1])
# 数据预处理
enc = preprocessing.OneHotEncoder(categories='auto')
enc.fit(X_train)
# toarray()变成数组的形式
X_train = enc.transform(X_train).toarray()
print(X_train)

clf=MultinomialNB(alpha=0.0000001)
clf.fit(X_train,y_train)

X_new=np.array([[2,"S"]])
X_new=enc.transform(X_new).toarray()

y_predict=clf.predict(X_new)
print("{}被分类为:{}".format(X_new,y_predict))
print(clf.predict_proba(X_new))

if __name__=="__main__":
main()


