# 4.3 朴素贝叶斯分类实战——新闻分类、垃圾邮件识别

### 一、sklearn20类新闻分类

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
# 特征抽取
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

if __name__ == '__main__':
# 加载数据集
news = fetch_20newsgroups(subset='all')
# 目标值与特征值
X = news.data
y = news.target
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

# 特征抽取
tdidf = TfidfVectorizer()
X_train = tdidf.fit_transform(X_train)
# 打印单词列表
print(tdidf.get_feature_names())
X_test = tdidf.transform(X_test)

# 朴素贝叶斯
model = MultinomialNB(alpha=1.0)
print(X_train.toarray())
# 训练
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 准确率
print('准确率=', model.score(X_test, y_test))



### 二、垃圾邮件识别

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# 获得词汇列表
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
# 词汇表与set取并集
vocabSet = vocabSet | set(document)
# 返回一个经过自然排序的词汇表
return sorted(list(vocabSet))

# 对邮件内容进行预处理
def textParse(bigString):
import re
list0fTokens = re.split(r'/W*', bigString)
# 返回长度大于2并转化为小写
return [tok.lower() for tok in list0fTokens if len(tok) > 2]

# 词袋模型
def bag0fWords2Vec(vocabList, inputSet):
# 初始化向量,其长度为词汇表程度相同
returnVec = [0] * len(vocabList)
for word in inputSet:
# 在词汇表对应位置上相加
returnVec[vocabList.index(word)] += 1
return returnVec

# 读取邮件
docList = []
classList = []

# 读取垃圾邮件
num = 26
for i in range(1, num):
docList.append(wordList)
classList.append(1)

# 读取非垃圾邮件
docList.append(wordList)
classList.append(0)

vocabList = createVocabList(docList)

X = []
for docIndex in range(len(docList)):
X.append(bag0fWords2Vec(vocabList, docList[docIndex]))

return X, classList, vocabList

if __name__ == '__main__':
# 读取邮件

# 数据分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

# model
model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)
print('accuracy =', accuracy_score(y_test, y_hat))

accuracy = 1.0


