贝叶斯网络
贝叶斯网络(Bayesian network),又称信念网络(belief network)或是有向无环图模型(directed acyclic graphical model),是一种概率图型模型。贝叶斯网络的关键在于网络结构和条件概率表。
下列为一个简单的贝叶斯网络:
有了网络结构和概率表就可以得到任意变量的联合概率分布。
例题:
通过贝叶斯网络确定条件独立
朴素贝叶斯分类器
适合文本分类
朴素贝叶斯分类器采用了“条件独立性假设”。应用贝叶斯公式的有监督学习算法。求。
利用贝叶斯公式由
其中为给定样本的n个特征,我们要根据已给特征来判定该样本属于哪个类别的概率大。
根据"条件独立性假设",对上式有
对于一个样本来说,无论他属于哪个类别,是相同的,因此我们可以得到
朴素贝叶斯分类器:
显然朴素贝叶斯分类器的训练方式,就是根据训练集样本,先验地给出,并为每个属性估计条件概率。
- 令表示训练集中类标签为c的样本的集合,则
- 若特征为连续的,我们假设,其中表示第c类样本关于特征的均值和方差
- 若特征为离散的,令表示属于第c类的样本中,属性的个数。
- 注意:为避免其他属性携带的信息被训练集中未出现的属性值抹去,我们采用拉普拉斯平滑,令N表示类别数,表示属性可能的取值数,则有
举例说明:垃圾邮件分类
==朴素贝叶斯训练的参数就是先验概率和条件概率。
Iris-NB
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
import matplotlib.patches as mpatches
from sklearn.decomposition import PCA
from matplotlib.font_manager import FontProperties
from sklearn.naive_bayes import GaussianNB, MultinomialNB
"""假设鸢尾花数据的特征服从高斯分布"""
font = FontProperties(fname=r'C:\Windows\Fonts\simsun.ttc', size=12)
if __name__ == '__main__':
"""数据读取与可视化"""
list1 = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
data = pd.read_csv('..\\8.Regression\\iris.data', header=None,
names=list1)
x = data.iloc[:, 0:4]
y = data['class']
y = pd.Categorical(y).codes # 预编码
"""PCA降维"""
pca = PCA(n_components=2)
x = pca.fit_transform(x)
"""分为训练集测试集"""
# priors=[0.3,0.35,0.35]先验概率可以自己给定,priors=None根据样本自己计算先验概率
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.8)
pipe_nb = Pipeline([('sc', StandardScaler()),
('poly', PolynomialFeatures(degree=1)),
('clc', GaussianNB(priors=None))])
pipe_nb.fit(x_train, y_train.ravel()) # 训练
y_hat = pipe_nb.predict(x_test) # 预测
print('测试集R^2 score:', pipe_nb.score(x_test, y_test))
print('训练集R^2 score:', pipe_nb.score(x_train, y_train))
"""画Logistic回归的分类图"""
N = 500 # 50*50的网格
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, N)
x1, x2 = np.meshgrid(t1, t2)
x_new = np.stack((x1.flat, x2.flat), axis=1) # (2500, 2)
# 预测
y_new = pipe_nb.predict(x_new)
y_new = y_new.reshape(N, N)
cm_light = ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
color_dark = ListedColormap(['g', 'r', 'b'])
plt.figure()
plt.pcolormesh(x1, x2, y_new, cmap=cm_light)
plt.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap=color_dark,edgecolors='k')
plt.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=color_dark, marker='*',edgecolors='k',s=80)
plt.xlabel(u'组分1', fontproperties=font)
plt.ylabel(u'组分2', fontproperties=font)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.legend('Iris Logistic Regression')
plt.title(u'朴素贝叶斯分类器', fontproperties=font)
plt.grid()
patchs = [mpatches.Patch(color='#77E0A0', label='Iris-setosa'),
mpatches.Patch(color='#FF8080', label='Iris-versicolor'),
mpatches.Patch(color='#A0A0FF', label='Iris-virginica')]
plt.legend(handles=patchs, fancybox=True, framealpha=0.8)
plt.show()
文本分类
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle as pkl
from time import time
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
"""||*****文本分类例子,首先将文本利用tf-idf映射为向量,再利用算法分类*****||||"""
def test_clf(clf):
alpha_can = np.logspace(-3, 2, 10) # NB里面laplace平滑超参数
# 5折交叉验证,寻找最优的参数
model = GridSearchCV(clf, param_grid={'alpha': alpha_can}, cv=5)
m = alpha_can.size
# GridSSearchCV寻找最优参数
# 如果clf里有'alpha'这个参数
if hasattr(clf, 'alpha'): # Ridge和NB参数
model.set_params(param_grid={'alpha': alpha_can})
m = alpha_can.size
if hasattr(clf, 'n_neighbors'): # K紧邻聚类数,m是为计算时间
neighbors_can = np.arange(1, 15)
model.set_params(param_grid={'n_neighbors': neighbors_can})
m = neighbors_can.size
if hasattr(clf, 'C'): # SVM的C和gamma
C_can = np.logspace(1, 3, 3)
gamma_can = np.logspace(-3, 0, 3)
model.set_params(param_grid={'C': C_can, 'gamma': gamma_can})
m = C_can.size * gamma_can.size
if hasattr(clf, 'max_depth'): # 随机森林
max_depth_can = np.arange(4, 10)
model.set_params(param_grid={'max_depth': max_depth_can})
m = max_depth_can.size
t_start = time()
model.fit(x_train, y_train)
t_end = time()
# 做了5折,故为1个模型用的时间,m是多少个参数需要做
t_train = (t_end - t_start) / (5 * m)
print(u'5折交叉验证的训练时间为:%.3f秒/(5*%d)=%.3f秒' % ((t_end - t_start), m, t_train))
print(u'最优超参数为:', model.best_params_)
t_start = time()
y_hat = model.predict(x_test)
t_end = time()
t_test = t_end - t_start
print(u'测试时间:%.3f秒' % t_test)
acc = model.score(x_test, y_test)
print(u'测试集准确率',acc)
return y_hat,acc,t_test,t_train
"""1、导入文本数据(pickle文件)"""
if __name__ == '__main__':
print('开始导入数据...\n')
t_start = time()
with open('data_test', 'rb') as f:
data_test = pkl.load(f)
with open('data_train', 'rb') as f:
data_train = pkl.load(f)
t_end = time()
print('数据导入完成.\n花费时间为:{}s'.format(t_end - t_start))
# 注意数据是以sklearn.utils.Bunch结构存储的,下面将显示以下文本信息
print(u'数据类型:', type(data_train))
print('训练集文本个数:{}\n测试集文本个数:{}\n'.format(len(data_train), len(data_test)))
categories = data_test.target_names
print('训练集和测试集总类别名称:{}'.format(categories))
y_train = data_train.target
y_test = data_test.target
print('--打印前3个文本的内容\n')
for i in range(3):
print('文本{}属于类别{}'.format(i + 1, categories[y_train[i]]))
print(data_train.data[i])
"""2、tf-idf将文本映射为词频向量,进而提取文本特征"""
# 'english'为内置停止词库
vetor = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True)
x_train = vetor.fit_transform(data_train.data)
x_test = vetor.transform(data_test.data)
print(u'训练集样本个数(文档个数):%d,特征个数(词库中词的总数):%d' % x_train.shape)
"""3、训练数据"""
print(u'\n\n===================\n分类器的比较:\n')
clfs = {'MultinomialNB': MultinomialNB(), # 0.87(0.017), 0.002, 90.39%
'BernoulliNB': BernoulliNB(), # 1.592(0.032), 0.010, 88.54%
'KNeighbors': KNeighborsClassifier(), # 19.737(0.282), 0.208, 86.03%
'Ridge': RidgeClassifier(), # 25.6(0.512), 0.003, 89.73%
'RandomForest': RandomForestClassifier(n_estimators=200), # 59.319(1.977), 0.248, 77.01%
'SVM': SVC() # 236.59(5.258), 1.574, 90.10%
}
s_acc,s_t_test,s_t_train=[],[],[]
for keys,model in clfs.items():
print('分类器:',keys)
print(model)
y_hat, acc, t_test, t_train=test_clf(model)
s_acc.append(acc)
s_t_test.append(t_test)
s_t_train.append(t_train)
# """3、分类性能可视化"""
plt.figure()
names=clfs.keys()
index=np.arange(5)
bar_width = 0.2
plt.bar(x=index, height=s_acc, width=bar_width)
plt.bar(x=index + bar_width, height=s_t_test, width=bar_width)
plt.bar(x=index + 2*bar_width, height=s_t_train, width=bar_width)
plt.xticks(index + 0.5, names)
plt.legend(['acc','time_test','time_train'])
plt.title('Model comparison')
plt.show()