【实践】数据挖掘DM课程课业打卡实验2 朴素贝叶斯分类器
- 一、实验目的
- 二、实验内容
- 1、(难度1)编程实现朴素贝叶斯分类器的训练。假设数据只涉及连续属性。
- 2、(难度2)编程实现朴素贝叶斯分类器的训练和分类。
- 3、(难度2)编程实现朴素贝叶斯分类器的训练。
- 4、(难度3)编程实现朴素贝叶斯分类器的训练和分类。
一、实验目的
(1)理解朴素贝叶斯分类器的工作原理。
(2)编程实现朴素贝叶斯分类器。
二、实验内容
1、(难度1)编程实现朴素贝叶斯分类器的训练。假设数据只涉及连续属性。
测试代码如下:
load('trainingData.mat');
load('testingData.mat');
[Params,prior,AllLabels] = NaiveBayesTrain1(trAttr, trLabels);
(将执行正确的代码粘贴在此处,核心代码要求有注释)
function [Params,prior,AllLabels] = NaiveBayesTrain(trAttr, trLabels)
%NaiveBayesTrain只处理连续分类,不处理分类属性;
AllLabels = unique(trLabels);
numClass = length(AllLabels); %取得所有类标号的种类个数;
for i =1:numClass
ind=find(trLabels==AllLabels(i));
Ind4Class{i,1}=ind;
prior(i)=length(ind)/length(trLabels);
end
numAttr=size(trAttr,2);
Params{numClass,numAttr}=[];%初始化参数cell;
for AttrInd =1:numAttr
AttrVals=trAttr(:,AttrInd);
for i=1:numClass
Params{i,AttrInd}=GetParams4ContiAttr(AttrVals,Ind4Class{i,1});
end
end
end
function Params1=GetParams4ContiAttr(AttrVals,ind)
AttrVals4ThisClass=AttrVals(ind);
Params1(1)=mean(AttrVals4ThisClass); %求得均值;
Params1(2)=var(AttrVals4ThisClass); %求得方差;
end
2、(难度2)编程实现朴素贝叶斯分类器的训练和分类。
假设数据只涉及连续属性。
测试代码如下:
load('trainingData.mat');
load('testingData.mat');
[Labels]=bys_classify1(trAttr, trLabels,tstAttr);
acc=sum(Labels ==tstLabels)/length(Labels);
(将执行正确的代码粘贴在此处,核心代码要求有注释)
% load('trainingData.mat');
% load('testingData.mat');
% [Labels]=bys_classify1(trAttr, trLabels,tstAttr);
% acc=sum(Labels ==tstLabels)/length(Labels);
%bys_classify1实现朴素贝叶斯分类器的训练和分类。数据只涉及连续属性。
function [Labels]=bys_classify1(trAttr,trLabels,tstAttr)
[Params,prior,AllLabels]=NaiveBayesTrain(trAttr,trLabels);
N=size(tstAttr,1);
Labels=zeros(N,1);
for i=1:N
tstAttrSample=tstAttr(i,:);
post=NaiveBayesPredict(Params,prior,tstAttrSample);
[maxValue,index]=max(post);
Labels(i)=AllLabels(index);
end
end
function post=NaiveBayesPredict(Params,prior,tstAttr)%%分类
numClass=length(prior);
numAttr=length(tstAttr);
post=[];
for i=1;numClass
p_AttrCond=1;
for AttrInd=1;numAttr
AttrVals=tstAttr(AttrInd);
mean=Params{i,AttrInd}(1);
varriance=Params{i,AttrInd}(2);
sig=sqrt(varriance);
end
end
post(i)=p_AttrCond*prior(i);
end
function [Params,prior,AllLabels]=NaiveBayesTrain(trAttr,trLabels)%%训练
AllLabels=unique(trLabels);
numClass=length(AllLabels);
for i=1:numClass
ind=find(trLabels==AllLabels(i));
Ind4Class{i,1}=ind;
prior(i)=length(ind)/length(trLabels);
end
numAttr=size(trAttr,2);
Params{numClass,numAttr}=[];
for AttrInd=1:numAttr
AttrVals=trAttr(:,AttrInd);
for i=1:numClass
Params{i,AttrInd}=GetParams4ContiAttr(AttrVals,Ind4Class{i,1});
end
end
end
function Params1=GetParams4ContiAttr(AttrVals,ind)
AttrVals4ThisClass=AttrVals(ind);
Params1(1)=mean(AttrVals4ThisClass);
Params1(2)=var(AttrVals4ThisClass);
end
3、(难度2)编程实现朴素贝叶斯分类器的训练。
假设数据既涉及连续属性,也涉及分类属性。
测试代码如下:
refund=[1 2 2 1 2 2 1 2 2 2]';
mar=[1 2 1 2 3 2 3 1 2 1]';
inc=[125,100,70,120,95,60,220,85,75,90]';
trAttr =[refund,mar,inc];
trLabels=[1 1 1 1 2 1 1 2 1 2]';
tstAttr=[2,2,120];
[type,Params,prior,AllLabels] = NaiveBayesTrain2(trAttr, trLabels);
(将执行正确的代码粘贴在此处,核心代码要求有注释)
% refund=[1 2 2 1 2 2 1 2 2 2]';
% mar=[1 2 1 2 3 2 3 1 2 1]';
% inc=[125,100,70,120,95,60,220,85,75,90]';
% trAttr =[refund,mar,inc];
% trLabels=[1 1 1 1 2 1 1 2 1 2]';
% tstAttr=[2,2,120];
% [type,Params,prior,AllLabels] = NaiveBayesTrain2(trAttr, trLabels);
function [type,Params,prior,AllLabels]=NaiveBayesTrain2(trAttr,trLabels)
AllLabels=unique(trLabels);
numClass=length(AllLabels); %取得所有类标号的种类个数
for i=1:numClass
ind=find(trLabels == AllLabels(i));
Ind4Class{i,1}=ind;
prior(i)=length(ind)/length(trLabels);
end
numAttr=size(trAttr,2);
Params{numClass,numAttr}=[]; %初始化参数cell;
for AttrInd=1:numAttr
AttrVals=trAttr(:,AttrInd);
numAttrValsClass=length(unique(AttrVals));%取得所有属性标号的种类个数;
maxAttrVals=max(unique(AttrVals));%取得当前的最大属性标号;
if(isempty(find(rem(AttrVals,1)~=0)) && numAttrValsClass<10 && maxAttrVals ==numAttrValsClass)
%若属性为分类属性K种取值情况,要求取值为1,2,....k;
%属性为分类属性,type=0;
type(AttrInd)=0;
for i=1:numClass
Params{i,AttrInd}=GetParams4CategoricalAttr(AttrVals,Ind4Class{i,1});
end
else
type(AttrInd)=1;
for i=1:numClass
Params{i,AttrInd}=GetParams4ContiAttr(AttrVals,Ind4Class{i,1});
end
end
end
end
function Params1=GetParams4ContiAttr(AttrVals,ind)
AttrVals4ThisClass=AttrVals(ind);
Params1(1)=mean(AttrVals4ThisClass); %求得均值;
Params1(2)=var(AttrVals4ThisClass); %求得方差;
end
function Params1=GetParams4CategoricalAttr(AttrVals,ind)
Params1=[];
AllAttrVals=unique(AttrVals);
numAttrValsClass=length(AllAttrVals);
AttrVals4ThisClass=AttrVals(ind);
nj=length(ind);
for i=1:numAttrValsClass
nij=length(find(AttrVals4ThisClass==AllAttrVals(i)));
Params1(i)=nij/nj;
end
end
4、(难度3)编程实现朴素贝叶斯分类器的训练和分类。
(将执行正确的代码粘贴在此处,核心代码要求有注释)
function [Labels]=bys_classify(trAttr,trLabels,tstAttr)
[type,Params,prior,AllLabels]=NaiveBayesTrain2(trAttr,trLabels);
N=size(tstAttr,1);
Labels=zeros(N,1);
for i=1:N
tstAttrSample=tstAttr(i,:);
post=NaiveBayesPredict(type,Params,prior,tstAttrSample);
[maxValue,index]=max(post);
Labels(i)=AllLabels(index);
end
end
function post=NaiveBayesPredict(type,Params,prior,tstAttr)
numClass=length(prior); %取得所有类标号的种类个数;
numAttr=length(tstAttr);
post=[];
for i=1:numClass
p_AttrCond=1;
for AttrInd=1:numAttr
AttrVals=tstAttr(AttrInd);
if(type(AttrInd)==0)
p=Params{i,AttrInd}(AttrVals); %属性编号是1,2;属性编号必须连续且从1开始;
p_AttrCond=p_AttrCond*p;
%属性为连续属性,type=1;
else
mean=Params{i,AttrInd}(1);
varriance=Params{i,AttrInd}(2);
sig=sqrt(varriance);
p=(1/(sqrt(2*pi)*sig))*exp(-(AttrVals-mean)^2/(2*sig^2));
p_AttrCond=p_AttrCond*p;
end
end
post(i)=p_AttrCond*prior(i);
end
end
function [type,Params,prior,AllLabels]=NaiveBayesTrain2(trAttr,trLabels)
AllLabels=unique(trLabels);
numClass=length(AllLabels); %取得所有类标号的种类个数
for i=1:numClass
ind=find(trLabels == AllLabels(i));
Ind4Class{i,1}=ind;
prior(i)=length(ind)/length(trLabels);
end
numAttr=size(trAttr,2);
Params{numClass,numAttr}=[]; %初始化参数cell;
for AttrInd=1:numAttr
AttrVals=trAttr(:,AttrInd);
numAttrValsClass=length(unique(AttrVals));%取得所有属性标号的种类个数;
maxAttrVals=max(unique(AttrVals));%取得当前的最大属性标号;
if(isempty(find(rem(AttrVals,1)~=0)) && numAttrValsClass<10 && maxAttrVals ==numAttrValsClass)
%若属性为分类属性K种取值情况,要求取值为1,2,....k;
%属性为分类属性,type=0;
type(AttrInd)=0;
for i=1:numClass
Params{i,AttrInd}=GetParams4CategoricalAttr(AttrVals,Ind4Class{i,1});
end
else
type(AttrInd)=1;
for i=1:numClass
Params{i,AttrInd}=GetParams4ContiAttr(AttrVals,Ind4Class{i,1});
end
end
end
end
function Params1=GetParams4ContiAttr(AttrVals,ind)
AttrVals4ThisClass=AttrVals(ind);
Params1(1)=mean(AttrVals4ThisClass); %求得均值;
Params1(2)=var(AttrVals4ThisClass); %求得方差;
end
function Params1=GetParams4CategoricalAttr(AttrVals,ind)
Params1=[];
AllAttrVals=unique(AttrVals);
numAttrValsClass=length(AllAttrVals);
AttrVals4ThisClass=AttrVals(ind);
nj=length(ind);
for i=1:numAttrValsClass
nij=length(find(AttrVals4ThisClass==AllAttrVals(i)));
Params1(i)=nij/nj;
end
end
Ending!