using Grass.Extend;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace DiscoverTest.Arithmetic
{
[TestClass]
public class NbTest2
{
[TestMethod]
public void NbType()
{
//列别集
var types = new Dictionary<string, string>
{
{"yes","好天气"},
{"no","坏天气"},
};
var trainSet = new []
{
new []{"yes","晴朗","适中","微风"},
new []{"yes","高温","微风"},
new []{"no","阴天","低温","高温","强风"},
new []{"no","雨天","潮湿"},
};
var nb = new NaiveBayesClassifier(types, trainSet);
Console.WriteLine(new string('~', 60));
var content = "晴朗,高温,潮湿,微风";
var type = nb.GetClassify(content);
Console.WriteLine("结果={0},{2} ;待分类特征={1};", type, content, types[type]);
Console.WriteLine(new string('~', 60));
content = "阴天,低温,强风,晴朗";
type = nb.GetClassify(content);
Console.WriteLine("结果={0},{2} ;待分类特征={1};", type, content, types[type]);
Console.WriteLine(new string('~', 60));
content = "阴天,低温,晴朗,晴朗,适中";
type = nb.GetClassify(content);
Console.WriteLine("结果={0},{2} ;待分类特征={1};", type, content, types[type]);
}
}
/// <summary>
/// 朴素贝叶斯分类器
/// </summary>
public class NaiveBayesClassifier
{
/// <summary>
///
/// </summary>
/// <param name="types">类别字典(key=类别代码;value=类别名称)</param>
/// <param name="trainSet">
/// 训练文本集:二维数组,[[分别代码,关键字1,关键字2,关键字3...],...]
/// 如:[["a","1","2","3"],["a","2","3","4","5","6"],["b","11","12","13"],["b","21","22","23"]]
/// </param>
public NaiveBayesClassifier(Dictionary<string, string> types
, string[][] trainSet)
{
//初始化
types.Keys
.ToList()
.ForEach(x =>
{
TrainTermSetTfGroup.Add(x, new List<int>());
TrainTermSetGroup.Add(x, new List<string>());
});
//构建不重复的训练文本关键字集合
trainSet.ToList().ForEach(x =>
{
for (int i = 0; i < x.Length; i++)
{
//第0项为类目
if (i == 0)
continue;
TrainTermSet.Add(x[i]);
}
});
Console.WriteLine("训练集字典:{0}", TrainTermSet.ToJsonSerialize());
//训练集分组
SortedSet<string> set = new SortedSet<string>();
List<string> list = new List<string>();
types.Keys
.ToList()
.ForEach(x =>
{
GroupByType(x,trainSet,ref list,ref set);
TrainTermSetTfGroup[x] = (new int[set.Count]).ToList();
TrainTermSetGroup[x] = set.ToList();
});
//训练集分组词频初始化
InitTrainTf(trainSet);
}
#region 变量
/// <summary>
/// 训练样本集,不重复关键字集合
/// </summary>
public SortedSet<string> TrainTermSet = new SortedSet<string>();
/// <summary>
/// 类目的关键词分组(key=类目,value=类目下不重复有序关键词)
/// </summary>
public Dictionary<string, List<string>> TrainTermSetGroup = new Dictionary<string, List<string>>();
/// <summary>
/// 类目的词频向量空间(key=类目,value=类目下不重复有序关键词词频)
/// </summary>
public Dictionary<string, List<int>> TrainTermSetTfGroup = new Dictionary<string, List<int>>();
/// <summary>
/// 待分类文本,
/// 按类目分组的词频向量空间(key=类目,value=词频向量)
/// </summary>
public Dictionary<string, List<int>> TempTfSetGroup = new Dictionary<string, List<int>>();
/// <summary>
/// 待分类文本,关键词列表(可重复)
/// </summary>
public List<string> TempTermList = new List<string>();
/// <summary>
/// 分类最终得分
/// </summary>
Dictionary<string, double> _typesScore = new Dictionary<string, double>();
#endregion
#region 方法
/// <summary>
/// 初始化训练集词频
/// </summary>
private void InitTrainTf(string[][] trainSet)
{
//string type = string.Empty;
int index = 0;
int total = 0;
foreach (KeyValuePair<string, List<string>> item in TrainTermSetGroup)//训练集,分组
{
foreach (string[] arr in trainSet)//训练集
{
if(!item.Key.Equals(arr[0]))
continue;
foreach (string term in item.Value)//训练集,分组词频
{
total = arr.Count(x=>x.Equals(term));//词频
index = TrainTermSetGroup[item.Key].FindIndex(x => x.Equals(term));//关键词位置
TrainTermSetTfGroup[item.Key][index] += total;//词频加总
}
}
}
Console.WriteLine("训练集分组:{0}", TrainTermSetGroup.ToJsonSerialize());
Console.WriteLine("训练集词频:{0}", TrainTermSetTfGroup.ToJsonSerialize());
}
/// <summary>
/// 初始化待分类文本,在各个分类中的词频向量
/// </summary>
/// <param name="termList"></param>
private void InitTempTfSetGroup(List<string> termList)
{
int index = 0;
int total = 0;
int tf = 0;
TempTfSetGroup.Clear();
foreach (KeyValuePair<string, List<string>> item in TrainTermSetGroup) //训练集,分组
{
TempTfSetGroup[item.Key] = (new int[item.Value.Count]).ToList();
foreach (string term in item.Value)//每个分组下的关键词
{
index = TrainTermSetGroup[item.Key].FindIndex(x => x.Equals(term));//关键词位置
total = termList.Count(x => x.Equals(term));//关键词次数
tf = TrainTermSetTfGroup[item.Key][index];//关键词词频
TempTfSetGroup[item.Key][index] = tf * total;
}
}
}
/// <summary>
/// 获取指定类目的关键词
/// </summary>
/// <param name="type"></param>
/// <param name="trainSet"></param>
/// <param name="list"></param>
/// <param name="set"></param>
private void GroupByType(string type, string[][] trainSet, ref List<string> list, ref SortedSet<string> set)
{
list.Clear();
set.Clear();
for (int i = 0; i < trainSet.Length; i++)
{
for (int j = 0; j < trainSet[i].Length; j++)
{
if(!type.Equals(trainSet[i][0]) || j==0) //j==0 为分类名
continue;
list.Add(trainSet[i][j]);
set.Add(trainSet[i][j]);
}
}
list.Sort();
}
/// <summary>
/// 分词
/// </summary>
/// <param name="content"></param>
/// <returns>排序后的分词结果</returns>
public List<string> GetTermSegment(string content)
{
var lst = new List<string>();
Regex reg;
MatchCollection ms;
//遍历样本集关键词字典,对待分类文本进行分词
foreach (string term in TrainTermSet)
{
reg = new Regex(term);
if (!reg.IsMatch(content))
continue;
ms = reg.Matches(content);
for (int i = 0; i < ms.Count; i++)
{
lst.Add(ms[i].Value);
}
}
lst.Sort();
Console.WriteLine("分词结果:{0}", lst.ToJsonSerialize());
return lst;
}
/// <summary>
/// 获取最终分类结果
/// </summary>
/// <returns></returns>
public string GetClassify(string content)
{
//对文本进行分词
TempTermList = GetTermSegment(content);//分词
//初始化待分类文本,在各个分类中的词频
InitTempTfSetGroup(TempTermList);
/*
* P(C|X)=P(X|C)P(C)/P(X);后验概率=似然概率(条件概率)*先验概率/联合概率
*
* 其中,P(X)联合概率,为常量,所以只需要计算和比较各个分类 P(X|C)P(C) 值
*
* 公式:P(X|C)P(C)
* 其中:
* P(X|C)=P(x1|c1)P(x2|c1)...P(xn|c1)
* P(x1|c1)="x1关键字在c1文档中出现过的次数之和+1"/"类c1下单词的总数(单词可重复)+总训练样本的不重复单词数"
* P(c1)=类c1下总共有单词个数(可重复)/训练样本单词总数(可重复),
*
* 先验概率P(c1)=“类c下词频之和”/“整个训练样本的词频之和”
* 条件概率P(x1|c1)=(单词x1在类c1下的词频之和+1)/(训练样本类c1的词频之和+训练样本类c1下不重复单词个数)
*
*/
double prior = 1.0;
double likelihood = 1.0;
double typeScore = 1.0;
_typesScore.Clear();
foreach (var type in TrainTermSetGroup.Keys)
{
//计算先验概率P(c1)
prior = GetPrior(type);
//计算条件概率P(x1|c1)
likelihood = GetLikelihood(type);
//记录最终得分
typeScore = prior*likelihood;
NoteTypeScore(type,typeScore);
}
//返回最高得分的分类
return GetMaxSoreType();
}
private string GetMaxSoreType()
{
//对字典中的值进行排序
Dictionary<string, double > soretDic = _typesScore
.OrderByDescending(x => x.Value)
.ToDictionary(x => x.Key, x => x.Value);
Console.WriteLine("排序后:{0}",soretDic.ToJsonSerialize());
//返回第一个分数最高的类型code
return soretDic.First().Key;
}
/// <summary>
/// 记录类型得分
/// </summary>
/// <param name="type"></param>
/// <param name="sore"></param>
private void NoteTypeScore(string type, double sore)
{
if (_typesScore.ContainsKey(type))
{
_typesScore.Add(type, sore);
return;
}
Console.WriteLine("得分:{0}={1}",type, sore);
_typesScore[type] = sore;
}
/// <summary>
/// 计算先验概率
/// </summary>
/// <param name="type"></param>
/// <returns></returns>
private double GetPrior(string type)
{
/*
* 先验概率P(c1)=“类c下词频之和”/“整个训练样本的词频之和”
*/
int tf = TrainTermSetTfGroup[type].Sum(x => x);//使用变量提高性能
int tfAll = 0;//使用变量提高性能
foreach (var key in TrainTermSetTfGroup.Keys)
{
tfAll += TrainTermSetTfGroup[key].Sum(x => x);
}
double result = tf * 1.0 / tfAll;
Console.WriteLine("先验概率:{0}={1}", type, result);
return result;
}
/// <summary>
/// 计算似然概率
/// </summary>
/// <param name="type"></param>
/// <returns></returns>
private double GetLikelihood(string type)
{
/*
* P(X|c1)=P(x1|c1)P(x2|c1)...P(xn|c1)
* P(x1|c1)="x1关键字在c1文档中出现过的次数之和+1"/"类c1下单词的总数(单词可重复)+总训练样本的不重复单词数"
* 条件概率P(x1|c1)=(单词x1在类c1下的词频之和+1)/(训练样本类c1的词频之和+训练样本类c1下不重复单词个数)
* 注:引入Laplace校准,它的思想非常简单,就是对没类别下所有划分的计数加1,解决 P(x1|c1)=0 的情况
*/
int tf = TempTfSetGroup[type].Sum(x => x)+1;//使用变量提高性能
int termCount = TrainTermSetTfGroup[type].Sum(x => x);
int trainTermCount = TrainTermSet.Count;
double result = tf*1.0/(termCount + trainTermCount);
Console.WriteLine("条件概率:{0}={1}",type,result);
return result;
}
#endregion
}
}