数据挖掘作业汇总

  • python基操
  • 实验一、NumPy基本操作
  • 2 数据导入、画图
  • 3熟悉基本数据规范化方法、主成分分析(PCA)降维方法。
  • 熟悉分类模型的评估和性能度量方法
  • 熟悉决策树的基本构建算法与后剪枝方法
  • 熟悉朴素贝叶斯算法、基于实验分析k最近邻算法对参数k的敏感度以及通过交叉验证的调参方法
  • 熟悉K-均值、凝聚层次聚类、DBSCAN聚类算法以及聚类结果评估
  • 课程笔记
  • 第一章


python基操

实验一、NumPy基本操作

链接: dataframe该看的东西

一、实验目的

熟悉编程环境、熟悉基于numpy的向量和矩阵操作。

二、实验内容

1、向量基本操作

1)创建向量v: [48, 6, 51, 32, 4, 85]

2)查看向量v的形状、数据类型

3)将向量v转成浮点数向量,并查看向量v的形状、数据类型

4)向量v前三项求和

5)向量v后四项的平方求均值

6)向量v的奇数项(从0开始)变成原来的算术平方根

7)求向量v最小、最大值及其所在位置(下标)

8)对向量v排序,并输出排序后的向量以及排序前的索引

2、矩阵基本操作

1)创建矩阵m:

[

   [45,62,31,753],

   [78,43,12,546],

   [146,785,2475,7],

]

2)查看矩阵m的形状、数据类型

3)将矩阵m转成浮点数矩阵,并查看矩阵m的形状、数据类型

4)矩阵m转置

5)矩阵m按列、行求和

6)对矩阵m第0行和第1行求欧式距离(基于numpy向量化运算)。

7)矩阵m第0行和第1行求欧式距离(调用np.linalg.norm)。

8)矩阵m的每一行变成单位向量。

9)打印矩阵m每一行对应的向量长度,确认是否都为1。

10)计算相似度矩阵,即矩阵m每一行分别与所有行求余弦相似度。

提示:向量在已经单位化的情况下,余弦相似度退化为点积,即以上公式中的分子。

在这里插入代码片
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
import numpy as np
#1
v=np.array([48,6,51,32,4,85])
v=v.reshape(1,6)

#2
print (v.shape)
print (v.dtype)
#3
a=np.array(v,dtype=float)
print (a.shape)
print (a.dtype)
#4
print(np.sum(v[:,:3]))
#5
print(np.mean(v[:,-4:]**2))
#6
i=1
while(i<6):
   a[0,i]=np.sqrt(a[0,i])
   i+=1
print (a)
#7
n=0
while(n<6):
    if(a[0,n]==np.amin(a)):
        print(n)
        break
    else:
        n+=1

m=0
while(m<6):
    if(a[0,m]==np.amax(a)):
        print(m)
        break
    m+=1

print(n,m)

#8

print(np.argsort(a) )
print(np.sort(a))

2.py
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 13 16:41:10 2020

@author: FRANP
"""
import numpy as np
# 2.1
m=[

       (45,62,31,753),

       (78,43,12,546),

       (146,785,2475,7),

    ]
m=np.asarray(m)
#2

print(m.shape)
print(m.dtype)
#3
m=np.array(m,dtype=float)
print(m.shape)
print(m.dtype)
#4
print(m.T)
#5
print(np.sum(m,axis=1))
print(np.sum(m,axis=0))
#6
#7
vector1 = m[0,:]
vector2 = m[1,:]

op1=np.sqrt(np.sum(np.square(vector1-vector2)))
op2=np.linalg.norm(vector1-vector2)
print(op1)
print(op2)

#8
i=0
n=np.ndarray(shape=(3,4),dtype=float)
while(i<3):
    j=0
    for k in m[i,]:
        n[i,j]=k/np.sqrt(np.sum(m[i,:]**2))
        j=j+1
    i+=1
print("@",n)
 #9
i=0
while(i<3):
    sum=0
    for k in n[i,]:
        sum+=k**2
    print(round(sum))
    i+=1
#10
i=0
while(i<2):
    j=i+1
    v1=n[i,]
    v2=n[j,]
    while(j<3):
        print(i,j,np.matmul(v1.T,v2))
        j+=1
    i+=1

2 数据导入、画图

"""
Spyder编辑器

这是一个临时脚本文件.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
from sklearn.datasets import load_iris
iris=load_iris()
print(iris.data[:5],"\n",
      iris.feature_names,
      iris.target_names,
      iris.target)

dfx=pd.DataFrame(iris.data)
dfy=pd.DataFrame(iris.target)
dfy.columns=["target"]

df=dfx.join(dfy)
print(df.iloc[1:5])
print(df.iloc[-3:])
print(df.loc[1:5,[1,3,'target']])
print(df.iloc[3,3])
df.to_csv('iris_new.csv')

x=df[0]
y=df[1]
t=df['target']
t=t.replace(0,'r')
t=t.replace(1,'g')
t=t.replace(2,'b')

plt.scatter(x,y,c=t)
plt.show()

df=pd.read_csv('flights.csv')
print(df.head(3))
print(df[df.isnull().any(axis=1)].head(3))
df=df.fillna(0)
print(df[df.isnull().any(axis=1)].head(3))
print(df['flight'],df['origin'],df['dest'],df['distance'])
for col in df.columns:
    if df[col].dtypes=='int64' or df[col].dtypes=='float64':
        print(df[col].mean())
print(df['hour'].median())
df.sort_values('distance')
l=[]
###[0, 100)、[100, 200)、[200, 500)、[500, 1000)、[1000, 2000)、[2000, 5000)
n, bins, patches=plt.hist(df[(df['distance'] >0 )&( df['distance'] <100)]['distance'],bins=1)
n, bins, patches=plt.hist(df[(df['distance'] >=200 )&( df['distance'] <500)]['distance'],bins=1)
n, bins, patches=plt.hist(df[(df['distance'] >=100 )&( df['distance'] <200)]['distance'],bins=1)
n, bins, patches=plt.hist(df[(df['distance'] >=500 )&( df['distance'] <1000)]['distance'],bins=1)
n, bins, patches=plt.hist(df[(df['distance'] >=1000 )&( df['distance'] <2000)]['distance'],bins=1)
n, bins, patches=plt.hist(df[(df['distance'] >=2000 )&( df['distance'] <5000)]['distance'],bins=1)

plt.show()
#n, bins, patches = plt.hist(df['distance'],bins=10)
#plt.show()

3熟悉基本数据规范化方法、主成分分析(PCA)降维方法。

在这里插入代码片
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
from sklearn import datasets
import pandas as pd
import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt


wine=datasets.load_wine()
df=pd.DataFrame(wine.data,columns=wine.feature_names)
print(df.shape)
#0-1

df1=df.copy()
min=df.min(axis=0)
max=df.max()
for col in df1.columns:
    df1[col]=((df[col]-min[col])/(max[col]-min[col]))
    
 #0 mean   
    
ms=df.mean()
st=df.std()
df2=df.copy()
for col in df2.columns:
    df2[col]=(df[col]-ms[col])/st[col]
    
    
# print(df1.max(),
# df1.min(),
# df1.mean(),
# df1.std())

# print(df2.max(),
# df2.min(),
# df2.mean(),
# df2.std())
def pca(data, k):
    
    U,s,Vh=np.linalg.svd(data)
    sorted_indices = np.argsort(s)
    Vh_k = Vh[sorted_indices[:-k-1:-1],:]
    d2 = np.dot(data,Vh_k.T)
    return d2


ms=df.mean()
dn=df.copy()

for col in df.columns:
    dn[col]=(df[col]-ms[col])
dn= np.array(dn)
dfj=pca(dn,2)

ms=df2.mean()
dn1=df2.copy()

dn1= np.array(dn1)
df2j=pca(dn1,2)



plt.scatter(-dfj.T[:1],-dfj.T[1:], color='red', s=20)
plt.show()
plt.scatter(-df2j.T[:1],-df2j.T[1:], color='blue', s=20)
plt.show()

def kick(x,k):
    U,s,Vh=linalg.svd(x,full_matrices=False)
    sigma=np.diag(s)
    # x1=np.dot(U,np.dot(sigma,Vh))
    # err=((x-x1)*(x-x1)).sum()
    U_k=U[:,:k]
    sigma_k=sigma[:k,:k]
    Vh_k=Vh[:k]
    x2=np.dot(U_k,np.dot(sigma_k,Vh_k))
    err_k=((x-x2)*(x-x2)).sum()
    return err_k
d4=pca(dn,4)
d6=pca(dn,6)
d8=pca(dn,8)
l=[2,4,6,8]
h=[kick(dn,2),kick(dn,4),kick(dn,6),kick(dn,8)]
plt.plot(l,h)

熟悉分类模型的评估和性能度量方法

在这里插入代码片
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
from sklearn import datasets,svm
import pandas as pd
import numpy as np


def acc(pre,data):
    s=0
    err=0
    for i,j in zip(pre,data):
        s=s+1
        if(i!=j):
            err=err+1
    print(1-err/s)
    return 1-err/s
def presice(pre,data):
    acc=0
    for i,j in zip(pre,data):
        if(i==j):
            acc=acc+1
    a=sum(data==1)#all 1 in real
    # print("pre=",acc/a)
    return acc/a
    
    
def recall(pre,data):
    acc=0
    for i,j in zip(pre,data):
        if(i==j):
            acc=acc+1
    a=sum(pre==1)#all 1 in pre
    # print("rec=",acc/a)
    return acc/a
def f1(pre,data):
    p=presice(pre,data)
    r=recall(pre,data)
    return (2*p*r)/(p+r)
# 导入 breast_cancer 数据 
breastcan = datasets.load_breast_cancer() 
#x 为对象-属性矩阵 

#y 为标签 
y= pd.DataFrame(breastcan.target,columns=['target'])
bc=pd.DataFrame(breastcan.data,columns=breastcan.feature_names)

bc=pd.concat([bc,y], axis=1)
#留
bc1=bc[bc['target']==1]
bc2=bc[bc['target']==0]
#test data t=1
ly1=bc1.sample(frac=0.2)
ly2=bc1.sample(frac=0.2)
ly3=bc1.sample(frac=0.2)
ly4=bc1.sample(frac=0.2)
ly5=bc1.sample(frac=0.2)
#test data t=0
zly1=bc2.sample(frac=0.2)
zly2=bc2.sample(frac=0.2)
zly3=bc2.sample(frac=0.2)
zly4=bc2.sample(frac=0.2)
zly5=bc2.sample(frac=0.2)

#test data
ts1=np.array(pd.concat([ly1,zly1],axis=0))
ts2=np.array(pd.concat([ly2,zly2],axis=0))
ts3=np.array(pd.concat([ly3,zly3],axis=0))
ts4=np.array(pd.concat([ly4,zly4],axis=0))
ts5=np.array(pd.concat([ly5,zly5],axis=0))
#train data
bc1=bc.append(ly1)
bc1=bc.append(zly1)
tr1=bc1.drop_duplicates(subset=None,keep=False,inplace=False)
tr1=np.array(tr1)

bc2=bc.append(ly2)
bc2=bc.append(zly2)
tr2=bc2.drop_duplicates(subset=None,keep=False,inplace=False)
tr2=np.array(tr2)

bc3=bc.append(ly3)
bc3=bc.append(zly3)
tr3=bc3.drop_duplicates(subset=None,keep=False,inplace=False)
tr3=np.array(tr3)

bc4=bc.append(ly4)
bc4=bc.append(zly4)
tr4=bc4.drop_duplicates(subset=None,keep=False,inplace=False)
tr4=np.array(tr4)

bc5=bc.append(ly5)
bc5=bc.append(zly5)
tr5=bc5.drop_duplicates(subset=None,keep=False,inplace=False)
tr5=np.array(tr5)

# 建立一个 svm 分类器 
classifier = svm.SVC(gamma=0.001)
# 用训练集学习模型参数 
classifier.fit(tr1[:,:-1], tr1[:,-1]) 
# 用训练好的模型对测试集预测 
pre1 = classifier.predict(ts1[:,:-1]) 
classifier = svm.SVC(gamma=0.001)
classifier.fit(tr2[:,:-1], tr2[:,-1])
pre2 = classifier.predict(ts2[:,:-1])
classifier = svm.SVC(gamma=0.001)     
classifier.fit(tr3[:,:-1], tr3[:,-1])
pre3 = classifier.predict(ts3[:,:-1])
classifier = svm.SVC(gamma=0.001)    
classifier.fit(tr4[:,:-1], tr4[:,-1])
pre4 = classifier.predict(ts4[:,:-1])
classifier = svm.SVC(gamma=0.001)
classifier.fit(tr5[:,:-1], tr5[:,-1])
pre5 = classifier.predict(ts5[:,:-1])


# 计算 accuracy 和 F1 的平均值
acc(pre1,ts1[:,-1])
acc(pre2,ts2[:,-1])
acc(pre3,ts3[:,-1])
acc(pre4,ts4[:,-1])
acc(pre5,ts5[:,-1])
am=acc(pre1,ts1[:,-1])+acc(pre2,ts2[:,-1])+acc(pre3,ts3[:,-1])+acc(pre4,ts4[:,-1])+acc(pre5,ts5[:,-1])
am=am/5
print('am',am)
ff1=f1(pre1,ts1[:,-1])
ff2=f1(pre2,ts2[:,-1])
ff3=f1(pre3,ts3[:,-1])
ff4=f1(pre4,ts4[:,-1])
ff5=f1(pre5,ts5[:,-1])
ffm=ff1+ ff2+ ff3+ ff4+ ff5
ffm=ffm/5
print('ffm',ffm)
#5fold
ts1=bc.sample(113)
pool=bc.append(ts1)
tr1=pool.drop_duplicates(subset=None,keep=False,inplace=False)

ts2=pool.sample(113)#test 
pool=bc.append(ts2)
pool=bc.drop_duplicates(subset=None,keep=False,inplace=False)#pool 
bc_temp=bc.append(ts2)
tr2=bc_temp.drop_duplicates(subset=None,keep=False,inplace=False)#train


 
ts3=pool.sample(113)#test 
pool=bc.append(ts3)
pool=bc.drop_duplicates(subset=None,keep=False,inplace=False)#pool 
bc_temp=bc.append(ts3)
tr3=bc_temp.drop_duplicates(subset=None,keep=False,inplace=False)#train

 
ts4=pool.sample(113)#test 
pool=bc.append(ts4)
pool=bc.drop_duplicates(subset=None,keep=False,inplace=False)#pool 
bc_temp=bc.append(ts4)
tr4=bc_temp.drop_duplicates(subset=None,keep=False,inplace=False)#train

 
ts5=pool.sample(113)#test 
pool=bc.append(ts5)
pool=bc.drop_duplicates(subset=None,keep=False,inplace=False)#pool 
bc_temp=bc.append(ts5)
tr5=bc_temp.drop_duplicates(subset=None,keep=False,inplace=False)#train

 
tr1=np.array(tr1) 
tr2=np.array(tr2) 
tr3=np.array(tr3) 
tr4=np.array(tr4) 
tr5=np.array(tr5) 

ts1=np.array(ts1) 
ts2=np.array(ts2) 
ts3=np.array(ts3) 
ts4=np.array(ts4) 
ts5=np.array(ts5) 

# 建立一个 svm 分类器 
classifier = svm.SVC(gamma=0.001)
# 用训练集学习模型参数 
classifier.fit(tr1[:,:-1], tr1[:,-1]) 
# 用训练好的模型对测试集预测 
pre1 = classifier.predict(ts1[:,:-1]) 

classifier = svm.SVC(gamma=0.001)
classifier.fit(tr2[:,:-1], tr2[:,-1])
pre2 = classifier.predict(ts2[:,:-1])

classifier = svm.SVC(gamma=0.001)     
classifier.fit(tr3[:,:-1], tr3[:,-1])
pre3 = classifier.predict(ts3[:,:-1])

classifier = svm.SVC(gamma=0.001)    
classifier.fit(tr4[:,:-1], tr4[:,-1])
pre4 = classifier.predict(ts4[:,:-1])

classifier = svm.SVC(gamma=0.001)
classifier.fit(tr5[:,:-1], tr5[:,-1])
pre5 = classifier.predict(ts5[:,:-1])

acc(pre1,ts1[:,-1])
acc(pre2,ts2[:,-1])
acc(pre3,ts3[:,-1])
acc(pre4,ts4[:,-1])
acc(pre5,ts5[:,-1])
am=acc(pre1,ts1[:,-1])+acc(pre2,ts2[:,-1])+acc(pre3,ts3[:,-1])+acc(pre4,ts4[:,-1])+acc(pre5,ts5[:,-1])
am=am/5
print('am',am)
ff1=f1(pre1,ts1[:,-1])
ff2=f1(pre2,ts2[:,-1])
ff3=f1(pre3,ts3[:,-1])
ff4=f1(pre4,ts4[:,-1])
ff5=f1(pre5,ts5[:,-1])
ffm=ff1+ ff2+ ff3+ ff4+ ff5
ffm=ffm/5
print('ffm',ffm)


#replace
ts1=bc.sample(113,replace=True)
ts2=bc.sample(113,replace=True) 
ts3=bc.sample(113,replace=True) 
ts4=bc.sample(113,replace=True) 
ts5=bc.sample(113,replace=True) 


pool=bc.append(ts1)
tr1=pool.drop_duplicates(subset=None,keep=False,inplace=False)
pool=bc.append(ts2)
tr2=pool.drop_duplicates(subset=None,keep=False,inplace=False) 
pool=bc.append(ts3)
tr3=pool.drop_duplicates(subset=None,keep=False,inplace=False) 
pool=bc.append(ts4)
tr4=pool.drop_duplicates(subset=None,keep=False,inplace=False) 
pool=bc.append(ts5)
tr5=pool.drop_duplicates(subset=None,keep=False,inplace=False) 



 
tr1=np.array(tr1) 
tr2=np.array(tr2) 
tr3=np.array(tr3) 
tr4=np.array(tr4) 
tr5=np.array(tr5) 

ts1=np.array(ts1) 
ts2=np.array(ts2) 
ts3=np.array(ts3) 
ts4=np.array(ts4) 
ts5=np.array(ts5) 

# 建立一个 svm 分类器 
classifier = svm.SVC(gamma=0.001)
# 用训练集学习模型参数 
classifier.fit(tr1[:,:-1], tr1[:,-1]) 
# 用训练好的模型对测试集预测 
pre1 = classifier.predict(ts1[:,:-1]) 

classifier = svm.SVC(gamma=0.001)
classifier.fit(tr2[:,:-1], tr2[:,-1])
pre2 = classifier.predict(ts2[:,:-1])

classifier = svm.SVC(gamma=0.001)     
classifier.fit(tr3[:,:-1], tr3[:,-1])
pre3 = classifier.predict(ts3[:,:-1])

classifier = svm.SVC(gamma=0.001)    
classifier.fit(tr4[:,:-1], tr4[:,-1])
pre4 = classifier.predict(ts4[:,:-1])

classifier = svm.SVC(gamma=0.001)
classifier.fit(tr5[:,:-1], tr5[:,-1])
pre5 = classifier.predict(ts5[:,:-1])

acc(pre1,ts1[:,-1])
acc(pre2,ts2[:,-1])
acc(pre3,ts3[:,-1])
acc(pre4,ts4[:,-1])
acc(pre5,ts5[:,-1])
am=acc(pre1,ts1[:,-1])+acc(pre2,ts2[:,-1])+acc(pre3,ts3[:,-1])+acc(pre4,ts4[:,-1])+acc(pre5,ts5[:,-1])
am=am/5
print('am',am)
ff1=f1(pre1,ts1[:,-1])
ff2=f1(pre2,ts2[:,-1])
ff3=f1(pre3,ts3[:,-1])
ff4=f1(pre4,ts4[:,-1])
ff5=f1(pre5,ts5[:,-1])
ffm=ff1+ ff2+ ff3+ ff4+ ff5
ffm=ffm/5
print('ffm',ffm)

熟悉决策树的基本构建算法与后剪枝方法

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
from sklearn.impute import SimpleImputer
from math import log2
import pandas as pd
import numpy as np

x=pd.read_csv('breast_cancer2.csv',header=None)
x.columns = ['ANSWER','AGE', 'MENOPAUSE','SIZE',
              'INV-NODES',
              'NODE-CAPS','DEG-MALIG','BREAST',
              'QUAD','IRRADIAT']
imp = SimpleImputer(missing_values='?', strategy='most_frequent')
x['QUAD']=imp.fit_transform(x['QUAD'].values.reshape(-1,1))
x['NODE-CAPS']=imp.fit_transform(x['NODE-CAPS'].values.reshape(-1,1))
k=x.columns

for col in x.columns:
    if col=='DEG-MALIG':
        continue
    classlist=x[col].value_counts().index
    j=1
    for i in classlist:
        x=x.replace(i,j)
        j+=1
test=x.sample(frac=0.2,replace=True)

temp=x.append(test)
train=temp.drop_duplicates(subset=None,keep=False,inplace=False)

#####取要使用的特征做决策树
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='gini') 
train_x=np.array(train.iloc[:,1:])
train_y=np.array(train.iloc[:,0])
clf=clf.fit(train_x,train_y)

print(str(clf))

predictY=clf.predict(test.iloc[:,1:].values)
print(predictY)

import graphviz
dot_data = tree.export_graphviz(clf, out_file=None)  
graph = graphviz.Source(dot_data)  
graph.render(r'bc1')

print("测试分类的准确度1:",clf.score(test.iloc[:,1:].values, test.iloc[:,0].values))##测试检验分类的准确度

 
clf = tree.DecisionTreeClassifier(criterion='entropy') 
train_x=np.array(train.iloc[:,1:])
train_y=np.array(train.iloc[:,0])
clf=clf.fit(train_x,train_y)

print(str(clf))

predictY=clf.predict(test.iloc[:,1:].values)
print(predictY)

 
print("测试分类的准确度2:",clf.score(test.iloc[:,1:].values, test.iloc[:,0].values))##测试检验分类的准确度


dot_data = tree.export_graphviz(clf, out_file=None)  
graph = graphviz.Source(dot_data)  
graph.render(r'bc2')

# #计算信息熵
# def calEnt(dataset):
#     n=dataset.shape[0]
#     iset = dataset.iloc[:,0].value_counts()
#     p=iset/n
#     ent = (-p*np.log2(p)).sum()
#     return ent

# def bestsplit(dataset):
#     Ent=calEnt(dataset)
#     Gain=0
#     axis=0
#     for i in range(1,dataset.shape[1]):#range函数研究
#         levels=dataset.iloc[:,i].value_counts().index
#         ents=0
              
#         for j in levels:
#             cs=dataset[dataset.iloc[:,i]==j]
#             ent=calEnt(cs)
#             ents += (cs.shape[0]/dataset.shape[0])*ent
#         thisgain=Ent-ents
#         if(thisgain>Gain):
#             Gain=thisgain
#             axis=i
#     return axis
# def mySplit(dataset,axis,value):
#     col = dataset.columns[axis]
#     re = dataset.loc[dataset[col]==value,:].drop(col,axis=1)
#     return re


# def createTree(dataset):
#     feat = list(dataset.columns)  #提取出数据集所有的列

#     classlist=dataset.iloc[:,0].value_counts()#类标签
#     if classlist[0]==dataset.shape[0] :
#         return classlist.index[0]#停止条件,纯,属性一个,dataset中所有的
#     if feat == None:#这个取了没了
#         return classlist.index[0]

#     # if classlist.shape[0]==1 or dataset.shape[1] <=1:
#     #     return classlist.index[0]
    
#     axis =  bestsplit(dataset)#最大信息增益的索引
#     bestfeat=feat[axis]#最大信息增益的特征值
#     myTree = {bestfeat:{}}
#     del feat[axis]
    
#     valuelist=set(dataset.iloc[:,axis])
#     for v in valuelist:
#         temp=mySplit(dataset,axis,v)

#         if(temp.empty):
#             myTree[bestfeat][v]=classlist.index[0]#不往下取了
#             return myTree
#         myTree[bestfeat][v]=createTree(temp)
#     return myTree
# def classify(inputTree,labels,testVec):
#     classLabel='recurrence-events
#     first = next(iter(inputTree))
#     second = inputTree[first]
#     fest=labels.index(first)-1
#     for key in second.keys():
#         if testVec[fest]==key:
#             if type(second[key]) == dict :
#                 classLabel = classify(second[key], labels, testVec)
#             else: 
#                 classLabel = second[key]
#                 return classLabel
#     return classLabel

# def acc(train,test):
#     inputTree=createTree(train)
#     labels=list(train.columns)
#     res=[]
#     for i in range(test.shape[0]):
#         testVec=test.iloc[i,1:]
#         classLabel = classify(inputTree,labels,testVec)
#         res.append(classLabel)
#     test['res']=res
#     ac=(test.iloc[:,-1]==test.iloc[:,0]).mean()	
#     print(inputTree)
#     print(f'acc:{ac}')
#     return test
# train = dataset
# test = testset
# acc(train,test)

熟悉朴素贝叶斯算法、基于实验分析k最近邻算法对参数k的敏感度以及通过交叉验证的调参方法

# -*- coding: utf-8 -*-
"""
Created on Sat Apr 25 14:10:47 2020

@author: FRANP
"""
import math
import numpy as np
import pandas as pd
from math import log10
from sklearn.impute import SimpleImputer

from sklearn.model_selection import KFold

wine = x=pd.read_csv('breast_cancer2.csv', header=None)

x=pd.read_csv('breast_cancer2.csv',header=None)
x.columns = ['ANSWER','AGE', 'MENOPAUSE','SIZE',
              'INV-NODES',
              'NODE-CAPS','DEG-MALIG','BREAST',
              'QUAD','IRRADIAT']
imp = SimpleImputer(missing_values='?', strategy='most_frequent')
x['QUAD']=imp.fit_transform(x['QUAD'].values.reshape(-1,1))
x['NODE-CAPS']=imp.fit_transform(x['NODE-CAPS'].values.reshape(-1,1))


for col in x.columns:
    if col=='DEG-MALIG':
        continue
    classlist=x[col].value_counts().index
    j=1
    for i in classlist:
        x=x.replace(i,j)
        j+=1
x.columns=[0,1,2,3,4,5,6,7,8,9]
te=x.sample(frac=0.2,replace=True)
#1 no recur 2 recur
temp=x.append(te)
tr=temp.drop_duplicates(subset=None,keep=False,inplace=False)

tes=x.iloc[[2, 17, 21, 30, 68, 
77, 81, 89, 98, 111, 121, 126, 144, 145, 158, 160, 172, 176]]
tra=x.append(tes)
tra=x.drop_duplicates(subset=None,keep=False,inplace=False)

def calNormalDistribution(x_value, var_value, mean_value):
    '''
    :param x_value: 目标特征值
    :param var_value: C类样本在第i个属性上的方差
    :param mean_value: C类样本在第i个属性上的均值
    :return: 概率结果
    '''
    x= math.exp(-(x_value - mean_value) ** 2.0 / (2.0*(var_value**2.0))) / (math.sqrt(2.0*math.pi) * var_value)
    if x_value==mean_value or x==0:
        return 1
    else :
        return x
def NB(data,test):
    '''
    data=tr
    test=te
    '''
    data1=data[data.iloc[:,0]==1]

    data2=data[data.iloc[:,0]==2]
    
    ld1=(data1.shape[0]+1)/(data.shape[0]+2)  #d1个数 pc

    ld2=(data2.shape[0]+1)/(data.shape[0]+2)
    d1=(data1.shape[0])/(data.shape[0])  #d1个数 pc

    d2=(data2.shape[0])/(data.shape[0])
    res=[]
    lres=[]
    p2=(d2)
    p1=(d1)
    lp2=(ld2)
    lp1=(ld1)
    for i in range(1,test.shape[0]):
        sam=test.iloc[i,:]
        for j in range(1,10):
            # p0 += log10(calNormalDistribution(sam[j], data0[j].std(), data0[j].mean()))
            # p1 += log10(calNormalDistribution(sam[j], data1[j].std(), data1[j].mean()))
            # p2 += log10(calNormalDistribution(sam[j], data2[j].std(), data2[j].mean()))
            # tmp=log10((data2[data2[i]==sam[i]].shape[0])/(data2.shape[0]))
            
            #没有拉普拉斯:
            # p2 +=log10(data2[data2[j]==sam[j]].shape[0]/float(data2.shape[0]))
            # p1 +=log10((data1[data1[j]==sam[j]].shape[0])/float(data1.shape[0]))
            
            #拉普拉斯修正
            lp2+=log10((data2[data2[j]==sam[j]].shape[0]+1)/(data2.shape[0]+len(data[j].value_counts())))
            lp1+=log10((data1[data1[j]==sam[j]].shape[0]+1)/(data1.shape[0]+len(data[j].value_counts())))
           
        # m=max(p1,p2)
        # if m==p1:
        #     res.append(int(1))
        # if m==p2:
        #     res.append(int(2))
        lm=max(lp1,lp2)
        if lm==lp1:
            lres.append(int(1))
        if lm==lp2:
            lres.append(int(2))
        


    print(res,lres)
    acc(res,test.iloc[:,-1])

    acc(lres,test.iloc[:,-1])

    return res   
        

# for i in range(0,3):
#     print("p{i} *= calNormalDistribution(sam[j], data{i}[j].std(), data{i}[j].mean())".format(i=i))


def acc(pre,data):
    s=data.shape[0]
    err=0
    for i,j in zip(pre,data):
        
        if(i!=j):
            err=err+1
    print("acc=",1-err/s)
    return 1-err/s
  

NB(tr,te)


def knn(trainData, testData, labels, k):
    trainData=np.array(trainData)
    testData=np.array(testData)
    labels=np.array(labels)
    rowSize = trainData.shape[0]
    
    diff = np.tile(testData, (rowSize, 1)) - trainData

    sqrDiff = diff ** 2
    sqrDiffSum = sqrDiff.sum(axis=1)
    distances = sqrDiffSum ** 0.5
    sortDistance = distances.argsort()
    
    count = {}
    
    for i in range(0,k):
        vote = labels[sortDistance[i]]
        count[vote] = count.get(vote, 0) + 1

    sortCount = sorted(count.items(),  reverse=True)
    
    return sortCount[0][0]
kf=KFold(n_splits=10)
sum1=0
sum3=0
sum5=0
sum7=0
sum9=0
for tr,te in kf.split(x):
    train=x.iloc[tr,:]
    test=x.iloc[te,:]
    res1=[]
    res3=[]
    res5=[]
    res7=[]
    res9=[]

    for i in range(0,test.shape[0]):
        res1.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],1))
        res3.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],3))
        res5.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],5))
        res7.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],7))
        res9.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],9))

    sum1+=acc(res1,test.iloc[:,0])
    sum3+=acc(res3,test.iloc[:,0])
    sum5+=acc(res5,test.iloc[:,0])
    sum7+=acc(res7,test.iloc[:,0])
    sum9+=acc(res9,test.iloc[:,0])


for z in range(1):
    train=tra
    test=tes
    res1=[]
    res3=[]
    res5=[]
    res7=[]
    res9=[]

    for i in range(0,test.shape[0]):
        res1.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],1))
        res3.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],3))
        res5.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],5))
        res7.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],7))
        res9.append(knn(train.iloc[:,1:],test.iloc[i,1:],train.iloc[:,0],9))
    print('acc13579:\n')
    acc(res1,test.iloc[:,0])
    acc(res3,test.iloc[:,0])
    acc(res5,test.iloc[:,0])
    acc(res7,test.iloc[:,0])
    acc(res9,test.iloc[:,0])

熟悉K-均值、凝聚层次聚类、DBSCAN聚类算法以及聚类结果评估

# -*- coding: utf-8 -*-
"""
Spyder 编辑器

这是一个临时脚本文件。
"""

#1
from sklearn.datasets import load_wine
from sklearn import metrics
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics 

# 欧氏距离计算
# def distEclud(x,y):
#     return np.sqrt(np.sum((x-y)**2))  # 计算欧氏距离
def distEclud(A,B):
    return np.sqrt(sum(np.power((A - B), 2)))
# 为给定数据集构建一个包含K个随机质心的集合
def randCent(dataSet,k):

    
    centroidlist = np.random.choice(range(dataSet.shape[0]),k)
    centroids=dataSet[centroidlist]

    return centroids
 
# k均值聚类
def KMeans(dataSet,k):
 
    m = dataSet.shape[0]  #行的数目
    # 第一列存样本属于哪一簇
    # 第二列存样本的到簇的中心点的误差
    
    # clusterAssment = np.mat(np.zeros((m,2)))
    clusterChange = True
    data=np.array(dataSet)
    dataSet['labels']=0
    # 第1步 初始化centroids
    centroids = randCent(data,k)
    while clusterChange:

 
        # 遍历所有的样本(行数)
        for i in range(m):

            distance=[]
            # 遍历所有的质心
            #第2步 找出最近的质心
            for j in range(k):
                # 计算该样本到质心的欧式距离
                distance.append(distEclud(centroids[j,:],data[i,:]))
                # distance.append(distEclud(centroids[j,:],dataSet[i,:]))
            p=distance.index(min(distance))
            dataSet.iloc[i,-1]=p

        # 第 3 步:更新质心
        new_centroids=centroids.copy()
        
        for x in range(k):
            d=dataSet[dataSet['labels']==x]
            d=d.iloc[:,:-1].mean()
            d=np.array(d)
            d=d.reshape(1,13)
            new_centroids[x]=d
            # print(new_centroids)
        if np.array_equal(new_centroids,centroids):
            # print(new_centroids)
            clusterChange = False
        else:
            print(new_centroids,end='\n')
            centroids = new_centroids
            

        
    return dataSet
 


data = load_wine()
dataSet=pd.DataFrame(data.data)
k = 3
clusterAssment = KMeans(dataSet,k)

labels=clusterAssment['labels']
labels_true=data.target
l=np.array(labels)
NMI_score= metrics.normalized_mutual_info_score(labels_true, labels)
print("normalized Mutual Information: %0.3f" % NMI_score)
# -*- coding: utf-8 -*-
"""
Created on Sat May 30 13:53:02 2020

@author: FRANP
"""
import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN 
 
import matplotlib.pyplot as plt
 
X=pd.read_csv('noisy_moons.csv')

db = DBSCAN(eps=0.1, min_samples=10).fit(X) 
core_samples_mask = np.zeros_like(db.labels_, dtype=bool) 
core_samples_mask[db.core_sample_indices_] = True 
labels = db.labels_ 

plt.scatter(X.iloc[:, 0], X.iloc[:, 1],c = labels, cmap='viridis', marker='o')
plt.show()
# -*- coding: utf-8 -*-
"""
Created on Sat May 30 13:32:13 2020

@author: FRANP
"""
from sklearn.datasets import load_wine
from sklearn import metrics 
from sklearn.cluster import AgglomerativeClustering 
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram 
from scipy.spatial.distance import pdist, squareform

# def manhattan_distance_1(inX, dataSet):
    # # 点与样本集的曼哈顿距离
    # sub = inX - dataSet
    # abs_sub = np.abs(sub)
    # distances = np.sum(abs_sub, axis=1)
    # return distances
data = load_wine()
df=data.data
m=df.shape[0]
model = AgglomerativeClustering(affinity='precomputed', distance_threshold=None, n_clusters=3, linkage='average') 
D=np.zeros(int(m*(m+1)/2))
import scipy.spatial.distance as dist
Y = pdist(df, 'cityblock')
# dist_list 为n*(n-1)/2大小的list
# s=0
# e=m
# for i in range(m):    
#     # D[s:e]=manhattan_distance_1(df[i],df[i+1:])

#     s=e
#     e=e+m-i-1

dist_matrix = dist.squareform(Y)
# model = AgglomerativeClustering(n_clusters=3,affinity='euclidean' , linkage='average',distance_threshold=None) 
# D=D.reshape(-1, 1)
model = model.fit(dist_matrix) 
labels=model.labels_ 
labels_true=data.target
NMI_score= metrics.normalized_mutual_info_score(labels_true, labels) 
print("normalized Mutual Information: %0.3f" % NMI_score)

###了解关联规则挖掘步骤,熟悉Apriori算法的频繁项集挖掘

import pandas as pd
import numpy as np
c1=[]

df=pd.read_csv('transaction.csv',header=None)
f = open('transaction.csv')
line=f.readline().strip()
while(line!=''):
    c1.append(line.split(","))
    line=f.readline().strip()

for i in c1:
    while '' in i:
        i.remove('')

 

def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item]) #store all the item unrepeatly

    C1.sort()
    #return map(frozenset, C1)#frozen set, user can't change it.
    return list(map(frozenset, C1))

def scanD(D,Ck,minSupport):
#参数:数据集、候选项集列表 Ck以及感兴趣项集的最小支持度 minSupport
    ssCnt={}
    for tid in D:#遍历数据集
        for can in Ck:#遍历候选项
            if can.issubset(tid):#判断候选项中是否含数据集的各项
                #if not ssCnt.has_key(can): # python3 can not support
                if not can in ssCnt:
                    ssCnt[can]=1 #不含设为1
                else: ssCnt[can]+=1#有则计数加1
    numItems=float(len(D))#数据集大小
    retList = []#L1初始化
    supportData = {}#记录候选项中各个数据的支持度
    for key in ssCnt:
        support = ssCnt[key]/numItems#计算支持度
        if support >= minSupport:
            retList.insert(0,key)#满足条件加入L1中
        supportData[key] = support
    return retList, supportData

def aprioriGen(Lk, k): #组合,向上合并
    #creates Ck 参数:频繁项集列表 Lk 与项集元素个数 k
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk): #两两组合遍历
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2: #若两个集合的前k-2个项相同时,则将两个集合合并
                retList.append(Lk[i] | Lk[j]) #set union
    return retList


def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet)) #python3
    L1, supportData = scanD(D, C1, minSupport)#单项最小支持度判断 0.5,生成L1
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):#创建包含更大项集的更大列表,直到下一个大的项集为空
        Ck = aprioriGen(L[k-2], k)#Ck
        Lk, supK = scanD(D, Ck, minSupport)#get Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

def generateRules(L, supportData, minConf=0.7):
    #频繁项集列表、包含那些频繁项集支持数据的字典、最小可信度阈值
    bigRuleList = [] #存储所有的关联规则
    for i in range(1, len(L)):  #只获取有两个或者更多集合的项目,从1,即第二个元素开始,L[0]是单个元素的
        # 两个及以上的才可能有关联一说,单个元素的项集不存在关联问题
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            #该函数遍历L中的每一个频繁项集并对每个频繁项集创建只包含单个元素集合的列表H1
            if (i > 1):
            #如果频繁项集元素数目超过2,那么会考虑对它做进一步的合并
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:#第一层时,后件数为1
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)# 调用函数2
    return bigRuleList

def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    #针对项集中只有两个元素时,计算可信度
    prunedH = []#返回一个满足最小可信度要求的规则列表
    for conseq in H:#后件,遍历 H中的所有项集并计算它们的可信度值
        conf = supportData[freqSet]/supportData[freqSet-conseq] #可信度计算,结合支持度数据
        if conf >= minConf:
            print (freqSet-conseq,'-->',conseq,'conf:',conf)
            #如果某条规则满足最小可信度值,那么将这些规则输出到屏幕显示
            brl.append((freqSet-conseq, conseq, conf))#添加到规则里,brl 是前面通过检查的 bigRuleList
            prunedH.append(conseq)#同样需要放入列表到后面检查
    return prunedH


def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    #参数:一个是频繁项集,另一个是可以出现在规则右部的元素列表 H
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #频繁项集元素数目大于单个集合的元素数
        Hmp1 = aprioriGen(H, m+1)#存在不同顺序、元素相同的集合,合并具有相同部分的集合
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)#计算可信度
        if (len(Hmp1) > 1):    
        #满足最小可信度要求的规则列表多于1,则递归来判断是否可以进一步组合这些规则
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
L,suppData=apriori(c1,0.5)
a=sorted(suppData.items(), key = lambda kv:kv[1],reverse=True)
rules=generateRules(L,suppData,0.5)
b=sorted(rules, key = lambda kv:kv[2],reverse=True)