1、算法
本文主要是论文《Two-stage instance selection and adaptive bag mapping algorithm for multi-instance learning》中算法代码的复现。具体算法原理见此文。本文与SMDP有类似之处,都使用了DP聚类。
2、代码
2.1 DIP.py
这一部分代码是创建实例原型池,即:找出所有包的代表实例聚集在一起。此阶段为在包内选取实例原型。
import math
import numpy as np
from scipy.spatial.distance import cdist
from MIL_MIDIE.MIL_Frame.MILTool import get_cosine
class DIP:#基于DIP(discriminative instance pool)映射
def __init__(self,bags,scale_num):
self.bags=bags
self.scale_num=scale_num
self.inner_bag_distance=[]
self.discriminative_instance = self.__get_discriminative_ins()
def __get_discriminative_ins(self):#获得discriminative实例
"""
Get the discriminative instance of the each bags to build the discriminative instance pool.
:return:
"""
for bag_i in range(self.bags.shape[0]):
self.__get_instance_in_bag(self.bags[bag_i])
return np.array(self.inner_bag_distance)
def __get_instance_in_bag(self, bag):#从包中找到实例作为discriminative instance
# Step 1. Calculate the distances of each instance.计算实例间的距离
# print(bag[0].shape[0])
ins_space_to_bag = []
for ins in bag[0][:, :-1]:
ins_space_to_bag.append(ins)
ins_space_to_bag = np.array(ins_space_to_bag)
distance_ins_to_ins = cdist(ins_space_to_bag, ins_space_to_bag)#计算两个实例间的马氏距离
# Step 2. Calculate the affinity of each instance.计算实例间的相似度,通过计算余弦值来进行度量
affinity_ins_to_ins = get_cosine(ins_space_to_bag)
affinity_ins = np.zeros((len(distance_ins_to_ins), len(distance_ins_to_ins))).astype("int32")
affinity_ins_score = []
# density_ins_score = []
ave_dis_ins = affinity_ins_to_ins.mean()
for i in range(len(affinity_ins_to_ins)):
for j in range(len(affinity_ins_to_ins)):
if affinity_ins_to_ins[i, j] <= ave_dis_ins:
affinity_ins[i, j] = 1
affinity_ins_score.append(sum(affinity_ins[i]))
# Step 3. Calculate the density of each instance.
dis_cut = 0.4 * distance_ins_to_ins.max()
density_ins_score = np.zeros(len(distance_ins_to_ins)).astype("float64")
for i in range(len(distance_ins_to_ins)):
if dis_cut == 0:
density_ins_score[i] = 1
else:
density_ins_score[i] = sum(np.exp(-(distance_ins_to_ins[i] / dis_cut) ** 2))
# Step 4. Calculate the lambda of each instance.
lambda_ins_score = np.multiply(affinity_ins_score, density_ins_score).tolist()
# Step 5. Get the discriminative instance.
for i in range(math.ceil(self.scale_num * bag[0].shape[0])):
self.inner_bag_distance.append(ins_space_to_bag[lambda_ins_score.index(min(lambda_ins_score))])
lambda_ins_score[lambda_ins_score.index(min((lambda_ins_score)))] = -1
if __name__=='__main__':
a = np.array([0, 1, 3])
b = np.array([[12, 3, 4], [0, 1, 2], [2, 5, 6], [1, 2, 3]])
c=cdist([a], b)
print(min(max(c)))
2.2 SDI.py
这段代码主要是进行第二阶段:在实例原型池中选出代表实例。先要通过DIP得到实例原型池,再通过DP来选出代表实例。在这个过程中,需要计算实例原型的密度以及实例到比他实例密度更大的实例的距离。最后计算来找出代表实例。
import numpy as np
from scipy.spatial.distance import cdist
import MIL_MIDIE.MIDIE.DIP as DIP
from MIL_MIDIE.MIL_Frame.MILTool import MILTool
class SDI:
def __init__(self, bags, ratio_instance_to_bag, num_SDI):
self.bags = bags
self.ratio_ins = ratio_instance_to_bag
self.num_SDI = num_SDI
self.discriminative_instance = self.__print_SDI()
self.final_discriminative_instance = self.__select_discriminative_instance()
def __print_SDI(self):
DIP_demo = DIP.DIP(self.bags,self.ratio_ins)#输入包以及搜索范围
return DIP_demo.discriminative_instance#返回找到的discriminative instance
def __select_discriminative_instance(self):
#通过DP来选出代表实例
# Step 1. Calculate the distance of discriminative instances.
discriminative_instance_distance = cdist(self.discriminative_instance, self.discriminative_instance)
# Step 2. The cutoff distances.
dis_cut = 0.4 * discriminative_instance_distance.max()
# The density of discriminative instances.
density_discriminative_ins = np.zeros(len(discriminative_instance_distance)).astype("float64")
for i in range(len(discriminative_instance_distance)):
if dis_cut == 0:
density_discriminative_ins[i] = 1
else:
density_discriminative_ins[i] = sum(np.exp(-(discriminative_instance_distance[i] / dis_cut) ** 2))
# Step 3. The distance of closest instance that is denser than itself.
distance_closest = []
for i in range(len(density_discriminative_ins)):
more_density_instance_index = []
temp_density_instance = density_discriminative_ins[i]
for j in range(len(density_discriminative_ins)):
if density_discriminative_ins[j] > temp_density_instance:
more_density_instance_index.append(j)
temp_distance_more_instance = []
for index in range(0, len(more_density_instance_index)):
index_k = more_density_instance_index[index]
temp_distance_more_instance.append(discriminative_instance_distance[i][index_k])
if temp_distance_more_instance:
temp_distance_more_instance.sort()
distance_closest.append(temp_distance_more_instance[0])
else:
distance_closest.append(float('inf'))
# Step 4. The lambda of discriminative instance.
lambda_discriminative_instance = np.multiply(distance_closest, density_discriminative_ins).tolist()
final_discriminative_instance = []
for i in range(self.num_SDI):
index_most = lambda_discriminative_instance.index(max(lambda_discriminative_instance))
final_discriminative_instance.append(self.discriminative_instance[index_most])
lambda_discriminative_instance[index_most] = -1
return np.array(final_discriminative_instance)
if __name__ == '__main__':
file_path = "MUSK1_1.arff"
mil = MILTool(file_path)
bags = mil.bags
SDI_demo = SDI(bags, 0.01, 2)
ins = SDI_demo.final_discriminative_instance
print(type(ins))
2.3 DIE.py
这一部分的代码主要做的就是包映射相关工作。可以对不同状态的包进行包获取,包状态主要有三种:global、positive、negative。也可以通过相加(add)或拼接(con)两种方式来对包进行映射。最后对向量进行归一化。
import warnings
import numpy as np
from MIL_MIDIE.MIDIE.SDI import SDI
from MIL_MIDIE.MIL_Frame.MILTool import MILTool, dis_euclidean, get_ten_fold_index
warnings.filterwarnings('ignore')
class DIE:
def __init__(self, all_bag, tr_index, bags_status, embed_status, ratio_instance_to_bag,
num_discriminative_instance):
self.bags = all_bag
self.bags_status = bags_status
self.embed_status = embed_status
self.tr_index = tr_index
self.ra_ins = ratio_instance_to_bag
self.num_dis_ins = num_discriminative_instance
self.train_final_bag = self.__get_bags()
self.embedding_vector = self.__embedding()
def __get_bags(self):
"""
Get train bags through three types:
g: The source of bags is the global bags.
p: The source of bags is the positive bags.
n:The source of bags is the negative bags
:return: The bags.
"""
if self.bags_status == 'g': # global bags
return self.bags[self.tr_index] # 训练集索引的所有包
elif self.bags_status == 'p': # positive bags
positive_bags_index = [] # 正包
for i in range(len(self.tr_index)):
if self.bags[self.tr_index[i], -1] == 1: # 标签为1
positive_bags_index.append(self.tr_index[i]) # 加入矩阵中
return self.bags[positive_bags_index]
elif self.bags_status == 'n': # negative bags
negative_bags_index = [] # 负包
for i in range(len(self.tr_index)):
if not self.bags[self.tr_index[i], -1] == 1: # 标签不为1
negative_bags_index.append(self.tr_index[i])
return self.bags[negative_bags_index]
def __embedding(self):
"""
Let a bag embedding into a single vector.
:return:
"""
# Step 1. Get the discriminative instance through the select discriminative instance method./通过SDI方法找出代表实例
discriminative_instance = SDI(self.train_final_bag, self.ra_ins, self.num_dis_ins).final_discriminative_instance
# Step 2. Embedding the bags as the single vector./将包映射为向量
bag_to_vector = []
for bag_i in range(self.bags.shape[0]):
# Step 2.1 Find the embedding model./选择嵌入模型
temp_single_vector = []
if self.embed_status == 'add':#映射方法为相加
temp_single_vector = np.zeros(self.bags[bag_i][0].shape[1] - 1).astype("float64")
elif self.embed_status == 'con':#映射方法为拼接
temp_single_vector = np.zeros(self.num_dis_ins * (self.bags[bag_i][0].shape[1] - 1)).astype("float64")
else:
print('Your input model is not exist!\n')
break
# Step 2.2 Specific steps of embedding.
for ins_i in self.bags[bag_i][0][:, :-1]:#所有实例进行寻找
temp_distance_dis_to_ins = []
# Step 2.2.1 Find the nearest discriminative instance.#找到最近的代表实例
for dis_ins_i in range(self.num_dis_ins):#与所有的代表进行距离计算
temp_distance_dis_to_ins.append(dis_euclidean(ins_i, discriminative_instance[dis_ins_i]))#计算欧式距离
temp_index = temp_distance_dis_to_ins.index(min(temp_distance_dis_to_ins))#将距离最小的那个索引纳入
# Subtract the i-th instance and the nearest discriminative instance.
temp_dis_to_ins_vector = ins_i - discriminative_instance[temp_index]
if self.embed_status == 'add':
temp_single_vector += temp_dis_to_ins_vector
elif self.embed_status == 'con':
start_index = 0
end_index = self.bags[bag_i][0].shape[1] - 1
temp_single_vector[start_index + temp_index * end_index:(temp_index + 1) * end_index] += (
temp_dis_to_ins_vector)
# Step 3. Normalize the vector
temp_single_vector = np.sign(temp_single_vector) * np.sqrt(np.abs(temp_single_vector))
temp_norm = np.linalg.norm(temp_single_vector)
temp_single_vector = temp_single_vector / temp_norm
bag_to_vector.append(temp_single_vector)
return np.array(bag_to_vector)
if __name__ == '__main__':
file_path = "rec_sport_hockey.mat"
mil = MILTool(file_path)
bags = mil.bags
train_index, te_index = get_ten_fold_index(bags)
miDie_demo = DIE(bags, train_index[1], 'g', 'con', 0.01, 2).embedding_vector
print(miDie_demo)
miDie_demo = np.array(miDie_demo)
2.4 DIEPredict.py
这一部分主要是对训练集进行预测,并且分别使用KNN、决策树、线性支持向量机、高斯径向基函数支持向量机对处理后的数据集进行分类预测,最后输出预测精度。
import warnings
import numpy as np
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score
from MIL_MIDIE.MIDIE.DIE import DIE
from MIL_MIDIE.MIL_Frame.MILTool import MILTool
from MIL_MIDIE.MIL_Frame.MILTool import get_ten_fold_index
warnings.filterwarnings('ignore')
class DIEPredict:
def __init__(self, file_path, bags_status, embed_status, ratio_instance_to_bag,
num_discriminative_instance, bags=None):
'''
:param file_path: 数据文件路径
:param bags_status: 包状态,即正包、负包、所有包
:param embed_status: 映射方法,有相加(add)或拼接(con)
:param ratio_instance_to_bag: 包中实例原型选择比例
:param num_discriminative_instance:代表实例数量
:param bags:包
'''
self.file_path = file_path
self.bags_status = bags_status
self.embed_status = embed_status
self.ra_ins = ratio_instance_to_bag
self.num_dis_ins = num_discriminative_instance
self.bags = bags
self.__DIEPredict()
def __DIEPredict(self):
MIL = MILTool(para_file_name=self.file_path, bags=self.bags)#加载数据集
bags = MIL.bags#加载包
bags_label = MIL.bags_label#加载标签
num_fold = 10
accuracy_res_score = np.zeros(4).astype('float64')#预测精确度分数
f1_res_score = np.zeros(4).astype('float64')#f1分数来评价预测精度
roc_aoc_res_score = np.zeros(4).astype('float64')
accuracy_std = [[], [], [], []]
f1_res_std = [[], [], [], []]
roc_aoc_res_std = [[], [], [], []]
for i in range(num_fold):
#四种分类器
knn_estimator = MIL.get_classifier('knn')#k近邻
DTree_estimator = MIL.get_classifier('DTree')#决策树
LSVM_estimator = MIL.get_classifier('linear_svm')#线性支持向量机
RSVM_estimator = MIL.get_classifier('rbf_svm')#高斯径向基函数核
accuracy_measure_score = np.zeros(4).astype('float64')
fl_measure_score = np.zeros(4).astype('float64')
roc_measure_score = np.zeros(4).astype('float64')
tr_index, te_index = get_ten_fold_index(bags)
temp_f1_score = 10
temp_acc_score = 10
temp_roc_score = 10
for j in range(10):
tr_bags = bags[tr_index[j]]#本次训练包
tr_label_true = bags_label[[tr_index[j]]]#训练包标签
te_bags = bags[te_index[j]]#本次测试包
te_label_true = bags_label[[te_index[j]]]#测试包标签
#第1步 使用算法将包映射
test_Demo = DIE(bags, tr_index[j], self.bags_status, self.embed_status, self.ra_ins,
self.num_dis_ins).embedding_vector
tr_embedding_vector = test_Demo[tr_index[j]]
te_embedding_vector = test_Demo[te_index[j]]
#第2.1.1步 使用knn进行分类
knn_model = knn_estimator.fit(tr_embedding_vector, tr_label_true)#输入映射后的向量以及标签进行处理
te_label_predict_knn = knn_model.predict(te_embedding_vector)
# 2.1.2 计算三种预测分数
fl_measure_score[0] += f1_score(te_label_true, te_label_predict_knn)#将测试集标签与测试集预测结果进行评分
accuracy_measure_score[0] += accuracy_score(te_label_true, te_label_predict_knn)
try:
temp_aoc = roc_auc_score(te_label_true, te_label_predict_knn)
roc_measure_score[0] += temp_aoc
except ValueError:
pass
#第2.2.1步 使用决策树进行分类
DTree_model = DTree_estimator.fit(tr_embedding_vector, tr_label_true)
te_label_predict_tree = DTree_model.predict(te_embedding_vector)
#2.2.2 计算三种预测分数
fl_measure_score[1] += f1_score(te_label_true, te_label_predict_tree)
accuracy_measure_score[1] += accuracy_score(te_label_true, te_label_predict_tree)
try:
temp_aoc = roc_auc_score(te_label_true, te_label_predict_knn)
roc_measure_score[1] += temp_aoc
except ValueError:
pass
#第2.3.1步 使用线性支持向量机
LSVM_model = LSVM_estimator.fit(tr_embedding_vector, tr_label_true)
te_label_predict_LSVM = LSVM_model.predict(te_embedding_vector)
#2.3.2 计算三种预测分数
fl_measure_score[2] += f1_score(te_label_true, te_label_predict_LSVM)
accuracy_measure_score[2] += accuracy_score(te_label_true, te_label_predict_LSVM)
try:
temp_aoc = roc_auc_score(te_label_true, te_label_predict_knn)
roc_measure_score[2] += temp_aoc
except ValueError:
pass
#第2.4.1 使用高斯径向基函数核支持向量机
RSVM_model = RSVM_estimator.fit(tr_embedding_vector, tr_label_true)
te_label_predict_RSVM = RSVM_model.predict(te_embedding_vector)
#2.4.2 计算三种预测分数
fl_measure_score[3] += f1_score(te_label_true, te_label_predict_RSVM)
accuracy_measure_score[3] += accuracy_score(te_label_true, te_label_predict_RSVM)
try:
temp_aoc = roc_auc_score(te_label_true, te_label_predict_knn)
roc_measure_score[3] += temp_aoc
except ValueError:
pass
#计算本次训练的准确度
accuracy_res_score[0] += accuracy_measure_score[0] / temp_acc_score # Knn acc
accuracy_res_score[1] += accuracy_measure_score[1] / temp_acc_score # DTree acc
accuracy_res_score[2] += accuracy_measure_score[2] / temp_acc_score # LSVM acc
accuracy_res_score[3] += accuracy_measure_score[3] / temp_acc_score # RSVM acc
#存入本次训练精确度标准差
accuracy_std[0].append(accuracy_measure_score[0] * temp_acc_score) # Knn std
accuracy_std[1].append(accuracy_measure_score[1] * temp_acc_score) # DTree std
accuracy_std[2].append(accuracy_measure_score[2] * temp_acc_score) # LSVM std
accuracy_std[3].append(accuracy_measure_score[3] * temp_acc_score) # RSVM std
#计算本次训练的f1分数
f1_res_score[0] += fl_measure_score[0] / temp_f1_score # Knn f1
f1_res_score[1] += fl_measure_score[1] / temp_f1_score # DTree f1
f1_res_score[2] += fl_measure_score[2] / temp_f1_score # LSVM f1
f1_res_score[3] += fl_measure_score[3] / temp_f1_score # RSVM f1
#存入本次训练f1分数标准差
f1_res_std[0].append(fl_measure_score[0] * temp_f1_score) # Knn std
f1_res_std[1].append(fl_measure_score[1] * temp_f1_score) # DTree std
f1_res_std[2].append(fl_measure_score[2] * temp_f1_score) # LSVM std
f1_res_std[3].append(fl_measure_score[3] * temp_f1_score) # RSVM std
#计算本次训练的roc_aoc分数
roc_aoc_res_score[0] += roc_measure_score[0] / temp_roc_score # Knn roc
roc_aoc_res_score[1] += roc_measure_score[1] / temp_roc_score # DTree roc
roc_aoc_res_score[2] += roc_measure_score[2] / temp_roc_score # LSVM roc
roc_aoc_res_score[3] += roc_measure_score[3] / temp_roc_score # RSVM roc
#存入本次训练的roc_aoc分数标准差
roc_aoc_res_std[0].append(roc_measure_score[0] * temp_roc_score) # Knn std
roc_aoc_res_std[1].append(roc_measure_score[1] * temp_roc_score) # DTree std
roc_aoc_res_std[2].append(roc_measure_score[2] * temp_roc_score) # LSVM std
roc_aoc_res_std[3].append(roc_measure_score[3] * temp_roc_score) # RSVM std
knn_acc_res = "&$%.1f" % (accuracy_res_score[0] * 10) + "_{\pm%.2f" % (np.std(accuracy_std[0])) + "}$"
knn_f1_res = "&$%.1f" % (f1_res_score[0] * 10) + "_{\pm%.2f" % (np.std(f1_res_std[0])) + "}$"
knn_roc_res = "&$%.1f" % (roc_aoc_res_score[0] * 10) + "_{\pm%.2f" % (np.std(roc_aoc_res_std[0])) + "}$"
DTree_acc_res = "&$%.1f" % (accuracy_res_score[1] * 10) + "_{\pm%.2f" % (np.std(accuracy_std[1])) + "}$"
DTree_f1_res = "&$%.1f" % (f1_res_score[1] * 10) + "_{\pm%.2f" % (np.std(f1_res_std[1])) + "}$"
DTree_roc_res = "&$%.1f" % (roc_aoc_res_score[1] * 10) + "_{\pm%.2f" % (
np.std(roc_aoc_res_std[1])) + "}$"
LSVM_acc_res = "&$%.1f" % (accuracy_res_score[2] * 10) + "_{\pm%.2f" % (np.std(accuracy_std[2])) + "}$"
LSVM_f1_res = "&$%.1f" % (f1_res_score[2] * 10) + "_{\pm%.2f" % (np.std(f1_res_std[2])) + "}$"
LSVM_roc_res = "&$%.1f" % (roc_aoc_res_score[2] * 10) + "_{\pm%.2f" % (
np.std(roc_aoc_res_std[2])) + "}$"
RSVM_acc_res = "&$%.1f" % (accuracy_res_score[3] * 10) + "_{\pm%.2f" % (np.std(accuracy_std[3])) + "}$"
RSVM_f1_res = "&$%.1f" % (f1_res_score[3] * 10) + "_{\pm%.2f" % (np.std(f1_res_std[3])) + "}$"
RSVM_roc_res = "&$%.1f" % (roc_aoc_res_score[3] * 10) + "_{\pm%.2f" % (
np.std(roc_aoc_res_std[3])) + "}$"
#输出结果
print('\t\t\t\t\t', knn_acc_res, '\t', knn_f1_res, '\t', knn_roc_res, '\t',
DTree_acc_res, '\t', DTree_f1_res, '\t', DTree_roc_res, '\t',
LSVM_acc_res, '\t', LSVM_f1_res, '\t', LSVM_roc_res, '\t',
RSVM_acc_res, '\t', RSVM_f1_res, '\t', RSVM_roc_res)
if __name__ == '__main__':
file_name = "mutagenesis1.mat"
para_name = file_name[30:]
DIEPredict(file_name,para_name)