1、算法

本文主要是论文《Two-stage instance selection and adaptive bag mapping algorithm for multi-instance learning》中算法代码的复现。具体算法原理见此文。本文与SMDP有类似之处,都使用了DP聚类。

2、代码

2.1 DIP.py

这一部分代码是创建实例原型池,即:找出所有包的代表实例聚集在一起。此阶段为在包内选取实例原型

import math
import numpy as np
from scipy.spatial.distance import cdist
from MIL_MIDIE.MIL_Frame.MILTool import get_cosine


class DIP:#基于DIP(discriminative instance pool)映射
    def __init__(self,bags,scale_num):
        self.bags=bags
        self.scale_num=scale_num
        self.inner_bag_distance=[]
        self.discriminative_instance = self.__get_discriminative_ins()

    def __get_discriminative_ins(self):#获得discriminative实例
        """
        Get the discriminative instance of the each bags to build the discriminative instance pool.
        :return:
        """
        for bag_i in range(self.bags.shape[0]):
            self.__get_instance_in_bag(self.bags[bag_i])
        return np.array(self.inner_bag_distance)

    def __get_instance_in_bag(self, bag):#从包中找到实例作为discriminative instance
        # Step 1. Calculate the distances of each instance.计算实例间的距离
        # print(bag[0].shape[0])
        ins_space_to_bag = []
        for ins in bag[0][:, :-1]:
            ins_space_to_bag.append(ins)
        ins_space_to_bag = np.array(ins_space_to_bag)
        distance_ins_to_ins = cdist(ins_space_to_bag, ins_space_to_bag)#计算两个实例间的马氏距离
        # Step 2. Calculate the affinity of each instance.计算实例间的相似度,通过计算余弦值来进行度量
        affinity_ins_to_ins = get_cosine(ins_space_to_bag)
        affinity_ins = np.zeros((len(distance_ins_to_ins), len(distance_ins_to_ins))).astype("int32")
        affinity_ins_score = []
        # density_ins_score = []
        ave_dis_ins = affinity_ins_to_ins.mean()
        for i in range(len(affinity_ins_to_ins)):
            for j in range(len(affinity_ins_to_ins)):
                if affinity_ins_to_ins[i, j] <= ave_dis_ins:
                    affinity_ins[i, j] = 1
            affinity_ins_score.append(sum(affinity_ins[i]))

        # Step 3. Calculate the density of each instance.
        dis_cut = 0.4 * distance_ins_to_ins.max()
        density_ins_score = np.zeros(len(distance_ins_to_ins)).astype("float64")
        for i in range(len(distance_ins_to_ins)):
            if dis_cut == 0:
                density_ins_score[i] = 1
            else:
                density_ins_score[i] = sum(np.exp(-(distance_ins_to_ins[i] / dis_cut) ** 2))
        # Step 4. Calculate the lambda of each instance.
        lambda_ins_score = np.multiply(affinity_ins_score, density_ins_score).tolist()
        # Step 5. Get the discriminative instance.
        for i in range(math.ceil(self.scale_num * bag[0].shape[0])):
            self.inner_bag_distance.append(ins_space_to_bag[lambda_ins_score.index(min(lambda_ins_score))])
            lambda_ins_score[lambda_ins_score.index(min((lambda_ins_score)))] = -1


if __name__=='__main__':
    a = np.array([0, 1, 3])
    b = np.array([[12, 3, 4], [0, 1, 2], [2, 5, 6], [1, 2, 3]])
    c=cdist([a], b)
    print(min(max(c)))
2.2 SDI.py

这段代码主要是进行第二阶段:在实例原型池中选出代表实例。先要通过DIP得到实例原型池,再通过DP来选出代表实例。在这个过程中,需要计算实例原型的密度TEXTCNN源码复现 代码复现_ci以及实例到比他实例密度更大的实例的距离TEXTCNN源码复现 代码复现_机器学习_02。最后计算TEXTCNN源码复现 代码复现_机器学习_03来找出代表实例。

import numpy as np
from scipy.spatial.distance import cdist
import MIL_MIDIE.MIDIE.DIP as DIP
from MIL_MIDIE.MIL_Frame.MILTool import MILTool


class SDI:
    def __init__(self, bags, ratio_instance_to_bag, num_SDI):
        self.bags = bags
        self.ratio_ins = ratio_instance_to_bag
        self.num_SDI = num_SDI
        self.discriminative_instance = self.__print_SDI()
        self.final_discriminative_instance = self.__select_discriminative_instance()

    def __print_SDI(self):
        DIP_demo = DIP.DIP(self.bags,self.ratio_ins)#输入包以及搜索范围
        return DIP_demo.discriminative_instance#返回找到的discriminative instance

    def __select_discriminative_instance(self):
        #通过DP来选出代表实例
        # Step 1. Calculate the distance of discriminative instances.
        discriminative_instance_distance = cdist(self.discriminative_instance, self.discriminative_instance)
        # Step 2. The cutoff distances.
        dis_cut = 0.4 * discriminative_instance_distance.max()
        # The density of  discriminative instances.
        density_discriminative_ins = np.zeros(len(discriminative_instance_distance)).astype("float64")
        for i in range(len(discriminative_instance_distance)):
            if dis_cut == 0:
                density_discriminative_ins[i] = 1
            else:
                density_discriminative_ins[i] = sum(np.exp(-(discriminative_instance_distance[i] / dis_cut) ** 2))
        # Step 3. The distance of closest instance that is denser than itself.
        distance_closest = []
        for i in range(len(density_discriminative_ins)):
            more_density_instance_index = []
            temp_density_instance = density_discriminative_ins[i]
            for j in range(len(density_discriminative_ins)):
                if density_discriminative_ins[j] > temp_density_instance:
                    more_density_instance_index.append(j)
            temp_distance_more_instance = []
            for index in range(0, len(more_density_instance_index)):
                index_k = more_density_instance_index[index]
                temp_distance_more_instance.append(discriminative_instance_distance[i][index_k])
            if temp_distance_more_instance:
                temp_distance_more_instance.sort()
                distance_closest.append(temp_distance_more_instance[0])
            else:
                distance_closest.append(float('inf'))
        # Step 4. The lambda of discriminative instance.
        lambda_discriminative_instance = np.multiply(distance_closest, density_discriminative_ins).tolist()
        final_discriminative_instance = []
        for i in range(self.num_SDI):
            index_most = lambda_discriminative_instance.index(max(lambda_discriminative_instance))
            final_discriminative_instance.append(self.discriminative_instance[index_most])
            lambda_discriminative_instance[index_most] = -1

        return np.array(final_discriminative_instance)


if __name__ == '__main__':
    file_path = "MUSK1_1.arff"
    mil = MILTool(file_path)
    bags = mil.bags
    SDI_demo = SDI(bags, 0.01, 2)
    ins = SDI_demo.final_discriminative_instance
    print(type(ins))
2.3 DIE.py

这一部分的代码主要做的就是包映射相关工作。可以对不同状态的包进行包获取,包状态主要有三种:global、positive、negative。也可以通过相加(add)或拼接(con)两种方式来对包进行映射。最后对向量进行归一化。

import warnings
import numpy as np
from MIL_MIDIE.MIDIE.SDI import SDI
from MIL_MIDIE.MIL_Frame.MILTool import MILTool, dis_euclidean, get_ten_fold_index

warnings.filterwarnings('ignore')


class DIE:
    def __init__(self, all_bag, tr_index, bags_status, embed_status, ratio_instance_to_bag,
                 num_discriminative_instance):
        self.bags = all_bag
        self.bags_status = bags_status
        self.embed_status = embed_status
        self.tr_index = tr_index
        self.ra_ins = ratio_instance_to_bag
        self.num_dis_ins = num_discriminative_instance
        self.train_final_bag = self.__get_bags()
        self.embedding_vector = self.__embedding()

    def __get_bags(self):
        """
                Get train bags through three types:
                    g: The source of bags is the global bags.
                    p: The source of bags is the positive bags.
                    n:The source of bags is the negative bags
                :return: The bags.
                """
        if self.bags_status == 'g':  # global bags
            return self.bags[self.tr_index]  # 训练集索引的所有包
        elif self.bags_status == 'p':  # positive bags
            positive_bags_index = []  # 正包
            for i in range(len(self.tr_index)):
                if self.bags[self.tr_index[i], -1] == 1:  # 标签为1
                    positive_bags_index.append(self.tr_index[i])  # 加入矩阵中
            return self.bags[positive_bags_index]
        elif self.bags_status == 'n':  # negative bags
            negative_bags_index = []  # 负包
            for i in range(len(self.tr_index)):
                if not self.bags[self.tr_index[i], -1] == 1:  # 标签不为1
                    negative_bags_index.append(self.tr_index[i])
            return self.bags[negative_bags_index]

    def __embedding(self):
        """
             Let a bag embedding into a single vector.
             :return:
             """
        # Step 1. Get the discriminative instance through the select discriminative instance method./通过SDI方法找出代表实例
        discriminative_instance = SDI(self.train_final_bag, self.ra_ins, self.num_dis_ins).final_discriminative_instance

        # Step 2. Embedding the bags as the single vector./将包映射为向量
        bag_to_vector = []
        for bag_i in range(self.bags.shape[0]):
            # Step 2.1 Find the embedding model./选择嵌入模型
            temp_single_vector = []
            if self.embed_status == 'add':#映射方法为相加
                temp_single_vector = np.zeros(self.bags[bag_i][0].shape[1] - 1).astype("float64")
            elif self.embed_status == 'con':#映射方法为拼接
                temp_single_vector = np.zeros(self.num_dis_ins * (self.bags[bag_i][0].shape[1] - 1)).astype("float64")
            else:
                print('Your input model is not exist!\n')
                break
            # Step 2.2 Specific steps of embedding.
            for ins_i in self.bags[bag_i][0][:, :-1]:#所有实例进行寻找
                temp_distance_dis_to_ins = []
                # Step 2.2.1 Find  the nearest discriminative instance.#找到最近的代表实例
                for dis_ins_i in range(self.num_dis_ins):#与所有的代表进行距离计算
                    temp_distance_dis_to_ins.append(dis_euclidean(ins_i, discriminative_instance[dis_ins_i]))#计算欧式距离
                temp_index = temp_distance_dis_to_ins.index(min(temp_distance_dis_to_ins))#将距离最小的那个索引纳入
                # Subtract the i-th instance and the nearest discriminative instance.
                temp_dis_to_ins_vector = ins_i - discriminative_instance[temp_index]
                if self.embed_status == 'add':
                    temp_single_vector += temp_dis_to_ins_vector
                elif self.embed_status == 'con':
                    start_index = 0
                    end_index = self.bags[bag_i][0].shape[1] - 1
                    temp_single_vector[start_index + temp_index * end_index:(temp_index + 1) * end_index] += (
                        temp_dis_to_ins_vector)

            # Step 3. Normalize the vector
            temp_single_vector = np.sign(temp_single_vector) * np.sqrt(np.abs(temp_single_vector))
            temp_norm = np.linalg.norm(temp_single_vector)
            temp_single_vector = temp_single_vector / temp_norm

            bag_to_vector.append(temp_single_vector)

        return np.array(bag_to_vector)


if __name__ == '__main__':
    file_path = "rec_sport_hockey.mat"
    mil = MILTool(file_path)
    bags = mil.bags
    train_index, te_index = get_ten_fold_index(bags)
    miDie_demo = DIE(bags, train_index[1], 'g', 'con', 0.01, 2).embedding_vector
    print(miDie_demo)
    miDie_demo = np.array(miDie_demo)
2.4 DIEPredict.py

这一部分主要是对训练集进行预测,并且分别使用KNN、决策树、线性支持向量机、高斯径向基函数支持向量机对处理后的数据集进行分类预测,最后输出预测精度。

import warnings
import numpy as np
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score
from MIL_MIDIE.MIDIE.DIE import DIE
from MIL_MIDIE.MIL_Frame.MILTool import MILTool
from MIL_MIDIE.MIL_Frame.MILTool import get_ten_fold_index

warnings.filterwarnings('ignore')


class DIEPredict:
    def __init__(self, file_path, bags_status, embed_status, ratio_instance_to_bag,
                 num_discriminative_instance, bags=None):
        '''

        :param file_path: 数据文件路径
        :param bags_status: 包状态,即正包、负包、所有包
        :param embed_status: 映射方法,有相加(add)或拼接(con)
        :param ratio_instance_to_bag: 包中实例原型选择比例
        :param num_discriminative_instance:代表实例数量
        :param bags:包
        '''
        self.file_path = file_path
        self.bags_status = bags_status
        self.embed_status = embed_status
        self.ra_ins = ratio_instance_to_bag
        self.num_dis_ins = num_discriminative_instance
        self.bags = bags
        self.__DIEPredict()

    def __DIEPredict(self):
        MIL = MILTool(para_file_name=self.file_path, bags=self.bags)#加载数据集
        bags = MIL.bags#加载包
        bags_label = MIL.bags_label#加载标签
        num_fold = 10

        accuracy_res_score = np.zeros(4).astype('float64')#预测精确度分数
        f1_res_score = np.zeros(4).astype('float64')#f1分数来评价预测精度
        roc_aoc_res_score = np.zeros(4).astype('float64')
        accuracy_std = [[], [], [], []]
        f1_res_std = [[], [], [], []]
        roc_aoc_res_std = [[], [], [], []]
        for i in range(num_fold):
            #四种分类器
            knn_estimator = MIL.get_classifier('knn')#k近邻
            DTree_estimator = MIL.get_classifier('DTree')#决策树
            LSVM_estimator = MIL.get_classifier('linear_svm')#线性支持向量机
            RSVM_estimator = MIL.get_classifier('rbf_svm')#高斯径向基函数核
            accuracy_measure_score = np.zeros(4).astype('float64')
            fl_measure_score = np.zeros(4).astype('float64')
            roc_measure_score = np.zeros(4).astype('float64')

            tr_index, te_index = get_ten_fold_index(bags)
            temp_f1_score = 10
            temp_acc_score = 10
            temp_roc_score = 10
            for j in range(10):
                tr_bags = bags[tr_index[j]]#本次训练包
                tr_label_true = bags_label[[tr_index[j]]]#训练包标签
                te_bags = bags[te_index[j]]#本次测试包
                te_label_true = bags_label[[te_index[j]]]#测试包标签

                #第1步 使用算法将包映射
                test_Demo = DIE(bags, tr_index[j], self.bags_status, self.embed_status, self.ra_ins,
                                self.num_dis_ins).embedding_vector

                tr_embedding_vector = test_Demo[tr_index[j]]
                te_embedding_vector = test_Demo[te_index[j]]

                #第2.1.1步 使用knn进行分类
                knn_model = knn_estimator.fit(tr_embedding_vector, tr_label_true)#输入映射后的向量以及标签进行处理
                te_label_predict_knn = knn_model.predict(te_embedding_vector)
                # 2.1.2 计算三种预测分数
                fl_measure_score[0] += f1_score(te_label_true, te_label_predict_knn)#将测试集标签与测试集预测结果进行评分
                accuracy_measure_score[0] += accuracy_score(te_label_true, te_label_predict_knn)
                try:
                    temp_aoc = roc_auc_score(te_label_true, te_label_predict_knn)
                    roc_measure_score[0] += temp_aoc
                except ValueError:
                    pass

                #第2.2.1步 使用决策树进行分类
                DTree_model = DTree_estimator.fit(tr_embedding_vector, tr_label_true)
                te_label_predict_tree = DTree_model.predict(te_embedding_vector)
                #2.2.2 计算三种预测分数
                fl_measure_score[1] += f1_score(te_label_true, te_label_predict_tree)
                accuracy_measure_score[1] += accuracy_score(te_label_true, te_label_predict_tree)
                try:
                    temp_aoc = roc_auc_score(te_label_true, te_label_predict_knn)
                    roc_measure_score[1] += temp_aoc
                except ValueError:
                    pass

                #第2.3.1步 使用线性支持向量机
                LSVM_model = LSVM_estimator.fit(tr_embedding_vector, tr_label_true)
                te_label_predict_LSVM = LSVM_model.predict(te_embedding_vector)
                #2.3.2 计算三种预测分数
                fl_measure_score[2] += f1_score(te_label_true, te_label_predict_LSVM)
                accuracy_measure_score[2] += accuracy_score(te_label_true, te_label_predict_LSVM)
                try:
                    temp_aoc = roc_auc_score(te_label_true, te_label_predict_knn)
                    roc_measure_score[2] += temp_aoc
                except ValueError:
                    pass

                #第2.4.1 使用高斯径向基函数核支持向量机
                RSVM_model = RSVM_estimator.fit(tr_embedding_vector, tr_label_true)
                te_label_predict_RSVM = RSVM_model.predict(te_embedding_vector)
                #2.4.2 计算三种预测分数
                fl_measure_score[3] += f1_score(te_label_true, te_label_predict_RSVM)
                accuracy_measure_score[3] += accuracy_score(te_label_true, te_label_predict_RSVM)
                try:
                    temp_aoc = roc_auc_score(te_label_true, te_label_predict_knn)
                    roc_measure_score[3] += temp_aoc
                except ValueError:
                    pass
                #计算本次训练的准确度
                accuracy_res_score[0] += accuracy_measure_score[0] / temp_acc_score  # Knn   acc
                accuracy_res_score[1] += accuracy_measure_score[1] / temp_acc_score  # DTree acc
                accuracy_res_score[2] += accuracy_measure_score[2] / temp_acc_score  # LSVM  acc
                accuracy_res_score[3] += accuracy_measure_score[3] / temp_acc_score  # RSVM  acc

                #存入本次训练精确度标准差
                accuracy_std[0].append(accuracy_measure_score[0] * temp_acc_score)  # Knn   std
                accuracy_std[1].append(accuracy_measure_score[1] * temp_acc_score)  # DTree std
                accuracy_std[2].append(accuracy_measure_score[2] * temp_acc_score)  # LSVM  std
                accuracy_std[3].append(accuracy_measure_score[3] * temp_acc_score)  # RSVM  std

                #计算本次训练的f1分数
                f1_res_score[0] += fl_measure_score[0] / temp_f1_score  # Knn   f1
                f1_res_score[1] += fl_measure_score[1] / temp_f1_score  # DTree f1
                f1_res_score[2] += fl_measure_score[2] / temp_f1_score  # LSVM  f1
                f1_res_score[3] += fl_measure_score[3] / temp_f1_score  # RSVM  f1

                #存入本次训练f1分数标准差
                f1_res_std[0].append(fl_measure_score[0] * temp_f1_score)  # Knn   std
                f1_res_std[1].append(fl_measure_score[1] * temp_f1_score)  # DTree std
                f1_res_std[2].append(fl_measure_score[2] * temp_f1_score)  # LSVM  std
                f1_res_std[3].append(fl_measure_score[3] * temp_f1_score)  # RSVM  std

                #计算本次训练的roc_aoc分数
                roc_aoc_res_score[0] += roc_measure_score[0] / temp_roc_score  # Knn   roc
                roc_aoc_res_score[1] += roc_measure_score[1] / temp_roc_score  # DTree roc
                roc_aoc_res_score[2] += roc_measure_score[2] / temp_roc_score  # LSVM  roc
                roc_aoc_res_score[3] += roc_measure_score[3] / temp_roc_score  # RSVM  roc

                #存入本次训练的roc_aoc分数标准差
                roc_aoc_res_std[0].append(roc_measure_score[0] * temp_roc_score)  # Knn   std
                roc_aoc_res_std[1].append(roc_measure_score[1] * temp_roc_score)  # DTree std
                roc_aoc_res_std[2].append(roc_measure_score[2] * temp_roc_score)  # LSVM  std
                roc_aoc_res_std[3].append(roc_measure_score[3] * temp_roc_score)  # RSVM  std

                knn_acc_res = "&$%.1f" % (accuracy_res_score[0] * 10) + "_{\pm%.2f" % (np.std(accuracy_std[0])) + "}$"
                knn_f1_res = "&$%.1f" % (f1_res_score[0] * 10) + "_{\pm%.2f" % (np.std(f1_res_std[0])) + "}$"
                knn_roc_res = "&$%.1f" % (roc_aoc_res_score[0] * 10) + "_{\pm%.2f" % (np.std(roc_aoc_res_std[0])) + "}$"

                DTree_acc_res = "&$%.1f" % (accuracy_res_score[1] * 10) + "_{\pm%.2f" % (np.std(accuracy_std[1])) + "}$"
                DTree_f1_res = "&$%.1f" % (f1_res_score[1] * 10) + "_{\pm%.2f" % (np.std(f1_res_std[1])) + "}$"
                DTree_roc_res = "&$%.1f" % (roc_aoc_res_score[1] * 10) + "_{\pm%.2f" % (
                    np.std(roc_aoc_res_std[1])) + "}$"

                LSVM_acc_res = "&$%.1f" % (accuracy_res_score[2] * 10) + "_{\pm%.2f" % (np.std(accuracy_std[2])) + "}$"
                LSVM_f1_res = "&$%.1f" % (f1_res_score[2] * 10) + "_{\pm%.2f" % (np.std(f1_res_std[2])) + "}$"
                LSVM_roc_res = "&$%.1f" % (roc_aoc_res_score[2] * 10) + "_{\pm%.2f" % (
                    np.std(roc_aoc_res_std[2])) + "}$"

                RSVM_acc_res = "&$%.1f" % (accuracy_res_score[3] * 10) + "_{\pm%.2f" % (np.std(accuracy_std[3])) + "}$"
                RSVM_f1_res = "&$%.1f" % (f1_res_score[3] * 10) + "_{\pm%.2f" % (np.std(f1_res_std[3])) + "}$"
                RSVM_roc_res = "&$%.1f" % (roc_aoc_res_score[3] * 10) + "_{\pm%.2f" % (
                    np.std(roc_aoc_res_std[3])) + "}$"

                #输出结果
                print('\t\t\t\t\t', knn_acc_res, '\t', knn_f1_res, '\t', knn_roc_res, '\t',
                      DTree_acc_res, '\t', DTree_f1_res, '\t', DTree_roc_res, '\t',
                      LSVM_acc_res, '\t', LSVM_f1_res, '\t', LSVM_roc_res, '\t',
                      RSVM_acc_res, '\t', RSVM_f1_res, '\t', RSVM_roc_res)


if __name__ == '__main__':
    file_name = "mutagenesis1.mat"
    para_name = file_name[30:]
    DIEPredict(file_name,para_name)