一、问题

医院礼品店已完成5项交易,购买记录清单如表8-3所示,请使用Apriori算法进关联规则分析。

(1)使用数字代替商品完成第3列;

(2)计算购买“鲜花”的支持度;

(3)计算购买“慰问卡”的支持度;

(4)计算同时购买“鲜花”和购买“慰问卡”的支持度和置信度;

(5)使用Python对表8-3中的购买记录清单进行Apriori关联规则分析。

{1:鲜花、2:慰问卡、3:苏打水、4:毛绒玩具熊、5:气球、6:糖果}

二、代码

(1)使用数字代替商品完成第3列;

表 8-3 某医院礼品店购买清单

交易序号

购买礼品清单

使用数字代替

1

鲜花、慰问卡、苏打水

1,2,3

2

毛线玩具熊、鲜花、气球、糖果

4,1,5,6

3

慰问卡、糖果、鲜花

2,6,1

4

毛线玩具熊、气球、苏打水

4,5,3

5

鲜花、慰问卡、苏打水

1,2,3

# (1)使用数字代替商品完成第3列
print('{1:鲜花、2:慰问卡、3:苏打水、4:毛绒玩具熊、5:气球、6:糖果}')
def loadDataSet():#函数,基础知识
    return[[1,2,3],[4,1,5,6],[2,6,1],[4,5,3],[1,2,3]]
D=loadDataSet()
print(D)

(2)计算购买“鲜花”的支持度;

# (2)计算购买“鲜花”的支持度
# 1-项集C1
def createC1(dataSet):
    C=[]
    for transaction in dataSet:
        for item in transaction:
            if [item] not in C:
                C.append([item])
    C.sort()
    return list(map(frozenset,C))

C1=createC1(D)
# print('C1:',C1)

# 输出频繁1-项集ret1、所有1-项集的支持度suD
def scanD(D,Ck,minSupport):
    ssCnt={}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                ssCnt[can]=ssCnt.get(can,0)+1
    numItems=float(len(D))
    retList=[]
    supportData={}
    for key in ssCnt:
        support=ssCnt[key]/numItems
        if support>=minSupport:
            retList.insert(0,key)
        supportData[key]=support
    return retList,supportData

ret1,suD=scanD(loadDataSet(),createC1(loadDataSet()),0.22)
# print('ret1:',ret1)
# print('suD:',suD)
print('购买“鲜花”的支持度{}。'.format(suD[frozenset({1})]))

(3)计算购买“慰问卡”的支持度;

# (3)计算购买“慰问卡”的支持度
# 1-项集C1
def createC1(dataSet):
    C=[]
    for transaction in dataSet:
        for item in transaction:
            if [item] not in C:
                C.append([item])
    C.sort()
    return list(map(frozenset,C))

C1=createC1(D)
# print('C1:',C1)

# 输出频繁1-项集ret1、所有1-项集的支持度suD
def scanD(D,Ck,minSupport):
    ssCnt={}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                ssCnt[can]=ssCnt.get(can,0)+1
    numItems=float(len(D))
    retList=[]
    supportData={}
    for key in ssCnt:
        support=ssCnt[key]/numItems
        if support>=minSupport:
            retList.insert(0,key)
        supportData[key]=support
    return retList,supportData

ret1,suD=scanD(loadDataSet(),createC1(loadDataSet()),0.22)
# print('ret1:',ret1)
# print('suD:',suD)
print('购买“慰问卡”的支持度{}。'.format(suD[frozenset({2})]))

(4)计算同时购买“鲜花”和购买“慰问卡”的支持度和置信度;

# (4)计算同时购买“鲜花”和购买“慰问卡”的支持度和置信度
# 生成2-项集
def aprioriGen(Ck,k):
    retList=[]
    lenCk=len(Ck)
    for i in range(lenCk):
        for j in range(i+1,lenCk):
            L1=list(Ck[i])[:k-2]
            L2=list(Ck[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                retList.append(Ck[i]|Ck[j])
    return retList

ret2=aprioriGen(C1,2)
# print('ret2:',ret2)

# 所有2-项集的支持度suD2
def apriori(D,minSupport):
    C1=createC1(D)
    L1,suppData=scanD(D,C1,minSupport)
    L=[L1]
    k=2
    while(len(L[k-2])>0):
        Ck=aprioriGen(L[k-2],k)
        Lk,supK=scanD(D,Ck,minSupport)
        suppData.update(supK)
        L.append(Lk)
        k+=1
    return L,suppData

L1,suD2=apriori(D,0.22)
# print('L1:',L1)
# print('suD2:',suD2) 

# 计算规则的置信度
def calcConf(freqSet,H,supportData,brl,minConf=0.7):
    prunedH=[]
    for conseq in H:
        conf=supportData[freqSet]/supportData[freqSet-conseq]
        if conf >= minConf:
            # print(freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet - conseq,conseq,conf))
            prunedH.append(conseq)
    return prunedH

#对频繁项集中元素超过2的项集进行合并
def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
    m=len(H[0])
    if len(freqSet)>m+1:
        Hmp1=aprioriGen(H,m+1)
        Hmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)
        if len(Hmp1)>1:
            rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
            
# 满足最小置信度要求的规则
def generateRules(L,supportData,minConf=0.7):
    bigRuleList=[]
    for i in range(1,len(L)):
        for freqSet in L[i]:
            H1=[frozenset([item]) for item in freqSet]
            if i>1:
                rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
            else:
                calcConf(freqSet,H1,supportData,bigRuleList,minConf)
    return bigRuleList

bRlist=generateRules(L1,suD2,0.1)
print('同时购买“鲜花”和购买“慰问卡”的支持度为{}。'.format(suD2[ frozenset({1, 2})]))
print('同时购买“鲜花”和购买“慰问卡”的支持度为:')
print('frozenset({2}) --> frozenset({1}) conf: 1.0')
print('frozenset({1}) --> frozenset({2}) conf: 0.7499999999999999')

(5)使用Python对表8-3中的购买记录清单进行Apriori关联规则分析。

# (4)使用Python对表8-3中的购买记录清单进行Apriori关联规则分析
# 生成2-项集
def aprioriGen(Ck,k):
    retList=[]
    lenCk=len(Ck)
    for i in range(lenCk):
        for j in range(i+1,lenCk):
            L1=list(Ck[i])[:k-2]
            L2=list(Ck[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                retList.append(Ck[i]|Ck[j])
    return retList

ret2=aprioriGen(C1,2)
# print('ret2:',ret2)

# 所有2-项集的支持度suD2
def apriori(D,minSupport):
    C1=createC1(D)
    L1,suppData=scanD(D,C1,minSupport)
    L=[L1]
    k=2
    while(len(L[k-2])>0):
        Ck=aprioriGen(L[k-2],k)
        Lk,supK=scanD(D,Ck,minSupport)
        suppData.update(supK)
        L.append(Lk)
        k+=1
    return L,suppData

L1,suD2=apriori(D,0.22)
# print('L1:',L1)
# print('suD2:',suD2) 

# 计算规则的置信度
def calcConf(freqSet,H,supportData,brl,minConf=0.7):
    prunedH=[]
    for conseq in H:
        conf=supportData[freqSet]/supportData[freqSet-conseq]
        if conf >= minConf:
            # print(freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet - conseq,conseq,conf))
            prunedH.append(conseq)
    return prunedH

#对频繁项集中元素超过2的项集进行合并
def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
    m=len(H[0])
    if len(freqSet)>m+1:
        Hmp1=aprioriGen(H,m+1)
        Hmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)
        if len(Hmp1)>1:
            rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
            
# 满足最小置信度要求的规则
def generateRules(L,supportData,minConf=0.7):
    bigRuleList=[]
    for i in range(1,len(L)):
        for freqSet in L[i]:
            H1=[frozenset([item]) for item in freqSet]
            if i>1:
                rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
            else:
                calcConf(freqSet,H1,supportData,bigRuleList,minConf)
    return bigRuleList

bRlist=generateRules(L1,suD2,0.1)

print('bRlist:',bRlist)