以购物记录分析为例,给定最小支持度(很多人买的商品),最小置信度(买A商品同时很可能会买B商品,也就是关联规则):
Python
声明:所有频繁集的子集一定是频繁集,“{苹果,梨子}是频繁集,也就是大家都在买,那么{苹果}和{梨子}显然都是频繁集,它们被一个大的频繁集包含了”
步骤(Apriori算法):
找出购买记录的所有商品,作为1项候选集;
计算1项集支持度,找到频繁1项集;
1项集两两合并为2项候选集;
计算2项集支持度,找到频繁2项集;
2项集,对元素进行sort排序,最后一个元素不同的2项集合并为3项候选集;
计算3项集支持度,找到频繁3项集;
1项频繁集元素X1分别和2项频繁集、3项频繁集元素X2进行X1.issubset(X2),计算置信度X1-->(X2-X1)的置信度,得到关联规则
主程序(4条购物记录,5种商品):
if __name__ == '__main__':
data = [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
L, support_data = apriori(data, min_support=0.5, max_len=3)
association_rules = association_rules(L, support_data, min_conf=0.6)
print('满足min_support=0.5, max_len=3, min_conf=0.6的关联规则:', association_rules)
print('满足min_support=0.5, max_len=3的频繁集及支持度:', support_data)
首先需要用data和min_supprot找到1项频繁集:
def generate_support_first(data, min_support):
# 生成候选集
c1 = set() # 元组(1项候选集)
for items in data:
for item in items:
item_set = frozenset([item])
c1.add(item_set)
# 对候选集计数
item_count = {} # 字典{1项候选集:出现次数}
for items in data:
for item in c1:
if item.issubset(items): # 元组元素.issubset(列表元素)
if item not in item_count:
item_count[item] = 1 # 第一次出现
else:
item_count[item] += 1
# 计算支持度
data_len = float(len(data)) # 原数据列表元素数量
freq_set = set() # 元组(1项频繁集)
support = {} # 字典{频繁:支持度}
for item in item_count:
if (item_count[item] / data_len) >= min_support:
freq_set.add(item)
support[item] = round(item_count[item] / data_len, 2) # 支持度保留两位小数
# print(freq_set, support)
return freq_set, support
生成新的候选集(在本例中,需要生成2项集和3项集):
def new_set(freq_set, item_len):
new_combinations = set() # 元组(新的候选集)
sets_len = len(freq_set) # 输入频繁集个数
freq_set_list = list(freq_set) # 列表[原频繁集]
# 如果频繁集只有1个,那么不不存包含它的频繁集,直接返回空元组
if sets_len == 1:
return new_combinations
for i in range(sets_len):
for j in range(i + 1, sets_len):
l1 = list(freq_set_list[i])
l2 = list(freq_set_list[j])
l1.sort()
l2.sort()
# 对最后一位不同的项集进行合并
if l1[0: item_len - 2] == l2[0: item_len - 2]:
freq_item = freq_set_list[i] | freq_set_list[j] # 元组元素合并
new_combinations.add(freq_item)
# print(new_combinations)
return new_combinations
继续计算新的候选集的支持度,我们需要用到原始数据列表data来计算候选集出现次数:
def generate_support(set1, min_support):
# 对频繁集计数, data为原始数据
item_count = {} # 字典:候选集:出现次数
for items in data:
for item in set1:
if item.issubset(items): # 元组元素.issubset(列表元素)
if item not in item_count:
item_count[item] = 1 # 第一次出现
else:
item_count[item] += 1
# 计算支持度
data_len = float(len(data)) # 原数据列表元素数量,
freq_set = set() # 元组:频繁集
support = {} # 字典:频繁:支持度
for item in item_count:
if (item_count[item] / data_len) >= min_support:
freq_set.add(item)
support[item] = round(item_count[item] / data_len, 2)
# print(freq_set, support)
return freq_set, support
调用上面的函数,我们可以得到全部的频繁集及其支持度:
def apriori(data, min_support, max_len=None):
freq_sets = [] # 保存所有频繁项集
supports = {} # 保存所有支持度
if max_len is None:
max_len = float('inf')
return freq_sets, supports
l1, support1 = generate_support_first(data, min_support)
freq_sets.append(l1)
supports.update(support1)
item_len = 2 # 设定第一次用到new_set函数时的项集元素个数
while item_len and item_len <= max_len:
ci = new_set(freq_sets[-1], item_len) # 生成候选集
li, support = generate_support(ci, min_support) # 生成频繁项集和支持度
# 如果频繁项集非空则进入下个循环
if li:
freq_sets.append(li)
supports.update(support)
item_len += 1
else:
item_len = 0 # 跳出while
return freq_sets, supports
最后生成关联规则:
def association_rules(freq_sets, supports, min_conf):
rules = [] # 列表[(前项,后项,支持度)]
max_len = len(freq_sets)
# 生成关联规则,筛选符合规则的频繁集计算置信度,满足最小置信度的关联规则添加到列表
for k in range(max_len - 1):
for freq_set in freq_sets[k]:
for i in range(1, max_len - k):
for sub_set in freq_sets[k + i]:
if freq_set.issubset(sub_set):
conf = supports[sub_set] / supports[freq_set]
rule = (freq_set, sub_set - freq_set, round(conf, 2)) # 保留两位小数
# rule = (freq_set, sub_set - freq_set, "%.2f" % conf)
if conf >= min_conf:
rules.append(rule)
return rules
输出:
满足min_support=0.5, max_len=3, min_conf=0.6的关联规则: [(frozenset({3}), frozenset({1}), 0.67), (frozenset({3}), frozenset({2}), 0.67), (frozenset({3}), frozenset({5}), 0.67), (frozenset({3}), frozenset({2, 5}), 0.67), (frozenset({2}), frozenset({5}), 1.0), (frozenset({2}), frozenset({3}), 0.67), (frozenset({2}), frozenset({3, 5}), 0.67), (frozenset({1}), frozenset({3}), 1.0), (frozenset({5}), frozenset({2}), 1.0), (frozenset({5}), frozenset({3}), 0.67), (frozenset({5}), frozenset({2, 3}), 0.67), (frozenset({2, 5}), frozenset({3}), 0.67), (frozenset({2, 3}), frozenset({5}), 1.0), (frozenset({3, 5}), frozenset({2}), 1.0)]
满足min_support=0.5, max_len=3的频繁集及支持度: {frozenset({3}): 0.75, frozenset({1}): 0.5, frozenset({2}): 0.75, frozenset({5}): 0.75, frozenset({1, 3}): 0.5, frozenset({2, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({3, 5}): 0.5, frozenset({2, 3, 5}): 0.5}
Python的mlxtend模块
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
data = [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
# df.astype('int') :用0,1显示
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)
rules = association_rules(frequent_itemsets, min_threshold=0.6)
Python的mlxtend模块的筛选功能
## 计算获得的规则中相关项集的长度
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules["consequent_len"] = rules["consequents"].apply(lambda x: len(x))
# 后项长度大于等于2,前项大于等于1的规则
rule1 = rules[(rules["consequent_len"]>=2) & (rules["antecedent_len"]>=1)]
# 筛选前项有2,后项有3的规则
def findset(set1, set2):
return(set2.issubset(set1))
set2 = {2}
set3 = {3}
rule3 = rules[(rules["antecedents"].apply(findset, set2=set2)) & (rules["consequents"].apply(findset, set2=set3))]
Python的mlxtend模块的可视化
## 忽略提醒
import warnings
warnings.filterwarnings("ignore")
from mlxtend.plotting import *
import matplotlib.pyplot as plt
## 使用散点图分析可视化支持度、置信度和提升度的关系
colnames = ["support","confidence","lift"]
plotdata = rules[["support","confidence","lift"]].values
scatterplotmatrix(plotdata, figsize=(14, 10),
names=colnames,color = "red")
plt.tight_layout()
plt.show()
import networkx as nx
# 提取前项后项长度为1且置信度大于等于0.5的规则
rule4 = rules[(rules["consequent_len"]==1) & (rules["antecedent_len"]==1) & (rules["confidence"]>=0.5)]
# 生成两个列表,作为边的起点集和终点集
antecedents = []
consequents = []
for i in range(len(rule4)):
antecedents.append(list(rule4.antecedents.values[i]))
consequents.append(list(rule4.consequents.values[i]))
# 添加边
plt.figure(figsize=(12,10))
G = nx.DiGraph()
for i in range(len(antecedents)):
G.add_edge(antecedents[i][0],
consequents[i][0],
weight=rule4.confidence.values[i])
# 定义两种边
elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] > 0.5]
esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] <= 0.5]
# 设置布局
pos = nx.circular_layout(G)
# 设置节点大小
nx.draw_networkx_nodes(G,pos,alpha=0.4,node_size=500)
# 设置边的形式
nx.draw_networkx_edges(G,pos,edgelist=elarge,edge_color='r',width=2,alpha=0.7,arrowsize=20)
nx.draw_networkx_edges(G,pos,edgelist=esmall,edge_color='b',width=2,alpha=0.7,arrowsize=20)
# 节点的标签
nx.draw_networkx_labels(G,pos,font_size=12)
plt.axis('off')
plt.title('前项和后项长度均为1的关联规则')
plt.show()
# 如果是前项为2的话,可以用str(antecedents[i][:])的方式进行点集的设置
for i in range(len(antecedents)):
G.add_edge(str(antecedents[i][:]),
str(consequents[i][:]),
weight=rule4.confidence.values[i])