FP-Growth是最常见的关联分析算法之一,其基本步骤是:
(1)对事务数据采用一棵FP树进行压缩存储
(2)FP树被构造出来,再使用一种递归的分而治之的方法来挖掘频繁项集
fp_growth.py
import csv
from collections import defaultdict, namedtuple
from optparse import OptionParser
from fp_node import FPNode
from fp_tree import FPTree
def conditional_tree_from_paths(paths, minimum_support):
"""Builds a conditional FP-tree from the given prefix paths."""
''' 使用一个项标号对应的所有追溯路径建立该项标号的一个条件FP树'''
tree = FPTree() #初始化
condition_item = None #要为其建立条件FP树的项标号
items = set() #路径(树)中涉及到的所有项标号
for path in paths: #将每个路径添加到FP树中
if condition_item is None:
condition_item = path[-1].item #将路径中的最后一个项标号作为要为其建立条件FP树的项标号
point = tree.root #从根节点开始(其是起始父节点)
for node in path:
next_point = point.search(node.item)
if not next_point: #若父节点中不存在该标号的子节点
items.add(node.item) #记录新出现的项标号
count = node.count if node.item == condition_item else 0 #除了要为其建立条件FP数的项标号对应的节点(其Count来自原始FP树,其他节点Count都先设置为0,下面再计算)
next_point = FPNode(tree, node.item, count) #建立一个新的FPNode
point.add(next_point) #将其添加为父节点的子节点
tree._update_route(next_point) #将其添加到对应项标号的链表中
point = next_point #下移一层,继续处理路径的下一个节点
assert condition_item is not None
''' 计算除了condition_item标号以外节点的Count(通过路径数)'''
for path in tree.prefix_paths(condition_item): #在刚建立的条件FP树上,找出以condition_item结尾的所有路径
count = path[-1].count #对每条路径,取出其最后一个节点(其实就是路径最后的condition_item标号的节点)的count
for node in reversed(path[:-1]):
node._count += count #将路径上所有节点的count用最后一个节点(condition_item标号的节点)的count修正
''' 计算每个涉及的项标号的支持度,如果小于阈值,将其所有节点从树中剪掉(注意要处理被剪掉节点子树上的count)'''
for item in items:
support = sum(n.count for n in tree.nodes(item))
if support < minimum_support:
# Doesn't make the cut anymore
for node in tree.nodes(item):
if node.parent is not None:
node.parent.remove(node)
''' 删除每条路径的最后一个节点(也就是condition_item标号的节点),这样以其为最后节点的所有路径的条件子树就构建起来了'''
for node in tree.nodes(condition_item):
if node.parent is not None:
node.parent.remove(node)
return tree
def find_frequent_itemsets(transactions, minimum_support, include_support=False):
'''FINDS FREQUENT ITEMSETS IN THE GIVEN TRANSACTIONS'''
''' 计算频繁集'''
items = defaultdict(lambda:0) #每个(经过预处理后)项标号对应的支持度字典(1-项集的支持度字典)
processed_transactions = [] #保存预处理后获取的事务集
''' 可以预处理下事务集,比如删除某些项编号,排除某些事务等,这里未做任何处理'''
for transaction in transactions:
transaction = transaction #transaction[0].split()
processed = []
for item in transaction:
items[item] += 1
processed.append(item)
processed_transactions.append(processed)
items = dict((item, support) for item, support in items.items()
if support >= minimum_support) #用阈值筛选出频繁的1-项集
def clean_transaction(transaction):
'''STRIPS TRANSACTIONS OF INFREQUENT ITEMS AND SURVIVING ITEMS ARE SORTED IN DECREASING ORDER OF FREQUENCY'''
''' 按照预处理的结果,自每个事务中删除已经排除的项标号,同时将事务中的每个项标号按照其支持度大小排序(这样构造FP树后,支持度低的都在树的底层,便于修剪'''
transaction = list(filter(lambda v: v in items, transaction))
transaction.sort(key=lambda v: items[v], reverse=True)
return transaction
''' 构建FP树'''
master = FPTree()
for transaction in map(clean_transaction, processed_transactions): #对数据进行预处理
master.add(transaction) #将事务(路径)逐个添加到FP树
# master.inspect()
def find_with_suffix(tree, suffix):
''' 后缀法沿树逐层向上寻找频繁项集(注意,这里第一次用的是原始FP树,下层递归就都是用的重新构建的条件FP树了)'''
for item, nodes in tree.items(): #对每个项标号处理(注意由于建立FP树时,事务中都是支持度高的项在前面,会优先处理支持度高的项标号)
support = sum(n.count for n in nodes) #计算该项标号的支持度
if support >= minimum_support: #如果满足支持度阈值,找到一个频繁集
found_set = [item] + suffix
yield (found_set, support) if include_support else found_set
'''Build a conditional tree and recursively search for frequent itemsets within it.'''
''' 从该标号对应的节点继续向上追溯,找出所有路径,构造条件FP树'''
cond_tree = conditional_tree_from_paths(tree.prefix_paths(item),
minimum_support)
''' 在条件FP树上继续寻找频繁的前缀路径'''
for found_suffix in find_with_suffix(cond_tree, found_set):
yield found_suffix
'''Search for frequent itemsets, and yield the results we find.'''
''' 搜索频繁集并返回迭代器'''
for itemset in find_with_suffix(master, []):
yield itemset
if __name__ == '__main__':
data = []
with open('data/transaction.csv','r') as csvf:
lines = csv.reader(csvf)
data = list(lines)
minsup = 2
ffi = find_frequent_itemsets(data, minsup, True)
for itemset, support in ffi:
print('{' + ', '.join(itemset) + '} ' + str(support))
fp_node.py
class FPNode(object):
'''A NODE IN AN FP TREE'''
''' FP树的节点类'''
def __init__(self, tree, item, count=1):
self._tree = tree #节点所属的树
self._item = item #节点对应的item的标号
self._count = count #通过节点的路径的个数(对树的根节点,总为None)
self._parent = None #节点的父节点(对树的根节点,总为None)
self._children = {} # 包含各子节点的字典
self._neighbour = None # 链接的下一个节点(用于每个item节点链表的跟踪指针)
def __repr__(self):
if self.root:
return '<%s (root)>' % type(self).__name__
return '<%s %r (%r)>' % (type(self).__name__, self.item, self.count)
def add(self, child):
'''ADDS GIVEN NODE AS A CHILD OF THE CURRENT NODE'''
''' 添加一个子节点 '''
if not isinstance(child, FPNode):
raise TypeError("ERROR: CHILD TO BE ADDED MUST BE FPNode")
if not child.item in self._children: #如果尚不存在该标号的子节点才添加
self._children[child.item] = child #创建新节点
child.parent = self #设置新节点的父节点为本节点
def search(self, item):
'''CHECKS IF CURRENT NODE HAS A CHILD NODE FOR THE GIVEN ITEM'''
''' 在子节点字典中搜索一个特定项标号的节点并返回'''
try:
return self._children[item]
except KeyError:
return None
def remove(self, child):
'''REMOVES CHILD NODE FROM CHILDREN OF CURRENT NODE'''
''' 删除指定的子节点 '''
try:
if self._children[child.item] is child:
del self._children[child.item] #从字典中删除
child.parent = None #设置被删除的子节点父节点为空
self._tree._removed(child) #同时从item链中删除该节点
for sub_child in child.children: #处理被删除子节点的各个子节点,将其转移到当前节点中
try:
self._children[sub_child.item]._count += sub_child.count #如果该节点标号在当前节点的子节点中存在,直接将其Count加到当前节点的子节点里
sub_child.parent = None #节点父节点置空
except KeyError:
self.add(sub_child) #否则,将该节点添加为当前节点的子节点
child._children = {} #清空被删除子节点的子节点字典
else:
raise ValueError('ERROR: CHILD TO BE REMOVED IS NOT THE CHILD OF THIS NODE')
except:
raise ValueError('ERROR: CHILD TO BE REMOVED IS NOT THE CHILD OF THIS NODE')
def __contains__(self, item):
''' 是否包含特定项标号的子节点'''
return item in self._children
@property
def tree(self):
'''RETURNS THE TREE TO WHICH CURRENT NODE BELONGS'''
''' 返回所属的树'''
return self._tree
@property
def item(self):
'''RETURNS ITEM CONTAINED IN CURRENT NODE'''
''' 返回对应的项标号'''
return self._item
@property
def count(self):
'''RETURNS THE COUNT OF CURRENT NODE\'S ITEM'''
''' 返回经过节点的路径条数'''
return self._count
def increment(self):
'''INCREMENTS THE COUNT OF CURRENT NODE\'S ITEM'''
''' 路径条数+1'''
if self._count is None:
raise ValueError('ERROR: ROOT NODE HAS NO COUNT')
self._count += 1
@property
def root(self):
'''CHECKS IF CURRENT NODE IS ROOT OF THE FP TREE'''
''' 是否是一个根节点'''
return self._item is None and self._count is None
@property
def leaf(self):
'''CHECKS IF CURRENT NODE IS NODE OF THE FP TREE'''
''' 是否是一个叶子节点'''
return len(self._children) == 0
def parent():
''' 父节点获取或设置'''
def fget(self):
return self._parent
def fset(self, value):
if value is not None and not isinstance(value, FPNode):
raise TypeError('ERROR: A NODE MUST HAVE AN FP NODE AS A PARENT')
if value and value.tree is not self.tree:
raise ValueError('ERROR: NODE OF ONE TREE CANNOT HAVE PARENT FROM ANOTHER TREE')
self._parent = value
return locals()
parent = property(**parent())
def neighbour():
''' 链接的下一个节点的获取和设置'''
def fget(self):
return self._neighbour
def fset(self, value):
if value is not None and not isinstance(value, FPNode):
raise TypeError('ERROR: A NODE MUST HAVE AN FP NODE AS A NEIGHBOUR')
if value and value.tree is not self.tree:
raise ValueError('ERROR: NODE OF ONE TREE CANNOT HAVE NEIGHBOUR FROM ANOTHER TREE')
self._neighbour = value
return locals()
neighbour = property(**neighbour())
@property
def children(self):
'''RETURNS CHILDREN OF CURRENT NODE'''
''' 返回所有子节点构成的元组'''
return tuple(self._children.values())
def inspect(self, depth=0):
''' 节点输出(控制台)'''
print(' ' * depth + repr(self))
for child in self.children:
child.inspect(depth + 1)
fp_tree.py
from collections import namedtuple
from fp_node import FPNode
class FPTree(object):
'''FP TREE STRUCTURE'''
''' FP树结构类'''
Route = namedtuple('Route', 'head tail') #一个简化类(命名元组),用于描述每个项标号所有的FPNode的链表,其实同时包含头尾两个FPNode的引用,通过FPNode的neibour指针遍历获取所有FPNode
def __init__(self):
''' 初始化根节点和各项标号链表使用的字典 '''
self._root = FPNode(self, None, None) #根节点
self._routes = {} #以项标号为key,包含每个项的FP节点列表(其实是上面的命名元组,仅包含头尾两个FPNode)
@property
def root(self):
'''RETURNS ROOT OF THE FP TREE'''
''' 返回根节点'''
return self._root
def add(self, transaction):
'''ADDS A TRANSACTION TO THE TREE'''
''' 向树中添加一个事务(或一条路径)'''
point = self._root #从根节点开始
for item in transaction: #按事务中的项排序沿树逐层向下查找
next_point = point.search(item)
if next_point: #父节点的子节点字典中以前已经建立了该项标号的子节点
next_point.increment() #直接给找到的子节点计数+1
else:
next_point = FPNode(self, item) #否则新建一个节点(count默认为1)
point.add(next_point) #添加为父节点的一个子节点
self._update_route(next_point) #添加到相应项标号的节点链表中
point = next_point #向下移动到当前节点,继续处理下一层的节点
def _update_route(self, point):
'''ADD THE NODE POINT TO THE ROUTE THROUGH ALL NODES FOR ITS ITEM'''
''' 将新建节点添加到对应项标号的节点列表中'''
assert self is point.tree
try:
route = self._routes[point.item] #如果项标号对应的列表已经存在(建立过)
route[1].neighbour = point #将新节点链接到尾节点(作为尾节点的下一个)
self._routes[point.item] = self.Route(route[0], point) #修正尾节点为刚添加的节点
except KeyError:
self._routes[point.item] = self.Route(point, point) #如果没有建立过,新建项编号的链表,首尾节点引用都是当前添加的节点
def items(self):
'''GENERATE 2-TUPLES FOR EACH ITEM OF THE FORM (ITEM, GENERATOR)'''
''' 返回一个迭代器,提供项标号和其所有FPNode的迭代器'''
for item in self._routes:
yield(item, self.nodes(item))
def nodes(self, item):
'''GENERATES THE SEQUENCE OF NODES THAT CONTAIN THE GIVEN ITEM'''
''' 对于给定的项标号,返回一个枚举其所有FPNode的迭代器'''
try:
node = self._routes[item][0]
except KeyError:
return
while node:
yield node
node = node.neighbour
def prefix_paths(self, item):
'''GENERATES PREFIX PATHS ENDING WITH CURRENT ITEM'''
''' 给定一个类标号,返回所有以该标号结尾的路径(该操作用于获取构造针对特定项标号的条件FP树所需要的所有路径)'''
def collect_path(node):
''' 从给定的node向上追溯到树的根,返回追溯获取的路径'''
path = []
while node and not node.root:
path.append(node)
node = node.parent
path.reverse()
return path
return (collect_path(node) for node in self.nodes(item))
def inspect(self):
''' 输出树(控制台),包括树结构和各项标号及其所有FPNode'''
print('\nTREE:')
self.root.inspect(1)
print('\nROUTES:')
for item, nodes in self.items():
print('%r' % item)
for node in nodes:
print('%r' % node)
def _removed(self, node_to_remove):
'''PERFORMS CLEANUP DURING REMOVAL OF A NODE'''
''' 自项标号链表中删除一个节点'''
head, tail = self._routes[node_to_remove.item] #获取链表的首节点和尾节点
if node_to_remove is head: #如果要删除的节点是首节点
if node_to_remove is tail or not node_to_remove.neighbour: #如果只有一个节点
# It was the sole node.
del self._routes[node_to_remove.item] #直接从字典中删除该项标号及其链表
else:
self._routes[node_to_remove.item] = self.Route(node_to_remove.neighbour, tail) #否则修正命名元组中的首节点
else: #如果要删除的不是首节点
for node in self.nodes(node_to_remove.item):
if node.neighbour is node_to_remove: #找到其前一个节点
node.neighbour = node_to_remove.neighbour # skip over #修改前一个节点的neibour指针
if node_to_remove is tail:
self._routes[node_to_remove.item] = self.Route(head, node) #如果删除的是尾节点,修正命名元组中的尾节点
break