FP-Growth是最常见的关联分析算法之一,其基本步骤是:
(1)对事务数据采用一棵FP树进行压缩存储
(2)FP树被构造出来,再使用一种递归的分而治之的方法来挖掘频繁项集

fp_growth.py

import csv
from collections import defaultdict, namedtuple
from optparse import OptionParser

from fp_node import FPNode
from fp_tree import FPTree


def conditional_tree_from_paths(paths, minimum_support):
    """Builds a conditional FP-tree from the given prefix paths."""
    ''' 使用一个项标号对应的所有追溯路径建立该项标号的一个条件FP树'''
    tree = FPTree() #初始化
    condition_item = None #要为其建立条件FP树的项标号
    items = set() #路径(树)中涉及到的所有项标号

    for path in paths: #将每个路径添加到FP树中
        if condition_item is None:
            condition_item = path[-1].item #将路径中的最后一个项标号作为要为其建立条件FP树的项标号

        point = tree.root #从根节点开始(其是起始父节点)
        for node in path:
            next_point = point.search(node.item) 
            if not next_point: #若父节点中不存在该标号的子节点
                items.add(node.item)  #记录新出现的项标号
                count = node.count if node.item == condition_item else 0 #除了要为其建立条件FP数的项标号对应的节点(其Count来自原始FP树,其他节点Count都先设置为0,下面再计算)
                next_point = FPNode(tree, node.item, count) #建立一个新的FPNode
                point.add(next_point) #将其添加为父节点的子节点
                tree._update_route(next_point) #将其添加到对应项标号的链表中
            point = next_point #下移一层,继续处理路径的下一个节点

    assert condition_item is not None

    ''' 计算除了condition_item标号以外节点的Count(通过路径数)'''
    for path in tree.prefix_paths(condition_item): #在刚建立的条件FP树上,找出以condition_item结尾的所有路径
        count = path[-1].count #对每条路径,取出其最后一个节点(其实就是路径最后的condition_item标号的节点)的count
        for node in reversed(path[:-1]):
            node._count += count  #将路径上所有节点的count用最后一个节点(condition_item标号的节点)的count修正

    ''' 计算每个涉及的项标号的支持度,如果小于阈值,将其所有节点从树中剪掉(注意要处理被剪掉节点子树上的count)'''
    for item in items:
        support = sum(n.count for n in tree.nodes(item))
        if support < minimum_support:
            # Doesn't make the cut anymore
            for node in tree.nodes(item):
                if node.parent is not None:
                    node.parent.remove(node)
                    
    ''' 删除每条路径的最后一个节点(也就是condition_item标号的节点),这样以其为最后节点的所有路径的条件子树就构建起来了'''
    for node in tree.nodes(condition_item):
        if node.parent is not None:
            node.parent.remove(node)

    return tree

def find_frequent_itemsets(transactions, minimum_support, include_support=False):
    '''FINDS FREQUENT ITEMSETS IN THE GIVEN TRANSACTIONS'''
    ''' 计算频繁集'''
    
    items = defaultdict(lambda:0) #每个(经过预处理后)项标号对应的支持度字典(1-项集的支持度字典)
    processed_transactions = [] #保存预处理后获取的事务集

    ''' 可以预处理下事务集,比如删除某些项编号,排除某些事务等,这里未做任何处理'''
    for transaction in transactions:        
        transaction = transaction #transaction[0].split()                   
        processed = []
        for item in transaction:
            items[item] += 1
            processed.append(item)
        processed_transactions.append(processed)
  
    items = dict((item, support) for item, support in items.items()
                  if support >= minimum_support) #用阈值筛选出频繁的1-项集

    def clean_transaction(transaction):
        '''STRIPS TRANSACTIONS OF INFREQUENT ITEMS AND SURVIVING ITEMS ARE SORTED IN DECREASING ORDER OF FREQUENCY'''
        ''' 按照预处理的结果,自每个事务中删除已经排除的项标号,同时将事务中的每个项标号按照其支持度大小排序(这样构造FP树后,支持度低的都在树的底层,便于修剪'''
        transaction = list(filter(lambda v: v in items, transaction))         
        transaction.sort(key=lambda v: items[v], reverse=True)        
        return transaction

    ''' 构建FP树'''
    master = FPTree()
    for transaction in map(clean_transaction, processed_transactions): #对数据进行预处理
        master.add(transaction) #将事务(路径)逐个添加到FP树
    # master.inspect()

    def find_with_suffix(tree, suffix):
        ''' 后缀法沿树逐层向上寻找频繁项集(注意,这里第一次用的是原始FP树,下层递归就都是用的重新构建的条件FP树了)'''
        for item, nodes in tree.items(): #对每个项标号处理(注意由于建立FP树时,事务中都是支持度高的项在前面,会优先处理支持度高的项标号)
            support = sum(n.count for n in nodes) #计算该项标号的支持度
            if support >= minimum_support: #如果满足支持度阈值,找到一个频繁集
                found_set = [item] + suffix
                yield (found_set, support) if include_support else found_set

                '''Build a conditional tree and recursively search for frequent itemsets within it.'''
                ''' 从该标号对应的节点继续向上追溯,找出所有路径,构造条件FP树'''
                cond_tree = conditional_tree_from_paths(tree.prefix_paths(item),
                    minimum_support)
                ''' 在条件FP树上继续寻找频繁的前缀路径'''
                for found_suffix in find_with_suffix(cond_tree, found_set):
                    yield found_suffix

    '''Search for frequent itemsets, and yield the results we find.'''
    ''' 搜索频繁集并返回迭代器'''
    for itemset in find_with_suffix(master, []):
        yield itemset

if __name__ == '__main__':
    data = []
    with open('data/transaction.csv','r') as csvf:
        lines = csv.reader(csvf)
        data = list(lines)    
    minsup = 2
    ffi = find_frequent_itemsets(data, minsup, True)    
    for itemset, support in ffi:
        print('{' + ', '.join(itemset) + '} ' + str(support))

fp_node.py

class FPNode(object):
    '''A NODE IN AN FP TREE'''
    ''' FP树的节点类'''    

    def __init__(self, tree, item, count=1):
        self._tree = tree #节点所属的树
        self._item = item #节点对应的item的标号
        self._count = count #通过节点的路径的个数(对树的根节点,总为None)
        self._parent = None #节点的父节点(对树的根节点,总为None)
        self._children = {} # 包含各子节点的字典
        self._neighbour = None # 链接的下一个节点(用于每个item节点链表的跟踪指针)

    def __repr__(self):
        if self.root:
            return '<%s (root)>' % type(self).__name__
        return '<%s %r (%r)>' % (type(self).__name__, self.item, self.count)


    def add(self, child):
        '''ADDS GIVEN NODE AS A CHILD OF THE CURRENT NODE'''
        ''' 添加一个子节点 '''
        
        if not isinstance(child, FPNode):
            raise TypeError("ERROR: CHILD TO BE ADDED MUST BE FPNode")
        
        if not child.item in self._children: #如果尚不存在该标号的子节点才添加
            self._children[child.item] = child #创建新节点
            child.parent = self #设置新节点的父节点为本节点

    def search(self, item):
        '''CHECKS IF CURRENT NODE HAS A CHILD NODE FOR THE GIVEN ITEM'''
        ''' 在子节点字典中搜索一个特定项标号的节点并返回'''
        
        try:
            return self._children[item]
        except KeyError:
            return None

    def remove(self, child):
        '''REMOVES CHILD NODE FROM CHILDREN OF CURRENT NODE'''
        ''' 删除指定的子节点 '''
        
        try:
            if self._children[child.item] is child:
                del self._children[child.item] #从字典中删除
                child.parent = None #设置被删除的子节点父节点为空
                self._tree._removed(child) #同时从item链中删除该节点
                for sub_child in child.children: #处理被删除子节点的各个子节点,将其转移到当前节点中
                    try:
                        self._children[sub_child.item]._count += sub_child.count #如果该节点标号在当前节点的子节点中存在,直接将其Count加到当前节点的子节点里
                        sub_child.parent = None #节点父节点置空
                    except KeyError:
                        self.add(sub_child) #否则,将该节点添加为当前节点的子节点
                child._children = {} #清空被删除子节点的子节点字典
            else:
                raise ValueError('ERROR: CHILD TO BE REMOVED IS NOT THE CHILD OF THIS NODE')
        except:
            raise ValueError('ERROR: CHILD TO BE REMOVED IS NOT THE CHILD OF THIS NODE')

    def __contains__(self, item):
        ''' 是否包含特定项标号的子节点'''
        return item in self._children 

    @property
    def tree(self):
        '''RETURNS THE TREE TO WHICH CURRENT NODE BELONGS'''
        ''' 返回所属的树'''
        return self._tree 

    @property
    def item(self):
        '''RETURNS ITEM CONTAINED IN CURRENT NODE'''
        ''' 返回对应的项标号'''
        return self._item 

    @property
    def count(self):
        '''RETURNS THE COUNT OF CURRENT NODE\'S ITEM'''
        ''' 返回经过节点的路径条数'''
        return self._count 

    def increment(self):
        '''INCREMENTS THE COUNT OF CURRENT NODE\'S ITEM'''
        ''' 路径条数+1'''
        if self._count is None:
            raise ValueError('ERROR: ROOT NODE HAS NO COUNT')
        self._count += 1 
    
    @property
    def root(self):
        '''CHECKS IF CURRENT NODE IS ROOT OF THE FP TREE'''
        ''' 是否是一个根节点'''
        return self._item is None and self._count is None

    @property
    def leaf(self):
        '''CHECKS IF CURRENT NODE IS NODE OF THE FP TREE'''
        ''' 是否是一个叶子节点'''
        return len(self._children) == 0 

    def parent():
        ''' 父节点获取或设置'''
        def fget(self):
            return self._parent
        def fset(self, value):
            if value is not None and not isinstance(value, FPNode):
                raise TypeError('ERROR: A NODE MUST HAVE AN FP NODE AS A PARENT')
            if value and value.tree is not self.tree:
                raise ValueError('ERROR: NODE OF ONE TREE CANNOT HAVE PARENT FROM ANOTHER TREE')
            self._parent = value
        return locals()
    parent = property(**parent()) 

    def neighbour():
        ''' 链接的下一个节点的获取和设置'''
        def fget(self):
            return self._neighbour
        def fset(self, value):
            if value is not None and not isinstance(value, FPNode):
                raise TypeError('ERROR: A NODE MUST HAVE AN FP NODE AS A NEIGHBOUR')
            if value and value.tree is not self.tree:
                raise ValueError('ERROR: NODE OF ONE TREE CANNOT HAVE NEIGHBOUR FROM ANOTHER TREE')
            self._neighbour = value
        return locals()
    neighbour = property(**neighbour())
                
    @property
    def children(self):
        '''RETURNS CHILDREN OF CURRENT NODE'''
        ''' 返回所有子节点构成的元组'''
        return tuple(self._children.values())

    def inspect(self, depth=0):
        ''' 节点输出(控制台)'''
        print('   ' * depth + repr(self))
        for child in self.children:
            child.inspect(depth + 1)

fp_tree.py

from collections import namedtuple
from fp_node import FPNode

class FPTree(object):
    '''FP TREE STRUCTURE'''
    ''' FP树结构类'''
    Route = namedtuple('Route', 'head tail') #一个简化类(命名元组),用于描述每个项标号所有的FPNode的链表,其实同时包含头尾两个FPNode的引用,通过FPNode的neibour指针遍历获取所有FPNode

    def __init__(self):
        ''' 初始化根节点和各项标号链表使用的字典 '''
        self._root = FPNode(self, None, None) #根节点
        self._routes = {} #以项标号为key,包含每个项的FP节点列表(其实是上面的命名元组,仅包含头尾两个FPNode)

    @property
    def root(self):
        '''RETURNS ROOT OF THE FP TREE'''
        ''' 返回根节点'''
        return self._root

    def add(self, transaction):
        '''ADDS A TRANSACTION TO THE TREE'''
        ''' 向树中添加一个事务(或一条路径)'''
        point = self._root #从根节点开始
        for item in transaction: #按事务中的项排序沿树逐层向下查找
            next_point = point.search(item) 
            if next_point: #父节点的子节点字典中以前已经建立了该项标号的子节点 
                next_point.increment() #直接给找到的子节点计数+1
            else:
                next_point = FPNode(self, item) #否则新建一个节点(count默认为1)
                point.add(next_point) #添加为父节点的一个子节点
                self._update_route(next_point) #添加到相应项标号的节点链表中
            point = next_point #向下移动到当前节点,继续处理下一层的节点

    def _update_route(self, point):
        '''ADD THE NODE POINT TO THE ROUTE THROUGH ALL NODES FOR ITS ITEM'''
        ''' 将新建节点添加到对应项标号的节点列表中'''
        assert self is point.tree

        try:
            route = self._routes[point.item] #如果项标号对应的列表已经存在(建立过)
            route[1].neighbour = point #将新节点链接到尾节点(作为尾节点的下一个)
            self._routes[point.item] = self.Route(route[0], point) #修正尾节点为刚添加的节点 
        except KeyError:
            self._routes[point.item] = self.Route(point, point) #如果没有建立过,新建项编号的链表,首尾节点引用都是当前添加的节点

    def items(self):
        '''GENERATE 2-TUPLES FOR EACH ITEM OF THE FORM (ITEM, GENERATOR)'''
        ''' 返回一个迭代器,提供项标号和其所有FPNode的迭代器'''
        for item in self._routes:
            yield(item, self.nodes(item))

    def nodes(self, item):
        '''GENERATES THE SEQUENCE OF NODES THAT CONTAIN THE GIVEN ITEM'''
        ''' 对于给定的项标号,返回一个枚举其所有FPNode的迭代器'''
        try:
            node = self._routes[item][0]
        except KeyError:
            return
        while node:
            yield node
            node = node.neighbour

    def prefix_paths(self, item):
        '''GENERATES PREFIX PATHS ENDING WITH CURRENT ITEM'''
        ''' 给定一个类标号,返回所有以该标号结尾的路径(该操作用于获取构造针对特定项标号的条件FP树所需要的所有路径)'''

        def collect_path(node):
            ''' 从给定的node向上追溯到树的根,返回追溯获取的路径'''
            path = []
            while node and not node.root:
                path.append(node)
                node = node.parent
            path.reverse()
            return path

        return (collect_path(node) for node in self.nodes(item))

    def inspect(self):
        ''' 输出树(控制台),包括树结构和各项标号及其所有FPNode'''
        print('\nTREE:')
        self.root.inspect(1)
        print('\nROUTES:')
        for item, nodes in self.items():
            print('%r' % item)
            for node in nodes:
                print('%r' % node)

    def _removed(self, node_to_remove):
        '''PERFORMS CLEANUP DURING REMOVAL OF A NODE'''
        ''' 自项标号链表中删除一个节点'''
        head, tail = self._routes[node_to_remove.item] #获取链表的首节点和尾节点
        if node_to_remove is head: #如果要删除的节点是首节点
            if node_to_remove is tail or not node_to_remove.neighbour: #如果只有一个节点
                # It was the sole node.
                del self._routes[node_to_remove.item] #直接从字典中删除该项标号及其链表
            else:
                self._routes[node_to_remove.item] = self.Route(node_to_remove.neighbour, tail) #否则修正命名元组中的首节点
        else: #如果要删除的不是首节点
            for node in self.nodes(node_to_remove.item):
                if node.neighbour is node_to_remove: #找到其前一个节点
                    node.neighbour = node_to_remove.neighbour # skip over #修改前一个节点的neibour指针
                    if node_to_remove is tail: 
                        self._routes[node_to_remove.item] = self.Route(head, node) #如果删除的是尾节点,修正命名元组中的尾节点
                    break