# coding=utf-8
import numpy as np
class treeNode:
def __init__(self, nameVale, numOccur, parentNode):
self.name = nameVale
self.count = numOccur
self.nodeLink = None
self.parent = parentNode
self.children = {}
def inc(self, numOccur):
self.count += numOccur
def disp(self, ind = 1):
print ' '*ind, self.name, ' ', self.count
for child in self.children.values():
child.disp(ind+1)
def laodSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'q'],
['y', 'z', 'x', 'e','q', 's', 't', 'm']]
return simpDat
def createInitSet(dataList):
retDict = {}
for i in dataList:
retDict[frozenset(i)] = 1;
return retDict
#构建树
def createTree(dataSet, minSup = 1):
headerTable = {}
#单字母出现的频率
for i in dataSet:
for j in i:
headerTable[j] = headerTable.get(j, 0) + dataSet[i]
#筛选出高频字母
for k in headerTable.keys():
if headerTable[k] < minSup:
del(headerTable[k])
#字母去重
freqItemSet = set(headerTable.keys())
#如果不存在字母了就返回空
if len(freqItemSet) == 0:
return None, None
#扩展字典集为字符:列表
for k in headerTable:
headerTable[k] = [headerTable[k], None]
#print headerTable
#建立头节点树根
reTree = treeNode('Null set', 1, None)
#.items():以列表返回可遍历的(键, 值) 元组数组
#将所有数据集里的频繁数据排序更新
for tranSet, count in dataSet.items():
localD = {}
#print 'tranSet:',tranSet
for item in tranSet:
if item in freqItemSet:
localD[item] = headerTable[item][0]
if len(localD) > 0:
orderedItems = [v[0] for v in sorted(localD.items(), key = lambda p: p[1], reverse = True)]
#print 'orderedItems:',orderedItems
updateTree(orderedItems, reTree, headerTable, count)
#print 'localD:', localD,'\n'
return reTree, headerTable
#更新树
def updateTree(items, inTree, headerTable, count):#待排序序列,父节点,字符集,个数集
#print 'items:',items
#字符已经存在,就加上count就行
if items[0] in inTree.children:
inTree.children[items[0]].inc(count)
#字符不存在,建立父节点的孩子
else:
inTree.children[items[0]] = treeNode(items[0], count, inTree)
#头节点没有子节点,加上就行
if headerTable[items[0]][1] == None:
headerTable[items[0]][1] = inTree.children[items[0]]
#头节点存在,重新加入
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:#存在循环
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
#加长链表
def updateHeader(nodeToTest, targetNode):#头节点和孩子
#指针循环
while (nodeToTest.nodeLink != None):
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
#回溯搜索储存父节点
def ascendTree(leafNode, prefixPath):
if leafNode.parent != None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)
#给出头指针,搜索FP树中所有到底此头指针的链接
def findPrefixPath(basePat, treeNode):
condPats = {}
#直到指针到空
while treeNode != None:
prefixPath = []
ascendTree(treeNode, prefixPath)
if len(prefixPath) > 1:#储存
condPats[frozenset(prefixPath[1:])] = treeNode.count
treeNode = treeNode.nodeLink
return condPats
#每个频繁项都建立一颗FP树
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
#将头指针按照数量排序
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]
print bigL
for basePat in bigL:
newFreqSet = preFix.copy()
newFreqSet.add(basePat)
#print 'finalFrequent Item: ',newFreqSet
freqItemList.append(newFreqSet)
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
#print 'condPattBases :',basePat, condPattBases
#创建频繁项basePat的FP树
myCondTree, myHead = createTree(condPattBases, minSup)
#print 'head from conditional tree: ', myHead
if myHead != None:
#print 'conditional tree for: ',newFreqSet
#循环构建FP树
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
def main():
dataArr = laodSimpDat()
dataList = createInitSet(dataArr)
print dataList
myFPtree, myheaderTab = createTree(dataList, 3)
print myFPtree.disp()
freqItems = []
mineTree(myFPtree, myheaderTab, 3, set([]), freqItems)
print freqItems
if __name__ == "__main__":
main()