步步為營KDD:FP-Growth算法

Apriori算法的缺陷

雖然Apriori算法使用先驗性質去除了很多的候選項集,但Apriori算法還是有兩個大問題:

1.可能產生大量的候選項集。例如,當頻繁1項集的個數達到1000個時,產生的候選2項集的個數達到C(2,1000)。C(2,1000) = 1000! / (2! * 9998!) = (1000 * 999 * 998!) / (2 * 998!) = (1000 * 999) / 2 = 499500。這還只是由頻繁1項集獲取候選2項集產生的項集個數。

2.可能需要多次迭代數據庫。在每次獲取候選項集後都需要迭代數據庫,獲取支持度計數。

FP-Growth

為此,Han JiaWei 提出了一個不需要產生候選項集的算法:FP-Growth。

FP-Growth算法主要分2步:先將所有數據庫事務壓縮到一個樹結構(FP-Tree)中,該結構保留了項與項之間的關聯信息;然後對樹結構進行挖掘。

以下是詳細過程:

1.創建FP-Tree

1.1遍歷數據庫(第1次),獲取所有頻繁1項集及其支持度計數,將頻繁1項集按遞減支持度計數排列。

1.2首先創建值為null的根節點,然後遍歷數據庫(第2次),將每個事務中的項集按頻繁1項集的順序排列(按遞減支持度計數排列),將排序後的最大支持度的項作為null根節點的子節點,將第2項作為第1項的子節點,以此類推。如果是創建新節點則設置新節點的SupportCount等於1,同時讓項頭表(ItemHeaderTable)記住該節點的地址;如果路徑中的節點已經存在,則讓該節點的SupportCount遞增1。最終形成具有項關聯信息的FP-Tree和完整的項頭表。

2.挖掘FP-Tree

2.1挖掘FP-Tree的主要思想就是讓項頭表中的每一個項與後綴模式合併形成新的模式,創建該項的子樹和與子樹對應的子項頭表,然後讓子項頭表中的每一個項與後綴模式(即上層遞歸形成的新模式)合併形成新的模式,創建該項的子子樹和與子子樹對應的子子項頭表。如此遞歸下去,直到無法形成樹為止。每層遞歸都會讓項頭表中的項與後綴模式(suffixpattern)合併成新的模式,新的模式將作為下層遞歸的後綴模式,這就是算法名稱中的“Growth”。針對只有單一路徑的樹,則可以對樹中的所有節點進行組合形成新的模式,提升算法性能。

代碼實現

以下是FP-Growth的C++實現,只貼出FP-Growth算法的核心部分:

FPGrowth.h:

<code>#ifndef ILANEVER_PATTERN_FPGROWTH_H
#define ILANEVER_PATTERN_FPGROWTH_H

#include <iostream>
#include <memory>
#include
#include <vector>

using namespace std;

namespace ilanever

{
namespace pattern
{
class ItemSet;
class ItemSetList;
class TransactionList;
class ItemHeader;
class ItemHeaderTable;
class TreeNode;
class TreeNodePtrList;
class FPGrowth
{
public:
FPGrowth(TransactionList& trans, int min_sup);

public:
/*
get the map of frequent itemsets.
key: k
value: frequent itemsets.
*/
map& get_frequent_itemsets();

private:
/*
get frequent 1 itemsets.
*/
ItemSetList* get_frequent_1_itemsets();

/*
get table of ItemHeader from frequents 1 itemsets.
*/
ItemHeaderTable* get_item_header_table(ItemSetList* p_frequents_1_itemsets);
/*
get the fp tree.
*/
TreeNode* get_FPTree(ItemHeaderTable* headerTable,ItemSetList* sortedFrequents);
/*
@<tree> the entire tree or the conditional tree.
@<headertable> the item header table cosrespond to tree.
@<suffixpattern> suffixpattern is null when the tree is entire tree,
pattern is suffixpattern when the tree is conditional tree.
*/
void fp_growth(TreeNode* tree,ItemHeaderTable* headerTable, ItemSet* suffixpattern);
/*
get patterns from the tree when the tree has only one path.
*/
ItemSetList* get_patterns_from_singlepath_tree(TreeNode* tree, ItemSet* suffixpattern);
/*

get condition pattern base.
*/
vector<treenodeptrlist> get_condition_patternbase(ItemHeader* itemHeader);
/*
get condition item header table.
*/
ItemHeaderTable* get_condition_headertable(vector<treenodeptrlist> patternbase);
/*
get condition FPTree.
*/
TreeNode* get_condition_FPTree
(vector<treenodeptrlist>& conditionPatternBase, ItemHeaderTable& headerTable);
/*
prune the tree branches and table items.
*/
void prune_tree_and_table(TreeNode* tree, ItemHeaderTable& headerTable);

private:
// all transactions in database.
TransactionList& tranList;
// absolute minimum support.
int min_sup;
// map of frequents
map map_frequents;

};
}
}

#endif
/<treenodeptrlist>/<treenodeptrlist>/<treenodeptrlist>/<suffixpattern>/<headertable>/<tree>
/<vector>
/<memory>/<iostream>/<code>


FPGrowth.cpp:


<code>#include "FPGrowth.h"
#include <memory>
#include
#include <iostream>
#include <vector>
#include "models/ItemSet.h"

#include "models/ItemSetList.h"
#include "models/TransactionList.h"
#include "models/ItemHeader.h"
#include "models/ItemHeaderTable.h"
#include "models/ItemSetSorter.h"
#include "models/TreeNode.h"
#include "models/TreeNodePtrList.h"
#include "../utils/set_utils.h"

namespace ilanever
{
namespace pattern
{
FPGrowth::FPGrowth(TransactionList& trans, int min_sup):
tranList(trans),min_sup(min_sup)
{}

map& FPGrowth::get_frequent_itemsets()
{
// get frequent 1 itemsets.
ItemSetList* frequent_1_itemsets = this->get_frequent_1_itemsets();
// sort by support count.
ItemSetSorterBySupportCount<:iterator> sorter;
sorter.sort(frequent_1_itemsets->begin(), frequent_1_itemsets->end());
// init map
for(int i = 1; i <= frequent_1_itemsets->size(); ++i)
{
this->map_frequents.insert(pair(i, new ItemSetList()));
}
// get item header table.
ItemHeaderTable* headerTable = this->get_item_header_table(frequent_1_itemsets);
// get FP tree.
TreeNode* fptree = this->get_FPTree(headerTable, frequent_1_itemsets);
// execute FP-Growth mining.
this->fp_growth(fptree, headerTable ,nullptr);
return this->map_frequents;
}

void FPGrowth::fp_growth(TreeNode* tree, ItemHeaderTable* headerTable, ItemSet* suffixpattern)
{
if(isSinglePath(tree))
{
ItemSetList* patterns = this->get_patterns_from_singlepath_tree(tree, suffixpattern);
for(ItemSet set : *patterns)
{
this->map_frequents[set.size()]->push_back(set);
}

}
else
{
for(auto begin = headerTable->rbegin(); begin != headerTable->rend(); ++begin)
{
ItemHeader* header = *begin;
ItemSet* pattern = new ItemSet({header->item});
if(suffixpattern != nullptr)
{
for(auto begin = suffixpattern->begin(); begin != suffixpattern->end(); ++begin)
{
pattern->push_back(*begin);
}
}
pattern->support_count = header->supportCount;
this->map_frequents[pattern->size()]->push_back(*pattern);

// get condition pattern base.
vector<treenodeptrlist> conditionPatternBase = this->get_condition_patternbase(header);
// get condition header table.
ItemHeaderTable* conditionHeaderTable = this->get_condition_headertable(conditionPatternBase);
// get condition FPTree.
TreeNode* conditionFPTree = this->get_condition_FPTree(conditionPatternBase, *conditionHeaderTable);
// prune tree and table.
this->prune_tree_and_table(conditionFPTree, *conditionHeaderTable);

if(conditionFPTree->children->size() > 0)
{
this->fp_growth(conditionFPTree, conditionHeaderTable, pattern);
}
}
}
}

void FPGrowth::prune_tree_and_table(TreeNode* tree, ItemHeaderTable& headerTable)
{
vector<:iterator> toDelHeaders;
for(auto begin = headerTable.begin(); begin != headerTable.end(); ++begin)
{
ItemHeader* header = *begin;
if(header->supportCount < this->min_sup)
{
toDelHeaders.push_back(begin);
for(TreeNode* node : *(header->nodes))
{
if(node->parent != nullptr)
{
node->parent->children->remove(node->value);
}

}
}
}
for(auto& iter : toDelHeaders)
{
headerTable.erase(iter);
}
}

TreeNode* FPGrowth::get_condition_FPTree
(vector<treenodeptrlist>& conditionPatternBase, ItemHeaderTable& headerTable)
{
TreeNode* root = new TreeNode("");
for(TreeNodePtrList* treeNodes : conditionPatternBase)
{
TreeNode* parent = root;
for(TreeNode* treeNode : *treeNodes)
{
TreeNode* found = nullptr;
if(parent->children->contains(treeNode->value))
{
found = parent->children->get(treeNode->value);
found->passedtimes += treeNode->passedtimes;
}
else
{
found = new TreeNode(treeNode->value, treeNode->passedtimes, parent);
parent->children->push_back(found);
}
headerTable.get(treeNode->value)->nodes->push_back(found);
parent = found;
}
}
return root;
}

ItemHeaderTable* FPGrowth::get_condition_headertable(vector<treenodeptrlist> patternbase)
{
ItemHeaderTable* headerTable = new ItemHeaderTable();
for(TreeNodePtrList* nodes : patternbase)
{
for(TreeNode* node : *nodes)
{
if(!headerTable->contains(node->value))
{
ItemHeader* header = new ItemHeader(node->value, node->passedtimes);
headerTable->push_back(header);
}
else
{

ItemHeader* header = headerTable->get(node->value);
header->supportCount = header->supportCount + node->passedtimes;
}
}
}
return headerTable;
}

vector<treenodeptrlist> FPGrowth::get_condition_patternbase(ItemHeader* itemHeader)
{
vector<treenodeptrlist> conditionPatternBase;
for(TreeNode* node : *(itemHeader->nodes))
{
TreeNodePtrList* conditionPattern = new TreeNodePtrList();
TreeNode* parent = node->parent;
while(parent != nullptr && !parent->isRoot())
{
TreeNode* ptnNode = new TreeNode(parent->value, node->passedtimes);
conditionPattern->insert(conditionPattern->begin(), ptnNode);
parent = parent->parent;
}
conditionPatternBase.push_back(conditionPattern);
}
return conditionPatternBase;
}

ItemSetList* FPGrowth::get_patterns_from_singlepath_tree(TreeNode* tree, ItemSet* suffixpattern)
{
ItemSetList* patterns = new ItemSetList();

// get all nodes in the tree.
TreeNodePtrList* allnodes = new TreeNodePtrList();
TreeNode* node = tree;
while(node->children->size() == 1)
{
TreeNode* child = (*node->children)[0];
if(child->passedtimes >= this->min_sup)
{
allnodes->push_back(child);
node = child;
}
else
{
break;
}
}
// get subsets.
vector<vector>*> subsets = ilanever::utils::get_subsets<treenode>(*allnodes);
// combine patterns.
for(vector<treenode>* subset : subsets)

{
if(subset->size() != 0)
{
ItemSet* pattern = new ItemSet();
int supportcount = -1;
for(TreeNode* node : *subset)
{
if(supportcount == -1)
{
supportcount = node->passedtimes;
}
else if(node->passedtimes < supportcount)
{
supportcount = node->passedtimes;
}
pattern->push_back(node->value);
}
pattern->support_count = supportcount;
// add suffix pattern.
for(auto beg = suffixpattern->begin(); beg != suffixpattern->end(); ++beg)
{
pattern->push_back(*beg);
}
patterns->push_back(*pattern);
}
}
return patterns;
}

TreeNode* FPGrowth::get_FPTree(ItemHeaderTable* headerTable,ItemSetList* sortedFrequents)
{
TreeNode* root = new TreeNode("");
for(auto begin = this->tranList.begin(); begin != this->tranList.end(); ++begin)
{
// get pruned and sorted set of the transaction.
ItemSet sortedSet;
ItemSet setOfTran = begin->getItemSet();
for(auto beg = sortedFrequents->begin(); beg != sortedFrequents->end(); ++beg)
{
string item = (*beg)[0];
for(auto b = setOfTran.begin(); b != setOfTran.end(); ++b)
{
if(*b == item)
{
sortedSet.push_back(*b);
break;
}
}
}

// build tree.
TreeNode* parent = root;
for(string& item : sortedSet)
{
if(parent->children != nullptr && parent->children->contains(item))
{
TreeNode* child = parent->children->get(item);
++(child->passedtimes);
parent = child;
}
else
{
TreeNode* child = new TreeNode(item,1,parent);
parent->children->push_back(child);
ItemHeader* itemHeader = headerTable->get(item);
itemHeader->nodes->push_back(child);
parent = child;
}
}
}
return root;
}

ItemHeaderTable* FPGrowth::get_item_header_table(ItemSetList* p_frequents_1_itemsets)
{
ItemHeaderTable* headerTab = new ItemHeaderTable();
for(auto beg = p_frequents_1_itemsets->begin(); beg != p_frequents_1_itemsets->end(); ++beg)
{
ItemHeader* header = new ItemHeader((*beg)[0], beg->support_count);
headerTab->push_back(header);
}
return headerTab;
}

ItemSetList* FPGrowth::get_frequent_1_itemsets()
{
// get candidate 1 itemsets.
ItemSetList* p_c_1_itemsets = new ItemSetList();
for(Transaction& t : this->tranList)
{
for(string& s : t.getItemSet())
{
ItemSet c_1_itemset(1);
c_1_itemset.add(s);
ItemSet* found_itemset = p_c_1_itemsets->find_equal_set(c_1_itemset);
if(found_itemset == nullptr)
{
c_1_itemset.support_count = 1;
p_c_1_itemsets->add(c_1_itemset);
}

else
{
++(found_itemset->support_count);
}
}
}

// remove the itemsets of unreach threshold.
vector<:iterator> toDels;
for(auto begin = p_c_1_itemsets->begin(); begin != p_c_1_itemsets->end(); ++begin)
{
if(begin->support_count < this->min_sup)
{
toDels.push_back(begin);
}
}
for(auto iter : toDels)
{
p_c_1_itemsets->erase(iter);
}
return p_c_1_itemsets;
}
}
}/<treenode>/<treenode>/<vector>/<treenodeptrlist>/<treenodeptrlist>/<treenodeptrlist>/<treenodeptrlist>/<treenodeptrlist>
/<vector>/<iostream>
/<memory>/<code>




分享到:


相關文章: