FPTreeApplication类
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;
import java.util.stream.Collectors;public class FPTreeApplication {/*** 数据源*/private static final String FILE_PATH = "E:\\UniCourses\\buct\\juniorI\\DataMining\\retail.txt";/*** 数据项分隔符*/private static final String SEPARATOR = " ";/*** 最小支持度*/private double minSupport;/*** 最小支持度项数*/private int minSupportItem;/*** 结果, Key为项集数,Value为频繁n项集*/private static final Map<Integer,List<List<String>>> resultMap = new HashMap<>();public FPTreeApplication(double minSupport){this.minSupport = minSupport;}public Map<Integer,List<List<String>>> getResultMap(){return resultMap;}/*** 读取源数据* @return*/public List<List<String>> readSrcData(){List<List<String>> srcData = new ArrayList<>();try(FileInputStream fileInputStream = new FileInputStream(FILE_PATH);InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);BufferedReader bufferedReader = new BufferedReader(inputStreamReader);){String dataLine;while((dataLine=bufferedReader.readLine())!=null){List<String> itemList = new ArrayList<>(Arrays.asList(dataLine.split(SEPARATOR)));srcData.add(itemList);}}catch (Exception e){e.printStackTrace();}// 得到最小支持度项数minSupportItem = (int)(minSupport*srcData.size());return srcData;}/*** 构建头表* @param itemListList* @return*/public List<FPTreeNode> buildHeadTableList(List<List<String>> itemListList){// 统计所有项出现次数Map<String,Integer> itemCountMap = new HashMap<>();for (List<String> itemList : itemListList) {for (String item : itemList) {if(itemCountMap.containsKey(item)){itemCountMap.put(item,itemCountMap.get(item)+1);}else{itemCountMap.put(item,1);}}}// 统计频繁项List<FPTreeNode> frequentItemList = new ArrayList<>();for(Map.Entry<String,Integer> entry:itemCountMap.entrySet()){if(entry.getValue()>= minSupportItem){frequentItemList.add(new FPTreeNode(entry.getKey(),entry.getValue()));}}// 对频繁项降序排序Collections.sort(frequentItemList);return frequentItemList;}/*** 构建FPTree* @param headTableList* @param itemListList* @return*/public FPTreeNode buildFPTree(List<FPTreeNode> headTableList,List<List<String>> itemListList){List<List<String>> frequentItemListList = new ArrayList<>();// 筛选出每行数据的频繁项for (List<String> itemList : itemListList) {List<String> frequentItemList = new ArrayList<>();for (FPTreeNode fpTreeNode : headTableList) {if(itemList.contains(fpTreeNode.getData())){frequentItemList.add(fpTreeNode.getData());}}frequentItemListList.add(frequentItemList);}FPTreeNode root = new FPTreeNode();// 对每行数据的频繁项去构造一颗子树for (List<String> frequentItemList : frequentItemListList) {buildSonFPTree(root,headTableList,frequentItemList);}return root;}/*** 为每行数据的频繁项构造一颗子树* @param rootNode 根节点* @param headTableList 头表* @param frequentItemList 每一行数据的频繁项集(按头表序)*/public void buildSonFPTree(FPTreeNode rootNode,List<FPTreeNode> headTableList,List<String> frequentItemList){FPTreeNode tempNode = rootNode;for (String frequentItem : frequentItemList) {List<FPTreeNode> sonNodeList = tempNode.getSonNodeList();// 当前节点没有子节点if(null==sonNodeList || sonNodeList.isEmpty()){tempNode.setSonNodeList(new ArrayList<>());FPTreeNode sonNode = new FPTreeNode(frequentItem,1);sonNode.setParentNode(tempNode);tempNode.getSonNodeList().add(sonNode);// 链接头表linkNode(headTableList,sonNode);// 移动当前节点,准备插入下一个项tempNode = sonNode;continue;}// 当前节点的子节点包含目标项boolean existItem = false;for (FPTreeNode sonNode : sonNodeList) {if(sonNode.getData().equals(frequentItem)){existItem = true;// 累加该子节点的count值sonNode.setCount(sonNode.getCount()+1);// 移动当前节点,准备插入下一个项tempNode = sonNode;break;}}// 当前节点的子节点不包含目标项if(!existItem){FPTreeNode sonNode = new FPTreeNode(frequentItem,1);sonNode.setParentNode(tempNode);// 将目标项插入当前节点的子节点tempNode.getSonNodeList().add(sonNode);// 链接头表linkNode(headTableList,sonNode);// 移动当前节点,准备插入下一个项tempNode = sonNode;}}}/*** 对新插入FPTree的节点进行与头表或同名节点的链接* @param headTableList* @param newNode*/public void linkNode(List<FPTreeNode> headTableList,FPTreeNode newNode){for (FPTreeNode headNode : headTableList) {if(headNode.getData().equals(newNode.getData())){// 如果头节点尚未链接任何节点if(headNode.getNextNode()==null){headNode.setNextNode(newNode);}else{// 如果头节点已经链接过同名节点,则将新节点链接到同名节点尾部FPTreeNode tempNode = headNode;while(tempNode.getNextNode()!=null){tempNode = tempNode.getNextNode();}tempNode.setNextNode(newNode);}}}}/*** 总执行函数* @param srcData 待分析数据* @param postItem 当前的后缀项*/public void FPGrow(List<List<String>> srcData,List<String> postItem){// 根据数据源构建头表List<FPTreeNode> headTableList = buildHeadTableList(srcData);// 若头表为空,则结束递归if(headTableList.isEmpty()){return;}// 如果当前后缀项为null,说明此时的头表就是所有的频繁一项集,加入最终结果if(null==postItem){List<List<String>> itemListList = new ArrayList<>();for (FPTreeNode headNode : headTableList) {List<String> itemList = new ArrayList<>();itemList.add(headNode.getData());itemListList.add(itemList);}resultMap.put(1,itemListList);}else{/*** 如果当前后缀项不为null,说明已经进入递归* 递归时,所谓的头表其实是后缀项的头表* 而头表的构建过程,保证了头表的每个头节点值都是频繁项* 因此,若后缀项有k项,则此时头表的每一项都能分别与后缀项组成新的频繁(k+1)项集*/Integer itemNum = postItem.size()+1;List<List<String>> newListList = new ArrayList<>();for (FPTreeNode headNode : headTableList) {List<String> newList = new ArrayList<>();newList.add(headNode.getData());newList.addAll(postItem);newListList.add(newList);}// 加入最终结果List<List<String>> originListList = resultMap.get(itemNum);if(originListList == null){resultMap.put(itemNum,newListList);}else{originListList.addAll(newListList);}}// 根据头表构建FPTreeFPTreeNode rootNode = buildFPTree(headTableList, srcData);// 对头表每个头节点,得到其所有同值节点的条件模式库,进入递归for (FPTreeNode headNode : headTableList) {// 当前的条件模式库,是 当前头节点+原后缀=新后缀 的条件模式库List<String> newPostItem = new ArrayList<>();newPostItem.add(headNode.getData());if(null!=postItem){newPostItem.addAll(postItem);}List<List<String>> conditionalPatternBase = new ArrayList<>();FPTreeNode bottomNode = headNode.getNextNode();// 遍历当前头节点的所有同值节点while(null!=bottomNode){// 当前条件模式的计数值,为底节点计数值int count = bottomNode.getCount();LinkedList<String> conditionalPattern = new LinkedList<>();FPTreeNode tempNode = bottomNode.getParentNode();while(tempNode != (rootNode)) {conditionalPattern.addFirst(tempNode.getData());tempNode = tempNode.getParentNode();}// 对计数值>1且不为空的条件模式,重复入库while(count>0 && !conditionalPattern.isEmpty()){conditionalPatternBase.add(conditionalPattern);count--;}// 继续查找下一个同值节点bottomNode = bottomNode.getNextNode();}// 带着新后缀与新条件模式库进入递归FPGrow(conditionalPatternBase,newPostItem);}}public static void main(String[] args) {FPTreeApplication fpTreeApplication = new FPTreeApplication(0.01);long start = System.currentTimeMillis();// 读取数据集List<List<String>> srcData = fpTreeApplication.readSrcData();// 执行,初始后缀项为nullfpTreeApplication.FPGrow(srcData,null);// 输出结果Map<Integer, List<List<String>>> resultMap = fpTreeApplication.getResultMap();for(Map.Entry<Integer,List<List<String>>> entry:resultMap.entrySet()){System.out.println("------频繁"+entry.getKey()+"项集------");for (List<String> itemList : entry.getValue()) {System.out.println(itemList);}System.out.println("共"+entry.getValue().size()+"个");}long end = System.currentTimeMillis();System.out.println("共耗时"+(end-start)+"ms");}
}
FPTreeNode类
import lombok.Data;
import lombok.NoArgsConstructor;import java.util.List;/*** 树节点*/
@Data
@NoArgsConstructor
public class FPTreeNode implements Comparable<FPTreeNode>{/*** 节点值*/private String data;/*** 节点值计数*/private Integer count;/*** 父节点*/private FPTreeNode parentNode;/*** 下一个同值节点*/private FPTreeNode nextNode;/*** 子节点*/private List<FPTreeNode> sonNodeList;public FPTreeNode(String data,Integer count){this.data = data;this.count = count;}@Overridepublic String toString() {return "FPTreeNode{" +"data='" + data + '\'' +", count=" + count +'}';}/*** 按出现次数降序排序* @param node the object to be compared.* @return*/public int compareTo(FPTreeNode node){return node.count-count;}
}