注册 登录  
 加关注
   显示下一条  |  关闭
温馨提示!由于新浪微博认证机制调整,您的新浪微博帐号绑定已过期,请重新绑定!立即重新绑定新浪微博》  |  关闭

Koala++'s blog

计算广告学 RTB

 
 
 

日志

 
 

Weka开发[24]——Apriori源代码分析(1)  

2009-08-04 20:13:38|  分类: 机器学习 |  标签: |举报 |字号 订阅

  下载LOFTER 我的照片书  |

     曾经卖过一个Apriori的程序,那个程序大约有50%的正确率(当然结果是正确的,是只实现上很不一样),数据挖掘课上写了一个Apriori,一部分懒地按书上的算法,大约对了80% (当然结果仍然是正确的),记得邱强有一次要用Apriori算法时说:weka的太慢了,还好上次数据挖掘课实现了一下,还挺快的,注意的一点是关联规则不属于机器学习,这里我不想再分出来一个数据挖掘的组了。

         buildAssociations函数开始:

double[] confidences, supports;

int[] indices;

FastVector[] sortedRuleSet;

int necSupport = 0;

 

instances = new Instances(instances);

 

if (m_removeMissingCols) {

    instances = removeMissingColumns(instances);

}

         看一下removeMissingColumns,虽然它是如此的不重要:

protected Instances removeMissingColumns(Instances instances)

       throws Exception {

 

    int numInstances = instances.numInstances();

    StringBuffer deleteString = new StringBuffer();

    int removeCount = 0;

    boolean first = true;

    int maxCount = 0;

 

    for (int i = 0; i < instances.numAttributes(); i++) {

       AttributeStats as = instances.attributeStats(i);

       if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {

           // see if we can decrease this by looking for the most frequent

           // value

           int[] counts = as.nominalCounts;

           if (counts[Utils.maxIndex(counts)] > maxCount) {

              maxCount = counts[Utils.maxIndex(counts)];

           }

       }

       if (as.missingCount == numInstances) {

           if (first) {

              deleteString.append((i + 1));

              first = false;

           } else {

              deleteString.append("," + (i + 1));

           }

           removeCount++;

       }

    }

    if (m_verbose) {

       System.err.println("Removed : " + removeCount

              + " columns with all missing " + "values.");

    }

    if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {

       m_upperBoundMinSupport = (double) maxCount

/ (double) numInstances;

       if (m_verbose) {

           System.err.println("Setting upper bound min support to : "

                  + m_upperBoundMinSupport);

       }

    }

 

    if (deleteString.toString().length() > 0) {

       Remove af = new Remove();

       af.setAttributeIndices(deleteString.toString());

       af.setInvertSelection(false);

       af.setInputFormat(instances);

       Instances newInst = Filter.useFilter(instances, af);

 

       return newInst;

    }

    return instances;

}

         For循环中的第一个if不重要,不要理睬。它的作用是想在后面也就是maxCount / numInstances那看一下support的下界设多少。第二个if中是看是不是缺失值等于样本数,也就是在这个属性上所有的值都是缺失的,那么就用deleteString把这些都是缺失值的属性下标记下来,连成一个字符串,最后一个if就是标准的删除特征的代码,也就是把那么都是缺失值的特征给删除了。

if (m_car && m_metricType != CONFIDENCE)

    throw new Exception(

           "For CAR-Mining metric type has to be confidence!");

         如果想得到与类别有关的规则,又没有设成CONFIDENCE是不可以的。

// only set class index if CAR is requested

if (m_car) {

    if (m_classIndex == -1) {

       instances.setClassIndex(instances.numAttributes() - 1);

    } else if (m_classIndex <= instances.numAttributes()

           && m_classIndex > 0) {

       instances.setClassIndex(m_classIndex - 1);

    } else {

       throw new Exception("Invalid class index.");

    }

}

         如果用用户想得到与类别有关的规则又忘了设类别索引,就帮他设一下。

// can associator handle the data?

getCapabilities().testWithFail(instances);

         看能不能外理这种数据。

m_cycles = 0;

if (m_car) {

    // m_instances does not contain the class attribute

    m_instances = LabeledItemSet.divide(instances, false);

 

    // m_onlyClass contains only the class attribute

    m_onlyClass = LabeledItemSet.divide(instances, true);

} else

    m_instances = instances;

 

if (m_car && m_numRules == Integer.MAX_VALUE) {

    // Set desired minimum support

    m_minSupport = m_lowerBoundMinSupport;

} else {

    // Decrease minimum support until desired number of rules found.

    m_minSupport = m_upperBoundMinSupport - m_delta;

    m_minSupport = (m_minSupport < m_lowerBoundMinSupport) ?

    m_lowerBoundMinSupport : m_minSupport;

}

         不再看LabeledItemSet.devide,只看一下注释,m_instances不包含类别属性,而m_onlyClass只包含类别属性。下面的就不去管它了。

do {

    // Reserve space for variables

    m_Ls = new FastVector();

    m_hashtables = new FastVector();

    m_allTheRules = new FastVector[6];

    m_allTheRules[0] = new FastVector();

    m_allTheRules[1] = new FastVector();

    m_allTheRules[2] = new FastVector();

    if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {

       m_allTheRules[3] = new FastVector();

       m_allTheRules[4] = new FastVector();

       m_allTheRules[5] = new FastVector();

    }

    sortedRuleSet = new FastVector[6];

    sortedRuleSet[0] = new FastVector();

    sortedRuleSet[1] = new FastVector();

    sortedRuleSet[2] = new FastVector();

    if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {

       sortedRuleSet[3] = new FastVector();

        sortedRuleSet[4] = new FastVector();

       sortedRuleSet[5] = new FastVector();

    }

    if (!m_car) {

       // Find large itemsets and rules

       findLargeItemSets();

       if (m_significanceLevel != -1 || m_metricType != CONFIDENCE)

           findRulesBruteForce();

       else

           findRulesQuickly();

    } else {

       findLargeCarItemSets();

       findCarRulesQuickly();

    }

    上面都是一些初始化的代码,不讲了,如果想知道,可以看一下我以前写的那一篇Weka开发[17]——关联规则之Apriori。再下来,如果不是挖掘与类别相关的规则,那么先执行findLargeItermSets

private void findLargeItemSets() throws Exception {

 

    FastVector kMinusOneSets, kSets;

    Hashtable hashtable;

    int necSupport, necMaxSupport, i = 0;

 

    // Find large itemsets

 

    // minimum support

    necSupport = (int) (m_minSupport

* (double) m_instances.numInstances() + 0.5);

    necMaxSupport = (int) (m_upperBoundMinSupport

           * (double) m_instances.numInstances() + 0.5);

 

    kSets = AprioriItemSet.singletons(m_instances);

    AprioriItemSet.upDateCounters(kSets, m_instances);

    kSets = AprioriItemSet.deleteItemSets(kSets, necSupport,

       necMaxSupport);

    if (kSets.size() == 0)

       return;

    do {

       m_Ls.addElement(kSets);

       kMinusOneSets = kSets;

       kSets = AprioriItemSet.mergeAllItemSets(kMinusOneSets, i,

              m_instances.numInstances());

       hashtable = AprioriItemSet.getHashtable(kMinusOneSets,

              kMinusOneSets.size());

       m_hashtables.addElement(hashtable);

       kSets = AprioriItemSet.pruneItemSets(kSets, hashtable);

       AprioriItemSet.upDateCounters(kSets, m_instances);

       kSets = AprioriItemSet.deleteItemSets(kSets, necSupport,

              necMaxSupport);

       i++;

    } while (kSets.size() > 0);

}

         NecSupprot是最小support的另一种衡量,就是计数,原来的m_minSupport是用百分比逄的,+0.5当然就是四舍五入了。necMaxSupport也是相同的意思。

         下面看AprioriItermSet.singletonsAprioriItermSet是继承自ItemSet的:

/**

 * Converts the header info of the given set of instances into a set of

 * item sets (singletons). The ordering of values in the header file

 * determines the lexicographic order.

 */

public static FastVector singletons(Instances instances)

throws Exception {

 

    FastVector setOfItemSets = new FastVector();

    ItemSet current;

 

    for (int i = 0; i < instances.numAttributes(); i++) {

       if (instances.attribute(i).isNumeric())

           throw new Exception("Can't handle numeric attributes!");

       for (int j = 0; j < instances.attribute(i).numValues(); j++) {

           current = new AprioriItemSet(instances.numInstances());

           current.m_items = new int[instances.numAttributes()];

           for (int k = 0; k < instances.numAttributes(); k++)

              current.m_items[k] = -1;

           current.m_items[i] = j;

           setOfItemSets.addElement(current);

       }

    }

    return setOfItemSets;

}

         这段代码的作用现在还看不出来,可以看一下注释,意思是:将给定数据集的头信息转换成一个项集的集合,头信息中的值的顺序是按字典序。如果你也是用weather.nominal数据集,那么它产生的结果应该是这个:

0,-1,-1,-1,-1

1,-1,-1,-1,-1

2,-1,-1,-1,-1

-1,0,-1,-1,-1

-1,1,-1,-1,-1

-1,2,-1,-1,-1

-1,-1,0,-1,-1

-1,-1,1,-1,-1

-1,-1,-1,0,-1

-1,-1,-1,1,-1

-1,-1,-1,-1,0

-1,-1,-1,-1,1

         一共有12组,因为有2个属性有3种属性值,3个属性有2种属性值,2*3+3*2=12。而且m_iterm的第i个元素值设为j

         接下来看AprioriItemSet.upDateCounters

public static void upDateCounters(FastVector itemSets, Instances instances) {

 

    for (int i = 0; i < instances.numInstances(); i++) {

       Enumeration enu = itemSets.elements();

       while (enu.hasMoreElements())

           ((ItemSet) enu.nextElement()).upDateCounter(instances

                  .instance(i));

    }

}

         外层循环是样本的个数,内层循环是itermSets的数量。里面的upDateCounter

public void upDateCounter(Instance instance) {

    if (containedBy(instance))

       m_counter++;

}

         也就是如果满足containBy那么就计数加1,看一下containBy

public boolean containedBy(Instance instance) {

    for (int i = 0; i < instance.numAttributes(); i++)

       if (m_items[i] > -1) {

           if (instance.isMissing(i))

              return false;

           if (m_items[i] != (int) instance.value(i))

              return false;

       }

    return true;

}

         这段代码就相对容易一点了,如果在相应的特征值上为缺失值,或是与我们要找的特征词不相同,那么就返回false

         那么在upDateCounters执行完之后,结果大概是这样的,最后的是记数:

0,-1,-1,-1,-1 : 5

1,-1,-1,-1,-1 : 4

2,-1,-1,-1,-1 : 5

-1,0,-1,-1,-1 : 4

-1,1,-1,-1,-1 : 6

-1,2,-1,-1,-1 : 4

-1,-1,0,-1,-1 : 7

-1,-1,1,-1,-1 : 7

-1,-1,-1,0,-1 : 6

-1,-1,-1,1,-1 : 8

-1,-1,-1,-1,0 : 9

-1,-1,-1,-1,1 : 5

         这样也就理解了,m_item的意义,也就是现在是1项集。

         接下来看deleteItemSets

public static FastVector deleteItemSets(FastVector itemSets,

       int minSupport, int maxSupport) {

 

    FastVector newVector = new FastVector(itemSets.size());

 

    for (int i = 0; i < itemSets.size(); i++) {

       ItemSet current = (ItemSet) itemSets.elementAt(i);

       if ((current.m_counter >= minSupport)

              && (current.m_counter <= maxSupport))

           newVector.addElement(current);

    }

    return newVector;

}

         这里可以看到,只有m_counterminSupportmaxSupport之间的项我们才要。

  评论这张
 
阅读(3460)| 评论(0)
推荐 转载

历史上的今天

评论

<#--最新日志,群博日志--> <#--推荐日志--> <#--引用记录--> <#--博主推荐--> <#--随机阅读--> <#--首页推荐--> <#--历史上的今天--> <#--被推荐日志--> <#--上一篇,下一篇--> <#-- 热度 --> <#-- 网易新闻广告 --> <#--右边模块结构--> <#--评论模块结构--> <#--引用模块结构--> <#--博主发起的投票-->
 
 
 
 
 
 
 
 
 
 
 
 
 
 

页脚

网易公司版权所有 ©1997-2017