注册 登录  
 加关注
   显示下一条  |  关闭
温馨提示!由于新浪微博认证机制调整,您的新浪微博帐号绑定已过期,请重新绑定!立即重新绑定新浪微博》  |  关闭

Koala++'s blog

计算广告学 RTB

 
 
 

日志

 
 

Weka开发[27]——SMO源代码分析[1]  

2009-09-19 21:44:28|  分类: 机器学习 |  标签: |举报 |字号 订阅

  下载LOFTER 我的照片书  |

       前两天思维混乱的状态下,把SMO的代码胡乱看了一下,里面的东西没有校对,错误是一定有的,现在请不要转载,把错误扩散,贴出来的目的是,如果有什么不清楚,或是不对的,希望大家能告诉我,如果我还有实力搞懂,我会努力纠正的。

还是从buildClassifier开始:

if (!m_checksTurnedOff) {

    if (insts.checkForStringAttributes()) {

       throw new UnsupportedAttributeTypeException(

              "Cannot handle string attributes!");

    }

    if (insts.classAttribute().isNumeric()) {

       throw new UnsupportedClassTypeException(

              "SMO can't handle a numeric class! Use"

                     + "SMOreg for performing regression.");

    }

    insts = new Instances(insts);

    insts.deleteWithMissingClass();

    if (insts.numInstances() == 0) {

       throw new Exception(

              "No training instances without a missing class!");

    }

 

    /* Removes all the instances with weight equal to 0.

     MUST be done since condition (8) of Keerthi's paper

     is made with the assertion Ci > 0 (See equation (3a). */

    Instances data = new Instances(insts, insts.numInstances());

    for (int i = 0; i < insts.numInstances(); i++) {

       if (insts.instance(i).weight() > 0)

           data.add(insts.instance(i));

    }

    if (data.numInstances() == 0) {

       throw new Exception(

              "No training instances left after removing "

                     + "instance with either a weight "

                     + "null or a missing class!");

    }

    insts = data;

}

       都比较简单,第一个检查是不是有String类型的属性,第二个是检查类别是不是数值型的,数值型的类别要用SMOreg进行回归分析,下面是删除没有类别的样本,再看是不是样本数为0,下面的注释写到,删除所有样本权重为0的样本,因为Keerthi的论文中条件(8)中有要求。再判断一次是不是样本数为0

m_onlyNumeric = true;

if (!m_checksTurnedOff) {

    for (int i = 0; i < insts.numAttributes(); i++) {

       if (i != insts.classIndex()) {

           if (!insts.attribute(i).isNumeric()) {

              m_onlyNumeric = false;

              break;

           }

       }

    }

}

       判断是否只有数值型属性。

if (!m_checksTurnedOff) {

    m_Missing = new ReplaceMissingValues();

    m_Missing.setInputFormat(insts);

    insts = Filter.useFilter(insts, m_Missing);

} else {

    m_Missing = null;

}

       ReplaceMissingValues的注释中写到,用数据集的训练集中的众数和平均值替换所有离散和数值型属性(Replaces all missing values for nominal and numeric attributes in a dataset with the modes and means from the training data.

if (!m_onlyNumeric) {

    m_NominalToBinary = new NominalToBinary();

    m_NominalToBinary.setInputFormat(insts);

    insts = Filter.useFilter(insts, m_NominalToBinary);

} else {

    m_NominalToBinary = null;

}

       NominalToBinary的注释中写到,转换所有的离散属性到二值的数值属性,一个属性有K个值会被转换成为K个二值属性。Converts all nominal attributes into binary numeric attributes. An attribute with k values is transformed into k binary attributes (using the one-attribute-per-value approach). Binary attributes are left binary.

if (m_filterType == FILTER_STANDARDIZE) {

    m_Filter = new Standardize();

    m_Filter.setInputFormat(insts);

    insts = Filter.useFilter(insts, m_Filter);

} else if (m_filterType == FILTER_NORMALIZE) {

    m_Filter = new Normalize();

    m_Filter.setInputFormat(insts);

    insts = Filter.useFilter(insts, m_Filter);

} else {

    m_Filter = null;

}

       Standardize的注释写到,将指定数据集中的所有数值属性都标准化为有均值0,单位方差(Standardizes all numeric attributes in the given dataset to have zero mean and unit variance)。Nomalize的注释写到,正规化所有的数值型值,结果值在[0,1]区间。(Normalizes all numeric values in the given dataset. The resulting values are in [0,1] for the data used to compute the normalization intervals.)

m_classIndex = insts.classIndex();

m_classAttribute = insts.classAttribute();

 

// Generate subsets representing each class

Instances[] subsets = new Instances[insts.numClasses()];

for (int i = 0; i < insts.numClasses(); i++) {

    subsets[i] = new Instances(insts, insts.numInstances());

}

for (int j = 0; j < insts.numInstances(); j++) {

    Instance inst = insts.instance(j);

    subsets[(int) inst.classValue()].add(inst);

}

for (int i = 0; i < insts.numClasses(); i++) {

    subsets[i].compactify();

}

       得到类别索引m_classIndex和类别属性m_classAttributeSubsets是将不同类别值的样本分开,add函数是把样本加到相应的类别值子集中。最后的compactify只是为了节约空间,因为开始分配空间的时候是分配的insts.numInstances()的大小。

// Build the binary classifiers

Random rand = new Random(m_randomSeed);

m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()];

for (int i = 0; i < insts.numClasses(); i++) {

    for (int j = i + 1; j < insts.numClasses(); j++) {

       m_classifiers[i][j] = new BinarySMO();

       Instances data = new Instances(insts, insts.numInstances());

       for (int k = 0; k < subsets[i].numInstances(); k++) {

           data.add(subsets[i].instance(k));

       }

       for (int k = 0; k < subsets[j].numInstances(); k++) {

           data.add(subsets[j].instance(k));

       }

       data.compactify();

       data.randomize(rand);

       m_classifiers[i][j].buildClassifier(data, i, j,

              m_fitLogisticModels, m_numFolds, m_randomSeed);

    }

}

       m_classifiers是一个两维的BinarySMO分类器数组,这里我们可以看到,它所用的方式很落后,这也是它速度慢的一个原因。

// Store the sum of weights

m_sumOfWeights = insts.sumOfWeights();

 

// Set class values

m_class = new double[insts.numInstances()];

m_iUp = -1;

m_iLow = -1;

for (int i = 0; i < m_class.length; i++) {

    if ((int) insts.instance(i).classValue() == cl1) {

       m_class[i] = -1;

       m_iLow = i;

    } else if ((int) insts.instance(i).classValue() == cl2) {

       m_class[i] = 1;

       m_iUp = i;

    } else {

       throw new Exception("This should never happen!");

    }

}

       计算权重总和m_sumOfWeights,将两个离散的类别值转换成{-1,1}。并且记录两个类别样本的最大索引值。

// Check whether one or both classes are missing

if ((m_iUp == -1) || (m_iLow == -1)) {

    if (m_iUp != -1) {

       m_b = -1;

    } else if (m_iLow != -1) {

       m_b = 1;

    } else {

       m_class = null;

       return;

    }

    if (!m_useRBF && m_exponent == 1.0) {

       m_sparseWeights = new double[0];

       m_sparseIndices = new int[0];

       m_class = null;

    } else {

       m_supportVectors = new SMOset(0);

       m_alpha = new double[0];

       m_class = new double[0];

    }

 

    // Fit sigmoid if requested

    if (fitLogistic) {

       fitLogistic(insts, cl1, cl2, numFolds, new Random(

              randomSeed));

    }

    return;

}

       如果m_iUp==-1表示第1种类别没有相应的样本,m_iLow==-1表示第2种类别没有相应的样本,m_b就是公式中的b,如果两个都没有相应的样本那m_class=nulll,最后的fitLogistic最用logistic regresstion model,这个就有点远了,略过。

// Set the reference to the data

m_data = insts;

 

// If machine is linear, reserve space for weights

if (!m_useRBF && m_exponent == 1.0) {

    m_weights = new double[m_data.numAttributes()];

} else {

    m_weights = null;

}

 

// Initialize alpha array to zero

m_alpha = new double[m_data.numInstances()];

       m_data指向insts,如果是线性的支持向量机,为weights保留空间,初始化m_alpha

// Initialize sets

m_supportVectors = new SMOset(m_data.numInstances());

m_I0 = new SMOset(m_data.numInstances());

m_I1 = new SMOset(m_data.numInstances());

m_I2 = new SMOset(m_data.numInstances());

m_I3 = new SMOset(m_data.numInstances());

m_I4 = new SMOset(m_data.numInstances());

       SMOset的构造函数如下:

public SMOset(int size) {

 

    m_indicators = new boolean[size];

    m_next = new int[size];

    m_previous = new int[size];

    m_number = 0;

    m_first = -1;

}

       也没什么特别的。

// Clean out some instance variables

m_sparseWeights = null;

m_sparseIndices = null;

 

// Initialize error cache

m_errors = new double[m_data.numInstances()];

m_errors[m_iLow] = 1;

m_errors[m_iUp] = -1;

 

// Initialize kernel

if (m_useRBF) {

    m_kernel = new RBFKernel(m_data, m_cacheSize, m_gamma);

} else {

    if (m_featureSpaceNormalization) {

       m_kernel = new NormalizedPolyKernel(m_data, m_cacheSize,

              m_exponent, m_lowerOrder);

    } else {

       m_kernel = new PolyKernel(m_data, m_cacheSize, m_exponent,

              m_lowerOrder);

    }

}

  评论这张
 
阅读(2105)| 评论(0)
推荐 转载

历史上的今天

评论

<#--最新日志,群博日志--> <#--推荐日志--> <#--引用记录--> <#--博主推荐--> <#--随机阅读--> <#--首页推荐--> <#--历史上的今天--> <#--被推荐日志--> <#--上一篇,下一篇--> <#-- 热度 --> <#-- 网易新闻广告 --> <#--右边模块结构--> <#--评论模块结构--> <#--引用模块结构--> <#--博主发起的投票-->
 
 
 
 
 
 
 
 
 
 
 
 
 
 

页脚

网易公司版权所有 ©1997-2017