Mahout決定木アルゴリズムソースコード分析(3-1)構築木実戦


前編では主にPartial Implementationのツリー作成の主な操作を分析し、以下は自分でmahoutのソースコードを使って自分で実戦してみます.
(注意:建設したMR工事は以下のパッケージを導入する必要がある:http://download.csdn.net/detail/fansy1990/5030740ああ、そうでないとconsoleのヒントが見えません)
次のクラスファイルを新規作成します.
package org.fansy.forest.test;

import java.io.*;
import java.util.List;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.classifier.df.Bagging;
import org.apache.mahout.classifier.df.builder.DecisionTreeBuilder;
import org.apache.mahout.classifier.df.builder.TreeBuilder;
import org.apache.mahout.classifier.df.data.Data;
import org.apache.mahout.classifier.df.data.DataConverter;
import org.apache.mahout.classifier.df.data.Dataset;
import org.apache.mahout.classifier.df.data.Instance;
import org.apache.mahout.classifier.df.node.Node;
import org.apache.mahout.common.RandomUtils;

import com.google.common.collect.Lists;

public class TestBuildTree {

	/** 
	 * use the Mahout source code to build a decision tree
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		Path dsPath=new Path("/home/fansy/workspace/MahTestDemo/car_small.info");
		String dataPath="/home/fansy/mahout/data/forest/car_test_small.txt";
		Random rng=RandomUtils.getRandom(555);
		
		// create dataset
		Dataset ds=Dataset.load(new Configuration(), dsPath);
		// create converter
		DataConverter converter=new DataConverter(ds);
		// load data
		Data data=loadData(ds,converter,dataPath);
		
		// create treeBuilder and build tree
		TreeBuilder treeBuilder=new DecisionTreeBuilder();
		Bagging bag=new Bagging(treeBuilder,data);
		Node tree=bag.build(rng);
		
		System.out.println("the tree is builded"+tree);
		
	}
	/**
	 * load data from the given data path
	 * @param ds  :dataset
	 * @param converter: DataConverter
	 * @param dataPath  : data path
	 * @return  Data
	 * @throws IOException
	 */
	public static Data loadData(Dataset ds,DataConverter converter,String dataPath) throws IOException{
		List<Instance> instances=Lists.newArrayList();
		
		File dataSourthPath=new File(dataPath);
		FileReader fileReader=null;
		try {
			fileReader = new FileReader(dataSourthPath);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		BufferedReader bf=new BufferedReader(fileReader);
		String line=null;

		try {
			while((line=bf.readLine())!=null){
				instances.add(converter.convert(line));
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		bf.close();
		fileReader.close();
		
		System.out.println("load  file to Data done ...");
		
		return new Data(ds,instances);
	}

}
car_small.infoは参考になりますhttp://blog.csdn.net/fansy1990/article/details/8443342取得、元のデータファイルとcar_test_small.txt同様、前編参照http://blog.csdn.net/fansy1990/article/details/8544344の元のデータファイル;
直接実行すると、次のプロンプトが表示されます.
load  file to Data done ...
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/fansy/hadoop-1.0.2/lib/slf4j-log4j12-1.6.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/fansy/mahout-0.7-pure/examples/target/mahout-examples-0.7-job.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/fansy/mahout-0.7-pure/core/target/mahout-core-0.7-job.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
__________________________ bag data which is changed
-----------------------
{1:2.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:1.0,1:2.0,2:3.0,4:1.0}
{0:1.0,1:1.0,3:2.0,4:1.0,5:2.0}
{0:2.0,2:3.0}
{0:1.0,2:3.0,3:1.0,4:2.0,5:1.0,6:1.0}
{1:2.0,2:1.0,3:1.0,5:2.0}
{0:3.0,1:2.0,2:2.0,3:1.0,4:2.0,5:1.0,6:1.0}
{0:1.0,1:1.0,2:2.0,4:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0}
{0:2.0,2:3.0,5:2.0}
{1:2.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{1:2.0,2:3.0,4:2.0}
{0:1.0,1:3.0,2:1.0,4:2.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:2.0,1:1.0,4:2.0}
{0:1.0,1:1.0,2:2.0,4:1.0}
{0:2.0,1:2.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{0:2.0,2:3.0}
{0:1.0,1:1.0,3:2.0,4:1.0,5:2.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{0:3.0,1:3.0,3:1.0,5:2.0}
{2:3.0,4:2.0,5:1.0}
{0:1.0,1:1.0,2:2.0,4:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:1.0,1:2.0,2:3.0,4:1.0}
{1:1.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{1:1.0,2:3.0,4:2.0,5:1.0}
{0:2.0,1:1.0,2:1.0,3:1.0,4:2.0,5:2.0}
{0:2.0,1:2.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{0:1.0,1:2.0,3:2.0,4:2.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{0:3.0,1:2.0,2:1.0}
{1:2.0,2:1.0,4:1.0,5:1.0}
{0:2.0,1:1.0,2:1.0,4:1.0}
{0:1.0,1:3.0,2:1.0,3:2.0}
{0:1.0,1:2.0,2:2.0,3:2.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:1.0,1:1.0,2:2.0,3:2.0,5:1.0}
{0:2.0,2:1.0,4:2.0,5:1.0}
{0:2.0,1:2.0,2:2.0,3:1.0,4:2.0}
{1:2.0,2:1.0,4:1.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:3.0,1:3.0,3:1.0,5:2.0}
{0:3.0,2:3.0,5:2.0}
{1:1.0,2:3.0,4:2.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:1.0,1:1.0,3:2.0,4:1.0,5:2.0}
{1:2.0,2:1.0,3:1.0,5:2.0}
{0:3.0,1:2.0,2:2.0,3:1.0,4:2.0,5:1.0,6:1.0}
{0:1.0,1:2.0,2:3.0,4:1.0}
{0:2.0,1:2.0,2:2.0,3:1.0,4:2.0}
-----------------------


goes down*************+time:1359169561154
the igSplit is:null
%%%%%%%%%%% the attributes
2,5,3,
the best ig is:0.3791769206396574,attribute:5,split:NaN
the attributes%%%%%%%%%%%%%
****************** not return but goes down1359169561157
if(complemented) before................
0.0,2.0,1.0,
complemented:true
0.0,2.0,1.0,
if(complemented) after................
subset[0] size:18
subset[1] size:10
subset[2] size:26
******************* cnt:3,---------------->end subsets size
__________________________subsets[0]
-----------------------
{0:1.0,1:2.0,2:3.0,4:1.0}
{0:2.0,2:3.0}
{0:1.0,1:1.0,2:2.0,4:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0}
{1:2.0,2:3.0,4:2.0}
{0:2.0,1:1.0,4:2.0}
{0:1.0,1:1.0,2:2.0,4:1.0}
{0:2.0,2:3.0}
{0:1.0,1:1.0,2:2.0,4:1.0}
{0:1.0,1:2.0,2:3.0,4:1.0}
{0:1.0,1:2.0,3:2.0,4:2.0}
{0:3.0,1:2.0,2:1.0}
{0:2.0,1:1.0,2:1.0,4:1.0}
{0:1.0,1:3.0,2:1.0,3:2.0}
{0:1.0,1:2.0,2:2.0,3:2.0}
{0:2.0,1:2.0,2:2.0,3:1.0,4:2.0}
{0:1.0,1:2.0,2:3.0,4:1.0}
{0:2.0,1:2.0,2:2.0,3:1.0,4:2.0}
-----------------------


XXXXXXXXXXXXXXXXXXXXXXXXXdata.isIdenticalLabel() in DecisionTreeBuilder it should not be here,time is :1359169561167,data.getDataset.getLabel():0.0
__________________________subsets[1]
-----------------------
{0:1.0,1:1.0,3:2.0,4:1.0,5:2.0}
{1:2.0,2:1.0,3:1.0,5:2.0}
{0:2.0,2:3.0,5:2.0}
{0:1.0,1:1.0,3:2.0,4:1.0,5:2.0}
{0:3.0,1:3.0,3:1.0,5:2.0}
{0:2.0,1:1.0,2:1.0,3:1.0,4:2.0,5:2.0}
{0:3.0,1:3.0,3:1.0,5:2.0}
{0:3.0,2:3.0,5:2.0}
{0:1.0,1:1.0,3:2.0,4:1.0,5:2.0}
{1:2.0,2:1.0,3:1.0,5:2.0}
-----------------------


XXXXXXXXXXXXXXXXXXXXXXXXXdata.isIdenticalLabel() in DecisionTreeBuilder it should not be here,time is :1359169561168,data.getDataset.getLabel():0.0
__________________________subsets[2]
-----------------------
{1:2.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:1.0,2:3.0,3:1.0,4:2.0,5:1.0,6:1.0}
{0:3.0,1:2.0,2:2.0,3:1.0,4:2.0,5:1.0,6:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{1:2.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:1.0,1:3.0,2:1.0,4:2.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:2.0,1:2.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{2:3.0,4:2.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:1.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{1:1.0,2:3.0,4:2.0,5:1.0}
{0:2.0,1:2.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{1:2.0,2:1.0,4:1.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:1.0,1:1.0,2:2.0,3:2.0,5:1.0}
{0:2.0,2:1.0,4:2.0,5:1.0}
{1:2.0,2:1.0,4:1.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:1.0,2:3.0,4:2.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:3.0,1:2.0,2:2.0,3:1.0,4:2.0,5:1.0,6:1.0}
-----------------------


goes down*************+time:1359169561170
%%%%%%%%%%% the attributes
3,0,2,
the best ig is:0.8024757691014436,attribute:3,split:NaN
the attributes%%%%%%%%%%%%%
****************** not return but goes down1359169561170
if(complemented) before................
0.0,2.0,1.0,
complemented:true
0.0,2.0,1.0,
if(complemented) after................
subset[0] size:10
subset[1] size:10
subset[2] size:6
******************* cnt:3,---------------->end subsets size
__________________________subsets[0]
-----------------------
{0:1.0,1:3.0,2:1.0,4:2.0,5:1.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{2:3.0,4:2.0,5:1.0}
{1:1.0,2:3.0,4:2.0,5:1.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{0:3.0,2:2.0,4:2.0,5:1.0}
{1:2.0,2:1.0,4:1.0,5:1.0}
{0:2.0,2:1.0,4:2.0,5:1.0}
{1:2.0,2:1.0,4:1.0,5:1.0}
{1:1.0,2:3.0,4:2.0,5:1.0}
-----------------------


XXXXXXXXXXXXXXXXXXXXXXXXXdata.isIdenticalLabel() in DecisionTreeBuilder it should not be here,time is :1359169561172,data.getDataset.getLabel():0.0
__________________________subsets[1]
-----------------------
{1:2.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{1:2.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:1.0,1:1.0,2:2.0,3:2.0,5:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
-----------------------


goes down*************+time:1359169561173
%%%%%%%%%%% the attributes
4,0,2,
the best ig is:0.4689955935892812,attribute:4,split:NaN
the attributes%%%%%%%%%%%%%
****************** not return but goes down1359169561173
if(complemented) before................
0.0,2.0,1.0,
complemented:true
0.0,2.0,1.0,
if(complemented) after................
subset[0] size:1
subset[1] size:5
subset[2] size:4
******************* cnt:2,---------------->end subsets size
__________________________subsets[0]
-----------------------
{0:1.0,1:1.0,2:2.0,3:2.0,5:1.0}
-----------------------


isIdentical(data) in DecisionTreeBuilder,time is :1359169561174,data.majorityLabel:0
__________________________subsets[1]
-----------------------
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
{1:2.0,3:2.0,4:2.0,5:1.0,6:1.0}
-----------------------


isIdentical(data) in DecisionTreeBuilder,time is :1359169561175,data.majorityLabel:1
__________________________subsets[2]
-----------------------
{1:2.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{1:2.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
{0:3.0,1:1.0,2:1.0,3:2.0,4:1.0,5:1.0,6:1.0}
-----------------------


XXXXXXXXXXXXXXXXXXXXXXXXXdata.isIdenticalLabel() in DecisionTreeBuilder it should not be here,time is :1359169561175,data.getDataset.getLabel():1.0
__________________________subsets[2]
-----------------------
{0:1.0,2:3.0,3:1.0,4:2.0,5:1.0,6:1.0}
{0:3.0,1:2.0,2:2.0,3:1.0,4:2.0,5:1.0,6:1.0}
{0:2.0,1:2.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{1:1.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{0:2.0,1:2.0,2:3.0,3:1.0,4:1.0,5:1.0,6:1.0}
{0:3.0,1:2.0,2:2.0,3:1.0,4:2.0,5:1.0,6:1.0}
-----------------------


XXXXXXXXXXXXXXXXXXXXXXXXXdata.isIdenticalLabel() in DecisionTreeBuilder it should not be here,time is :1359169561176,data.getDataset.getLabel():1.0
the tree is buildedCATEGORICAL:LEAF:0.0;,LEAF:0.0;,CATEGORICAL:LEAF:0.0;,CATEGORICAL:LEAF:0.0;,LEAF:1.0;,LEAF:1.0;,;,LEAF:1.0;,;,;
この提示結果と前編の原理分析の<2.1>~<2.6>を照合すれば意味がわかる.
最後の行のヒントからも、最後に得られたツリーは次のように見えます.
Mahout决策树算法源码分析(3-1)建树实战_第1张图片元データ対応ツリーに変換:Mahout决策树算法源码分析(3-1)建树实战_第2张图片
分かち合う
転載は出典を明記してください.http://blog.csdn.net/fansy1990