/ 中存储网

hadoop实现自定义的数据类型

2013-04-08 00:00:00 来源:中存储网
hadoop实现自定义的数据类型
    博客分类: hadoop hadoopmrunit自定义数据类型 

    关于自定义数据类型,http://book.douban.com/annotation/17067489/?一文中给出了一个比较清晰的说明和解释。

    以wordCount为例子

    定义自己的数据类型Http类

    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    import org.apache.hadoop.io.WritableComparable;
    
    public class Http implements WritableComparable<Http>
    {
        public Http(){ }
        
        private String value;
        
        public Http(String value)
        {
            setValue(value);
        }
    
        public String getValue()
        {
            return value;
        }
    
        public void setValue(String value)
        {
            this.value = value;
        }
    
        public void readFields(DataInput in) throws IOException
        {
            value = in.readUTF();
        }
    
        public void write(DataOutput out) throws IOException
        {
            out.writeUTF(value);
        }
    
        public int compareTo(Http http)
        {
            return (value.compareTo(http.value));
        }
    
        @Override
        public int hashCode()
        {
            final int prime = 31;
            int result = 1;
            result = prime * result + ((value == null) ? 0 : value.hashCode());
            return result;
        }
    
        @Override
        public boolean equals(Object obj)
        {
            if (!(obj instanceof Http))
                return false;
            Http other = (Http)obj;
            return this.value.equals(other.value);
        }
    
        @Override
        public String toString()
        {
            return value;
        }
    }

    ?编写wordcount程序

    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    
    public class WordCountEntry
    {
        public static class TokenizerMapper extends
                Mapper<LongWritable, Http, Http, IntWritable>
        {
    
            private final static IntWritable one = new IntWritable(1);
    
            private Http word = new Http();
    
            public void map(LongWritable key, Http value, Context context)
                    throws IOException, InterruptedException
            {
                StringTokenizer itr = new StringTokenizer(value.toString());
                while (itr.hasMoreTokens())
                {
                    word.setValue(itr.nextToken());
                    context.write(word, one);
                }
            }
        }
    
        public static class IntSumReducer extends
                Reducer<Http, IntWritable, Http, IntWritable>
        {
            private IntWritable result = new IntWritable();
    
            public void reduce(Http key, Iterable<IntWritable> values,
                    Context context) throws IOException, InterruptedException
            {
                int sum = 0;
                for (IntWritable val : values)
                {
                    sum += val.get();
                }
                result.set(sum);
                context.write(key, result);
            }
        }
    
        public static void main(String[] args) 
                throws IOException, InterruptedException, ClassNotFoundException 
        {
            Configuration conf = new Configuration();
            String[] otherArgs = new GenericOptionsParser(conf, args)
                    .getRemainingArgs();
            if (otherArgs.length != 2)
            {
                System.err.println("Usage: wordcount <in> <out>");
                System.exit(2);
            }
    
            Path input = new Path(args[0]);
            Path output = new Path(args[1]);
            Job job = new Job(conf, "word count");
            job.setJarByClass(WordCountEntry.class);
            job.setMapperClass(TokenizerMapper.class);
            job.setCombinerClass(IntSumReducer.class);
            job.setReducerClass(IntSumReducer.class);
            job.setOutputKeyClass(Http.class);
            job.setOutputValueClass(IntWritable.class);
            FileInputFormat.addInputPath(job, input);
            FileOutputFormat.setOutputPath(job, output);
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }

    ?编写mrUnit测试用例进行mapreduce程序测试

    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.mrunit.mapreduce.MapDriver;
    import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
    import org.junit.Before;
    import org.junit.Test;
    
    import com.geo.dmp.WordCountEntry.IntSumReducer;
    import com.geo.dmp.WordCountEntry.TokenizerMapper;
    
    public class WordCountEntryTest
    {
    
        private MapDriver<LongWritable, Http, Http, IntWritable> mapDriver;
        private ReduceDriver<Http, IntWritable, Http, IntWritable> reduceDriver;
        
        @Before
        public void setUpBeforeClass() throws Exception
        {
            TokenizerMapper tm = new TokenizerMapper();
            mapDriver = MapDriver.newMapDriver(tm);
            
            IntSumReducer isr = new IntSumReducer();
            reduceDriver = ReduceDriver.newReduceDriver(isr);
        }
    
        @Test
        public void TokenizerMapperTest()
        {
            mapDriver.withInput(new LongWritable(), new Http("01a55tablsd"));
            
            mapDriver.withOutput(new Http("01a55"), new IntWritable(1));
            mapDriver.withOutput(new Http("ablsd"), new IntWritable(1));
            
            mapDriver.runTest();
        }
        
        @Test
        public void IntSumReducerTest()
        {
            List<IntWritable> values = new ArrayList<IntWritable>();
            values.add(new IntWritable(1));
            values.add(new IntWritable(1));
            
            reduceDriver.withInput(new Http("01a55"), values);
            
            reduceDriver.withOutput(new Http("01a55"), new IntWritable(2));
            
            reduceDriver.runTest();
        }
    }

    ?