"单表关联"这个实例要求从给出的数据中寻找所关心的数据,它是对原始数据所包含信息的挖掘。

需求:实例中给出 child-parent(孩子—父母)表,要求输出 grandchild-grandparent(孙子—爷奶)表。

package test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /**
* 输入:
* child parent
* 张三 张三的爸爸
* 张三的爸爸 张三的爷爷
*
* 输出:
* grandChiled grandFather
* 张三 张三的爷爷
*/
public class MySingle { public static void main(String[] args) throws Exception { //配置环境变量
System.setProperty("hadoop.home.dir", "F:\\JAVA\\hadoop-2.2.0");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(MySingle.class); job.setMapperClass(STMapper.class);
job.setReducerClass(STReducer.class); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : -1);
} public static class STMapper extends Mapper<LongWritable, Text, Text, Text>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException { String[] splited = value.toString().split(" ");
if(splited.length >= 2){ //正向输出,value即 父亲前加符号"-"
context.write(new Text(splited[0]), new Text("-"+splited[1])); //反向输出
context.write(new Text(splited[1]), new Text(splited[0]));
}
}
} public static class STReducer extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> v2s,Context context)
throws IOException, InterruptedException { List<String> grandChild=new ArrayList<String>();
List<String> grandParent=new ArrayList<String>(); for(Text text : v2s){ //以"-"开始则是key的父亲
if(text.toString().startsWith("-")){ //将可能成为爷爷的变量存储到grandParent集合中去
grandParent.add(text.toString().substring(1));
}else { grandChild.add(text.toString());
}
}
/**
* 【关键的判断】
* 当前输入的key既有儿子又有父亲
*/
if(grandChild.size()!=0 && grandParent.size()!=0){ for(int i=0;i<grandChild.size();i++){
for(int j=0;j<grandParent.size();j++){ //key:孙子 value:爷爷
context.write(new Text(grandChild.get(i)), new Text(grandParent.get(j)));
}
}
}
}
}
}
  • 在reduce阶段,将两种Value分别存储到grandchild和grandparent集合中
  • 对于reduce阶段的key,只有当他既有儿子又有父亲时,他才可以使得grandchild和grandparent两集合都不为空
05-11 14:58