分别建立三个文件:
file1txt
file2.txt
file3.txt
文件内容分别是:
MapReduce is simple
和
MapReduce is powerful is simple
和
Hello MapReduce bye MapReduce
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class InvertedIndex{
private static class Map extends Mapper<Object, Text, Text, Text>{
private Text keyInfo =new Text(); //存单词与uri
private Text valueInfo =new Text(); //存词频
private FileSplit split; //存储split对象
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
split=(FileSplit) context.getInputSplit();
StringTokenizer itr=new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
int splitIndex=split.getPath().toString().indexOf("file");
keyInfo.set(itr.nextToken()+":"+split.getPath().toString().substring(splitIndex));
valueInfo.set("1");
context.write(keyInfo, valueInfo);
}
}
} public static class Combine extends Reducer<Text, Text, Text, Text>{ //相同的key,value累加
private Text info=new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
int sum=0;
for(Text value:values){
sum+=Integer.parseInt(value.toString());
}
int splitIndex=key.toString().indexOf(":");
info.set(key.toString().substring(splitIndex+1)+":"+sum); //与下一行顺序不能调换
key.set(key.toString().substring(0, splitIndex));
context.write(key, info);
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text>{
private Text result=new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String file=new String();
for(Text value:values){
file+=value.toString()+";";
}
result.set(file);
context.write(key, result);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
@SuppressWarnings("deprecation")
Job job=new Job(conf,"invertedIndex");
job.setJarByClass(InvertedIndex.class);
job.setMapperClass(Map.class);
job.setCombinerClass(Combine.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}
2017-03-15 22:16:27,071 WARN [org.apache.hadoop.util.NativeCodeLoader] - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2017-03-15 22:16:27,748 INFO [org.apache.hadoop.conf.Configuration.deprecation] - session.id is deprecated. Instead, use dfs.metrics.session-id
2017-03-15 22:16:27,749 INFO [org.apache.hadoop.metrics.jvm.JvmMetrics] - Initializing JVM Metrics with processName=JobTracker, sessionId=
2017-03-15 22:16:28,058 WARN [org.apache.hadoop.mapreduce.JobResourceUploader] - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
2017-03-15 22:16:28,061 WARN [org.apache.hadoop.mapreduce.JobResourceUploader] - No job jar file set. User classes may not be found. See Job or Job#setJar(String).
2017-03-15 22:16:28,124 INFO [org.apache.hadoop.mapreduce.lib.input.FileInputFormat] - Total input paths to process : 3
2017-03-15 22:16:28,171 INFO [org.apache.hadoop.mapreduce.JobSubmitter] - number of splits:3
2017-03-15 22:16:28,289 INFO [org.apache.hadoop.mapreduce.JobSubmitter] - Submitting tokens for job: job_local1466554694_0001
2017-03-15 22:16:28,463 INFO [org.apache.hadoop.mapreduce.Job] - The url to track the job: http://localhost:8080/
2017-03-15 22:16:28,468 INFO [org.apache.hadoop.mapreduce.Job] - Running job: job_local1466554694_0001
2017-03-15 22:16:28,473 INFO [org.apache.hadoop.mapred.LocalJobRunner] - OutputCommitter set in config null
2017-03-15 22:16:28,479 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1
2017-03-15 22:16:28,482 INFO [org.apache.hadoop.mapred.LocalJobRunner] - OutputCommitter is org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
2017-03-15 22:16:28,571 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Waiting for map tasks
2017-03-15 22:16:28,571 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local1466554694_0001_m_000000_0
2017-03-15 22:16:28,609 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1
2017-03-15 22:16:28,621 INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : [ ]
2017-03-15 22:16:28,624 INFO [org.apache.hadoop.mapred.MapTask] - Processing split: hdfs://localhost:9000/user/hadoop/input/file2.txt:0+32
2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - (EQUATOR) 0 kvi 26214396(104857584)
2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - mapreduce.task.io.sort.mb: 100
2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - soft limit at 83886080
2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufvoid = 104857600
2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396; length = 6553600
2017-03-15 22:16:28,683 INFO [org.apache.hadoop.mapred.MapTask] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
2017-03-15 22:16:28,754 INFO [org.apache.hadoop.mapred.LocalJobRunner] -
2017-03-15 22:16:28,756 INFO [org.apache.hadoop.mapred.MapTask] - Starting flush of map output
2017-03-15 22:16:28,757 INFO [org.apache.hadoop.mapred.MapTask] - Spilling map output
2017-03-15 22:16:28,757 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufend = 92; bufvoid = 104857600
2017-03-15 22:16:28,757 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396(104857584); kvend = 26214380(104857520); length = 17/6553600
2017-03-15 22:16:28,770 INFO [org.apache.hadoop.mapred.MapTask] - Finished spill 0
2017-03-15 22:16:28,774 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local1466554694_0001_m_000000_0 is done. And is in the process of committing
2017-03-15 22:16:28,785 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map
2017-03-15 22:16:28,785 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local1466554694_0001_m_000000_0' done.
2017-03-15 22:16:28,785 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local1466554694_0001_m_000000_0
2017-03-15 22:16:28,786 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local1466554694_0001_m_000001_0
2017-03-15 22:16:28,791 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1
2017-03-15 22:16:28,792 INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : [ ]
2017-03-15 22:16:28,793 INFO [org.apache.hadoop.mapred.MapTask] - Processing split: hdfs://localhost:9000/user/hadoop/input/file3.txt:0+30
2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - (EQUATOR) 0 kvi 26214396(104857584)
2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - mapreduce.task.io.sort.mb: 100
2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - soft limit at 83886080
2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufvoid = 104857600
2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396; length = 6553600
2017-03-15 22:16:28,824 INFO [org.apache.hadoop.mapred.MapTask] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
2017-03-15 22:16:28,831 INFO [org.apache.hadoop.mapred.LocalJobRunner] -
2017-03-15 22:16:28,832 INFO [org.apache.hadoop.mapred.MapTask] - Starting flush of map output
2017-03-15 22:16:28,832 INFO [org.apache.hadoop.mapred.MapTask] - Spilling map output
2017-03-15 22:16:28,832 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufend = 78; bufvoid = 104857600
2017-03-15 22:16:28,832 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396(104857584); kvend = 26214384(104857536); length = 13/6553600
2017-03-15 22:16:28,834 INFO [org.apache.hadoop.mapred.MapTask] - Finished spill 0
2017-03-15 22:16:28,835 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local1466554694_0001_m_000001_0 is done. And is in the process of committing
2017-03-15 22:16:28,839 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map
2017-03-15 22:16:28,839 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local1466554694_0001_m_000001_0' done.
2017-03-15 22:16:28,839 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local1466554694_0001_m_000001_0
2017-03-15 22:16:28,839 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local1466554694_0001_m_000002_0
2017-03-15 22:16:28,842 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1
2017-03-15 22:16:28,843 INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : [ ]
2017-03-15 22:16:28,844 INFO [org.apache.hadoop.mapred.MapTask] - Processing split: hdfs://localhost:9000/user/hadoop/input/file1.txt:0+20
2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - (EQUATOR) 0 kvi 26214396(104857584)
2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - mapreduce.task.io.sort.mb: 100
2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - soft limit at 83886080
2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufvoid = 104857600
2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396; length = 6553600
2017-03-15 22:16:28,889 INFO [org.apache.hadoop.mapred.MapTask] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
2017-03-15 22:16:28,893 INFO [org.apache.hadoop.mapred.LocalJobRunner] -
2017-03-15 22:16:28,894 INFO [org.apache.hadoop.mapred.MapTask] - Starting flush of map output
2017-03-15 22:16:28,894 INFO [org.apache.hadoop.mapred.MapTask] - Spilling map output
2017-03-15 22:16:28,894 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufend = 56; bufvoid = 104857600
2017-03-15 22:16:28,894 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396(104857584); kvend = 26214388(104857552); length = 9/6553600
2017-03-15 22:16:28,895 INFO [org.apache.hadoop.mapred.MapTask] - Finished spill 0
2017-03-15 22:16:28,896 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local1466554694_0001_m_000002_0 is done. And is in the process of committing
2017-03-15 22:16:28,899 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map
2017-03-15 22:16:28,899 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local1466554694_0001_m_000002_0' done.
2017-03-15 22:16:28,899 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local1466554694_0001_m_000002_0
2017-03-15 22:16:28,899 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map task executor complete.
2017-03-15 22:16:28,901 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Waiting for reduce tasks
2017-03-15 22:16:28,902 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local1466554694_0001_r_000000_0
2017-03-15 22:16:28,914 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1
2017-03-15 22:16:28,915 INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : [ ]
2017-03-15 22:16:28,919 INFO [org.apache.hadoop.mapred.ReduceTask] - Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@4fb755c9
2017-03-15 22:16:28,932 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - MergerManager: memoryLimit=1945842432, maxSingleShuffleLimit=486460608, mergeThreshold=1284256000, ioSortFactor=10, memToMemMergeOutputsThreshold=10
2017-03-15 22:16:28,934 INFO [org.apache.hadoop.mapreduce.task.reduce.EventFetcher] - attempt_local1466554694_0001_r_000000_0 Thread started: EventFetcher for fetching Map Completion Events
2017-03-15 22:16:28,980 INFO [org.apache.hadoop.mapreduce.task.reduce.LocalFetcher] - localfetcher#1 about to shuffle output of map attempt_local1466554694_0001_m_000002_0 decomp: 64 len: 68 to MEMORY
2017-03-15 22:16:28,984 INFO [org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput] - Read 64 bytes from map-output for attempt_local1466554694_0001_m_000002_0
2017-03-15 22:16:28,986 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - closeInMemoryFile -> map-output of size: 64, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->64
2017-03-15 22:16:28,991 INFO [org.apache.hadoop.mapreduce.task.reduce.LocalFetcher] - localfetcher#1 about to shuffle output of map attempt_local1466554694_0001_m_000001_0 decomp: 64 len: 68 to MEMORY
2017-03-15 22:16:28,992 INFO [org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput] - Read 64 bytes from map-output for attempt_local1466554694_0001_m_000001_0
2017-03-15 22:16:28,992 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - closeInMemoryFile -> map-output of size: 64, inMemoryMapOutputs.size() -> 2, commitMemory -> 64, usedMemory ->128
2017-03-15 22:16:28,993 INFO [org.apache.hadoop.mapreduce.task.reduce.LocalFetcher] - localfetcher#1 about to shuffle output of map attempt_local1466554694_0001_m_000000_0 decomp: 87 len: 91 to MEMORY
2017-03-15 22:16:28,994 INFO [org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput] - Read 87 bytes from map-output for attempt_local1466554694_0001_m_000000_0
2017-03-15 22:16:28,994 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - closeInMemoryFile -> map-output of size: 87, inMemoryMapOutputs.size() -> 3, commitMemory -> 128, usedMemory ->215
2017-03-15 22:16:28,994 INFO [org.apache.hadoop.mapreduce.task.reduce.EventFetcher] - EventFetcher is interrupted.. Returning
2017-03-15 22:16:28,995 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 3 / 3 copied.
2017-03-15 22:16:28,995 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - finalMerge called with 3 in-memory map-outputs and 0 on-disk map-outputs
2017-03-15 22:16:29,004 INFO [org.apache.hadoop.mapred.Merger] - Merging 3 sorted segments
2017-03-15 22:16:29,005 INFO [org.apache.hadoop.mapred.Merger] - Down to the last merge-pass, with 3 segments left of total size: 183 bytes
2017-03-15 22:16:29,006 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - Merged 3 segments, 215 bytes to disk to satisfy reduce memory limit
2017-03-15 22:16:29,006 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - Merging 1 files, 215 bytes from disk
2017-03-15 22:16:29,007 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - Merging 0 segments, 0 bytes from memory into reduce
2017-03-15 22:16:29,007 INFO [org.apache.hadoop.mapred.Merger] - Merging 1 sorted segments
2017-03-15 22:16:29,008 INFO [org.apache.hadoop.mapred.Merger] - Down to the last merge-pass, with 1 segments left of total size: 203 bytes
2017-03-15 22:16:29,008 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 3 / 3 copied.
2017-03-15 22:16:29,049 INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords
2017-03-15 22:16:29,186 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local1466554694_0001_r_000000_0 is done. And is in the process of committing
2017-03-15 22:16:29,190 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 3 / 3 copied.
2017-03-15 22:16:29,190 INFO [org.apache.hadoop.mapred.Task] - Task attempt_local1466554694_0001_r_000000_0 is allowed to commit now
2017-03-15 22:16:29,209 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - Saved output of task 'attempt_local1466554694_0001_r_000000_0' to hdfs://localhost:9000/user/hadoop/output/_temporary/0/task_local1466554694_0001_r_000000
2017-03-15 22:16:29,210 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce
2017-03-15 22:16:29,210 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local1466554694_0001_r_000000_0' done.
2017-03-15 22:16:29,210 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local1466554694_0001_r_000000_0
2017-03-15 22:16:29,210 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce task executor complete.
2017-03-15 22:16:29,473 INFO [org.apache.hadoop.mapreduce.Job] - Job job_local1466554694_0001 running in uber mode : false
2017-03-15 22:16:29,474 INFO [org.apache.hadoop.mapreduce.Job] - map 100% reduce 100%
2017-03-15 22:16:29,475 INFO [org.apache.hadoop.mapreduce.Job] - Job job_local1466554694_0001 completed successfully
2017-03-15 22:16:29,487 INFO [org.apache.hadoop.mapreduce.Job] - Counters: 35
File System Counters
FILE: Number of bytes read=4131
FILE: Number of bytes written=1128147
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=258
HDFS: Number of bytes written=165
HDFS: Number of read operations=33
HDFS: Number of large read operations=0
HDFS: Number of write operations=6
Map-Reduce Framework
Map input records=3
Map output records=12
Map output bytes=226
Map output materialized bytes=227
Input split bytes=342
Combine input records=12
Combine output records=10
Reduce input groups=6
Reduce shuffle bytes=227
Reduce input records=10
Reduce output records=6
Spilled Records=20
Shuffled Maps =3
Failed Shuffles=0
Merged Map outputs=3
GC time elapsed (ms)=0
Total committed heap usage (bytes)=1592262656
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=82
File Output Format Counters
Bytes Written=165
结果:
Hello file3.txt:1;
MapReduce file3.txt:2;file1.txt:1;file2.txt:1;
bye file3.txt:1;
is file1.txt:1;file2.txt:2;
powerful file2.txt:1;
simple file2.txt:1;file1.txt:1;