需求:读入一个文本文件,确定所有单词的使用频率并从高到低排序,打印出所有单词及其频率的排序列表
先用传统方法解:
package cn._1.wordfrequency; import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern; /*
* Functional Thinking by Neal Ford(O'Reilly).
*/
public class Word { @SuppressWarnings("serial")
//统计除了以下单词的其他单词的使用频率
private Set<String> NON_WORDS = new HashSet<String>() {{
//匿名内部类+初始化块的初始化方式
add("the");add("and");add("of");add("to");add("a");
add("i");add("it");add("in");add("or");add("is");
add("as");add("so");add("but");add("be");
}};
public Map<String, Integer> wordFreq(String words) {
TreeMap<String,Integer> wordMap = new TreeMap<>();
Matcher m = Pattern.compile("\\w+").matcher(words);
while(m.find()){
String word = m.group().toLowerCase();
if (!NON_WORDS.contains(word)) {
if (wordMap.get(word) == null) {
wordMap.put(word, );
}else {
wordMap.put(word, wordMap.get(word)+);
}
}
}
return wordMap;
}
}
再使用Java8的新特性解:
package cn._1.wordfrequency; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern; /*
* Functional Thinking by Neal Ford(O'Reilly).
*/
public class Word2 {
@SuppressWarnings("serial")
private Set<String> NON_WORDS = new HashSet<String>() {{
//匿名内部类+初始化块的初始化方式
add("the");add("and");add("of");add("to");add("a");
add("i");add("it");add("in");add("or");add("is");
add("as");add("so");add("but");add("be");
}};
/*
* 使用正则表达式获得包含所有单词的List
*/
private List<String> regexToList(String words,String regex){
List<String> wordList = new ArrayList<>();
Matcher m = Pattern.compile(regex).matcher(words);
while(m.find())
wordList.add(m.group());
return wordList;
}
public Map<String, Integer> wordFreq(String words){
TreeMap<String, Integer> wordMap = new TreeMap<>();//使用TreeMap是为了使输出结果自然排序
/*
* java.util.stream.Stream:A sequence of elements supporting sequential and parallel aggregate operations.
* map:Returns a stream consisting of the results of applying the given function to the elements of this stream.
* filter:Returns a stream consisting of the elements of this stream that match the given predicate.
* forEach:Performs an action for each element of this stream.
*/
regexToList(words, "\\w+").stream()//将collection对象变为stream
.map(w -> w.toLowerCase())//返回一个经过小写处理的stream
.filter(w -> !NON_WORDS.contains(w))//过滤,使流中的元素都是NON_WORDS集合中不包含的元素
.forEach(w -> wordMap.put(w, wordMap.getOrDefault(w, )+));//遍历执行操作
return wordMap;
}
}
测试类:
package cn._1.wordfrequency; import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry; public class Mmain { public static void main(String[] args) throws IOException {
String str = readText("/home/yanshaochen/workspace/Functional_Thinking_Examples/mflie/sucai.txt");
//调用老方法
/*Map<String, Integer> map = new Word().wordFreq(str);*/
//调用新方法:
Map<String, Integer> map = new Word2().wordFreq(str);
//自然排序:
for (Entry<String, Integer> item : map.entrySet()) {
System.out.println(item.getKey()+","+item.getValue());
}
//按照value进行排序(摘自网络):
/*List<Map.Entry<String, Integer>> infoIds = new ArrayList<>(map.entrySet());
Collections.sort(infoIds, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return (o2.getValue() - o1.getValue());
//return (o1.getKey()).toString().compareTo(o2.getKey());
}
});
for (Entry<String, Integer> item : infoIds) {
System.out.println(item.getKey()+","+item.getValue());
}*/
} /*
* IO流
*/
private static String readText(String path) throws IOException {
FileInputStream fis = new FileInputStream(path);
byte[] bytes = new byte[];
int data;
String str ="";
while((data = fis.read(bytes))!=-){
str += new String(bytes, , data);
}
fis.close();
return str;
}
}