package search; import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
import java.util.TreeMap; public class UpdateWordSearch {
/**
* 输入文件 保存分隔后的单词集合 保存统计后的单词集合
*/
String article;// 保存文章的内容
String[] rWords;
String[] words;
int[] wordFreqs;// 保存单词对应的词频
String filename;// 文件名
// 统计总数
int total = 0; // 构造函数:输入文章的内容
public UpdateWordSearch() throws IOException {
Scanner sc = new Scanner(System.in);
System.out.println("请输入文件名:");
filename = sc.nextLine();
File file = new File(filename);
if (!file.exists()) {
System.out.println("文件不存在!");
return;
}
BufferedReader bf = new BufferedReader(new FileReader(file));
StringBuffer article = new StringBuffer(); // 动态字符串数组
String temp = bf.readLine();
while (temp != null) {
article.append(temp + " "); // 往动态字符串数组里添加数据
temp = bf.readLine();
if (temp == null) {
break;
}
}
this.article = article.toString();
} // 分词并统计相应词汇
public void sWord() {
// 分词的时候,因为标点符号不参与,所以所有的符号全部替换为空格
final char SPACE = ' ';
article = article.replace('\"', SPACE).replace(',', SPACE)
.replace('.', SPACE).replace('\'', SPACE);
article = article.replace('(', SPACE).replace(')', SPACE)
.replace('-', SPACE);
rWords = article.split("\\s+");// 凡是空格隔开的都算单词,上面替换了',所以I've被分成两个单词
} public List<String> sort() {
// 将所有出现的字符串放入唯一的list中,不用map,是因为map寻找效率太低了
List<String> list = new ArrayList<String>();
for (String word : rWords) {
list.add(word);
}
Collections.sort(list);
return list;
} // 词汇排序
public List countWordFreq() {
// 统计词频信息
Map<String, Integer> wordsInfo = new TreeMap<String, Integer>();
String word = ""; // 词频名字
int count = 0; // 词频数量
// 统计单词总数
int total = 0;
List<String> wordList = sort();
word = wordList.get(0);
for (int i = 0; i <= wordList.size(); i++) {
if (i == wordList.size()) {
wordsInfo.put(word, count);
total++;
break;
}
if (wordList.get(i).equals(word)) {
count++;
} else {
wordsInfo.put(word, count);
total++;
word = wordList.get(i);
count = 1;
}
}
// 词频信息排序
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
wordsInfo.entrySet());
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Entry<String, Integer> o1,
Entry<String, Integer> o2) {
// TODO Auto-generated method stub
return o2.getValue().compareTo(o1.getValue());
}
});
this.total = total;
return list;
} public void run() {
// 拆分文本
sWord();
// 统计词频
List<Map.Entry<String, Integer>> list = countWordFreq();
// 打印词频总数
System.out.println("词频总数:");
System.out.println("total:" + this.total);
System.out.println("词频统计信息:");
// 打印统计词频
int m = 0;
for (Map.Entry<String, Integer> mapping : list) {
if (m < 10) {
System.out.println(mapping.getKey() + " : "
+ mapping.getValue());
m++;
} else
break;
}
} // 测试类的功能
public static void main(String[] args) throws IOException {
UpdateWordSearch w = new UpdateWordSearch();
w.run();
}
}
下图是词频统计所做的junit测试: