1.自定义TokenFilter
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import java.io.IOException; import java.util.HashMap; import java.util.Map; public class CourtesyTokenFilter extends TokenFilter { private Map<String, String> courtesyMap = new HashMap<>(); private CharTermAttribute charTermAttribute; public CourtesyTokenFilter(TokenStream input) { super(input); this.charTermAttribute = this.addAttribute(CharTermAttribute.class); courtesyMap.put("dr", "doctor"); courtesyMap.put("mr", "mister"); courtesyMap.put("mrs", "miss"); } @Override public final boolean incrementToken() throws IOException { if (!this.input.incrementToken()) { return false; } String term = this.charTermAttribute.toString(); if (courtesyMap.containsKey(term)) { this.charTermAttribute.setEmpty().append(this.courtesyMap.get(term)); } return true; } }
2.应用TokenFilter:
String text = "Hi, Dr Wang, Mr Liu asks if you stay with Mrs Liu yesterday!"; StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); CourtesyTokenFilter courtesyTokenFilter = new CourtesyTokenFilter(standardAnalyzer.tokenStream("text", text)); CharTermAttribute charTermAttribute = courtesyTokenFilter.addAttribute(CharTermAttribute.class); courtesyTokenFilter.reset(); while (courtesyTokenFilter.incrementToken()) { System.out.print(charTermAttribute + " "); }
3.场景解析
"Hi, Dr Wang, Mr Liu asks if you stay with Mrs Liu yesterday!" 这段文本中,有Dr, Mrs这两个词,我们看不懂,要用全称来显示。