测试数据

["美科学家发现人体新器官","科学家发现新器官"],["曝高云翔悉尼被捕","高云翔涉性侵被捕"],["走失柯基犬被摔死事件","柯基犬摔死后续"],["张继科景甜公布恋情","张继科公布恋情"]

string-similarity  (gem "string-similarity")   https://github.com/mhutter/string-similarity

 String::Similarity.cosine '美科学家发现人体新器官', '科学家发现新器官'
=> 0.8528028654224417
String::Similarity.cosine '曝高云翔悉尼被捕', '高云翔涉性侵被捕'
=> 0.6249999999999999
String::Similarity.cosine '走失柯基犬被摔死事件', '柯基犬摔死后续'
=> 0.5976143046671968
String::Similarity.cosine '张继科景甜公布恋情', '张继科公布恋情'
=> 0.8819171036881969

text  (gem "text")   https://github.com/threedaymonk/text

 white = Text::WhiteSimilarity.new
=> #<Text::WhiteSimilarity:0x00007fb358148e90 @word_letter_pairs={}>
white.similarity('美科学家发现人体新器官', '科学家发现新器官')
=> 0.7058823529411765
white.similarity('曝高云翔悉尼被捕', '高云翔涉性侵被捕')
=> 0.42857142857142855
white.similarity('走失柯基犬被摔死事件', '柯基犬摔死后续')
=> 0.4
white.similarity('张继科景甜公布恋情', '张继科公布恋情')
=> 0.7142857142857143

fuzzy_match  (gem "fuzzy_match", gem "amatch")   https://github.com/seamusabshere/fuzzy_match

 '美科学家发现人体新器官'.levenshtein_similar '科学家发现新器官'
=> 0.7272727272727273
'曝高云翔悉尼被捕'.levenshtein_similar '高云翔涉性侵被捕'
=> 0.5833333333333333
'走失柯基犬被摔死事件'.levenshtein_similar '柯基犬摔死后续'
=> 0.5333333333333333
'张继科景甜公布恋情'.levenshtein_similar '张继科公布恋情'
=> 0.7777777777777778

  

得出结论  中文匹配度:string-similarity > fuzzy_match > text



05-21 06:17