我有一个包含许多字段的索引,并且一个字段“ServiceCategories”具有类似于以下内容的数据:
我需要用分隔符“|”分解数据我尝试这样做:
var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("tab_delim_analyzer"))
.GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
.Settings(st => st
.Analysis(an => an
.Analyzers(anz => anz
.Custom("tab_delim_analyzer", td => td
.Filters("lowercase")
.Tokenizer("tab_delim_tokenizer")))
.Tokenizers(t => t
.Pattern("tab_delim_tokenizer", tdt => tdt
.Pattern("|")))));
_elasticClientWrapper.CreateIndex(descriptor);
我对ServiceCategories(ES的serviceCategories)的搜索代码使用一个简单的TermQuery,其值设置为小写。
使用此搜索参数无法获得结果(其他参数可以正常工作)。预期结果是至少要从上述一项获得完全匹配。
我也尝试通过使用经典的 token 生成器使其工作:
var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("classic_tokenizer")
.SearchAnalyzer("standard"))
.GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
.Settings(s => s
.Analysis(an => an
.Analyzers(a => a.Custom("classic_tokenizer", ca => ca
.Tokenizer("classic")))));
这也不起作用。谁能帮助我确定我要去哪里哪里?
这是搜索请求:
### ES REQEUST ###
{
"from": 0,
"size": 10,
"sort": [
{
"organizationName": {
"order": "asc"
}
}
],
"query": {
"bool": {
"must": [
{
"match_all": {}
},
{
"term": {
"serviceCategories": {
"value": "developmental disabilities"
}
}
}
]
}
}
}
最佳答案
您的tab_delim_tokenizer
模式是接近的,但不是很正确:)看到这一点的最简单方法是使用Analyze API来了解分析器将如何标记一段文本。有了您的第一个映射,我们可以检查自定义分析器的功能
client.Analyze(a => a
.Index(_DataSource.ToLower())
.Analyzer("tab_delim_analyzer")
.Text("|Case Management|Developmental Disabilities")
);
返回(为简洁起见)
{
"tokens" : [ {
"token" : "|",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
}, {
"token" : "c",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1
}, {
"token" : "a",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2
}, {
"token" : "s",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 3
}, ... ]
}
表明
tab_delim_tokenizer
并未标记我们的期望。进行一个小小的更改即可解决此问题,方法是使用|
将模式中的\
转义,并通过使用@
前缀使该模式成为逐字字符串。这是一个完整的例子
void Main()
{
var pool = new SingleNodeConnectionPool(new Uri("http://localhost:9200"));
var defaultIndex = "default-index";
var connectionSettings = new ConnectionSettings(pool)
.DefaultIndex(defaultIndex);
var client = new ElasticClient(connectionSettings);
if (client.IndexExists(defaultIndex).Exists)
client.DeleteIndex(defaultIndex);
var descriptor = new CreateIndexDescriptor(defaultIndex)
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("tab_delim_analyzer")
)
.GeoPoint(g => g
.Name(n => n.Location)
.LatLon(true)
)
)
)
)
.Settings(st => st
.Analysis(an => an
.Analyzers(anz => anz
.Custom("tab_delim_analyzer", td => td
.Filters("lowercase")
.Tokenizer("tab_delim_tokenizer")
)
)
.Tokenizers(t => t
.Pattern("tab_delim_tokenizer", tdt => tdt
.Pattern(@"\|")
)
)
)
);
client.CreateIndex(descriptor);
// check our custom analyzer does what we think it should
client.Analyze(a => a
.Index(defaultIndex)
.Analyzer("tab_delim_analyzer")
.Text("|Case Management|Developmental Disabilities")
);
// index a document and make it immediately available for search
client.Index(new ProviderContent
{
OrganizationName = "Elastic",
ServiceCategories = "|Case Management|Developmental Disabilities"
}, i => i.Refresh());
// search for our document. Use a term query in a bool filter clause
// as we don't need scoring (probably)
client.Search<ProviderContent>(s => s
.From(0)
.Size(10)
.Sort(so => so
.Ascending(f => f.OrganizationName)
)
.Query(q => +q
.Term(f => f.ServiceCategories, "developmental disabilities")
)
);
}
public class ProviderContent
{
public string OrganizationName { get; set; }
public string ServiceCategories { get; set; }
public GeoLocation Location { get; set; }
}
搜索结果返回
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : null,
"hits" : [ {
"_index" : "default-index",
"_type" : "providercontent",
"_id" : "AVqNNqlQpAW_5iHrnIDQ",
"_score" : null,
"_source" : {
"organizationName" : "Elastic",
"serviceCategories" : "|Case Management|Developmental Disabilities"
},
"sort" : [ "elastic" ]
} ]
}
}