c# - ElasticSearch NEST 2.4中的自定义 “tab” token 生成器

我有一个包含许多字段的索引，并且一个字段“ServiceCategories”具有类似于以下内容的数据:

我需要用分隔符“|”分解数据我尝试这样做:

    var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
        .Mappings(ms => ms
            .Map<ProviderContent>(m => m
                .AutoMap()
                .Properties(p => p
                    .String(s => s
                        .Name(n => n.OrganizationName)
                        .Fields(f => f
                            .String(ss => ss.Name("raw").NotAnalyzed())))
                    .String(s => s
                        .Name(n => n.ServiceCategories)
                        .Analyzer("tab_delim_analyzer"))
                    .GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
        .Settings(st => st
            .Analysis(an => an
                .Analyzers(anz => anz
                    .Custom("tab_delim_analyzer", td => td
                        .Filters("lowercase")
                    .Tokenizer("tab_delim_tokenizer")))
                .Tokenizers(t => t
                    .Pattern("tab_delim_tokenizer", tdt => tdt
                        .Pattern("|")))));
    _elasticClientWrapper.CreateIndex(descriptor);

我对ServiceCategories(ES的serviceCategories)的搜索代码使用一个简单的TermQuery，其值设置为小写。

使用此搜索参数无法获得结果(其他参数可以正常工作)。预期结果是至少要从上述一项获得完全匹配。

我也尝试通过使用经典的 token 生成器使其工作:

    var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
        .Mappings(ms => ms
            .Map<ProviderContent>(m => m
                .AutoMap()
                .Properties(p => p
                    .String(s => s
                        .Name(n => n.OrganizationName)
                        .Fields(f => f
                            .String(ss => ss.Name("raw").NotAnalyzed())))
                    .String(s => s
                        .Name(n => n.ServiceCategories)
                        .Analyzer("classic_tokenizer")
                        .SearchAnalyzer("standard"))
                    .GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
        .Settings(s => s
            .Analysis(an => an
                .Analyzers(a => a.Custom("classic_tokenizer", ca => ca
                    .Tokenizer("classic")))));

这也不起作用。谁能帮助我确定我要去哪里哪里？

这是搜索请求:

### ES REQEUST ###
{
  "from": 0,
  "size": 10,
  "sort": [
    {
      "organizationName": {
        "order": "asc"
      }
    }
  ],
  "query": {
    "bool": {
      "must": [
        {
          "match_all": {}
        },
        {
          "term": {
            "serviceCategories": {
              "value": "developmental disabilities"
            }
          }
        }
      ]
    }
  }
}

最佳答案

您的tab_delim_tokenizer模式是接近的，但不是很正确:)看到这一点的最简单方法是使用Analyze API来了解分析器将如何标记一段文本。有了您的第一个映射，我们可以检查自定义分析器的功能

client.Analyze(a => a
    .Index(_DataSource.ToLower())
    .Analyzer("tab_delim_analyzer")
    .Text("|Case Management|Developmental Disabilities")
);

返回(为简洁起见)

{
  "tokens" : [ {
    "token" : "|",
    "start_offset" : 0,
    "end_offset" : 1,
    "type" : "word",
    "position" : 0
  }, {
    "token" : "c",
    "start_offset" : 1,
    "end_offset" : 2,
    "type" : "word",
    "position" : 1
  }, {
    "token" : "a",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "word",
    "position" : 2
  }, {
    "token" : "s",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "word",
    "position" : 3
  }, ... ]
}

表明tab_delim_tokenizer并未标记我们的期望。进行一个小小的更改即可解决此问题，方法是使用|将模式中的\转义，并通过使用@前缀使该模式成为逐字字符串。

这是一个完整的例子

void Main()
{
    var pool = new SingleNodeConnectionPool(new Uri("http://localhost:9200"));
    var defaultIndex = "default-index";
    var connectionSettings = new ConnectionSettings(pool)
            .DefaultIndex(defaultIndex);

    var client = new ElasticClient(connectionSettings);

    if (client.IndexExists(defaultIndex).Exists)
        client.DeleteIndex(defaultIndex);

    var descriptor = new CreateIndexDescriptor(defaultIndex)
        .Mappings(ms => ms
            .Map<ProviderContent>(m => m
                .AutoMap()
                .Properties(p => p
                    .String(s => s
                        .Name(n => n.OrganizationName)
                        .Fields(f => f
                            .String(ss => ss.Name("raw").NotAnalyzed())))
                    .String(s => s
                        .Name(n => n.ServiceCategories)
                        .Analyzer("tab_delim_analyzer")
                    )
                    .GeoPoint(g => g
                        .Name(n => n.Location)
                        .LatLon(true)
                    )
                )
            )
        )
        .Settings(st => st
            .Analysis(an => an
                .Analyzers(anz => anz
                    .Custom("tab_delim_analyzer", td => td
                        .Filters("lowercase")
                        .Tokenizer("tab_delim_tokenizer")
                    )
                )
                .Tokenizers(t => t
                    .Pattern("tab_delim_tokenizer", tdt => tdt
                        .Pattern(@"\|")
                    )
                )
            )
        );

    client.CreateIndex(descriptor);

    // check our custom analyzer does what we think it should
    client.Analyze(a => a
        .Index(defaultIndex)
        .Analyzer("tab_delim_analyzer")
        .Text("|Case Management|Developmental Disabilities")
    );

    // index a document and make it immediately available for search
    client.Index(new ProviderContent
    {
        OrganizationName = "Elastic",
        ServiceCategories = "|Case Management|Developmental Disabilities"
    }, i => i.Refresh());


    // search for our document. Use a term query in a bool filter clause
    // as we don't need scoring (probably)
    client.Search<ProviderContent>(s => s
        .From(0)
        .Size(10)
        .Sort(so => so
            .Ascending(f => f.OrganizationName)
        )
        .Query(q => +q
            .Term(f => f.ServiceCategories, "developmental disabilities")
        )
    );

}

public class ProviderContent
{
    public string OrganizationName { get; set; }

    public string ServiceCategories { get; set; }

    public GeoLocation Location { get; set; }
}

搜索结果返回

{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1,
    "max_score" : null,
    "hits" : [ {
      "_index" : "default-index",
      "_type" : "providercontent",
      "_id" : "AVqNNqlQpAW_5iHrnIDQ",
      "_score" : null,
      "_source" : {
        "organizationName" : "Elastic",
        "serviceCategories" : "|Case Management|Developmental Disabilities"
      },
      "sort" : [ "elastic" ]
    } ]
  }
}