elasticsearch - 在ElasticSearch中使用 token 化器“asciifolding”的“pattern”

谁能告诉我为什么下面的映射中的“模式”标记符上不执行“asciifolding”？
我需要使用“模式”标记器，但是我也不需要区分带有重音或没有“asciifolding”功能的重音功能的单词。
我需要“televisão”等于“televisao”，但是“asciifolding”在我的“analyzer_customizado”上不起作用，该“analyzer_customizado”具有“asciifolding”和 token 生成器“pattern”

{
  "settings": {
    "index": {
      "number_of_shards": "5",
      "number_of_replicas": "0",
      "analysis": {
        "filter": {
          "stemmer_plural_portugues": {
            "name": "minimal_portuguese",
            "stopwords" : ["http", "https", "ftp", "www"],
            "type": "stemmer"
          },


            "synonym_filter": {
            "type": "synonym",
            "lenient": true,
            "synonyms_path": "analysis/synonym.txt",
            "updateable" : true

          },


          "shingle_filter": {
            "type": "shingle",
            "min_shingle_size": 2,
            "max_shingle_size": 3
          }

        },

        "analyzer": {
          "analyzer_customizado": {
            "filter": [
              "lowercase",
              "stemmer_plural_portugues",
              "asciifolding",
              "synonym_filter",
              "shingle_filter"

            ],
            "tokenizer": "pattern"
          }
        }

      }
    }
  },
  "mappings": {
      "properties": {

        "id": {
         "type": "long"
        },
         "data": {
          "type": "date"
        },
         "quebrado": {
          "type": "byte"

        },
         "pgrk": {
           "type":  "integer"
        },
         "url_length": {
           "type":  "integer"
        },
        "titulo": {
          "analyzer": "analyzer_customizado",
          "type": "text",
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        },
        "descricao": {
        "analyzer": "analyzer_customizado",
          "type": "text",
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        },
        "url": {
          "analyzer": "analyzer_customizado",
          "type": "text",
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        }
      }
    }
  }

有人可以告诉我如何解决我的“asciifolding”映射，以使其在具有标记器“pattern”的“analyzer_customizado”中工作

最佳答案

问题是由于official doc of pattern tokenizer中提到的默认pattern分析器

您可以使用分析器API 自己进行测试，因为它认为televisão为非单词char，因此会为ã生成两个 token 。

{
    "tokenizer": "pattern",
    "text": "televisão"
}

{
    "tokens": [
        {
            "token": "televis",
            "start_offset": 0,
            "end_offset": 7,
            "type": "word",
            "position": 0
        },
        {
            "token": "o",
            "start_offset": 8,
            "end_offset": 9,
            "type": "word",
            "position": 1
        }
    ]
}

解决方案:-不幸的是，没有ASCIIfolding char filter可以将其转换为适当的ASCII字符，以防止其在模式标记器中分解为其他标记。您可以引用this discuss post来进行讨论，并建议使用自定义插件。
编辑如注释中的@Val所建议，您还可以使用mapping char filter并定义自己的字符映射，将其转换为要分析的第一阶段，即char过滤器。