我已经建立了一个包含3个Elasticsearch实例的集群,并且它们正在被Logstash提供的文档(每分钟约165,000个文档)馈送。我有3台具有16Gb RAM的机器,每个实例都以8Gb启动。
索引工作得很好,我能够执行我期望的所有必需的搜索操作。现在的事情是我想使其通用,但是不幸的是,取决于所执行的查询(即所有索引的范围构面),它冻结了整个群集并最终变成了裂脑状态。
我已经限制了一些东西,例如:
indices.memory.index_buffer_size: 30%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
indices.fielddata.cache.size: 15%
indices.fielddata.cache.expire: 6h
indices.cache.filter.size: 15%
indices.cache.filter.expire: 6h
我的整个配置文件如下所示:
index.number_of_shards: 10
index.number_of_replicas: 0
bootstrap.mlockall: true
# Indices settings
indices.memory.index_buffer_size: 30%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
# Cache Sizes
indices.fielddata.cache.size: 15%
indices.fielddata.cache.expire: 6h
indices.cache.filter.size: 15%
indices.cache.filter.expire: 6h
# Indexing Settings for Writes
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000
为了避免这种冻结和脑裂状态,我还有什么可以改善的吗?
我的节点信息输出:
{
"cluster_name" : "elasticsearch",
"nodes" : {
"7i5sZj_jT_qe6HNESfzO3A" : {
"name" : "Captain Fate",
"transport_address" : "inet[/192.168.0.83:9300]",
"host" : "esserver02",
"ip" : "192.168.0.83",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.83:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Captain Fate",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 8482,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 8482,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1411976625093,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0:9300]",
"publish_address" : "inet[/192.168.0.83:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0:9200]",
"publish_address" : "inet[/192.168.0.83:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
},
"0OaMqY6IR1SYeL6rd6P61Q" : {
"name" : "Blonde Phantom",
"transport_address" : "inet[/192.168.0.100:9300]",
"host" : "esserver03",
"ip" : "192.168.0.100",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.100:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Blonde Phantom",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 98772,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 98772,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1414657551806,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000,
"primary_interface" : {
"address" : "",
"name" : "",
"mac_address" : ""
}
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9300]",
"publish_address" : "inet[/192.168.0.100:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9200]",
"publish_address" : "inet[/192.168.0.100:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
},
"H2h01oNGSuCL0uu8J3SF6w" : {
"name" : "Dakimh the Enchanter",
"transport_address" : "inet[/192.168.0.101:9300]",
"host" : "esserver04",
"ip" : "192.168.0.101",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.101:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Dakimh the Enchanter",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 88019,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 88019,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1414657560829,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000,
"primary_interface" : {
"address" : "",
"name" : "",
"mac_address" : ""
}
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9300]",
"publish_address" : "inet[/192.168.0.101:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9200]",
"publish_address" : "inet[/192.168.0.101:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
}
}
}
我的旧配置:
index.number_of_shards: 40
index.number_of_replicas: 0
bootstrap.mlockall: true
## Threadpool Settings ##
# Indices settings
indices.memory.index_buffer_size: 50%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
# Indexing Settings for Writes
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000
最佳答案
您的indices.fielddata.cache.size
仅设置为15%。为什么?
这些数据用于汇总/方面,因此可以关联。您应该尽快删除indices.fielddata.cache.expire
:根本不建议使用此设置,因为逐出确实非常昂贵,并且即使使用,它也会为字段数据值安排逐出。您能给我们一个节点统计API的结果吗?
更新1:
我看到minimum_master_nodes
设置为1,但是您说您有3个节点。应根据常用公式将其设置为2(节点数/ 2 + 1,documentation)
更新2:
使用更新后的配置,您仍然会遇到裂脑吗?
根据您的群集ES版本(> 1.0),您可能希望调整fielddata断路器,以防止执行最昂贵的请求:看一看here。