Elasticsearch 节点运维的那些套路,这篇是易懂的
一、节点相关命令
1、查看节点基本信息
GET /_nodes/<node_id>
GET /_nodes/<node_id>/process
返回 Response:
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" :
},
"cluster_name" : "es-xxxx",
"nodes" : {
"fdaOV16OQPq6-AUVihmq4A" : {
"name" : "162680s31430001456s32",
"transport_address" : "xx.0.96.22:9300",
"host" : "xx.0.96.22",
"ip" : "xx.0.96.22",
"version" : "7.10.1",
"build_flavor" : "default",
"build_type" : "tar",
"build_hash" : "119c2106d7bd8d206aa3b65dc43c87b8aa590b2b",
"roles" : [
"master",
"ml",
"remote_cluster_client"
],
"attributes" : {
"ml.machine_memory" : "16478932992",
"rack" : "cvm_4_200003",
"xpack.installed" : "true",
"set" : "200003",
"transform.node" : "false",
"ip" : "xx.20.58.221",
"temperature" : "hot",
"ml.max_open_jobs" : "20",
"region" : "4"
},
"process" : {
"refresh_interval_in_millis" : 1000,
"id" : 24918,
"mlockall" : false
}
}
}
}
2、查看节点统计信息
GET /_nodes/stats
GET /_nodes/<node_id>/stats
GET/_nodes/stats/<metric>
总之,通过该 API,我们能够全方位获取到节点维度相关的各种指标信息。对于我们排查集群问题非常有帮助,我们还了解到腾讯云 ES 的部分大客户,通过定期去请求该API,将返回信息输出到对应的监控系统,来自己做更加细粒度的集群监控。
该API默认是返回节点所有的统计指标信息,如果我们需要查看部分指标或者特定指标统计信息,也可以在 API 中进行指定,如我们想查看特定节点的 JVM 使用情况:
GET /_nodes/1626803143000145632/stats/jvm
返回 Response 如下:
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" :
},
"cluster_name" : "es-xxx",
"nodes" : {
"fdaOV16OQPq6-AUVihmq4A" : {
"timestamp" : 1639878427713,
"name" : "1626803143000145632",
"transport_address" : "xx.0.96.22:9300",
"host" : "xx.0.96.22",
"ip" : "xx.0.96.22:9300",
"roles" : [
"master",
"ml",
"remote_cluster_client"
],
"attributes" : {
"ml.machine_memory" : "16478932992",
"rack" : "cvm_4_200003",
"xpack.installed" : "true",
"set" : "200003",
"transform.node" : "false",
"ip" : "xx.20.58.221",
"temperature" : "hot",
"ml.max_open_jobs" : "20",
"region" : "4"
},
"jvm" : {
"timestamp" : 1639878427713,
"uptime_in_millis" : 13037976328,
"mem" : {
"heap_used_in_bytes" : 214161760,
"heap_used_percent" : 2,
"heap_committed_in_bytes" : 8555069440,
"heap_max_in_bytes" : 8555069440,
"non_heap_used_in_bytes" : 178981224,
"non_heap_committed_in_bytes" : 195989504,
"pools" : {
"young" : {
"used_in_bytes" : 104047328,
"max_in_bytes" : 279183360,
"peak_used_in_bytes" : 279183360,
"peak_max_in_bytes" : 279183360
},
"survivor" : {
"used_in_bytes" : 545616,
"max_in_bytes" : 34865152,
"peak_used_in_bytes" : 34865144,
"peak_max_in_bytes" : 34865152
},
"old" : {
"used_in_bytes" : 109568816,
"max_in_bytes" : 8241020928,
"peak_used_in_bytes" : 127845408,
"peak_max_in_bytes" : 8241020928
}
}
},
"threads" : {
"count" : 41,
"peak_count" : 43
},
"gc" : {
"collectors" : {
"young" : {
"collection_count" : 4718,
"collection_time_in_millis" : 162825
},
"old" : {
"collection_count" : 3,
"collection_time_in_millis" : 359
}
}
},
"buffer_pools" : {
"direct" : {
"count" : 16,
"used_in_bytes" : 4300808,
"total_capacity_in_bytes" : 4300807
},
"mapped" : {
"count" : ,
"used_in_bytes" : ,
"total_capacity_in_bytes" :
}
},
"classes" : {
"current_loaded_count" : 21255,
"total_loaded_count" : 21361,
"total_unloaded_count" : 106
}
}
}
}
}
GET /_nodes/1626803143000145632/stats/indices/merge
返回 Response 如下:
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" :
},
"cluster_name" : "es-xxx",
"nodes" : {
"fdaOV16OQPq6-AUVihmq4A" : {
"timestamp" : 1639878633590,
"name" : "1626803143000145632",
"transport_address" : "xx.0.96.22:9300",
"host" : "xx.0.96.22",
"ip" : "xx.0.96.22:9300",
"roles" : [
"master",
"ml",
"remote_cluster_client"
],
"attributes" : {
"ml.machine_memory" : "16478932992",
"rack" : "cvm_4_200003",
"xpack.installed" : "true",
"set" : "200003",
"transform.node" : "false",
"ip" : "xx.20.58.221",
"temperature" : "hot",
"ml.max_open_jobs" : "20",
"region" : "4"
},
"indices" : {
"merges" : {
"current" : ,
"current_docs" : ,
"current_size_in_bytes" : ,
"total" : ,
"total_time_in_millis" : ,
"total_docs" : ,
"total_size_in_bytes" : ,
"total_stopped_time_in_millis" : ,
"total_throttled_time_in_millis" : ,
"total_auto_throttle_in_bytes" :
}
}
}
}
}
以及查看节点索引 segment 和 translog 统计信息:
GET /_nodes/1626803143000145632/stats/indices/segments,translog
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" :
},
"cluster_name" : "es-xxxx",
"nodes" : {
"fdaOV16OQPq6-AUVihmq4A" : {
"timestamp" : 1639878746911,
"name" : "1626803143000145632",
"transport_address" : "xx.0.96.22:9300",
"host" : "xx.0.96.22",
"ip" : "xx.0.96.22:9300",
"roles" : [
"master",
"ml",
"remote_cluster_client"
],
"attributes" : {
"ml.machine_memory" : "16478932992",
"rack" : "cvm_4_200003",
"xpack.installed" : "true",
"set" : "200003",
"transform.node" : "false",
"ip" : "xx.20.58.221",
"temperature" : "hot",
"ml.max_open_jobs" : "20",
"region" : "4"
},
"indices" : {
"segments" : {
"count" : ,
"memory_in_bytes" : ,
"terms_memory_in_bytes" : ,
"stored_fields_memory_in_bytes" : ,
"term_vectors_memory_in_bytes" : ,
"norms_memory_in_bytes" : ,
"points_memory_in_bytes" : ,
"doc_values_memory_in_bytes" : ,
"index_writer_memory_in_bytes" : ,
"version_map_memory_in_bytes" : ,
"fixed_bit_set_memory_in_bytes" : ,
"max_unsafe_auto_id_timestamp" : -9223372036854775808,
"file_sizes" : { }
},
"translog" : {
"operations" : ,
"size_in_bytes" : ,
"uncommitted_operations" : ,
"uncommitted_size_in_bytes" : ,
"earliest_last_modified_age" :
}
}
}
}
}
也可以通过该API来查看每个节点上所分配的索引存储信息:
GET /_nodes/stats/indices/store
GET _nodes/stats
API来查看节点的统计信息,ES 官方文档中还提供了另外一个 API,也可以获取到基本的统计信息:
GET /_cat/nodes
返回 Response:
ip heap.percent ram.percent cpu load_1m load_5m load_15m node.role master name
xx.0.96.9 30 70 0 0.01 0.05 0.05 cdhilrstw - 1626803143000145432
xx.0.96.24 2 99 1 0.11 0.09 0.07 lmr - 1626803143000145832
xx.0.96.49 14 99 1 0.87 0.24 0.12 lmr * 1626803143000145732
xx.0.96.13 14 70 0 0.16 0.08 0.06 cdhilrstw - 1626803143000145532
xx.0.96.22 3 99 2 0.13 0.12 0.13 lmr - 1626803143000145632
xx.0.96.20 cdhilrstw - 1626803143000145332
GET _cat/nodes?h=name,segments.memory,segments.index_writer_memory,heap.percent,fielddata.memory_size,query_cache.memory_size,request_cache.memory_size\&v
name segments.memory segments.index_writer_memory heap.percent fielddata.memory_size query_cache.memory_size
1626803143000145832 0b 0b 3 0b 0b
1626803143000145532 15.9mb 58mb 17 3.7kb 3.9kb
1626803143000145332
1626803143000145432 15.7mb 50.7mb 28 3.4kb 10.2kb
1626803143000145732 0b 0b 13 0b 0b
1626803143000145632 0b 0b 3 0b 0b
3、查看节点线程池占用情况
GET /_cat/thread_pool
图1. 集群出现查询拒绝
图2. 集群节点查询队列被打满
GET /_cat/thread_pool/search,write?v
返回 Response:
node_name name active queue rejected
1626803143000145832 search 0 0 0
1626803143000145832 write
1626803143000145532 search 0 0 0
1626803143000145532 write
1626803143000145332 search 0 0 0
1626803143000145332 write
1626803143000145432 search 0 0 0
1626803143000145432 write
1626803143000145732 search 0 0 0
1626803143000145732 write
1626803143000145632 search 0 0 0
1626803143000145632 write
如果能从如上的返回中看到 queue
值和 rejected
值比较高,就说明该节点的读写处理能力快到瓶颈了,此时应该结合cpu使用率来综合评估。以我们的经验来看。读写拒绝通常是由于 CPU 使用率高引起,CPU 使用率高会导致节点读写请求处理不过来,从而导致查询或 bulk 队列被打满而出现拒绝。而读写熔断通常是由于 JVM 使用率高引起。因此这里面需要针对不同的指标来进行分析。
4、查看节点热线程
GET /_nodes/hot_threads
GET /_nodes/<node_id>/hot_threads
返回 Response:
::: {1626803143000145832}{vT4YRHWdRweoouLn2fGu0g}{Pq2mklvJTvCMPlhN7OY_KQ}{xx.0.96.24}{xx.0.96.24:9300}{lmr}{ml.machine_memory=16478932992, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=false, ip=9.20.59.20, temperature=hot, ml.max_open_jobs=20, region=4}
Hot threads at 2021-12-19T02:21:12.334Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145532}{CaQnhaYpQw6vbabGwKaPTw}{X_yKVAz9RHCOUwnMEXKLvg}{xx.0.96.13}{xx.0.96.13:9300}{cdhilrstw}{ml.machine_memory=50299387904, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=true, ip=9.20.57.70, temperature=hot, ml.max_open_jobs=20, region=4}
Hot threads at 2021-12-19T02:21:12.335Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145332}{yvtpqeypTke6aFxxwYSjjA}{ZQkJVy7zQGOOY5_hAP3z_w}{xx.0.96.20}{xx.0.96.20:9300}{cdhilrstw}{ml.machine_memory=50299125760, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=true, ip=9.20.53.190, temperature=hot, ml.max_open_jobs=20, region=4}
Hot threads at 2021-12-19T02:21:12.337Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145432}{hz6BqoupSuOUuWykrX5c2g}{VoANc_CJQWylc2rxT6NJTg}{xx.0.96.9}{xx.0.96.9:9300}{cdhilrstw}{ml.machine_memory=50299387904, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=true, ip=9.20.56.176, temperature=hot, ml.max_open_jobs=20, region=4}
Hot threads at 2021-12-19T02:21:12.334Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145732}{AexOqq25T7SRf6tzMYzO1Q}{t21NHAcyQwiJKEtV85Okzg}{xx.0.96.49}{xx.0.96.49:9300}{lmr}{ml.machine_memory=16478932992, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=false, ip=9.20.58.203, temperature=hot, ml.max_open_jobs=20, region=4}
Hot threads at 2021-12-19T02:21:12.335Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145632}{fdaOV16OQPq6-AUVihmq4A}{pdyApXIKQbivHe57M5UCIA}{xx.0.96.22}{xx.0.96.22:9300}{lmr}{ml.machine_memory=16478932992, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=false, ip=9.20.58.221, temperature=hot, ml.max_open_jobs=20, region=4}
Hot threads at 2021-12-19T02:21:12.335Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
二、节点常用命令总结
来源:https://cloud.tencent.com/developer/article/1921434
相关文章