具体ceph状态如下
[root@ceph-01 ~]# ceph -s
cluster:
id: c8ae7537-8693-40df-8943-733f82049642
health: HEALTH_WARN
1 daemons have recently crashed
告警内容如下
Dashboard告警如下
但是实际上mgr节点也已经恢复了,节点并没有解除
[root@ceph-01 ~]# ceph -s |grep mgr
mgr: ceph-03(active, since 5h), standbys: ceph-02, ceph-01
接下来查看告警信息
[root@ceph-01 ~]# ceph crash ls-new
ID ENTITY NEW
2022-11-30_21:01:42.289594Z_53c1ca9d-148f-4804-883d-2f62b3592be1 mgr.ceph-01 *
这里可以看到是ceph-01-mgr告警的,但是上面已经看到它的状态为standbys
我们可以查看下具体告警信息
ceph crash info ID
[root@ceph-01 ~]# ceph crash info 2022-11-30_21:01:42.289594Z_53c1ca9d-148f-4804-883d-2f62b3592be1
{
"os_version_id": "7",
"assert_condition": "pending_service_map.epoch > service_map.epoch",
"utsname_release": "3.10.0-693.el7.x86_64",
"os_name": "CentOS Linux",
"entity_name": "mgr.ceph-01",
"assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.22/rpm/el7/BUILD/ceph-14.2.22/src/mgr/DaemonServer.cc",
"timestamp": "2022-11-30 21:01:42.289594Z",
"process_name": "ceph-mgr",
"utsname_machine": "x86_64",
"assert_line": 2883,
"utsname_sysname": "Linux",
"os_version": "7 (Core)",
"os_id": "centos",
"assert_thread_name": "ms_dispatch",
"utsname_version": "#1 SMP Tue Aug 22 21:09:27 UTC 2017",
"backtrace": [
"(()+0xf630) [0x7f6dbb781630]",
"(gsignal()+0x37) [0x7f6dba574387]",
"(abort()+0x148) [0x7f6dba575a78]",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x7f6dbdea8436]",
"(()+0x25d5af) [0x7f6dbdea85af]",
"(DaemonServer::got_service_map()+0x8bd) [0x5574ba4dafbd]",
"(Mgr::handle_service_map(MServiceMap*)+0x138) [0x5574ba50a178]",
"(Mgr::ms_dispatch(Message*)+0x31b) [0x5574ba50c38b]",
"(MgrStandby::ms_dispatch(Message*)+0x97) [0x5574ba513ba7]",
"(Dispatcher::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x26) [0x5574ba4fe196]",
"(DispatchQueue::entry()+0x1699) [0x7f6dbe0cd699]",
"(DispatchQueue::DispatchThread::entry()+0xd) [0x7f6dbe17b64d]",
"(()+0x7ea5) [0x7f6dbb779ea5]",
"(clone()+0x6d) [0x7f6dba63cb0d]"
],
"utsname_hostname": "ceph-01",
"assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.22/rpm/el7/BUILD/ceph-14.2.22/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7f6db2dfc700 time 2022-12-01 05:01:42.287318\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.22/rpm/el7/BUILD/ceph-14.2.22/src/mgr/DaemonServer.cc: 2883: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)\n",
"crash_id": "2022-11-30_21:01:42.289594Z_53c1ca9d-148f-4804-883d-2f62b3592be1",
"assert_func": "DaemonServer::got_service_map()::<lambda(const ServiceMap&)>",
"ceph_version": "14.2.22"
}
如果确认该daemons告警已经恢复的话,我们手工屏蔽该告警
[root@ceph-01 ~]# ceph crash ls-new
ID ENTITY NEW
2022-11-30_21:01:42.289594Z_53c1ca9d-148f-4804-883d-2f62b3592be1 mgr.ceph-01 *
[root@ceph-01 ~]# ceph crash archive 2022-11-30_21:01:42.289594Z_53c1ca9d-148f-4804-883d-2f62b3592be1
[root@ceph-01 ~]#
[root@ceph-01 ~]# ceph -s
cluster:
id: c8ae7537-8693-40df-8943-733f82049642
health: HEALTH_OK
services:
mon: 3 daemons, quorum ceph-01,ceph-02,ceph-03 (age 3h)
mgr: ceph-03(active, since 5h), standbys: ceph-02, ceph-01
mds: cephfs-abcdocker:1 cephfs:1 i4tfs:1 {cephfs-abcdocker:0=ceph-02=up:active,cephfs:0=ceph-03=up:active,i4tfs:0=ceph-01=up:active}
osd: 4 osds: 4 up (since 14h), 4 in (since 3d)
rgw: 2 daemons active (ceph-01, ceph-02)
task status:
data:
pools: 19 pools, 880 pgs
objects: 9.21k objects, 33 GiB
usage: 102 GiB used, 78 GiB / 180 GiB avail
pgs: 880 active+clean
io:
client: 416 KiB/s wr, 0 op/s rd, 1 op/s wr