【 使用环境 】生产环境
【 OB or 其他组件 】ob
【 使用版本 】4.2.1.2
【问题描述】oceanbase 1-1-1 3节点中一个节点下线,无法正常恢复,杀掉进程重启后,直连可访问,但是状态为INACTIVE,且在该节点并未相关租户资源
直连可以访问数据
租户资源只分布在另外两个节点上
【 使用环境 】生产环境
【 OB or 其他组件 】ob
【 使用版本 】4.2.1.2
【问题描述】oceanbase 1-1-1 3节点中一个节点下线,无法正常恢复,杀掉进程重启后,直连可访问,但是状态为INACTIVE,且在该节点并未相关租户资源
下线执行的是ALTER SYSTEM DELETE SERVER么
恢复操作是什么详细描述一下。执行add server时候失败了,然后重启进程了么
下线不是主动下线的,还未查出是什么原因导致下线。恢复操作是直接重启进程
重启进程的方法是用obd么
是在服务器上执行cd /home/admin/oceanbase && ./bin/observer
看看最近的事件
select * from oceanbase.__all_rootservice_event_History
where module in ('server','root_service')
order by gmt_create desc 100;
GMT_CREATE | MODULE | EVENT | NAME1 | VALUE1 | NAME2 | VALUE2 | NAME3 | VALUE3 | NAME4 | VALUE4 | NAME5 | VALUE5 | NAME6 | VALUE6 | EXTRA_INFO | RS_SVR_IP | RS_SVR_PORT |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2025-01-15 04:41:50 | root_service | root_minor_freeze | ret | 0 | arg | {tenant_ids:[1002], server_list:[], zone:, tablet_id:{id:0}, ls_id:{id:-1}} | 172.64.159.112 | 12882 | |||||||||
2025-01-15 02:13:26 | root_service | finish_wait_stop | cost | 1140 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:13:26 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||
2025-01-15 02:13:16 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 02:13:13 | root_service | admin_set_config | ret | 0 | arg | {items:[{name:rootservice_list, value:172.64.159.112:12882:12881;172.64.159.113:12882:12881, comment:, zone:, server:0.0.0.0:0, tenant_name:, exec_tenant_id:1, tenant_ids:[], want_to_set_tenant_config:false}], is_inner:false} | 172.64.159.112 | 12882 | |||||||||
2025-01-15 02:13:13 | root_service | admin_set_config | ret | 0 | arg | {items:[{name:rootservice_list, value:172.64.159.112:12882:12881;172.64.159.113:12882:12881, comment:, zone:, server:0.0.0.0:0, tenant_name:, exec_tenant_id:1, tenant_ids:[], want_to_set_tenant_config:false}], is_inner:false} | 172.64.159.112 | 12882 | |||||||||
2025-01-15 02:13:13 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 02:13:11 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 02:13:09 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 02:13:09 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 02:13:09 | root_service | start_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 02:13:08 | root_service | stop_rootservice | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:06:11 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:06:06 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 02:06:06 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 02:06:06 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 02:06:06 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 02:06:06 | root_service | start_rootservice | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:06:05 | root_service | finish_wait_stop | cost | 991 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:06:05 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||
2025-01-15 02:06:03 | root_service | stop_rootservice | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:00:12 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:00:07 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 02:00:07 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 02:00:07 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 02:00:07 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 02:00:07 | root_service | start_rootservice | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:00:07 | root_service | finish_wait_stop | cost | 1085 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 02:00:07 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||
2025-01-15 02:00:06 | root_service | root_major_freeze | tenant_id | 1 | ret | 0 | new_frozen_scn | 1736877606453491799 | 172.64.159.113 | 12882 | |||||||
2025-01-15 02:00:02 | root_service | root_major_freeze | tenant_id | 1002 | ret | 0 | new_frozen_scn | 1736877602479224587 | 172.64.159.114 | 12882 | |||||||
2025-01-15 02:00:01 | root_service | root_major_freeze | tenant_id | 1001 | ret | 0 | new_frozen_scn | 1736877601549560191 | 172.64.159.114 | 12882 | |||||||
2025-01-15 02:00:00 | root_service | stop_rootservice | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 01:59:58 | root_service | finish_wait_stop | cost | 899 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 01:59:58 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||
2025-01-15 01:59:50 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 01:59:47 | root_service | admin_set_config | ret | 0 | arg | {items:[{name:rootservice_list, value:172.64.159.113:12882:12881, comment:, zone:, server:0.0.0.0:0, tenant_name:, exec_tenant_id:1, tenant_ids:[], want_to_set_tenant_config:false}], is_inner:false} | 172.64.159.113 | 12882 | |||||||||
2025-01-15 01:59:47 | root_service | admin_set_config | ret | 0 | arg | {items:[{name:rootservice_list, value:172.64.159.113:12882:12881, comment:, zone:, server:0.0.0.0:0, tenant_name:, exec_tenant_id:1, tenant_ids:[], want_to_set_tenant_config:false}], is_inner:false} | 172.64.159.113 | 12882 | |||||||||
2025-01-15 01:59:47 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 01:59:45 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 01:59:43 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 01:59:43 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||
2025-01-15 01:59:43 | root_service | start_rootservice | self_addr | 172.64.159.113:12882 | 172.64.159.113 | 12882 | |||||||||||
2025-01-15 01:59:42 | root_service | stop_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 01:25:23 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 01:25:18 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 01:25:18 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 01:25:18 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 01:25:18 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 01:25:18 | root_service | start_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 01:25:18 | root_service | finish_wait_stop | cost | 1220 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 01:25:18 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||
2025-01-15 01:25:09 | root_service | stop_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 00:06:50 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 00:06:48 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 00:06:43 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 00:06:43 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 00:06:43 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-15 00:06:43 | root_service | start_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 00:06:43 | root_service | finish_wait_stop | cost | 1122 | 172.64.159.112 | 12882 | |||||||||||
2025-01-15 00:06:43 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||
2025-01-15 00:06:32 | root_service | stop_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:56:34 | root_service | root_minor_freeze | ret | 0 | arg | {tenant_ids:[1002], server_list:[], zone:, tablet_id:{id:0}, ls_id:{id:-1}} | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:40:55 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:40:51 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:40:50 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:40:50 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:40:50 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:40:50 | root_service | start_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:40:50 | root_service | finish_wait_stop | cost | 907 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:40:50 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||
2025-01-14 19:40:34 | root_service | stop_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:40:29 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:40:29 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:40:29 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:40:29 | root_service | start_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:40:29 | root_service | finish_wait_stop | cost | 1432 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:40:29 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||
2025-01-14 19:40:16 | root_service | stop_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:11:16 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:11:12 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:11:11 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:11:11 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:11:09 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 19:11:09 | root_service | start_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:11:05 | root_service | finish_wait_stop | cost | 999 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 19:11:05 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||
2025-01-14 19:10:53 | root_service | stop_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 18:31:43 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 18:31:38 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 18:31:38 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 18:31:38 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 18:31:38 | root_service | finish_start_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 18:31:38 | root_service | start_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 18:31:38 | root_service | finish_wait_stop | cost | 1112 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 18:31:38 | root_service | finish_stop_thread | ret | 0 | ret | OB_SUCCESS | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||
2025-01-14 18:31:22 | root_service | stop_rootservice | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 18:09:42 | root_service | admin_refresh_io_calibration | ret | 0 | 172.64.159.112 | 12882 | |||||||||||
2025-01-14 18:09:39 | root_service | full_rootservice | result | 0 | self_addr | 172.64.159.112:12882 | 172.64.159.112 | 12882 | |||||||||
2025-01-14 18:09:37 | server | load_servers | ret | 0 | has_build | 1 | 172.64.159.112 | 12882 |
下线节点observer.log日志有这个报错
[2025-01-15 14:40:02.219487] WDIAG [SERVER] runTimerTask (ob_server.cpp:3214) [78568][ServerGTimer][T0][Y0-0000000000000000-0-0]
[lt=9][errcode=-4000] ObRefreshNetworkSpeedTask reload bandwidth throttle limit failed(ret=-4000, ret="OB_ERROR")
[2025-01-15 14:40:02.451290] ERROR issue_dba_error (ob_log.cpp:1868) [78905][T1_HBService][T1][Y3252AC409F70-00062B91E17DB570-0-0
] [lt=28][errcode=-4388] Unexpected internal error happen, please checkout the internal errcode(errcode=-4631, file="ob_heartbeat
_service.cpp", line_no=836, info="[HEARTBEAT_SERVICE] server's zone does not match")
observer.zip (26.3 MB)
4631报错服务器集群信息不匹配。
[HEARTBEAT_SERVICE]心跳服务检测时,发现某个服务器的 Zone 信息与集群中其他服务器的 Zone 信息不匹配。这个是异常服务器,当前还是inactive状态么,异常节点是113么
下线节点是112,但是112可以直连且可以访问数据
inactive 状态是112
查一下这个表__all_virtual_core_meta_table; 是否存在数据看看ip是112还是113.这边先看下日志
帮忙再看一下 112节点的clog和sstable存储空间是否有core文件
执行下select * from __all_zone;
TENANT_ID | LS_ID | SVR_IP | SVR_PORT | SQL_PORT | ROLE | MEMBER_LIST | PROPOSAL_ID | REPLICA_TYPE | REPLICA_STATUS | RESTORE_STATUS | MEMSTORE_PERCENT | UNIT_ID | ZONE | PAXOS_REPLICA_NUMBER | DATA_SIZE | REQUIRED_SIZE | LEARNER_LIST | REBUILD |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | 172.64.159.112 | 12882 | 12881 | 1 | 172.64.159.112:12882:1733975167560729,172.64.159.113:12882:1 | 49124 | 0 | NORMAL | 0 | 100 | 3 | zone1 | 3 | 0 | 0 | 0 | |
1 | 1 | 172.64.159.113 | 12882 | 12881 | 2 | 172.64.159.112:12882:1733975167560729,172.64.159.113:12882:1 | 0 | 0 | NORMAL | 0 | 100 | 2 | zone1 | 3 | 0 | 0 | 0 |
GMT_CREATE | GMT_MODIFIED | ZONE | NAME | VALUE | INFO |
---|---|---|---|---|---|
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | cluster | 0 | obcluster | |
2024-03-08 20:44:17 | 2025-01-15 02:13:13 | config_version | 1736878393354742 | ||
2024-03-08 20:44:17 | 2025-01-15 02:13:13 | lease_info_version | 1736878393368335 | ||
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | privilege_version | 0 | ||
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | storage_format_version | 4 | ||
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | time_zone_info_version | 0 | ||
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone1 | idc | 0 | |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone1 | recovery_status | 0 | NORMAL |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone1 | region | 0 | sys_region |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone1 | status | 2 | ACTIVE |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone1 | storage_type | 0 | LOCAL |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone1 | zone_type | 0 | ReadWrite |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone2 | idc | 0 | |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone2 | recovery_status | 0 | NORMAL |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone2 | region | 0 | sys_region |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone2 | status | 2 | ACTIVE |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone2 | storage_type | 0 | LOCAL |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone2 | zone_type | 0 | ReadWrite |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone3 | idc | 0 | |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone3 | recovery_status | 0 | NORMAL |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone3 | region | 0 | sys_region |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone3 | status | 2 | ACTIVE |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone3 | storage_type | 0 | LOCAL |
2024-03-08 20:44:17 | 2024-03-08 20:44:17 | zone3 | zone_type | 0 | ReadWrite |
SVR_IP | SVR_PORT | STATUS | START_SERVICE_TIME | ZONE | |
---|---|---|---|---|---|
1 | 172.64.159.112 | 12882 | INACTIVE | zone3 | |
2 | 172.64.159.113 | 12882 | ACTIVE | 2024-08-15 15:00:35 | zone2 |
3 | 172.64.159.114 | 12882 | ACTIVE | 2024-12-12 11:45:26 | zone1 |