使用场景说明:
1、使用obd部署的ob集群且没有接管到ocp。
2、有空闲的机器可以用来替换故障节点。
1、当前OB集群的拓扑
MySQL [oceanbase]> select * from dba_ob_servers order by zone;
+----------------+----------+----+-------+----------+-----------------+--------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+-------------------+
| SVR_IP | SVR_PORT | ID | ZONE | SQL_PORT | WITH_ROOTSERVER | STATUS | START_SERVICE_TIME | STOP_TIME | BLOCK_MIGRATE_IN_TIME | CREATE_TIME | MODIFY_TIME | BUILD_VERSION | LAST_OFFLINE_TIME |
+----------------+----------+----+-------+----------+-----------------+--------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+-------------------+
| 11.xxx.xxx.191 | 12882 | 1 | zone1 | 12881 | YES | ACTIVE | 2024-07-31 08:14:53.119695 | NULL | NULL | 2024-07-24 10:50:42.478616 | 2024-07-31 08:14:55.115175 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.191 | 22882 | 4 | zone1 | 22881 | NO | ACTIVE | 2024-07-31 08:14:07.606076 | NULL | NULL | 2024-07-24 10:51:01.744417 | 2024-07-31 08:14:10.278981 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.191 | 32882 | 2 | zone2 | 32881 | NO | ACTIVE | 2024-08-05 10:06:37.915423 | NULL | NULL | 2024-07-24 10:50:42.523764 | 2024-08-05 10:06:39.509455 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 42882 | 5 | zone2 | 42881 | NO | ACTIVE | 2024-07-31 08:14:03.193183 | NULL | NULL | 2024-07-24 10:51:01.792832 | 2024-07-31 08:14:06.075483 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 52882 | 3 | zone3 | 52881 | NO | ACTIVE | 2024-07-31 08:14:57.594518 | NULL | NULL | 2024-07-24 10:50:42.562099 | 2024-07-31 08:14:59.182698 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 62882 | 6 | zone3 | 62881 | NO | ACTIVE | 2024-07-31 08:14:03.155596 | NULL | NULL | 2024-07-24 10:51:01.849129 | 2024-07-31 08:14:06.104469 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
+----------------+----------+----+-------+----------+-----------------+--------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+-------------------+
6 rows in set (0.01 sec)
2、当前obd的配置
user:
username: heshun.lxd
# password: xxx
key_file: /home/heshun.lxd/.ssh/id_rsa
oceanbase-ce:
package_hash: f3cd399aa780d23fbb60faed68e32a7dbd4e6a3c
servers:
- name: server1
ip: 11.xxx.xxx.191
- name: server2
ip: 11.xxx.xxx.191
- name: server3
ip: 11.xxx.xxx.191
- name: server4
ip: 11.xxx.xxx.192
- name: server5
ip: 11.xxx.xxx.192
- name: server6
ip: 11.xxx.xxx.192
global:
cluster_id: 1
memory_limit: 12G
system_memory: 2G
datafile_size: 10G
log_disk_size: 36G
cpu_count: 8
production_mode: false
enable_syslog_wf: false
enable_syslog_recycle: true
max_syslog_file_count: 30
appname: obcluster
root_password: AAaa11__
proxyro_password: AAaa11__
server1:
obshell_port: 15331
mysql_port: 12881
rpc_port: 12882
home_path: /home/heshun.lxd/observer1
data_dir: /obdata/data/data1
redo_dir: /obdata/log/log1
zone: zone1
server2:
obshell_port: 25331
mysql_port: 22881
rpc_port: 22882
home_path: /home/heshun.lxd/observer2
data_dir: /obdata/data/data2
redo_dir: /obdata/log/log2
zone: zone1
server3:
obshell_port: 35331
mysql_port: 32881
rpc_port: 32882
home_path: /home/heshun.lxd/observer3
data_dir: /obdata/data/data3
redo_dir: /obdata/log/log3
zone: zone2
server4:
obshell_port: 45331
mysql_port: 42881
rpc_port: 42882
home_path: /home/heshun.lxd/observer4
data_dir: /obdata/data/data4
redo_dir: /obdata/log/log4
zone: zone2
server5:
obshell_port: 55331
mysql_port: 52881
rpc_port: 52882
home_path: /home/heshun.lxd/observer5
data_dir: /obdata/data/data5
redo_dir: /obdata/log/log5
zone: zone3
server6:
obshell_port: 65331
mysql_port: 62881
rpc_port: 62882
home_path: /home/heshun.lxd/observer6
data_dir: /obdata/data/data6
redo_dir: /obdata/log/log6
zone: zone3
obproxy-ce:
depends:
- oceanbase-ce
servers:
- 11.xxx.xxx.191
global:
listen_port: 2883
prometheus_listen_port: 2884
home_path: /home/heshun.lxd/obproxy
enable_cluster_checkout: false
skip_proxy_sys_private_check: true
enable_strict_kernel_release: false
obproxy_sys_password: xxx
observer_sys_password: xxx
3、 模拟故障
[heshun.lxd@sqaobnoxdn011162217192.sa128 /home/heshun.lxd]
$ps -ef | grep observer6 | grep -v grep
heshun.+ 34803 1 87 Jul31 ? 4-16:13:19 /home/heshun.lxd/observer6/bin/observer -r 11.162.217.191:12882:12881;11.162.217.191:32882:32881;11.162.217.192:52882:52881 -p 62881 -P 62882 -z zone3 -n obcluster -c 1 -d /obdata/data/data6 -I 11.162.217.192 -o __min_full_resource_pool_memory=2147483648,memory_limit=12G,system_memory=2G,datafile_size=10G,log_disk_size=36G,cpu_count=8,enable_syslog_wf=False,enable_syslog_recycle=True,max_syslog_file_count=30
heshun.+ 36803 1 0 Jul31 ? 00:00:50 /home/heshun.lxd/observer6/bin/obshell daemon --ip 11.162.217.192 --port 65331
heshun.+ 36852 36803 0 Jul31 ? 00:12:20 /home/heshun.lxd/observer6/bin/obshell server --ip 11.162.217.192 --port 65331
[heshun.lxd@sqaobnoxdn011162217192.sa128 /home/heshun.lxd]
$
[heshun.lxd@sqaobnoxdn011162217192.sa128 /home/heshun.lxd]
$kill -9 36852 34803
MySQL [oceanbase]> select * from dba_ob_servers
where svr_ip='11.xxx.xxx.192' and svr_port=62882 \G
*************************** 1. row ***************************
SVR_IP: 11.xxx.xxx.192
SVR_PORT: 62882
ID: 6
ZONE: zone3
SQL_PORT: 62881
WITH_ROOTSERVER: NO
STATUS: INACTIVE
START_SERVICE_TIME: NULL
STOP_TIME: NULL
BLOCK_MIGRATE_IN_TIME: NULL
CREATE_TIME: 2024-07-24 10:51:01.849129
MODIFY_TIME: 2024-08-05 16:27:58.108298
BUILD_VERSION: 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03)
LAST_OFFLINE_TIME: 2024-08-05 16:27:58.097271
1 row in set (0.02 sec)
4、准备添加新节点的配置文件
在故障节点所在的 zone 上添加新节点的配置文件 add_zone3_server7.yaml
oceanbase-ce:
servers:
- name: server7
ip: 11.xxx.xxx.192
server7:
obshell_port: 33333
mysql_port: 33331
rpc_port: 33332
home_path: /home/heshun.lxd/observer7
data_dir: /obdata/data/data7
redo_dir: /obdata/log/log7
zone: zone3
5、关闭sys租户的enable_rebalance
alter system set enable_rebalance='false' tenant='sys';
6、修改obd已有的配置文件
之所以有这一步的的修改,原因是:obd scale_out 扩容前做了检查,要求当前所有节点是running状态。
obd cluster scale_out ob431 -c add_zone3_server7.yaml -v
- Deploy status inconsistent
[ERROR] OBD-1005: Some of the servers in the cluster have been stopped
[ERROR] server status error: server6(11.xxx.xxx.192) oceanbase-ce is not STATUS_RUNNING
[ERROR] Some of the servers in the cluster is not running
obd cluster list – 获取对应的 deploy_name
cd ~/.obd/cluster/${deploy_name}/
cp inner_config.yaml inner_config.yaml_bak
# 修改前:
$_deploy_install_mode: ln
oceanbase-ce:
server1: {}
server2: {}
server3: {}
server4: {}
server5: {}
server6: {}
obproxy-ce:
11.xxx.xxx.191: {}
# 修改后:
$_deploy_install_mode: ln
oceanbase-ce:
server1: {}
server2: {}
server3: {}
server4: {}
server5: {}
obproxy-ce:
11.xxx.xxx.191: {}
cp config.yaml config.yaml_bak
user:
username: heshun.lxd
# password: xxx
key_file: /home/heshun.lxd/.ssh/id_rsa
oceanbase-ce:
package_hash: f3cd399aa780d23fbb60faed68e32a7dbd4e6a3c
servers:
- name: server1
ip: 11.xxx.xxx.191
- name: server2
ip: 11.xxx.xxx.191
- name: server3
ip: 11.xxx.xxx.191
- name: server4
ip: 11.xxx.xxx.192
- name: server5
ip: 11.xxx.xxx.192
global:
cluster_id: 1
memory_limit: 12G
system_memory: 2G
datafile_size: 10G
log_disk_size: 36G
cpu_count: 8
production_mode: false
enable_syslog_wf: false
enable_syslog_recycle: true
max_syslog_file_count: 30
appname: obcluster
root_password: AAaa11__
proxyro_password: AAaa11__
server1:
obshell_port: 15331
mysql_port: 12881
rpc_port: 12882
home_path: /home/heshun.lxd/observer1
data_dir: /obdata/data/data1
redo_dir: /obdata/log/log1
zone: zone1
server2:
obshell_port: 25331
mysql_port: 22881
rpc_port: 22882
home_path: /home/heshun.lxd/observer2
data_dir: /obdata/data/data2
redo_dir: /obdata/log/log2
zone: zone1
server3:
obshell_port: 35331
mysql_port: 32881
rpc_port: 32882
home_path: /home/heshun.lxd/observer3
data_dir: /obdata/data/data3
redo_dir: /obdata/log/log3
zone: zone2
server4:
obshell_port: 45331
mysql_port: 42881
rpc_port: 42882
home_path: /home/heshun.lxd/observer4
data_dir: /obdata/data/data4
redo_dir: /obdata/log/log4
zone: zone2
server5:
obshell_port: 55331
mysql_port: 52881
rpc_port: 52882
home_path: /home/heshun.lxd/observer5
data_dir: /obdata/data/data5
redo_dir: /obdata/log/log5
zone: zone3
obproxy-ce:
depends:
- oceanbase-ce
servers:
- 11.xxx.xxx.191
global:
listen_port: 2883
prometheus_listen_port: 2884
home_path: /home/heshun.lxd/obproxy
enable_cluster_checkout: false
skip_proxy_sys_private_check: true
enable_strict_kernel_release: false
obproxy_sys_password: xxx
observer_sys_password: xxx
7、obd scale_out 扩容新节点
obd cluster scale_out ob431 -c add_zone3_server7.yaml -v
确认扩容的新增节点状态
MySQL [oceanbase]> select * from dba_ob_servers
where svr_ip='11.xxx.xxx.192' and svr_port=33332 \G
*************************** 1. row ***************************
SVR_IP: 11.xxx.xxx.192
SVR_PORT: 33332
ID: 7
ZONE: zone3
SQL_PORT: 33331
WITH_ROOTSERVER: NO
STATUS: ACTIVE
START_SERVICE_TIME: 2024-08-05 17:23:09.258153
STOP_TIME: NULL
BLOCK_MIGRATE_IN_TIME: NULL
CREATE_TIME: 2024-08-05 17:22:59.938657
MODIFY_TIME: 2024-08-05 17:23:11.080485
BUILD_VERSION: 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03)
LAST_OFFLINE_TIME: NULL
1 row in set (0.00 sec)
obshelll日志里看会去连接故障的节点。
目前obshell未适配该场景,后续迭代中会优化支持。
8、迁移unit
8.1 确认故障节点上的unit
MySQL [oceanbase]> select * from dba_ob_units
where svr_ip='11.xxx.xxx.192' and svr_port=62882 \G
*************************** 1. row ***************************
UNIT_ID: 1021
TENANT_ID: 1002
STATUS: ACTIVE
RESOURCE_POOL_ID: 1001
UNIT_GROUP_ID: 1008
CREATE_TIME: 2024-08-01 10:35:14.374258
MODIFY_TIME: 2024-08-01 10:35:14.374258
ZONE: zone3
SVR_IP: 11.xxx.xxx.192
SVR_PORT: 62882
MIGRATE_FROM_SVR_IP: NULL
MIGRATE_FROM_SVR_PORT: NULL
MANUAL_MIGRATE: NULL
UNIT_CONFIG_ID: 1001
MAX_CPU: 4
MIN_CPU: 4
MEMORY_SIZE: 12884901888
LOG_DISK_SIZE: 25769803776
MAX_IOPS: 10000
MIN_IOPS: 10000
IOPS_WEIGHT: 0
1 row in set (0.00 sec)
8.2 执行unit迁移
# alter system migrate unit ${unit_id} destination '${new_svr_ip}:${new_svr_port}';
alter system migrate unit 1021 destination '11.162.xxx.xxx:33332';
8.3 查看unit 迁移进度
select * from dba_ob_units where MIGRATE_FROM_SVR_IP<>"";
SELECT * FROM oceanbase.DBA_OB_UNIT_JOBS WHERE JOB_TYPE = 'MIGRATE_UNIT';
# 查询为空说明unit迁移完成了。
9、删除故障节点
MySQL [oceanbase]> select * from dba_ob_servers order by zone;
+----------------+----------+----+-------+----------+-----------------+----------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+----------------------------+
| SVR_IP | SVR_PORT | ID | ZONE | SQL_PORT | WITH_ROOTSERVER | STATUS | START_SERVICE_TIME | STOP_TIME | BLOCK_MIGRATE_IN_TIME | CREATE_TIME | MODIFY_TIME | BUILD_VERSION | LAST_OFFLINE_TIME |
+----------------+----------+----+-------+----------+-----------------+----------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+----------------------------+
| 11.xxx.xxx.191 | 12882 | 1 | zone1 | 12881 | YES | ACTIVE | 2024-07-31 08:14:53.119695 | NULL | NULL | 2024-07-24 10:50:42.478616 | 2024-07-31 08:14:55.115175 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.191 | 22882 | 4 | zone1 | 22881 | NO | ACTIVE | 2024-07-31 08:14:07.606076 | NULL | NULL | 2024-07-24 10:51:01.744417 | 2024-07-31 08:14:10.278981 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.191 | 32882 | 2 | zone2 | 32881 | NO | ACTIVE | 2024-08-05 10:06:37.915423 | NULL | NULL | 2024-07-24 10:50:42.523764 | 2024-08-05 10:06:39.509455 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 42882 | 5 | zone2 | 42881 | NO | ACTIVE | 2024-07-31 08:14:03.193183 | NULL | NULL | 2024-07-24 10:51:01.792832 | 2024-07-31 08:14:06.075483 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 33332 | 7 | zone3 | 33331 | NO | ACTIVE | 2024-08-05 17:23:09.258153 | NULL | NULL | 2024-08-05 17:22:59.938657 | 2024-08-05 17:23:11.080485 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 52882 | 3 | zone3 | 52881 | NO | ACTIVE | 2024-07-31 08:14:57.594518 | NULL | NULL | 2024-07-24 10:50:42.562099 | 2024-07-31 08:14:59.182698 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 62882 | 6 | zone3 | 62881 | NO | INACTIVE | NULL | NULL | NULL | 2024-07-24 10:51:01.849129 | 2024-08-05 16:27:58.108298 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | 2024-08-05 16:27:58.097271 |
+----------------+----------+----+-------+----------+-----------------+----------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+----------------------------+
7 rows in set (0.02 sec)
MySQL [oceanbase]> alter system delete server '11.xxx.xxx.192:62882' zone='zone3';
Query OK, 0 rows affected (0.13 sec)
## dba_ob_servers 里没有故障的节点这条记录,说明delete server执行完成了
MySQL [oceanbase]> select * from dba_ob_servers order by zone;
+----------------+----------+----+-------+----------+-----------------+--------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+-------------------+
| SVR_IP | SVR_PORT | ID | ZONE | SQL_PORT | WITH_ROOTSERVER | STATUS | START_SERVICE_TIME | STOP_TIME | BLOCK_MIGRATE_IN_TIME | CREATE_TIME | MODIFY_TIME | BUILD_VERSION | LAST_OFFLINE_TIME |
+----------------+----------+----+-------+----------+-----------------+--------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+-------------------+
| 11.xxx.xxx.191 | 12882 | 1 | zone1 | 12881 | YES | ACTIVE | 2024-07-31 08:14:53.119695 | NULL | NULL | 2024-07-24 10:50:42.478616 | 2024-07-31 08:14:55.115175 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.191 | 22882 | 4 | zone1 | 22881 | NO | ACTIVE | 2024-07-31 08:14:07.606076 | NULL | NULL | 2024-07-24 10:51:01.744417 | 2024-07-31 08:14:10.278981 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.191 | 32882 | 2 | zone2 | 32881 | NO | ACTIVE | 2024-08-05 10:06:37.915423 | NULL | NULL | 2024-07-24 10:50:42.523764 | 2024-08-05 10:06:39.509455 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 42882 | 5 | zone2 | 42881 | NO | ACTIVE | 2024-07-31 08:14:03.193183 | NULL | NULL | 2024-07-24 10:51:01.792832 | 2024-07-31 08:14:06.075483 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 33332 | 7 | zone3 | 33331 | NO | ACTIVE | 2024-08-05 17:23:09.258153 | NULL | NULL | 2024-08-05 17:22:59.938657 | 2024-08-05 17:23:11.080485 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
| 11.xxx.xxx.192 | 52882 | 3 | zone3 | 52881 | NO | ACTIVE | 2024-07-31 08:14:57.594518 | NULL | NULL | 2024-07-24 10:50:42.562099 | 2024-07-31 08:14:59.182698 | 4.3.1.0_1-bad90e897a7f0f56b8ce5c43e186aa8f4bab03c4(Jun 28 2024 23:14:03) | NULL |
+----------------+----------+----+-------+----------+-----------------+--------+----------------------------+-----------+-----------------------+----------------------------+----------------------------+--------------------------------------------------------------------------+-------------------+
6 rows in set (0.01 sec)
10、开启sys租户的enable_rebalance
alter system set enable_rebalance='true' tenant='sys';