【 使用环境 】生产环境
【 OB or 其他组件 】
【 使用版本 】4.2.1.7
【问题描述】三节点部署,关闭其中一个observer,java项目就无法再连接集群
正常是可以的,你把连接数据库的日志也抛出来啊,你这只打印鸽成功失败看不出啥具体问题
你是不是observer停掉了之后新建的表
我把表删了重建几次。发现有时候是两副本,有时候是三副本。现在三个节点都是正常的。
可以多测几次,出现两副本的时候把trace_id捞出来然后看看有没有什么报错,大概就是下面的步骤
create table xxxx;
select last_trace_id();
grep trace_id observer.log //三台机器上多去搜搜
没捞到。有没有什么语句可以查看sql执行的日志
我上面提到的trace_id就是每条SQL执行后的日志。。应该有才对,随便执行了个语句试下能捞到一些信息吗
[admin@10-10-10-59 log]$ grep YB420A92023B-000620411DE480DD-0-0 observer.log
[2024-08-22 16:05:02.669960] WDIAG resolve_table_option (ob_ddl_resolver.cpp:1823) [23148][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=11][errcode=-5259] Unknown storage engine ‘InnoDB’
[2024-08-22 16:05:02.669980] WDIAG [SQL.RESV] resolve_table_option (ob_ddl_resolver.cpp:1824) [23148][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=14][errcode=0] unknown engine(engine_name=InnoDB, ret=0)
[2024-08-22 16:05:02.695776] INFO [STORAGE.TRANS] submit_multi_data_source_ (ob_trans_part_ctx.cpp:6471) [23149][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=11] submit MDS redo with barrier or base_scn successfully(ret=0, trans_id_={txid:47001114}, ls_id_={id:1002}, log_cb={ObTxBaseLogCb:{base_ts:{val:0, v:0}, log_ts:{val:1724313902322356917, v:0}, lsn:{lsn:916375757}, submit_ts:1724313902695773}, this:0x2ba9e7a8ccf0, is_inited_:true, trans_id:{txid:47001114}, ls_id:{id:1002}, ctx:0x2ba9e7a8a450, tx_data_guard:{tx_data:NULL}, is_callbacked_:false, is_dynamic_:false, mds_range_:{range_array_.count():1, range_array_:[{register_no_:1, has_submitted_:true, has_synced_:false, type:3, data_.length():569}]}, cb_arg_array_:[{log_type:0x4, arg:null}], first_part_scn_:{val:18446744073709551615, v:3}}, mds_cache_={unsubmitted_size_:0, mds_list_.size():1, max_register_no_:1}, exec_info_.multi_data_source_=[], mds_base_scn={val:0, v:0}, barrier_type=2)
[2024-08-22 16:05:02.695830] INFO [MDS.EVENT]before_prepare_ (mds_node.ipp:174) [23149][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=37] BEFORE_PREPARE(key={tenant_id:1004, ls_id:{id:1002}, tablet_id:{id:200019}}, event={alloc:null, timestamp:“2024-08-22 16:05:02.695829”, event:“BEFORE_PREPARE”, info_str:"{tablet_status:{val:1, str:“NORMAL”}, transfer_scn:{val:18446744073709551615, v:3}, transfer_ls_id:{id:-1}, data_type:1, create_commit_scn:{val:18446744073709551615, v:3}, create_commit_version:-1, delete_commit_scn:{val:18446744073709551615, v:3}, delete_commit_version:-1, transfer_out_commit_version:-1}", unit_id:0, key_str:“Dummy”, writer_type:1, writer_id:47001114, seq_no:0, redo_scn:{val:4611686018427387903, v:0}, end_scn:{val:4611686018427387903, v:0}, trans_version:{val:0, v:0}, node_type:1, state:1})
[2024-08-22 16:05:02.695855] INFO [STORAGE.TRANS] notify_data_source_ (ob_trans_part_ctx.cpp:6606) [23149][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=24] notify MDS(ret=0, trans_id_={txid:47001114}, ls_id_={id:1002}, notify_type=2, log_ts={val:18446744073709551615, v:3}, notify_array.count()=1, notify_array=[{register_no_:1, has_submitted_:true, has_synced_:false, type:3, data_.length():569}], total_time=39)
[2024-08-22 16:05:02.695880] INFO [RPC] process (ob_batch_processor.cpp:106) [23149][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=9] batch rpc statistics(clog_batch_nodelay_cnt=0, clog_batch_cnt=0, trx_batch_cnt=1, sql_batch_cnt=0)
[2024-08-22 16:05:02.699737] WDIAG [SHARE.SCHEMA] try_check_parallel_ddl_schema_in_sync (ob_schema_utils.cpp:536) [23148][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=9][errcode=0] schema version not sync(tenant_id=1004, consensus_timeout=30000000, refreshed_schema_version=1724313881502480, consensus_schema_version=1724313857150264, schema_version=1724313902682800)
[2024-08-22 16:05:02.740189] INFO [SQL.ENG] execute (ob_table_executor.cpp:581) [23148][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=15] [parallel_create_table](ret=0, ret=“OB_SUCCESS”, cost=70121, execute_time=29672, wait_schema=40449, table_name=“mytest”)
[2024-08-22 16:05:02.740246] INFO [SHARE] add_event (ob_event_history_table_operator.h:290) [23148][T1004_L0_G0][T1004][YB420A92023B-000620411DE480DD-0-0] [lt=16] event table add task(ret=0, event_table_name="__all_server_event_history", sql=INSERT INTO __all_server_event_history (gmt_create, module, event, name1, value1, name2, value2, name3, value3, name4, value4, svr_ip, svr_port) VALUES (usec_to_time(1724313902740208), ‘sql’, ‘execute_cmd’, ‘cmd_type’, 20, ‘sql_text’, X’’, ‘return_code’, 0, ‘tenant_id’, 1004, ‘10.10.10.59’, 2882))
[2024-08-22 16:05:05.615138] INFO [MDS] try_gc_mds_table (mds_table_handler.cpp:128) [22972][T1004_Occam][T1004][YB420A92023B-0006204110D481C9-0-0] [lt=19] [GC]success to gc mds_table(ret=0, ret=“OB_SUCCESS”, valid_node_cnt=0, *this={mds_table_handle:{p_mds_table_base:{ctrl_ptr:{ref:1, p_data_block:{this:0x2baa78a08170, ls_id:{id:1002}, tablet_id:{id:200019}, flushing_scn:{val:18446744073709551615, v:3}, rec_scn:{val:4611686018427387903, v:0}, last_inner_recycled_scn:{val:1724313902884148280, v:0}, total_node_cnt:0, construct_sequence:5, debug_info:{do_init_tablet_pointer:0x2ba9fdc14d38, do_remove_tablet_pointer:null, init_ts:“2024-08-22 16:05:02.693469”, last_reset_ts:“1970-01-01 08:00:00.0”, remove_ts:“1970-01-01 08:00:00.0”, last_flush_ts:“2024-08-22 16:05:02.994537”, switch_to_empty_shell_ts:“1970-01-01 08:00:00.0”, init_trace_id:YB420A92023B-000620411DE480DD-0-0, remove_trace_id:Y0-0000000000000000-0-0}}}}, mds_table_id:1}})
[2024-08-22 16:05:05.615163] INFO [MDS] unregister_from_mds_table_mgr (mds_table_mgr.cpp:125) [22972][T1004_Occam][T1004][YB420A92023B-0006204110D481C9-0-0] [lt=22] unregister success(ret=0, ret=“OB_SUCCESS”, p_mds_table={this:0x2baa78a08170, ls_id:{id:1002}, tablet_id:{id:200019}, flushing_scn:{val:18446744073709551615, v:3}, rec_scn:{val:4611686018427387903, v:0}, last_inner_recycled_scn:{val:1724313902884148280, v:0}, total_node_cnt:0, construct_sequence:5, debug_info:{do_init_tablet_pointer:0x2ba9fdc14d38, do_remove_tablet_pointer:null, init_ts:“2024-08-22 16:05:02.693469”, last_reset_ts:“1970-01-01 08:00:00.0”, remove_ts:“1970-01-01 08:00:00.0”, last_flush_ts:“2024-08-22 16:05:02.994537”, switch_to_empty_shell_ts:“1970-01-01 08:00:00.0”, init_trace_id:YB420A92023B-000620411DE480DD-0-0, remove_trace_id:Y0-0000000000000000-0-0}})
[2024-08-22 16:05:05.615178] INFO [MDS] ~MdsTableImpl (mds_table_impl.ipp:79) [22972][T1004_Occam][T1004][YB420A92023B-0006204110D481C9-0-0] [lt=14] mds table destructed(*this={this:0x2baa78a08170, ls_id:{id:1002}, tablet_id:{id:200019}, flushing_scn:{val:18446744073709551615, v:3}, rec_scn:{val:4611686018427387903, v:0}, last_inner_recycled_scn:{val:1724313902884148280, v:0}, total_node_cnt:0, construct_sequence:5, debug_info:{do_init_tablet_pointer:0x2ba9fdc14d38, do_remove_tablet_pointer:null, init_ts:“2024-08-22 16:05:02.693469”, last_reset_ts:“1970-01-01 08:00:00.0”, remove_ts:“1970-01-01 08:00:00.0”, last_flush_ts:“2024-08-22 16:05:02.994537”, switch_to_empty_shell_ts:“1970-01-01 08:00:00.0”, init_trace_id:YB420A92023B-000620411DE480DD-0-0, remove_trace_id:Y0-0000000000000000-0-0}})
SET ob_enable_show_trace=‘ON’;
注意:需要在同一个会话中执行
obclient [test]> create table --创建表语句
obclient [test]> select last_trace_id();
obclient [test]> select * from oceanbase.gv$ob_sql_audit where trace_id=‘YB420BA1CC68-000615A0A8EA6511-0-0’;
[root@x.x.x.x ~]$ grep “YB420BA1CC68-000615A0A8EA6511-0-0” rootservice.log
[root@x.x.x.x ~]$ grep “YB420BA1CC68-000615A0A8EA6511-0-0” observer.log
查看 test 库下 t1 表分布
SELECT * FROM oceanbase.DBA_OB_TABLE_LOCATIONS
WHERE DATABASE_NAME=‘test’ and TABLE_NAME=‘t1’ and TABLE_TYPE=‘USER TABLE’;
查看 test 库下 t1 表分布
SELECT * FROM oceanbase.DBA_OB_TABLE_LOCATIONS
WHERE DATABASE_NAME=‘test’ and TABLE_NAME=‘t1’ and TABLE_TYPE=‘USER TABLE’;
这个查询 也执行一下 改一下 库名和表名
得在只创建了两副本的trace,而且三台机器上的日志都找下
obclient [test]> SELECT * FROM oceanbase.DBA_OB_TABLE_LOCATIONS
→ WHERE DATABASE_NAME=‘test’ and TABLE_NAME=‘mytest’;
±--------------±-----------±---------±-----------±---------------±------------------±-----------±--------------±----------±------±------±------------±---------±---------±-------------±----------------±----------±----------------±--------------±---------+
| DATABASE_NAME | TABLE_NAME | TABLE_ID | TABLE_TYPE | PARTITION_NAME | SUBPARTITION_NAME | INDEX_NAME | DATA_TABLE_ID | TABLET_ID | LS_ID | ZONE | SVR_IP | SVR_PORT | ROLE | REPLICA_TYPE | DUPLICATE_SCOPE | OBJECT_ID | TABLEGROUP_NAME | TABLEGROUP_ID | SHARDING |
±--------------±-----------±---------±-----------±---------------±------------------±-----------±--------------±----------±------±------±------------±---------±---------±-------------±----------------±----------±----------------±--------------±---------+
| test | mytest | 500027 | USER TABLE | NULL | NULL | NULL | NULL | 200027 | 1002 | zone1 | 10.10.10.59 | 2882 | LEADER | FULL | NONE | 500027 | NULL | NULL | NULL |
| test | mytest | 500027 | USER TABLE | NULL | NULL | NULL | NULL | 200027 | 1002 | zone3 | 10.10.10.61 | 2882 | FOLLOWER | FULL | NONE | 500027 | NULL | NULL | NULL |
±--------------±-----------±---------±-----------±---------------±------------------±-----------±--------------±----------±------±------±------------±---------±---------±-------------±----------------±----------±----------------±--------------±---------+
这个就是两副本的trace,只能在执行sql的机器上找到trace记录,别的机器找不到。而且使用2881端口连接丢失副本的节点是能找到myetst表的,数据也是正确的。但是好像没有用。
下面这个语句也显示的是在三个节点执行了建表。
obclient [test]> show create table mytest;
±-------±------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Table | Create Table |
±-------±------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| mytest | CREATE TABLE mytest
(
flag
varchar(10) DEFAULT NULL,
creatime
datetime NOT NULL DEFAULT CURRENT_TIMESTAMP
) DEFAULT CHARSET = utf8mb4 ROW_FORMAT = DYNAMIC COMPRESSION = ‘zstd_1.3.8’ REPLICA_NUM = 3 BLOCK_SIZE = 16384 USE_BLOOM_FILTER = FALSE TABLET_SIZE = 134217728 PCTFREE = 0 |
±-------±------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
1 row in set (0.012 sec)
你这个确实是有点问题,建表语句里面是有REPLICA_NUM = 3 ,但是DBA_OB_TABLE_LOCATIONS只有两条数据。。但是那个trace确实看不到任何报错,搞不懂了,zone2的那台机器是正常的吗