obclient -uroot@sys -h127.0.0.1 -P2881
1 新建一个 oracle/mysql 租户,租户规格自定(说明:本实验与租户模式无关),本文以 oracle
租户示例:
新建资源规格:
create resource unit u1_ora max_cpu=1,min_cpu=1,max_memory=‘2G’,min_memory=‘2G’,max_iops=128,max_disk_size=‘10G’,max_session_num=100;
新建资源池:
create resource pool pool_ora unit=‘u1_ora’,unit_num=1,zone_list=(‘zone1’);
新建租户:
CREATE TENANT IF NOT EXISTS ob_pay charset=‘utf8mb4’, replica_num=1,zone_list=(‘zone1’), resource_pool_list=(‘pool_ora’) SET ob_tcp_invited_nodes=’%’,ob_compatibility_mode=‘oracle’;
登录租户新建 tpcc 用户
obclient -usys@ob_pay -h127.0.0.1 -P2881
create user tpcc identified by obce_test;
grant dba to tpcc;
步骤 1 定义一张表,包括主键,建议至少两条记录。
obclient -utpcc@ob_pay -h127.0.0.1 -P2883 -p (一定要通过obproxy连接)
create table t1 (id number,name varchar2(12),primary key(id));
insert into t1 values(1,‘A1’);
insert into t1 values(2,‘B1’);
commit;
select * from t1;
步骤 2 调大业务租户变量 ob_query_timeout 、ob_trx_timeout、ob_trx_idle_timeout,比如
到 10 分钟。
obclient -usys@ob_pay -h127.0.0.1 -P2881
set global ob_query_timeout = 3600000000;
set global ob_trx_timeout = 3600000000;
set global ob_trx_idle_timeout = 3600000000;
步骤 3 模拟一个死锁的场景,打开两个新的会话。 #第一个会话执行如下操作
begin;
update t1 set name=‘C1’ where id = 1;
#第二个会话执行如下操作
begin;
update t1 set name=‘D2’ where id = 2;
update t1 set name=‘C2’ where id = 1;
#第一个会话执行如下操作
update t1 set name=‘D1’ where id = 2;
步骤 4 登录第三个会话,登录 sys 租户,根据访问的对象查询事务状态,锁源头和锁等待。
select table_name, table_id from __all_virtual_table where table_name=‘T1’;
±-----------±-----------------+
| table_name | table_id |
±-----------±-----------------+
| T1 | 1100611139453777 |
±-----------±-----------------+
1 row in set (0.011 sec)
根据 table_id 查看进行中的会话
select * from __all_virtual_trans_stat where partition
like ‘%1100611139453777%’ and now()-ctx_create_time>100\G
查看交易持有锁的信息
select * from __all_virtual_trans_lock_stat where table_id = 1100611139453777 \G
*************************** 1. row ***************************
tenant_id: 1001
trans_id: {hash:17418920681233242343, inc:169830, addr:“127.0.0.1:2882”, t:1681438900661003}
svr_ip: 127.0.0.1
svr_port: 2882
partition: {tid:1100611139453777, partition_id:0, part_cnt:0}
table_id: 1100611139453777
rowkey: table_id=1100611139453777 rowkey_object=[{“DECIMAL”:“1”}]
session_id: 3221487944
proxy_id: “172.16.22.145:2883”
ctx_create_time: 2023-04-14 10:21:40.661003
expired_time: 2023-04-14 11:21:40.661367
row_lock_addr: 139736568798328
*************************** 2. row ***************************
tenant_id: 1001
trans_id: {hash:1676114662546596750, inc:170835, addr:“127.0.0.1:2882”, t:1681438906809695}
svr_ip: 127.0.0.1
svr_port: 2882
partition: {tid:1100611139453777, partition_id:0, part_cnt:0}
table_id: 1100611139453777
rowkey: table_id=1100611139453777 rowkey_object=[{“DECIMAL”:“2”}]
session_id: 3221487947
proxy_id: “172.16.22.145:2883”
ctx_create_time: 2023-04-14 10:21:46.809695
expired_time: 2023-04-14 11:21:46.810183
row_lock_addr: 139736568859440
2 rows in set (0.012 sec)
查看锁等待信息
select * from __all_virtual_lock_wait_stat where table_id = 1100611139453777 \G
*************************** 1. row ***************************
svr_ip: 127.0.0.1
svr_port: 2882
table_id: 1100611139453777
rowkey: table_id=1100611139453777 rowkey_object=[{“NUMBER”:“2”}]
addr: 139739249609280
need_wait: 1
recv_ts: 1681438916332033
lock_ts: 1681438932150248
abs_timeout: 1681442516232033
try_lock_times: 3
time_after_recv: 17743447
session_id: 3221487944
block_session_id: 3221487944
type: 0
lock_mode: 0
total_update_cnt: 3
*************************** 2. row ***************************
svr_ip: 127.0.0.1
svr_port: 2882
table_id: 1100611139453777
rowkey: table_id=1100611139453777 rowkey_object=[{“NUMBER”:“1”}]
addr: 139739306249024
need_wait: 1
recv_ts: 1681438907101823
lock_ts: 1681438932149836
abs_timeout: 1681442507001823
try_lock_times: 4
time_after_recv: 26973704
session_id: 3221487947
block_session_id: 3221487947
type: 0
lock_mode: 0
total_update_cnt: 3
2 rows in set (0.002 sec)
查看死锁发生事件(此时无法显示死锁信息)
select * from __all_virtual_deadlock_event_history \G
根据查询到的 session_id,杀掉会话 2 最后一条 update
kill query 3221487947; #需要在 tpcc 的用户下执行操作
观察会话 1 和会话 2 的执行情况:会话 1 中仍处于锁等待状态,会话 2 中最后一条 update 已经被杀掉
对会话 2 执行 rollback,可以看到会话 1 的 SQL 执行结束
此时再次查看__all_virtual_trans_stat 、_all_virtual_trans_lock_stat、_all_virtual_lock_wait_stat 几张表,观察其中的变化
会话 2 交易已经结束,仅剩下会话 1 的交易在__all_virtual_trans_stat 表中可以查询到
根据 table_id 查看进行中的会话
select * from __all_virtual_trans_stat where partition
like ‘%1100611139453777%’ and now()-ctx_create_time>100\G
查看交易持有锁的信息
select * from __all_virtual_trans_lock_stat where table_id = 1100611139453777 \G
查看锁等待信息
select * from __all_virtual_lock_wait_stat where table_id = 1100611139453777 \G
会话 1 现在同时拥有 id=1 和 id=2 的两行锁
现在没有锁等待的情况发生
步骤 7 对会话 1 执行 rollback 操作
步骤 8 从 sql_audit 视图中查询刚刚发生的交易
select usec_to_time(request_time), svr_ip, sid, transaction_hash,user_name, query_sql, ret_code, plan_type, elapsed_time , retry_cnt from gv$sql_audit where tenant_id = 1001 and query_sql like ‘update t1%’ order by request_time desc limit 4;
现在只有 3 条 sql 语句,被 kill 的语句在 sql_audit 中没有记录,其中 retry_cnt 表示中锁等待中重试的次数
通过日志追查被杀掉 sql 的等待原因
从 obproxy_error.log 日志中查询会话 2 中被 kill 的语句
/root/obproxy/log/obproxy_error.log
grep “update t1 set name=‘C2’ where id = 1” obproxy_error.log
2023-04-14 10:24:52.215854,undefined,obce_test:ob_pay:TPCC,OB_ORACLE,t1,t1,COM_QUERY,UPDATE,failed,600,update t1 set name=‘C2’ where id = 1,185113993us,32us,0us,185113783us,Y0-7F010BB54DD0,0,127.0.0.1:2881,ORA-00600: internal error code, arguments: -5065, Query execution was interrupted
看到错误代码-5065:Query execution was interrupted
根据日志中获取 trace_id(Y0-7F010BB54DD0)以及 sql 时间戳(2023-04-14 10:24:52.215854)到obproxy.log 中获取 server_ip 和 server trace_id
grep Y0-7F3266B5DDD0 obproxy.log | grep ‘2023-04-14 10:24:52’
[2023-04-14 10:24:52.215660] WARN [PROXY.SM] print_mysql_complete_log (ob_mysql_sm.cpp:4418) [2407][Y0-7F010BB54DD0] [lt=31] [dc=0] finishing mysql tunnel((sm_id=13, cs_id=6, proxy_sessid=12398434586365984784, ss_id=16, server_sessid=3221487947, client_ip={127.0.0.1:45696}, server_ip={127.0.0.1:2881}, server_trace_id=YB427F000001-0005F9422A9857C3-0-0, proxy_user_name=tpcc@ob_pay#obce_test, database_name=TPCC, is_flow_controlled=false, cpu_flow_control_count=0, memory_flow_control_count=0, sql=update t1 set name=‘C2’ where id = 1, sql_cmd=“Query”, result={is_trans_completed:false, is_resp_completed:true, ending_type:2, is_partition_hit:true, has_new_sys_var:false, has_proxy_idc_name_user_var:false, is_server_db_reset:false, reserved_len:0, connection_id:0, scramble_buf:"", is_resultset_resp:false, server_capabilities_lower_.capability:0, ok_packet_action_type:2, last_ok_pkt_len:75, rewritten_last_ok_pkt_len:0, extra_info:{is_exist_sess_info:false, sess_info:""}, error_pkt:field_count:255, errcode:600, sqlstate:“70100”, message:“ORA-00600: internal error code, arguments: -5065, Query execution was interrupted”})
[2023-04-14 10:24:52.215741] WARN [PROXY.SM] update_cmd_stats (ob_mysql_sm.cpp:8263) [2407][Y0-7F010BB54DD0] [lt=47] [dc=0] Slow Query: ((client_ip={127.0.0.1:45696}, server_ip={127.0.0.1:2881}, obproxy_client_port={127.0.0.1:19384}, server_trace_id=YB427F000001-0005F9422A9857C3-0-0, route_type=ROUTE_TYPE_MAX, user_name=tpcc, tenant_name=ob_pay, cluster_name=obce_test, logic_database_name=, logic_tenant_name=, ob_proxy_protocol=0, cs_id=6, proxy_sessid=12398434586365984784, ss_id=16, server_sessid=3221487947, sm_id=13, cmd_size_stats={client_request_bytes:41, server_request_bytes:41, server_response_bytes:94, client_response_bytes:94}, cmd_time_stats={client_transaction_idle_time_us=290996, client_request_read_time_us=19, client_request_analyze_time_us=18, cluster_resource_create_time_us=0, pl_lookup_time_us=0, pl_process_time_us=0, congestion_control_time_us=0, congestion_process_time_us=0, do_observer_open_time_us=4, server_connect_time_us=0, server_sync_session_variable_time_us=0, server_send_saved_login_time_us=0, server_send_use_database_time_us=0, server_send_session_variable_time_us=0, server_send_all_session_variable_time_us=0, server_send_last_insert_id_time_us=0, server_send_start_trans_time_us=0, build_server_request_time_us=1, plugin_compress_request_time_us=0, prepare_send_request_to_server_time_us=32, server_request_write_time_us=22, server_process_request_time_us=185113783, server_response_read_time_us=3, plugin_decompress_response_time_us=0, server_response_analyze_time_us=12, ok_packet_trim_time_us=10, client_response_write_time_us=123, request_total_time_us=185113993}, sql=update t1 set name=‘C2’ where id = 1)
[2023-04-14 10:24:52.215866] INFO [PROXY] ob_tenant_stat_manager.cpp:62 [2407][Y0-7F010BB54DD0] [lt=4] [dc=0] succ to create tenant stat item(item={logic_tenant_name:“undefined”, logic_database_name:"", cluster_name:“obce_test”, tenant_name:“ob_pay”, database_name:“TPCC”, database_type:2, stmt_type:27, error_code:“600”, idle_period_count:0, stat_value:{value:[[0]0, [1]0, [2]0, [3]0, [4]0, [5]0, [6]0]}}, key={logic_tenant_hash:13937442267662686820, logic_database_hash:0, cluster_hash:7037252716020286560, tenant_hash:2325757893161853000, database_hash:14897564302911242005, stmt_type:27, error_code_hash:10879343988660408079})
server_trace_id=YB427F000001-0005F9422A9857C3-0-0
YB427F000001-0006049B31BA694D-0-0
根据以上获得信息去对应的 observer 日志中查询(示例如下):
cd /root/observer/log
grep YB427F000001-0005F9422A9857C3-0-0 observer.log.20230414103554 | vim -
rowkey_object=[{“NUMBER”:“1”}]
pkey={tid:1100611139453777, partition_id:0, part_cnt:0} trans_id={hash:17418920681233242343, inc:169830, addr:“127.0.0.1:2882”, t:1681438900661003}
从日志中看到发生了锁冲突 冲突发生在 table_id=1100611139453777 的表,主键 rowkey_object=[{“NUMBER”:“1”}] 锁的 holder 是 trans_id= hash:17418920681233242343, inc:169830,addr:"127.0.0.1:2882
根据锁 holder 信息查询 observer 日志
grep 17418920681233242343 observer.log.20230414103554| grep -v YB427F000001-0005F9422A9857C3-0-0 | vim - 从日志中看到发生了锁冲突
rowkey_object=[{“NUMBER”:“2”}
trans_id={hash:1676114662546596750, inc:170835, addr:“127.0.0.1:2882”, t:1681438906809695}
grep 5695719313326504729 observer.log | grep -v YB427F000001-0006049B31BA694D-0-0 | vim -
冲突发生在 table_id=1100611139453777 的 表 (与前面是同一张表) ,主键rowkey_object=[{“NUMBER”:“2”}]
锁的 holder 是 trans_id= hash:1676114662546596750, inc:170835,addr:“127.0.0.1:2882”,
9721306618834339968
通过两个日志对比可以发现,两个交易互相等锁,产生了死锁。
步骤 10 打开 OceanBase 死锁检查功能,观察之前场景变化
默认情况下死锁检查关闭,可以通过以下命令手动开启,或者通过 OCP 开启
obclient -uroot@sys -h127.0.0.1 -P2883 -p
alter system set _lcl_op_interval = ‘20ms’;
查看参数状态
select * from __all_virtual_sys_parameter_stat where name=’_lcl_op_interval’\G
再次执行步骤 3,模拟死锁的场景
重复步骤 4 中查看死锁发生事件
select * from __all_virtual_deadlock_event_history \G
******************* 1. row ***************************
tenant_id: 1001
event_id: 4756374708667105128
svr_ip: 127.0.0.1
svr_port: 2882
detector_id: 1
report_time: 1681440791141896
cycle_idx: 1
cycle_size: 2
role: victim
priority_level: normal
priority: 18445062632923592629
create_time: 1681440786493586
start_delay: 0
module: transaction
visitor: {hash:4352965127008317200, inc:361352, addr:“127.0.0.1:2882”, t:1681440785958986}
resource: table_id=1100611139453777 rowkey_object=[{“NUMBER”:“1”}]
extra_name1: current sql
extra_value1: update t1 set name=‘C2’ where id = 1
extra_name2:
extra_value2:
extra_name3:
extra_value3:
*************************** 2. row ***************************
tenant_id: 1001
event_id: 4756374708667105128
svr_ip: 127.0.0.1
svr_port: 2882
detector_id: 2
report_time: 1681440791141896
cycle_idx: 2
cycle_size: 2
role: witness
priority_level: normal
priority: 18445062632928614632
create_time: 1681440790479613
start_delay: 0
module: transaction
visitor: {hash:9345934103940005124, inc:361199, addr:“127.0.0.1:2882”, t:1681440780936983}
resource: table_id=1100611139453777 rowkey_object=[{“NUMBER”:“2”}]
extra_name1: current sql
extra_value1: update t1 set name=‘D1’ where id = 2
extra_name2:
extra_value2:
extra_name3:
extra_value3:
2 rows in set (0.010 sec)
通过以上信息看到系统发生死锁并做处理:role = victim,表示该事务陷入死锁等待 sql 语句被系统杀掉。