OceanBase SQL执行计划工作空间内存不足

问题现象

发现有些视图SQL获取不到执行计划~提示No memory or reach tenant memory limit
image

那么通过视图SQL拆解我们发现了一定规律,接下来将会进行讲述

翻查官方文档发现有一定参数调整

set global ob_sql_work_area_percentage = 10;

设置后发现~

image

情况一致

image

复现

环境

oceanbase 4.3.2 & 4.3.3 (皆进行试验过,都不行~)

obproxy 4.3.1

集群租户配置 12c 36g

表结构

CREATE TABLE `部门表` (
  `ID` bigint(255) NOT NULL,
  `上级ID` bigint(255) DEFAULT NULL,
  `编码` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL,
  `名称` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL,
  PRIMARY KEY (`ID`)
)  WITH COLUMN GROUP(all columns, each column);


CREATE TABLE `病案主页_T1` (
  `病人ID` bigint(255) NOT NULL,
  `主页ID` int(255) NOT NULL,
  `住院号` bigint(255) DEFAULT NULL,
  `出院科室ID` bigint(255) DEFAULT NULL,
  PRIMARY KEY (`病人ID`, `主页ID`)
) WITH COLUMN GROUP(all columns, each column);


CREATE TABLE `病人变动记录_T1` (
  `ID` bigint(255) NOT NULL,
  `病人ID` bigint(255) DEFAULT NULL,
  `主页ID` int(255) DEFAULT NULL,
  `开始时间` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL,
  `开始原因` int(255) DEFAULT NULL,
  PRIMARY KEY (`ID`)
) WITH COLUMN GROUP(all columns, each column);

样例数据

INSERT INTO `test`.`病案主页_T1` (`病人ID`, `主页ID`, `住院号`, `出院科室ID`) VALUES (1, 1, 666, 1);
INSERT INTO `test`.`病人变动记录_T1` (`ID`, `病人ID`, `主页ID`, `开始时间`, `开始原因`) VALUES (1, 666, 666, '2024-10-17 00:00:00', 1);
INSERT INTO `test`.`部门表` (`ID`, `上级ID`, `编码`, `名称`) VALUES (1, 2, '123', 'TEST');

执行SQL

explain
SELECT
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`))
FROM
	`ods_whfkyy_his_hisdbdg_zlhis`.`病案主页_T1` `a`,
	`ods_whfkyy_his_hisdbdg_zlhis`.`病人变动记录_T1` `c`
WHERE
	(`c`.`开始原因` IN (1, 3, 15))
	AND (`a`.`病人ID` = `c`.`病人ID`)
	AND (`a`.`主页ID` = `c`.`主页ID`)
LIMIT 0,200

结果

研究

验证1:删减同表子查询数

执行SQL

explain
SELECT
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`)),
	(
	SELECT
		`部门表`.`编码`
	FROM
		`ods_whfkyy_his_hisdbdg_zlhis`.`部门表`
	WHERE
		(`部门表`.`ID` = `a`.`出院科室ID`))
FROM
	`ods_whfkyy_his_hisdbdg_zlhis`.`病案主页_T1` `a`,
	`ods_whfkyy_his_hisdbdg_zlhis`.`病人变动记录_T1` `c`
WHERE
	(`c`.`开始原因` IN (1, 3, 15))
	AND (`a`.`病人ID` = `c`.`病人ID`)
	AND (`a`.`主页ID` = `c`.`主页ID`)
LIMIT 0,200

结果:发现explain十分快且正常返回

小结:根据SQL我们发现规律其实由于SQL里多次出现相同的表的子查询导致的~

验证2:删除关联主键

执行SQL:此处不赘述,是沿用复现的SQL进行操作的,仅删除来源的部门表中的主键字段

ALTER TABLE ods_whfkyy_his_hisdbdg_zlhis.部门表 DROP PRIMARY KEY;

结果:此时我们发现似乎验证1的结论被推翻了,同表子查询数很多也行!

小结:删除部门表主键后,意外可以~似乎是SQL引擎去检索索引的时候导致的执行计划生成异常

验证3:数据存储方式

执行SQL:此处就是将行存表变列存

ALTER table `ods_whfkyy_his_hisdbdg_zlhis`.`部门表` ADD COLUMN GROUP (each column);
ALTER TABLE  `ods_whfkyy_his_hisdbdg_zlhis`.`部门表` DROP COLUMN GROUP(all columns);

结果:在切换了行存、列存后发现,都有该问题

小结:于是可以排除是不同存储导致的执行计划生成异常,估摸着就是验证2的问题了

验证4:执行计划栈

发现在异常的SQL执行计划的栈里会一直报 SPF WILL USE BASIC METHOD

image

小弟不是很懂这个,大佬可以帮忙康康栈分析下

附上正常(无主键索引)

:paperclip:optimizer_trace_d9q8R2_trace_正常SQL栈.zip

附上异常(有主键索引)

:paperclip:optimizer_trace_mm4eD1_trace_t异常SQL栈.zip

结论

目前发现就是跟子查询表的主键索引有关,看看社区大佬们考虑是不是执行计划生成的陈年Bug~

1)设置trace信息
SET ob_enable_show_trace=‘ON’;

2)执行sql。

3)获取上个命令的trace
select last_trace_id();

4)获取trace对应的节点
select query_sql,svr_ip from gv$ob_sql_audit where trace_id=‘第三步获取的trace信息’;

5)取对应的svr_ip节点 过滤日志
grep “第三步获取的trace信息” observer.log*
grep “第三步获取的trace信息” rootservice.log*
6)提供日志信息即可。

麻烦提供一份跟踪tracleid ob日志,这边也联系下相关sql同学帮忙看一下