近两年ai的话题很多,最近也看了很多人分享通过ai去编程或者实现自己的想法,我抽时间也想写个关于OB的检查脚本,主要目的还是自用,现在整理了一部分,可能是我想法的30%,先拿出来跟大家分享一下,因为没有去生产场景验证,也希望有能用到的朋友给我反馈。
先说下这个检查部分是shell写的,是我一个脚本里的一部分,所以拿去用的时候,自己再补齐一些传参才可以,还有就是场景是对应4.x的,我自己测试的环境是425的版本。
因为帖子字数有限,展示效果我发在帖子下面
实际的模块脚本如下,需要的话自取,我后续完善之后可能会专门再发个文章。
log_dir="/home/admin/oceanbase/log"
node_ip=$(hostname -I | awk '{print $1}')
[ -z "$node_ip" ] && node_ip="127.0.0.1"
echo "=== 队列积压检查 ==="
# =============== 1. 检查日志(当前 + 最近5分钟) ===============
log_output=""
log_files=("$log_dir/observer.log")
for i in {0..5}; do
checkdate=$(date -d "now -$i minutes" +"%Y%m%d%H%M")
log_files+=("$log_dir/observer.log.$checkdate"* )
done
for logfile in "${log_files[@]}"; do
for f in $logfile; do
if [[ -f "$f" ]]; then
# 提取非零积压行,并格式化
while IFS= read -r line; do
if [[ $line =~ req_queue:total_size=[1-9] ]]; then
# 使用 awk 解析单行
parsed=$(echo "$line" | awk -v ip="$node_ip" '
{
gsub(/\[|\]/, "", $1); gsub(/\[|\]/, "", $2);
log_time = $1 " " $2;
tenant_str = $11;
gsub(/.*id:/, "", tenant_str);
gsub(/,.*/, "", tenant_str);
rest = "";
for(i=39; i<=NF; i++) rest = rest " " $i;
gsub(/ /, "", rest);
match(rest, /total_size=([0-9]+)/, t); total = (t[1] ? t[1] : "0");
q[0]=q[1]=q[2]=q[3]=q[4]="0";
for(j=0; j<=4; j++) {
pattern = "queue_" j "=([0-9]+)";
if(match(rest, pattern, a)) q[j] = a[1];
}
print log_time, ip, tenant_str, total, q[0], q[1], q[2], q[3], q[4];
}'
)
if [ -n "$parsed" ]; then
log_output+="$parsed"$'\n'
fi
fi
done < "$f"
fi
done
done
# =============== 2. 检查 SQL 表 ===============
sql_output=$(mysql -h"${obhost}" -P"${local_port}" -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -Nse "
SELECT
DATE_FORMAT(NOW(), '%Y-%m-%d %H:%i:%s') AS check_time,
svr_ip,
tenant_id,
req_queue_total_size,
queue_0, queue_1, queue_2, queue_3, queue_4
FROM __all_virtual_dump_tenant_info
WHERE req_queue_total_size > 0;
" 2>/dev/null)
# =============== 3. 输出结果 ===============
any_found=false
if [ -n "$log_output" ]; then
echo "【日志中发现队列积压】"
echo "检查时间 节点IP tenant_id 积压总量 queue_0 queue_1 queue_2 queue_3 queue_4"
echo "$log_output" | head -n 20 # 避免刷屏,可调整
any_found=true
fi
if [ -n "$sql_output" ]; then
echo ""
echo "【系统表中发现队列积压】"
echo "检查时间 节点IP tenant_id 积压总量 queue_0 queue_1 queue_2 queue_3 queue_4"
echo "$sql_output" | head -n 20
any_found=true
fi
if [ "$any_found" = false ]; then
echo "队列等待检查正常"
fi
echo "=== 主机资源使用检查 ==="
check_time=$(date '+%Y-%m-%d %H:%M:%S')
host_ip=$(hostname -I | awk '{print $1}')
[ -z "$host_ip" ] && host_ip="127.0.0.1"
# 获取总内存(字节)
total_mem_bytes=$(free -b | awk 'NR==2 {print $2}')
total_mem_gb=$(awk "BEGIN {printf \"%.2f\", $total_mem_bytes/1024/1024/1024}")
any_issue=false
# --- 1. CPU 使用率 >60% ---
cpu_high=false
cpu_line=$(top -bn1 | grep "Cpu(s)" | head -1)
if [ -n "$cpu_line" ]; then
cpu_user=$(echo "$cpu_line" | awk '{print $2}' | cut -d'%' -f1)
cpu_sys=$(echo "$cpu_line" | awk '{print $4}' | cut -d'%' -f1)
cpu_used=$(awk "BEGIN {printf \"%.1f\", $cpu_user + $cpu_sys}")
if (( $(echo "$cpu_used > 60" | bc -l) )); then
echo "【CPU 使用率过高】(>60%)"
printf "%-20s %-15s user=%s%%, sys=%s%%, 合计=%s%%\n" \
"$check_time" "$host_ip" "$cpu_user" "$cpu_sys" "$cpu_used"
any_issue=true
cpu_high=true
else
echo "【CPU 使用率】指标正常"
fi
else
echo "【CPU 使用率】指标正常(无法获取数据)"
fi
# --- 2. 内存使用率 (>80%) ---
mem_high=false
mem_info=$(free -b)
if [ -n "$mem_info" ]; then
total=$(echo "$mem_info" | awk 'NR==2 {print $2}')
used=$(echo "$mem_info" | awk 'NR==2 {print $3}')
buff_cache_bytes=$(echo "$mem_info" | awk 'NR==2 {print $6}')
if [ "$total" -gt 0 ]; then
total_gb=$(awk "BEGIN {printf \"%.2f\", $total/1024/1024/1024}")
used_gb=$(awk "BEGIN {printf \"%.2f\", $used/1024/1024/1024}")
mem_used_pct=$(awk "BEGIN {printf \"%.1f\", 100 * $used / $total}")
if (( $(echo "$mem_used_pct > 80" | bc -l) )); then
echo "【内存使用率过高】(>80%)"
printf "%-20s %-15s 总内存=%sGB, 已用=%sGB, 使用率=%.1f%%\n" \
"$check_time" "$host_ip" "$total_gb" "$used_gb" "$mem_used_pct"
any_issue=true
mem_high=true
else
echo "【内存使用率】指标正常"
fi
else
echo "【内存使用率】指标正常(total=0)"
fi
else
echo "【内存使用率】指标正常(无法获取数据)"
fi
# --- 3. buff/cache 占比 (>10%) ---
if [ -n "$mem_info" ] && [ "$total" -gt 0 ]; then
buff_cache_gb=$(awk "BEGIN {printf \"%.2f\", $buff_cache_bytes/1024/1024/1024}")
buff_cache_pct=$(awk "BEGIN {printf \"%.1f\", 100 * $buff_cache_bytes / $total}")
if (( $(echo "$buff_cache_pct > 10" | bc -l) )); then
echo "【buff/cache 占比过高】(>10%)"
printf "%-20s %-15s buff/cache=%sGB, 占总内存=%.1f%%\n" \
"$check_time" "$host_ip" "$buff_cache_gb" "$buff_cache_pct"
any_issue=true
else
echo "【buff/cache 占比】指标正常"
fi
else
echo "【buff/cache 占比】指标正常(无法获取数据)"
fi
# --- 4. Top 进程(仅在 CPU 或内存异常时显示)---
if [ "$cpu_high" = true ] || [ "$mem_high" = true ]; then
echo ""
echo "【资源占用 Top 进程】"
echo "PID COMMAND %CPU MEM(GB) %MEM"
{
ps -eo pid,comm,pcpu,pmem,rss --no-headers --sort=-pcpu | head -n 3
ps -eo pid,comm,pcpu,pmem,rss --no-headers --sort=-rss | head -n 3
} | sort -k1,1 -u | head -n 5 | while read pid comm pcpu pmem rss_kb; do
[ -z "$pid" ] && continue
rss_gb=$(awk "BEGIN {printf \"%.2f\", $rss_kb * 1024 / 1024/1024/1024}")
comm_short=$(echo "$comm" | cut -c1-15)
printf "%-8s %-15s %-6s %-8s %-5s\n" "$pid" "$comm_short" "$pcpu" "$rss_gb" "$pmem"
done
echo ""
fi
# --- 5. 磁盘 I/O 负载 >80% + 挂载目录 ---
io_high=false
if command -v iostat >/dev/null 2>&1 && command -v lsblk >/dev/null 2>&1; then
high_io_devices=$(iostat -x 1 2 -y 2>/dev/null | awk 'NR>2 && /^[a-z]/ && $NF+0 > 80 {print $1, $NF}')
if [ -n "$high_io_devices" ]; then
echo "【磁盘 I/O 负载过高】(%util >80%)"
echo "检查时间 节点IP 设备 %util 挂载目录"
while read dev util; do
mounts=$(lsblk -no MOUNTPOINT "/dev/$dev" 2>/dev/null | grep -v '^$' | sort -u)
if [ -z "$mounts" ]; then
mounts="(未挂载或无文件系统)"
fi
first=true
while IFS= read -r mp; do
if [ "$first" = true ]; then
printf "%-20s %-15s %-7s %s%% %s\n" "$check_time" "$host_ip" "$dev" "$util" "$mp"
first=false
else
printf "%-20s %-15s %-7s %s\n" "" "" "" "$mp"
fi
done <<< "$mounts"
done <<< "$high_io_devices"
any_issue=true
io_high=true
else
echo "【磁盘 I/O 负载】指标正常"
fi
else
echo "【磁盘 I/O 负载】指标正常(iostat 或 lsblk 未安装)"
fi
# --- 6. Load >50 ---
load_high=false
load_line=$(uptime 2>/dev/null | awk -F'load average:' '{print $2}')
if [ -n "$load_line" ]; then
load1=$(echo "$load_line" | awk '{print $1}' | tr -d ',')
if (( $(echo "$load1 > 50" | bc -l) )); then
echo "【系统 Load 过高】(load1 >50)"
printf "%-20s %-15s load1=%s\n" "$check_time" "$host_ip" "$load1"
any_issue=true
load_high=true
else
echo "【系统 Load】指标正常"
fi
else
echo "【系统 Load】指标正常(无法获取 uptime)"
fi
#--- 可选:整体总结(按需保留)---
if [ "$any_issue" = false ]; then
echo ""
echo "所有主机资源指标均正常"
fi
echo ""
echo "=== OceanBase 节点状态检查 ==="
# 执行查询并过滤出数据行(跳过表头和分隔线)
node_status_raw=$(mysql -h"${obhost}" -P${local_port} -u${sys_username}@sys -p"${sys_password}" \
-c -Doceanbase --batch --raw -e "
SELECT svr_ip,
with_rootserver AS rs,
zone,
ORA_DECODE(start_service_time, 0, NULL, CAST(usec_to_time(start_service_time) AS DATETIME)) AS start_service_time,
ORA_DECODE(stop_time, 0, NULL, CAST(usec_to_time(stop_time) AS DATETIME)) AS stop_time,
ORA_DECODE(last_offline_time, 0, NULL, CAST(usec_to_time(last_offline_time) AS DATETIME)) AS last_offline_time,
status,
SUBSTR(build_version, 1, INSTR(build_version, '-') - 1) AS build_version
FROM __all_server
ORDER BY svr_ip, with_rootserver;
" 2>/dev/null)
# 检查是否能连通
if [ $? -ne 0 ] || [ -z "$node_status_raw" ]; then
echo "【OB 节点状态】无法连接或查询失败"
any_issue=true
else
# 提取数据行(排除空行)
node_data=$(echo "$node_status_raw" | tail -n +2 | grep -v '^$')
if [ -z "$node_data" ]; then
echo "【OB 节点状态】无节点数据返回"
any_issue=true
else
abnormal_nodes=""
total_nodes=0
abnormal_count=0
while IFS=$'\t' read -r svr_ip rs zone start_service_time stop_time last_offline_time status build_version; do
((total_nodes++))
# 判断是否正常:
# - status == 'ACTIVE'
# - start_service_time != NULL (非空且不是 "NULL")
# - stop_time == "NULL"
# - last_offline_time == "NULL"
if [[ "$status" == "ACTIVE" ]] && \
[[ "$start_service_time" != "NULL" ]] && \
[[ "$start_service_time" != "" ]] && \
[[ "$stop_time" == "NULL" ]] && \
[[ "$last_offline_time" == "NULL" ]]; then
# 正常,不处理
continue
else
((abnormal_count++))
# 记录异常行(保留原始字段,用空格对齐)
abnormal_nodes+="$svr_ip|$rs|$zone|$start_service_time|$stop_time|$last_offline_time|$status|$build_version"$'\n'
fi
done <<< "$node_data"
if [ $abnormal_count -eq 0 ]; then
echo "【OB 节点状态】指标正常(共 $total_nodes 个节点)"
else
echo "【OB 节点状态异常】共 $abnormal_count / $total_nodes 个节点异常"
echo "svr_ip rs zone start_service_time stop_time last_offline_time status build_version"
echo "------------------------------------------------------------------------------------------------------------------"
while IFS='|' read -r ip rs zone start stop last_off status ver; do
[ -z "$ip" ] && continue
printf "%-16s %-3s %-6s %-22s %-10s %-18s %-9s %s\n" \
"$ip" "$rs" "$zone" "$start" "$stop" "$last_off" "$status" "$ver"
done <<< "$abnormal_nodes"
any_issue=true
fi
fi
fi
echo ""
echo "=== OceanBase 租户 CPU 使用率检查 ==="
# 执行租户 CPU 查询(使用 --batch --raw 便于解析)
tenant_cpu_raw=$(mysql -h"${obhost}" -P${local_port} -u${sys_username}@sys -p"${sys_password}" \
-c -Doceanbase --batch --raw -e "
WITH tmp AS (
SELECT
NOW() AS chenck_time,
t.tenant_name,
t.tenant_id,
ca.svr_ip,
SUM(CASE WHEN STAT_ID='140006' THEN ca.VALUE ELSE 0 END) AS cpu_usage,
SUM(CASE WHEN STAT_ID='140005' THEN ca.VALUE ELSE 0 END) AS cpu_max,
ROUND(100 * (SUM(CASE WHEN STAT_ID='140006' THEN ca.VALUE ELSE 0 END) /
SUM(CASE WHEN STAT_ID='140005' THEN ca.VALUE ELSE 0 END)), 2) AS cpu_percent
FROM (
SELECT s.CON_ID, s.svr_ip, s.STAT_ID, SUM(s.VALUE) AS VALUE
FROM gv\$sysstat s
WHERE s.STAT_ID IN (140005, 140006)
GROUP BY s.CON_ID, s.svr_ip, s.STAT_ID
) ca
JOIN __all_tenant t ON t.tenant_id = ca.CON_ID
GROUP BY t.tenant_name, t.tenant_id, ca.svr_ip
)
SELECT
NOW() AS chenck_time,
t2.tenant_id,
t2.tenant_name,
t2.SVR_IP,
ROUND((t1.cpu_usage + t2.cpu_usage) / 100, 2) AS cpu_usage_sec,
ROUND(t2.cpu_max / 100, 1) AS cpu_max_sec,
ROUND((t1.cpu_usage + t2.cpu_usage) / t2.cpu_max * 100, 2) AS cpu_percent
FROM tmp t1
JOIN tmp t2
ON SUBSTR(t1.tenant_name, 6) = t2.tenant_id
AND t1.SVR_IP = t2.SVR_IP
WHERE t1.tenant_name LIKE 'META%'
AND ROUND((t1.cpu_usage + t2.cpu_usage) / t2.cpu_max * 100, 2) > 10
ORDER BY t2.tenant_name, t2.SVR_IP;
" 2>/dev/null)
# 检查执行状态
if [ $? -ne 0 ]; then
echo "【租户 CPU 使用率】查询失败(可能无法连接或权限不足)"
any_issue=true
else
# 提取数据行(跳过可能的空行)
tenant_cpu_data=$(echo "$tenant_cpu_raw" | tail -n +2 | grep -v '^$')
if [ -z "$tenant_cpu_data" ]; then
echo "【租户 CPU 使用率】指标正常"
else
echo "【租户 CPU 使用率过高】(>10%)"
echo "check_time tenant_id tenant_name svr_ip cpu_usage(sec) cpu_max(sec) cpu_percent(%)"
echo "----------------------------------------------------------------------------------------------------------"
while IFS=$'\t' read -r check_time tenant_id tenant_name svr_ip cpu_usage cpu_max cpu_percent; do
[ -z "$svr_ip" ] && continue
printf "%-20s %-10s %-12s %-15s %-15s %-13s %s%%\n" \
"$check_time" "$tenant_id" "$tenant_name" "$svr_ip" "$cpu_usage" "$cpu_max" "$cpu_percent"
done <<< "$tenant_cpu_data"
any_issue=true
fi
fi
echo "=== OceanBase 租户 Memstore 使用率检查 ==="
# === 构造 SQL 查询(新增 freeze_cnt)===
SQL=$(cat <<EOF
SELECT
t.tenant_name,
t.tenant_id,
m.svr_ip,
u.memory_size / 1024 / 1024 / 1024 AS tenant_memory_gb,
m.memstore_limit_actual_gb,
ROUND(
u.memory_size *
CASE
WHEN COALESCE(p1.value, '0') != '0' THEN CAST(p1.value AS DECIMAL(5,2)) / 100.0
WHEN COALESCE(p2.value, '0') != '0' THEN CAST(p2.value AS DECIMAL(5,2)) / 100.0
WHEN u.memory_size <= 8 * 1024 * 1024 * 1024 THEN 0.4
ELSE 0.5
END
/ 1024 / 1024 / 1024,
2
) AS memstore_limit_theoretical_gb,
m.memstore_used_gb,
m.freeze_trigger_gb AS actual_freeze_trigger_gb,
ROUND(
(
u.memory_size *
CASE
WHEN COALESCE(p1.value, '0') != '0' THEN CAST(p1.value AS DECIMAL(5,2)) / 100.0
WHEN COALESCE(p2.value, '0') != '0' THEN CAST(p2.value AS DECIMAL(5,2)) / 100.0
WHEN u.memory_size <= 8 * 1024 * 1024 * 1024 THEN 0.4
ELSE 0.5
END
) * f.freeze_pct / 100.0
/ 1024 / 1024 / 1024,
2
) AS theoretical_freeze_trigger_gb,
m.freeze_cnt
FROM dba_ob_tenants t
JOIN gv\$ob_units u ON t.tenant_id = u.tenant_id
JOIN (
SELECT
tenant_id,
svr_ip,
ROUND(MEMSTORE_LIMIT / 1024 / 1024 / 1024, 2) AS memstore_limit_actual_gb,
ROUND(MEMSTORE_USED / 1024 / 1024 / 1024, 2) AS memstore_used_gb,
ROUND(FREEZE_TRIGGER / 1024 / 1024 / 1024, 2) AS freeze_trigger_gb,
FREEZE_CNT
FROM gv\$ob_memstore
) m ON t.tenant_id = m.tenant_id AND u.svr_ip = m.svr_ip
LEFT JOIN (SELECT tenant_id, CAST(value AS UNSIGNED) AS freeze_pct FROM __all_virtual_tenant_parameter_info WHERE name = 'freeze_trigger_percentage') f ON t.tenant_id = f.tenant_id
LEFT JOIN __all_virtual_tenant_parameter_info p1 ON t.tenant_id = p1.tenant_id AND p1.name = 'memstore_limit_percentage'
LEFT JOIN __all_virtual_tenant_parameter_info p2 ON t.tenant_id = p2.tenant_id AND p2.name = '_memstore_limit_percentage'
WHERE t.tenant_type = 'USER';
EOF
)
# === 执行 SQL ===
RESULT=$(mysql -h"${obhost}" -P${local_port} -u${sys_username}@sys -p"${sys_password}" -Doceanbase -ANe "$SQL" 2>/dev/null)
if [ $? -ne 0 ]; then
echo "无法连接 OceanBase 或执行查询,请检查配置。"
exit 1
fi
ALERT_FOUND=0
ALERT_MSG=""
while IFS=$'\t' read -r tenant_name tenant_id svr_ip tenant_memory_gb \
memstore_limit_actual_gb memstore_limit_theoretical_gb memstore_used_gb \
actual_freeze_trigger_gb theoretical_freeze_trigger_gb freeze_cnt; do
# 处理空值
memstore_limit_actual_gb=${memstore_limit_actual_gb:-0}
memstore_limit_theoretical_gb=${memstore_limit_theoretical_gb:-0}
memstore_used_gb=${memstore_used_gb:-0}
actual_freeze_trigger_gb=${actual_freeze_trigger_gb:-0}
theoretical_freeze_trigger_gb=${theoretical_freeze_trigger_gb:-0}
freeze_cnt=${freeze_cnt:-0}
# 计算差值
limit_diff=$(echo "$memstore_limit_actual_gb $memstore_limit_theoretical_gb" | awk '{print ($1>$2)?($1-$2):($2-$1)}')
freeze_diff=$(echo "$actual_freeze_trigger_gb $theoretical_freeze_trigger_gb" | awk '{print ($1>$2)?($1-$2):($2-$1)}')
# 告警条件
cond1=$(echo "$limit_diff > 2" | bc -l)
cond2=$(echo "$freeze_diff > 2" | bc -l)
cond3=$(echo "$memstore_used_gb > ($theoretical_freeze_trigger_gb + 2)" | bc -l)
cond4=$(echo "$freeze_cnt >= 50" | bc -l) # 新增:转储次数 ≥ 50
if [ "$cond1" = "1" ] || [ "$cond2" = "1" ] || [ "$cond3" = "1" ] || [ "$cond4" = "1" ]; then
ALERT_FOUND=1
msg="租户: ${tenant_name} (ID: ${tenant_id}) @ ${svr_ip}\n"
msg+=" 租户内存: ${tenant_memory_gb} GB\n"
msg+=" MemStore Limit (实际/理论): ${memstore_limit_actual_gb} / ${memstore_limit_theoretical_gb} GB (差值: ${limit_diff} GB)\n"
msg+=" Freeze Trigger (实际/理论): ${actual_freeze_trigger_gb} / ${theoretical_freeze_trigger_gb} GB (差值: ${freeze_diff} GB)\n"
msg+=" MemStore Used: ${memstore_used_gb} GB\n"
msg+=" 转储次数 (FREEZE_CNT): ${freeze_cnt}\n"
if [ "$cond3" = "1" ]; then
threshold=$(echo "$theoretical_freeze_trigger_gb + 2" | bc -l)
msg+=" ⚠️ MemStore 使用量已超过理论 Freeze 触发阈值 + 2G (${threshold} GB)\n"
fi
if [ "$cond4" = "1" ]; then
msg+=" ⚠️ 转储次数达到 ${freeze_cnt} 次(超过 50 次)\n"
fi
ALERT_MSG+="$msg\n"
fi
done <<< "$RESULT"
# === 输出结果 ===
if [ $ALERT_FOUND -eq 1 ]; then
echo "【租户 MemStore & Freeze 配置异常】发现以下问题:"
echo -e "$ALERT_MSG"
else
echo "【租户 MemStore & Freeze 配置】指标正常"
fi
echo "=== OceanBase 租户 内存模块检查 ==="
# 执行你的原始SQL(稍作格式优化)
output=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT /*+ query_timeout(100000000) */
tenant_id,
svr_ip,
mod_name,
ctx_name,
SUM(hold) / 1024 / 1024 / 1024 AS hold_g
FROM __all_virtual_memory_info
GROUP BY tenant_id, svr_ip, mod_name, ctx_name
HAVING hold_g > 5 AND mod_name <> 'OB_KVSTORE_CACHE_MB'
ORDER BY svr_ip, hold_g DESC;
")
# 判断是否有输出
if [ -z "$output" ]; then
echo "【租户内存模块检查】指标正常"
else
# 输出表头 + 数据(模拟 mysql -t 的表格效果)
echo "tenant_id svr_ip mod_name ctx_name hold_g"
echo "$output"
fi
echo "=== 活跃会话检查 ==="
# === 模块1:SQL_ID 相同,数量最多(top 10)===
echo -e "\n【1. SQL_ID 相同的活跃会话(数量排序 Top 10)】"
result1=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT tenant, user, svr_ip, IFNULL(sql_id, ''), AVG(retry_cnt), AVG(time), MAX(time), COUNT(*) AS cnt
FROM __all_virtual_processlist
WHERE tenant <> 'sys' AND sql_id <> ''
GROUP BY tenant, user, svr_ip, sql_id
ORDER BY cnt DESC
LIMIT 10;
" 2>/dev/null)
if [ -z "$result1" ]; then
echo " (无数据)"
else
echo -e "tenant\tuser\tsvr_ip\tsql_id\tavg_retry_cnt\tavg_time\tmax_time\tcnt"
echo "$result1"
fi
# === 模块2:SQL 文本前40字符相同,数量最多(top 10)===
echo -e "\n【2. SQL文本相似(前40字符)的活跃会话(数量排序 Top 10)】"
result2=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT tenant, user, svr_ip, SUBSTR(info,1,40) AS q_sql, AVG(retry_cnt), AVG(time), MAX(time), COUNT(*) AS cnt
FROM __all_virtual_processlist
WHERE tenant <> 'sys' AND sql_id <> '' AND info IS NOT NULL
GROUP BY tenant, user, svr_ip, SUBSTR(info,1,40)
ORDER BY cnt DESC
LIMIT 10;
" 2>/dev/null)
if [ -z "$result2" ]; then
echo " (无数据)"
else
echo -e "tenant\tuser\tsvr_ip\tq_sql\tavg_retry_cnt\tavg_time\tmax_time\tcnt"
echo "$result2"
fi
# === 模块3:执行时间最长的活跃会话(top 10)===
echo -e "\n【3. 执行时间最长的活跃会话(Top 10)】"
result3=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT tenant, user, svr_ip, IFNULL(sql_id, ''), time, retry_cnt, SUBSTR(info,1,60) AS short_sql
FROM __all_virtual_processlist
WHERE tenant <> 'sys' AND state = 'ACTIVE'
ORDER BY time DESC
LIMIT 10;
" 2>/dev/null)
if [ -z "$result3" ]; then
echo " (无数据)"
else
echo -e "tenant\tuser\tsvr_ip\tsql_id\ttime\tretry_cnt\tshort_sql"
echo "$result3"
fi
# === 模块4:各租户@节点的活跃会话数(仅 >=10)===
echo -e "\n【4. 各租户节点活跃会话数(仅展示 ≥10 的)】"
result4=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT tenant, svr_ip, COUNT(*) AS active_sessions
FROM __all_virtual_processlist
WHERE tenant <> 'sys' AND state = 'ACTIVE'
GROUP BY tenant, svr_ip
HAVING active_sessions >= 10
ORDER BY active_sessions DESC;
" 2>/dev/null)
if [ -z "$result4" ]; then
echo " (无租户节点活跃会话 ≥10)"
else
echo -e "tenant\tsvr_ip\tactive_sessions"
echo "$result4"
fi
# === 模块5:retry_info <> '0' 的异常会话汇总 ===
echo -e "\n【5. 异常会话汇总】"
result5=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT tenant, user, svr_ip, IFNULL(sql_id, '') AS sql_id, SUBSTR(info,1,40) AS short_sql, retry_info, SUM(retry_cnt) AS retry_cnt_sum, COUNT(*) AS cnt
FROM __all_virtual_processlist
WHERE retry_info <> '0' AND tenant <> 'sys' AND state = 'ACTIVE'
GROUP BY tenant, user, svr_ip, IFNULL(sql_id, ''), SUBSTR(info,1,40), retry_info
ORDER BY cnt DESC
LIMIT 5;
" 2>/dev/null)
if [ -z "$result5" ]; then
echo " (当前无 retry_info <> 0 的活跃会话)"
else
echo -e "tenant\tuser\tsvr_ip\tsql_id\tshort_sql\tretry_info\tretry_cnt_sum\tcnt"
echo "$result5"
fi
# === 模块6:活跃会话接近理论上限(≥80%)===
echo -e "\n【6. 活跃会话接近理论上限(≥80%)】"
result6=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT
u.tenant_id,
t.tenant_name,
u.svr_ip,
u.max_cpu,
COALESCE(
CASE
WHEN p.value IS NULL OR TRIM(p.value) = '' OR CAST(p.value AS SIGNED) = 0 THEN 10
ELSE CAST(p.value AS SIGNED)
END,
10
) AS cpu_quota_concurrency,
ROUND(u.max_cpu * COALESCE(
CASE
WHEN p.value IS NULL OR TRIM(p.value) = '' OR CAST(p.value AS SIGNED) = 0 THEN 10
ELSE CAST(p.value AS SIGNED)
END,
10
), 2) AS theoretical_active_sessions,
COALESCE(act.cnt, 0) AS actual_active_sessions
FROM
gv\$ob_units u
JOIN
dba_ob_tenants t ON u.tenant_id = t.tenant_id
LEFT JOIN
__all_virtual_tenant_parameter_info p
ON u.tenant_id = p.tenant_id
AND p.name = 'cpu_quota_concurrency'
AND p.svr_ip = u.svr_ip
LEFT JOIN (
SELECT tenant_id, svr_ip, COUNT(*) AS cnt
FROM __all_virtual_processlist
WHERE state = 'ACTIVE'
GROUP BY tenant_id, svr_ip
) act
ON u.tenant_id = act.tenant_id AND u.svr_ip = act.svr_ip
WHERE
t.tenant_type <> 'META'
AND u.tenant_id <> 1 -- 排除 sys 租户(tenant_id=1),如需保留请删除此行
HAVING
COALESCE(act.cnt, 0) >= (u.max_cpu * COALESCE(
CASE
WHEN p.value IS NULL OR TRIM(p.value) = '' OR CAST(p.value AS SIGNED) = 0 THEN 10
ELSE CAST(p.value AS SIGNED)
END,
10
)) * 0.8
ORDER BY
actual_active_sessions DESC;
" 2>/dev/null)
if [ -z "$result6" ]; then
echo " 活跃会话接近理论上限(指标正常)"
else
echo -e "tenant_id\ttenant_name\tsvr_ip\tmax_cpu\tcpu_quota_concurrency\ttheoretical_active_sessions\tactual_active_sessions"
echo "$result6"
fi
echo "=== 长事务检查 ==="
# 第一步:静默检查是否有符合条件的长事务(用 -N 判断是否为空)
has_data=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT 1
FROM __all_virtual_trans_stat a
LEFT JOIN __all_virtual_global_transaction b
ON a.tenant_id = b.tenant_id AND a.trans_id = b.trans_id
WHERE TIMESTAMPDIFF(SECOND, DATE_FORMAT(ctx_create_time,'%Y-%m-%d %T'), NOW()) > 600
AND a.is_exiting <> 1
LIMIT 1;
" 2>/dev/null)
if [ -z "$has_data" ]; then
echo "【长事务检查】指标正常"
else
# 第二步:有数据,用默认格式(不加 -N)执行完整查询,自动带列名和表格
mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -e "
SELECT /*+query_timeout(720000000000) parallel(8)*/
a.svr_ip,
a.session_id,
a.ctx_create_time,
NOW() AS now_time,
TIMESTAMPDIFF(SECOND, DATE_FORMAT(ctx_create_time,'%Y-%m-%d %T'), NOW()) / 60 AS wait_time,
a.trans_type,
a.trans_id,
CASE
WHEN b.trans_id IS NOT NULL AND part_trans_action > 2 THEN 'XA悬挂'
WHEN b.trans_id IS NULL AND part_trans_action > 2 THEN '普通悬挂'
WHEN b.trans_id IS NOT NULL THEN '普通XA事务'
WHEN b.trans_id IS NULL THEN '普通长事务'
END AS \`事务类型\`,
a.participants
FROM __all_virtual_trans_stat a
LEFT JOIN __all_virtual_global_transaction b
ON a.tenant_id = b.tenant_id AND a.trans_id = b.trans_id
WHERE TIMESTAMPDIFF(SECOND, DATE_FORMAT(ctx_create_time,'%Y-%m-%d %T'), NOW()) > 600
AND a.is_exiting <> 1
ORDER BY a.ctx_create_time
LIMIT 50;
"
fi
echo "=== 日志盘使用率检查 ==="
# 第一步:静默检查是否有租户日志盘使用率超过阈值(默认80%)
has_alert=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
SELECT 1
FROM gv\$ob_units u
JOIN dba_ob_tenants t ON u.tenant_id = t.tenant_id
LEFT JOIN __all_virtual_tenant_parameter_info p
ON u.tenant_id = p.tenant_id
AND p.name = 'log_disk_utilization_threshold'
AND p.svr_ip = u.svr_ip
WHERE t.tenant_type <> 'META'
AND u.log_disk_size > 0
AND (u.log_disk_in_use * 100.0 / u.log_disk_size) >= COALESCE(CAST(p.value AS UNSIGNED), 80)
LIMIT 1;
" 2>/dev/null)
if [ -z "$has_alert" ]; then
echo "【日志盘使用率】指标正常"
else
# 第二步:有超阈值的租户,输出完整信息(带列名、表格对齐)
mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -e "
SELECT
u.svr_ip,
u.tenant_id,
t.tenant_name,
ROUND(u.log_disk_size / 1024 / 1024 / 1024, 2) AS all_log_disk_gb,
ROUND(u.log_disk_in_use / 1024 / 1024 / 1024, 2) AS used_log_disk_gb,
ROUND((u.log_disk_size - u.log_disk_in_use) / 1024 / 1024 / 1024, 2) AS free_log_disk_gb,
ROUND(u.log_disk_in_use * 100.0 / u.log_disk_size, 2) AS usage_pct,
COALESCE(CAST(p.value AS UNSIGNED), 80) AS threshold_pct
FROM gv\$ob_units u
JOIN dba_ob_tenants t ON u.tenant_id = t.tenant_id
LEFT JOIN __all_virtual_tenant_parameter_info p
ON u.tenant_id = p.tenant_id
AND p.name = 'log_disk_utilization_threshold'
AND p.svr_ip = u.svr_ip
WHERE t.tenant_type <> 'META'
AND u.log_disk_size > 0
AND (u.log_disk_in_use * 100.0 / u.log_disk_size) >= COALESCE(CAST(p.value AS UNSIGNED), 80)
ORDER BY usage_pct DESC;
"
fi
echo "=== 磁盘风险检查 ==="
# 第一步:静默检测是否存在风险(当前 >75% 或 预估 >90%)
has_risk=$(mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -ANe "
WITH
tenant_threads AS (
SELECT
u.svr_ip,
LEAST(COALESCE(MAX(CAST(NULLIF(p.value, '0') AS UNSIGNED)), 8), 100) AS thread_count
FROM gv\$ob_units u
LEFT JOIN __all_virtual_tenant_parameter_info p
ON u.tenant_id = p.tenant_id AND p.name = 'ha_high_thread_score' AND p.svr_ip = u.svr_ip
WHERE u.tenant_id > 1
GROUP BY u.svr_ip
),
raw_tablets AS (
SELECT
svr_ip,
CASE
WHEN nested_offset = 0 THEN
IFNULL(data_block_count + index_block_count + linked_block_count, 0) * 2 * 1024 * 1024
ELSE
IFNULL(size, 0)
END AS data_size_bytes
FROM __all_virtual_table_mgr
),
top100_tablets AS (
SELECT
svr_ip,
data_size_bytes,
ROW_NUMBER() OVER (PARTITION BY svr_ip ORDER BY data_size_bytes DESC) AS rn
FROM raw_tablets
),
selected_tablets AS (
SELECT
t100.svr_ip,
t100.data_size_bytes
FROM top100_tablets t100
JOIN tenant_threads tt ON t100.svr_ip = tt.svr_ip
WHERE t100.rn <= tt.thread_count
),
disk_with_usage AS (
SELECT
d.svr_ip,
d.svr_port,
d.total_size,
d.free_size,
(d.total_size - d.free_size) AS used_size,
((d.total_size - d.free_size) + COALESCE(SUM(st.data_size_bytes), 0)) AS estimated_peak_bytes,
ROUND((d.total_size - d.free_size) / d.total_size * 100, 2) AS current_used_pct,
ROUND(((d.total_size - d.free_size) + COALESCE(SUM(st.data_size_bytes), 0)) / d.total_size * 100, 2) AS estimated_peak_pct
FROM __all_virtual_disk_stat d
LEFT JOIN selected_tablets st ON d.svr_ip = st.svr_ip
GROUP BY d.svr_ip, d.svr_port, d.total_size, d.free_size
)
SELECT 1
FROM disk_with_usage
WHERE current_used_pct > 75 OR estimated_peak_pct > 90
LIMIT 1;
" 2>/dev/null)
if [ -z "$has_risk" ]; then
echo "【磁盘风险】指标正常"
else
# 第二步:输出完整风险详情表(带当前使用率)
mysql -h"${obhost}" -P${local_port} -u"${sys_username}@sys" -p"${sys_password}" -c -Doceanbase -e "
WITH
tenant_threads AS (
SELECT
u.svr_ip,
LEAST(COALESCE(MAX(CAST(NULLIF(p.value, '0') AS UNSIGNED)), 8), 100) AS thread_count
FROM gv\$ob_units u
LEFT JOIN __all_virtual_tenant_parameter_info p
ON u.tenant_id = p.tenant_id AND p.name = 'ha_high_thread_score' AND p.svr_ip = u.svr_ip
WHERE u.tenant_id > 1
GROUP BY u.svr_ip
),
raw_tablets AS (
SELECT
svr_ip,
CASE
WHEN nested_offset = 0 THEN
IFNULL(data_block_count + index_block_count + linked_block_count, 0) * 2 * 1024 * 1024
ELSE
IFNULL(size, 0)
END AS data_size_bytes
FROM __all_virtual_table_mgr
),
top100_tablets AS (
SELECT
svr_ip,
data_size_bytes,
ROW_NUMBER() OVER (PARTITION BY svr_ip ORDER BY data_size_bytes DESC) AS rn
FROM raw_tablets
),
selected_tablets AS (
SELECT
t100.svr_ip,
t100.data_size_bytes
FROM top100_tablets t100
JOIN tenant_threads tt ON t100.svr_ip = tt.svr_ip
WHERE t100.rn <= tt.thread_count
)
SELECT
d.svr_ip,
d.svr_port,
CONCAT(ROUND(d.total_size / 1024 / 1024 / 1024, 2), 'G') AS total_disk,
CONCAT(ROUND((d.total_size - d.free_size) / 1024 / 1024 / 1024, 2), 'G') AS current_used,
CONCAT(ROUND((d.total_size - d.free_size) / d.total_size * 100, 2), '%') AS current_used_pct, -- ✅ 显式输出当前使用率
CONCAT(ROUND(COALESCE(SUM(st.data_size_bytes), 0) / 1024 / 1024 / 1024, 2), 'G') AS compaction_addition,
CONCAT(
ROUND(((d.total_size - d.free_size) + COALESCE(SUM(st.data_size_bytes), 0)) / 1024 / 1024 / 1024, 2),
'G'
) AS estimated_peak_usage,
CONCAT(
ROUND(((d.total_size - d.free_size) + COALESCE(SUM(st.data_size_bytes), 0)) / d.total_size * 100, 2),
'%'
) AS estimated_peak_pct,
CASE
WHEN (d.total_size - d.free_size) / d.total_size * 100 > 75 THEN '当前使用率过高'
WHEN ((d.total_size - d.free_size) + COALESCE(SUM(st.data_size_bytes), 0)) / d.total_size * 100 > 90 THEN '合并可能爆盘'
ELSE '无风险'
END AS risk_reason
FROM __all_virtual_disk_stat d
LEFT JOIN selected_tablets st ON d.svr_ip = st.svr_ip
GROUP BY d.svr_ip, d.svr_port, d.total_size, d.free_size
HAVING current_used_pct > 75 OR estimated_peak_pct > 90
ORDER BY current_used_pct DESC, estimated_peak_pct DESC;
"
fi
echo "=== Core 文件检查 ==="
# 读取 core_pattern
core_pattern=$(cat /proc/sys/kernel/core_pattern 2>/dev/null)
# 提取目录路径(假设格式为 /path/to/core-%e-%p-%t)
if [[ "$core_pattern" == /* ]]; then
# 是绝对路径
core_dir=$(dirname "$core_pattern")
else
# 非绝对路径(如 "core" 或 "|/usr/bin/..."),跳过或设为当前目录(通常不会)
echo "【Core 文件检查】core_pattern 不是绝对路径,跳过检查。"
exit 0
fi
# 检查目录是否存在
if [ ! -d "$core_dir" ]; then
echo "【Core 文件检查】core 目录不存在: $core_dir"
exit 0
fi
# 查找 2 小时(7200 秒)内修改过的 core 文件(以 core- 开头)
recent_cores=$(find "$core_dir" -maxdepth 1 -name 'core-*' -type f -mmin -120 2>/dev/null)
if [ -z "$recent_cores" ]; then
echo "【Core 文件检查】指标正常"
else
echo "【Core 文件检查】发现 2 小时内生成的 core 文件:"
ls -lt $(echo "$recent_cores" | head -n 5) # 最多列出 5 个
fi