CREATE DEFINER = `root`@`%` PROCEDURE `sjqx_jck_zt_zxxx_077_01`(IN `rwuuid` varchar(32)) MODIFIES SQL DATA COMMENT '资讯信息主题数据清洗' BEGIN #资讯信息主题数据清洗,在该存储过程中,调用主题模块的数据清洗存储过程,在主题模块的存储过程中完成具体的数据清洗逻辑, #重点: #1.在调用主题模块的数据清洗前,需要在主题清洗存储过程中记录日志, #插入主题模块日志表qxk.sj_qx_ztmkrz,用来描述每个主题模块当前批次的清洗情况; #2.在调用主题模块的数据清洗中,需要在主题模块清洗存储过程中记录日志, #插入主题模块明细日志表qxk.sj_qx_ztmkmxrz,用来描述主题模块中每一段逻辑当前批次的清洗情况; #3.在调用主题模块的数据清洗后,无论整个过程是成功或者失败,需要获取主题的执行状态, #插入主题日志表qxk.sj_qx_ztrz,用来描述每个主题当前批次的清洗情况; #4.判断执行情况,若执行失败,需要将相关的连锁相关的清洗结果回滚 #暂时未使用,可删除 DECLARE toatalTask INT DEFAULT 100;#执行总任务数 DECLARE currentTaskCount INT DEFAULT 0;#当前任务 DECLARE affectedCount INT DEFAULT 0;#影响行数 DECLARE errMsg text;#执行异常日志信息 DECLARE errCode varchar(100);#执行日志异常码 DECLARE zxycxx text DEFAULT null;#执行异常信息 #DECLARE sjrwdm VARCHAR(3) DEFAULT '001';#数据任务代码,001 代表公司类别数据清洗,每一个定时任务job对应一个数据任务代码 #DECLARE rwuuid VARCHAR(32) DEFAULT NULL;#任务日志,用来保存于定时任务启动的关联关系 DECLARE jgdm VARCHAR(2) DEFAULT '1';#定义主题模块明细执行结果代码,默认为1,表示成功。【1-全部成功;2-部分成功;3-全部失败;】 DECLARE zxycrz text DEFAULT null;#定义主题模块明细执行异常信息 DECLARE mkmxkssj TIMESTAMP DEFAULT now();#定义主题模块明细执行的开始时间 DECLARE mkmxjssj TIMESTAMP DEFAULT now();#定义主题模块明细执行的结束时间 #定时主题使用的变量 DECLARE sjlbdm VARCHAR(3) DEFAULT '18';#数据类别代码,18 代表为资讯 DECLARE sjztdm VARCHAR(4) DEFAULT '077';#数据主题代码,077 代表为新闻资讯信息 DECLARE rwdm VARCHAR(4) DEFAULT '011';#任务代码,011 代表为资讯 DECLARE mbbdm varchar(10);#目标表代码 DECLARE ztmkrzuuid VARCHAR(32) DEFAULT MD5(UUID());#生成uuid,为当前的主题模块日志uuid赋值 DECLARE ztmkdm VARCHAR(6) DEFAULT '0300';#定义主题模块代码 DECLARE zxkssj TIMESTAMP DEFAULT now();#当前主题执行开始时间,默认为系统当前时间 DECLARE zxjssj TIMESTAMP DEFAULT null;#当前主题执行结束时间,在该存储过程业务逻辑执行完毕时赋值 #主题日志uuid,采用数据类别,数据主题,执行开始时间动态生成 DECLARE ztrzuuid VARCHAR(32) DEFAULT MD5(CONCAT(sjlbdm,'_',sjztdm,'_',zxkssj)); DECLARE zxjgdm VARCHAR(2) DEFAULT '1';#主题执行结果代码,默认为1,表示成功。【1-全部成功;2-部分成功;3-全部失败;】 DECLARE sjlzbuuid VARCHAR(32) DEFAULT MD5(UUID());#数据流转表uuid #记录ETL任务执行影响主体记录数的参数 DECLARE insertCount INT DEFAULT 0; DECLARE updateCount INT DEFAULT 0; DECLARE deleteCount INT DEFAULT 0; DECLARE kmc VARCHAR(20) DEFAULT 'jck';#主体所在的库名称 DECLARE bmc VARCHAR(90) DEFAULT 'zx_jbxx';#主体所在的表名称 DECLARE startUuid VARCHAR(40) DEFAULT '';#注意:取当前需要清洗的批次专利基本信息表zl_sq_jbxx数据中最小的uuid DECLARE endUuid VARCHAR(40) DEFAULT ''; DECLARE maxUuid VARCHAR(40); DECLARE taskid VARCHAR(32) DEFAULT REPLACE(UUID(), '-', '');#任务id DECLARE totalCount INT DEFAULT 0; DECLARE totalCleanCount INT DEFAULT 0; DECLARE cleanCount INT DEFAULT 0; DECLARE cleanRound INT DEFAULT 0; DECLARE startTime VARCHAR(40) DEFAULT '2015-01-01';#待清洗的本批专利入库的开始时间 #定义游标,记录异常日志 DECLARE CONTINUE HANDLER FOR SQLEXCEPTION,SQLWARNING,NOT FOUND#存储过程执行出错的时候记录异常日志 err : BEGIN GET DIAGNOSTICS CONDITION 1 errCode = RETURNED_SQLSTATE, zxycxx = MESSAGE_TEXT; SET zxycxx = CONCAT(errCode,':\r\n',zxycxx); SET zxjgdm = '2'; END; SET @_rwdm = rwdm; SET @_qx_etl_kssj = null; ###################获取数据同步开始时间################### #1.0.1.创建新增数据临时表,拼接业务表对应的临时表建表语句,temporary SET @query_qx_etl_kssj_sql = CONCAT('SET @_qx_etl_kssj = (select t.jssj from qxk.sj_lzb t where t.sjlzhjdm = 2 and t.yxztdm = 2 and yxjgbz = 1 and t.rwdm = \'',@_rwdm,'\' order by t.cjrq desc limit 1);'); #1.0.2.预处理创建临时表sql PREPARE stmt FROM @query_qx_etl_kssj_sql; #1.0.3.根据预处理sql执行创建临时表的语句 EXECUTE stmt; #释放连接 DEALLOCATE PREPARE stmt; #记录主题数据信息主题开始清洗日志 CALL qxk.sjqx_qxk_zt_ks_log_01 ( rwuuid, ztrzuuid,#主题日志uuid sjlbdm,#数据类别代码 sjztdm,#数据主题代码 zxkssj#开始时间 ); START TRANSACTION; SET @_rwdm = rwdm; SET @qxkssj = null; ###################获取数据同步开始时间################### #1.0.1.创建新增数据临时表,拼接业务表对应的临时表建表语句,temporary SET @query_qxkssj_sql = CONCAT('set @qxkssj = (select t.kssj from qxk.sj_lzb t where t.sjlzhjdm = 2 and t.yxztdm = 2 and t.rwdm = @_rwdm order by t.cjrq desc limit 1);'); #1.0.2.预处理创建临时表sql PREPARE stmt FROM @query_qxkssj_sql; #1.0.3.根据预处理sql执行创建临时表的语句 EXECUTE stmt; #释放连接 DEALLOCATE PREPARE stmt; set startTime=@query_qxkssj_sql; /**/ SET zxycxx = NULL;#初始化默认执行异常日志为空 #获取目标表代码 SET mbbdm = (select bdm from qxk.sj_dm_sjb t where kmc = 'jck' and t.bmc = 'zx_jbxx'); #将清洗辅助的实体表清空 delete from etlgck.qxgc_zx_zxjbxx_jbxx_hz_0300;#资讯汇总表 delete from etlgck.qxgc_zx_zxjbxx_sgzxjbxx_cc_0300;#搜狗资讯基本信息临时存储 delete from etlgck.qxgc_zx_zxjbxx_sgzxjbxx_gl_0300;#搜狗资讯基本信息过滤 delete from etlgck.qxgc_zx_zxjbxx_wxzxjbxx_cc_0300;#无效的资讯临时存储 delete from etlgck.qxgc_zx_zxjbxx_pczhgjc_cc_0300;#排除的关键词的临时存储 delete from etlgck.qxgc_zx_zxjbxx_pczhgjc_fp_0300;#排除的关键词分批 SET jgdm = '1';#初始化默认执行结果代码为成功 SET zxycrz = NULL;#初始化默认执行异常日志为空 SET mkmxkssj = now();#当前主题模块明细执行的开始时间 #1.新闻资讯 jck.zx_jbxx #1.1百度新闻数据源,暂时按照新闻标题和日期去重 ##zxlyfldm暂时用来存储新闻是否敏感,对应zxmgfldm-资讯敏感分类代码【01:敏感;02:不敏感】 #插入百度新闻 insert ignore etlgck.qxgc_zx_zxjbxx_jbxx_hz_0300(uuid,zxbt,zxbt_wbdfh,zxnr,zxzz,zxwz,zxly,osspath,fbrq,wyuuid,sjlydm) select uuid,xwbt, REGEXP_REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(xwbt,'○',''),'|',''),'|',''),'~',''),'+',''),'丨',''),'│',''),'→',''),'▏',''),'✅',''),'<',''),'>',''), '[[:punct:]]', '') zxbt_wbdfh, xwnrbq,xwzz,xwwz,xwly,osspath,fbrq, MD5(CONCAT(REGEXP_REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(xwbt,'○',''),'|',''),'|',''),'~',''),'+',''),'丨',''),'│',''),'→',''),'▏',''),'✅',''),'<',''),'>',''), '[[:punct:]]', ''),'_',IFNULL(fbrq_date,''))) wyuuid,'01' as sjlydm from( select t1.uuid, case when t1.xwbt like '*%' then REPLACE(REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ',''),'*','') when t1.xwbt like '%*' then REPLACE(REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ',''),'*','') else REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ','') end xwbt,t1.xwnr_bq xwnrbq,null xwzz,ysdz xwwz,xwly,t1.osspath, case when LENGTH(xwsj) >= 10 then xwsj else null end fbrq, case when LENGTH(xwsj) >= 10 then left(xwsj,10) else null end fbrq_date from etlgck.qxyl_gs_bd_xwzx_077 t1 where ysdz is not null and xwbt is not null and xwly != '南方财富网' #and t1.sftqdzd = 'Y' )t ; IF INSTR(zxycrz,'Duplicate') > 0 || INSTR(zxycrz,'Incorrect string') > 0 THEN SET jgdm = '1';#初始化执行结果代码为成功 SET zxycrz = NULL;#初始化异常日志 END IF; SET affectedCount = ROW_COUNT();#影响的行次 SET mkmxjssj = now();#当前主题模块明细执行的结束时间 #记录模块明细执行完毕的日志 CALL qxk.sjqx_qxk_ztmkmx_log_01 ( rwuuid,#任务uuid ztmkrzuuid,#主题模块日志uuid ztrzuuid,#主题日志uuid sjlbdm,#数据类别代码 sjztdm,#数据主题代码 ztmkdm,#主题模块代码 '300.1',#主题模块明细编码 mbbdm,#目标表代码 '1',#批次 mkmxkssj,#开始时间 mkmxjssj,#结束时间 jgdm,#结果代码【1-成功;2-失败;】 zxycrz,#执行异常日志 affectedCount#执行影响记录数 ); SET jgdm = '1';#初始化默认执行结果代码为成功 SET zxycrz = NULL;#初始化默认执行异常日志为空 SET mkmxkssj = now();#当前主题模块明细执行的开始时间 #1.2IT桔子新闻, #TODO #1.2.1新闻详情未抓取,目前保留的新闻外链,外链的新闻来源去重后4630个不同数据源 #select count(distinct xwly) from ylk.cyrz_itjz_rzxmxwxxyssj t where LENGTH(xwly) > 4 limit 1000; -- 4630 #1.2.2若抓取新闻详情,数据抓取需要对接各类不同网站的新闻页面结构,前端需要对接各类不同新闻网站的样式 insert ignore etlgck.qxgc_zx_zxjbxx_jbxx_hz_0300(uuid,zxbt,zxbt_wbdfh,zxnr,zxzz,zxwz,zxly,fbrq,wyuuid,sjlydm) select uuid,xwbt, REGEXP_REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(xwbt,'○',''),'|',''),'|',''),'~',''),'+',''),'丨',''),'│',''),'→',''),'▏',''),'✅',''),'<',''),'>',''), '[[:punct:]]', '') zxbt_wbdfh,xwnrbq,xwzz,xwwz,xwly,fbrq, MD5(CONCAT(REGEXP_REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(xwbt,'○',''),'|',''),'|',''),'~',''),'+',''),'丨',''),'│',''),'→',''),'▏',''),'✅',''),'<',''),'>',''), '[[:punct:]]', '') ,'_',IFNULL(fbrq_date,''))) wyuuid,'02' as sjlydm from( select MD5(t1.xwljdz) uuid, case when t1.xwbt like '*%' then REPLACE(REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ',''),'*','') when t1.xwbt like '%*' then REPLACE(REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ',''),'*','') else REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ','') end xwbt,null xwnrbq,null xwzz,t1.xwljdz xwwz,t1.xwly, case when LENGTH(t1.xwfbrq) >= 10 then t1.xwfbrq else null end fbrq, case when LENGTH(t1.xwfbrq) >= 10 then left(t1.xwfbrq,10) else null end fbrq_date from etlgck.qxyl_cyrz_itjz_rzxmxwxxyssj_077 t1,etlgck.qxyl_cyrz_itjz_rzxmxwxxyssj_sfyx_077 t2 where t1.uuid = t2.uuid and t2.yxbz = 'Y' and t1.xwljdz is not null and LENGTH(t1.xwljdz) > 3 and t1.xwly != '南方财富网' )t ; IF INSTR(zxycrz,'Duplicate') > 0 || INSTR(zxycrz,'Incorrect string') > 0 THEN SET jgdm = '1';#初始化执行结果代码为成功 SET zxycrz = NULL;#初始化异常日志 END IF; SET affectedCount = ROW_COUNT();#影响的行次 SET mkmxjssj = now();#当前主题模块明细执行的结束时间 #记录模块明细执行完毕的日志 CALL qxk.sjqx_qxk_ztmkmx_log_01 ( rwuuid,#任务uuid ztmkrzuuid,#主题模块日志uuid ztrzuuid,#主题日志uuid sjlbdm,#数据类别代码 sjztdm,#数据主题代码 ztmkdm,#主题模块代码 '300.2',#主题模块明细编码 mbbdm,#目标表代码 '1',#批次 mkmxkssj,#开始时间 mkmxjssj,#结束时间 jgdm,#结果代码【1-成功;2-失败;】 zxycrz,#执行异常日志 affectedCount#执行影响记录数 ); SET jgdm = '1';#初始化默认执行结果代码为成功 SET zxycrz = NULL;#初始化默认执行异常日志为空 SET mkmxkssj = now();#当前主题模块明细执行的开始时间 insert ignore etlgck.qxgc_zx_zxjbxx_jbxx_hz_0300(uuid,zxbt,zxbt_wbdfh,zxnr,zxzz,zxwz,zxly,fbrq,wyuuid,sjlydm) select uuid,xwbt, REGEXP_REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(xwbt,'○',''),'|',''),'|',''),'~',''),'+',''),'丨',''),'│',''),'→',''),'▏',''),'✅',''),'<',''),'>',''), '[[:punct:]]', '') zxbt_wbdfh, xwnrbq,xwzz,xwwz,xwly,fbrq, MD5(CONCAT(REGEXP_REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(xwbt,'○',''),'|',''),'|',''),'~',''),'+',''),'丨',''),'│',''),'→',''),'▏',''),'✅',''),'<',''),'>',''), '[[:punct:]]', ''),'_',IFNULL(fbrq_date,''))) wyuuid,'02' as sjlydm from( select MD5(t1.xwljdz) uuid, case when t1.xwbt like '*%' then REPLACE(REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ',''),'*','') when t1.xwbt like '%*' then REPLACE(REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ',''),'*','') else REPLACE(REPLACE(REPLACE(t1.xwbt,' ',''),' ',''),' ','') end xwbt,null xwnrbq,null xwzz,t1.xwljdz xwwz,t1.xwly, case when LENGTH(t1.xwfbrq) >= 10 then t1.xwfbrq else null end fbrq, case when LENGTH(t1.xwfbrq) >= 10 then LEFT(t1.xwfbrq,10) else null end fbrq_date from etlgck.qxyl_cyrz_itjz_rzxmxwxx_077 t1 where t1.xwljdz is not null and LENGTH(t1.xwljdz) > 3 )t ; IF INSTR(zxycrz,'Duplicate') > 0 || INSTR(zxycrz,'Incorrect string') > 0 || INSTR(zxycrz,'Out') > 0 THEN SET jgdm = '1';#初始化执行结果代码为成功 SET zxycrz = NULL;#初始化异常日志 END IF; SET affectedCount = ROW_COUNT();#影响的行次 SET mkmxjssj = now();#当前主题模块明细执行的结束时间 #记录模块明细执行完毕的日志 CALL qxk.sjqx_qxk_ztmkmx_log_01 ( rwuuid,#任务uuid ztmkrzuuid,#主题模块日志uuid ztrzuuid,#主题日志uuid sjlbdm,#数据类别代码 sjztdm,#数据主题代码 ztmkdm,#主题模块代码 '300.3',#主题模块明细编码 mbbdm,#目标表代码 '1',#批次 mkmxkssj,#开始时间 mkmxjssj,#结束时间 jgdm,#结果代码【1-成功;2-失败;】 zxycrz,#执行异常日志 affectedCount#执行影响记录数 ); SET jgdm = '1';#初始化默认执行结果代码为成功 SET zxycrz = NULL;#初始化默认执行异常日志为空 SET mkmxkssj = now();#当前主题模块明细执行的开始时间 IF INSTR(zxycrz,'Duplicate') > 0 || INSTR(zxycrz,'UTF8MB4') > 0 || INSTR(zxycrz,'DEPRECATED') > 0 || INSTR(zxycrz,'Incorrect string') > 0 THEN SET jgdm = '1';#初始化执行结果代码为成功 SET zxycrz = NULL;#初始化异常日志 END IF; SET affectedCount = ROW_COUNT();#影响的行次 SET mkmxjssj = now();#当前主题模块明细执行的结束时间 #记录模块明细执行完毕的日志 CALL qxk.sjqx_qxk_ztmkmx_log_01 ( rwuuid,#任务uuid ztmkrzuuid,#主题模块日志uuid ztrzuuid,#主题日志uuid sjlbdm,#数据类别代码 sjztdm,#数据主题代码 ztmkdm,#主题模块代码 '300.4',#主题模块明细编码 mbbdm,#目标表代码 '1',#批次 mkmxkssj,#开始时间 mkmxjssj,#结束时间 jgdm,#结果代码【1-成功;2-失败;】 zxycrz,#执行异常日志 affectedCount#执行影响记录数 ); SET jgdm = '1';#初始化默认执行结果代码为成功 SET zxycrz = NULL;#初始化默认执行异常日志为空 SET mkmxkssj = now();#当前主题模块明细执行的开始时间 #插入搜狐新闻 insert ignore etlgck.qxgc_zx_zxjbxx_jbxx_hz_0300(uuid,zxbt,zxbt_wbdfh,zxnr,zxwz,zxly,fbrq,osspath,wyuuid,sjlydm) select uuid,zxbt,REGEXP_REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(zxbt,'○',''),'|',''),'|',''),'~',''),'+',''),'丨',''),'│',''),'→',''),'▏',''),'✅',''),'<',''),'>',''), '[[:punct:]]', '') zxbt_wbdfh,zxnr,zxwz,zxly,fbrq,osspath, MD5(CONCAT(REGEXP_REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(zxbt,'○',''),'|',''),'|',''),'~',''),'+',''),'丨',''),'│',''),'→',''),'▏',''),'✅',''),'<',''),'>',''), '[[:punct:]]', ''),'_',IFNULL(fbrq_date,''))) wyuuid,'04' sjlydm from( select t1.uuid, case when t1.bt like '*%' then REPLACE(REPLACE(REPLACE(REPLACE(t1.bt,' ',''),' ',''),' ',''),'*','') when t1.bt like '%*' then REPLACE(REPLACE(REPLACE(REPLACE(t1.bt,' ',''),' ',''),' ',''),'*','') else REPLACE(REPLACE(REPLACE(t1.bt,' ',''),' ',''),' ','') end zxbt, REPLACE(REPLACE(t1.xq,'
','') zxnr,t1.xqurl zxwz,t1.zzly zxly,t1.osspath, case when LENGTH(t1.fbrq) >= 10 then t1.fbrq else null end fbrq, case when LENGTH(t1.fbrq) >= 10 then LEFT(t1.fbrq,10) else null end fbrq_date from etlgck.qxyl_zx_shxw_077 t1 where LENGTH(t1.fbrq) > 4 and t1.xq is not null and t1.xq not like '%