拉链表设计:

  在企业中,由于有些流水表每日有几千万条记录,数据仓库保存5年数据的话很容易不堪重负,因此可以使用拉链表的算法来节省存储空间。

 例子:

-- 用户信息表;  采集当日全量数据存储到 (当日) 表中
CREATE TABLE dwd.user_info(
id string,
name string,
sex string,
biz_date string -- 业务日期
) -- 用户信息整合表
CREATE TABLE dws.user_merge_info(
id string,
name string,
sex string,
start_date string,
end_date string
) -- 测试插入用户信息
INSERT INTO dwd.user_info
SELECT
'','YaoMing','boy',''
UNION ALL
SELECT
'','YaoLinlin','girl',''
UNION ALL
SELECT
'','CaiLili','girl',''
UNION ALL
SELECT
'','ZhangSan','girl',''
UNION ALL
SELECT
'','LiSi','girl','' -- 查看数据
SELECT * FROM dwd.user_info

hive拉链表以及退链例子笔记-LMLPHP

-- 初始化用户信息整合表
INSERT overwrite TABLE dws.user_merge_info
SELECT
id,
name,
sex,
'' AS start_date,
'' AS end_date
FROM (
SELECT
id,
name,
sex,
row_number() over(PARTITION BY id ORDER BY biz_date) AS row_num -- 初始化时候根据主键id分组,取最新修改的数据
FROM dwd.user_info
) t
WHERE t.row_num = 1 -- 查看数据
SELECT * FROM dws.user_merge_info

hive拉链表以及退链例子笔记-LMLPHP

-- 现在biz_date='20190702'这天,新跑了一条全新数据id=6,以及修改了一条id=2的数据
INSERT INTO dwd.user_info
SELECT
'','WangWu','boy',''
UNION ALL
SELECT
'','YaoLinlin','boy','' -- 查看数据
SELECT * FROM dwd.user_info ORDER BY id,biz_date

hive拉链表以及退链例子笔记-LMLPHP

-- 新增修改以及完全新增
INSERT overwrite TABLE tmp.user_merge_info_new
-- 修改的数据
SELECT
b.id,
b.name,
b.sex,
'' AS start_date, -- ${bizdate} 业务日期
'' AS end_date -- 99991231代表有效数据
FROM dws.user_merge_info a,
dwd.user_info b
WHERE a.id = b.id
AND a.end_date = ''
AND b.biz_date = '' -- ${bizdate}只取当天数据
AND (
a.name!= b.name
OR a.sex != b.sex
) UNION ALL
-- 全新的数据
SELECT
b.id,
b.name,
b.sex,
''AS start_date,
''AS end_date
FROM dws.user_merge_info a
RIGHT JOIN dwd.user_info b
ON a.id = b.id
WHERE b.biz_date=''
AND a.id IS NULL;

hive拉链表以及退链例子笔记-LMLPHP

-- 闭链
INSERT overwrite TABLE tmp.user_merge_info_upt
SELECT
a.id,
a.name,
a.sex,
a.start_date,
'' -- 闭链,${biz_date}业务时间
FROM dws.user_merge_info a
LEFT JOIN dwd.user_info b
ON a.id=b.id
WHERE a.end_date=''
AND b.biz_date=''
AND (
a.name != b.name
OR a.sex != b.sex
)

hive拉链表以及退链例子笔记-LMLPHP

-- 历史数据
INSERT overwrite TABLE tmp.user_merge_info_new
SELECT
a.id,
a.name,
a.sex,
a.start_date,
a.end_date
FROM dws.user_merge_info a,
tmp.user_merge_info_upt b
WHERE a.id != b.id;

hive拉链表以及退链例子笔记-LMLPHP

-- 整合数据
INSERT OVERWRITE TABLE dws.user_merge_info
SELECT
id,
name,
sex,
start_date,
end_date
FROM tmp.user_merge_info_new
UNION ALL
SELECT
id,
name,
sex,
start_date,
end_date
FROM tmp.user_merge_info_upt
UNION ALL
SELECT
id,
name,
sex,
start_date,
end_date
FROM tmp.user_merge_info_his -- 查看下数据
SELECT * FROM dws.user_merge_info ORDER BY id,start_date

hive拉链表以及退链例子笔记-LMLPHP

以上拉链表就实现好了


以下是退链操作模板

#!/bin/bash

# 使用说明提示
if [ $# -ne 1 ]; then
echo "Usage : `basename $0` biz_date"
exit 1
fi #业务时间
biz_date=$1 # 判断是数据整合还是回退拉链表
isGoBack=`execHQL "select count(1) from dws.user_merge_info where (end_date>='$biz_date' or start_date>='$biz_date') and biz_date<>'';"` if [ $isGoBack -ne 0 ];then
# 回退模式
Log "\n## 【user_merge_info表回退】 执行开始 ##"
execHQL "
INSERT overwrite TABLE dws.user_merge_info
-- 完全不变的数据
SELECT
id
,name
,sex
,start_date
,end_date
FROM dws.user_merge_info
WHERE (start_date<'$biz_date' AND end_date='') OR end_date<'$biz_date' UNION ALL -- 重跑 重新开链的数据
SELECT
id
,name
,sex
,start_date
,'' AS end_date
FROM dws.user_merge_info
WHERE start_date<'$biz_date' AND end_date>='$biz_date' AND end_date<>'';
"
if [ $? -ne 0 ];then
Log "\n## 【user_merge_info表回退】 执行失败 ##"
exit 1
fi
Log "\n## 【user_merge_info表回退】 执行成功 ##" fi
05-11 08:43