set hive.fetch.task.conversion=more;       -- 避免触发MR job
select distinct name from employee_id limit 2;      -- limit: 随机取出
-- nested SELECT
with t1 as (  -- 第一种方式 nested SELECT using CTE
select * from employee_id
where gender_age.gender = "Male"
)
select name, gender_age.age, gender_age.gender from t1;

select name, gender_age.age, gender_age.gender from ( -- 第二种方式 using nested SELECT after the FROM statement. 更快
select * from employee_id
where gender_age.gender = "Male"
) t1; --given alias

-- where从句中的子查询。
-- 1) IN, NOT IN 仅支持单个字段:
select name, gender_age.age, gender_age.gender  -- 会使用MR job,极其耗时
from employee_id t1
where t1.gender_age in (  -- where中限定字段的表名要使用别称
select gender_age from employee_id
where gender_age.gender = "Male"
);
-- 2) EXIT, NOT EXIT
select name, gender_age.age, gender_age.gender  -- 会使用MR job,极其耗时
from employee_id t1
where exists ( -- 子查询必须同时指向内部和外部表
select * from employee_id t2
where t1.gender_age.gender = t2.gender_age.gender and t2.gender_age.gender = "Male"
);
create table employee_hr (
name string,
employee_id int,
sin_number string,
start_date date
)
row format delimited
fields terminated by "|"
stored as textfile;

-- 内连接:hive不支持unequal join
select emp01.name, emp02.sin_number
from employee_id emp01      -- m ∩ n
join employee_hr emp02 on emp01.name = emp02.name;

select emp01.name, emp02.sin_number
from employee_id emp01, employee_hr emp02 where emp01.name = emp02.name;

    -- 自联接:查询出同一天Apple和IBM的股价
SELECT a.ymd, a.price_close, b.price_close
FROM stocks a JOIN stocks b ON a.ymd = b.ymd
WHERE a.symbol = 'AAPL' AND b.symbol = 'IBM';
    -- 三表联接
select emp01.name, emp02.employee_id, emp01.sin_number
from employee_hr emp01
join employee_id emp02 on emp01.name = emp02.name
join employee_id emp03 on emp02.employee_id = emp03.employee_id;

-- 外连接和交叉联接
join不支持交换律,且始终为左结合
所有join均为:cross join 产生笛卡尔积
--outer join: left join, right join, full join

    -- cross join
set hive.strict.checks.cartesian.product=false;
set hive.mapred.mode=nonstrict;
    1.
select e1.name, e2.sin_number
from employee e1
join employee_hr e2;  -- 若不加条件,则join为交叉联接
    2.
select e1.name, e2.sin_number
from employee e1
join employee_hr e2 on 1 = 1; -- 或者联接条件始终返回true,也为交叉联接

    -- unqual join
select e1.name, e2.sin_number
from employee e1
cross join employee_hr e2   -- join始终在where之前发生
where e1.name <> e2.name;

-- mapjoin
set hive.auto.convert.join=true;   -- hive自动将join转化为map join。
-- regular map-side join
select /*+ mapjoin(employee) */ emp01.name, emp02.sin_number
from employee emp01
cross join employee_hr emp02
where emp01.name <> emp02.name;

-- bucket map join:
set hive.optimize.bucketmapjoin = true;    -- 开启bucket map join
set hive.optimize.bucketmapjoin.sortedmerge = true;
set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;  -- 开启sort-merge join,要求两表有相同的桶数

-- 获取两表的交集并从交集中查询出a表的name字段:
select a.name
from employee a
where exists (
select * from employee_id ei where a.name = ei.name
);
-- left semi join中select和where从句不能引用右侧表的字段,其比内联接更为高效,因为内联接在m * n数集中查询,而左半联接则是若右表满足联接条件,则返回左表中的记录。
-- hive中无右半联接
select a.name
from employee a
left semi join employee_id ei on a.name = ei.name;

-- union all 垂直操作结果集,包含重复元素
select a.name
from employee a
union all   -- 用在外层查询
select b.name
from employee_hr b;

-- 类似于union(去重)
select distanct name
from (  -- union all的子查询中要求:两子查询的字段数相同,并且相同位置的字段类型也相同。
select a.name as name
from employee a
union all
select b.name as name
from employee_hr b
) u1; -- 必须给定查询结果的别名
05-12 14:06