作为ROLAP引擎的SparkSQL具备丰富的SQL算子,比如关联、聚合、分组、窗口函数以及内置的各种函数、hint等等,在功能层面较好地满足了业务分析场景的需要。作为平台工具的开发者,在提供好用、稳定的工具的同时,更要掌握一定的SQL使用方式,来加深各个算子在实际场景的应用效果。故开此文,长期记录使用SQL进行经典OLAP分析的场景或者是一些奇淫技巧。

场景1

有用户每日访问数据表,user_visit_log(userid, visit_date, visit_count),使用SQL统计出每个用户在当月累积访问次数和总累计访问次数?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
create table user_visit_log(user_id string, visit_date string, visti_count int) using orc;
insert into usr_visit_log
values
('u01', '2017-01-01',5),('u02','2017-01-23',6),
('u03', '2017-01-22',8),('u04','2017-01-20',3),
('u01', '2017-01-23',6),('u01','2017-02-21',8),
('u02', '2017-01-23',6),('u01','2017-02-22',4);

with a as (
select
*,
date_format(visit_date, 'yyyyMM') as visit_month
from
user_visit_log
),
b as (
select
user_id,
visit_month,
sum(visit_count) as month_total_visit_count
from
a
group by
user_id, visit_month
)
select
*,
sum(month_total_visit_count) over(partition by user_id) as total_visit_count
from
b
order by
user_id, visit_month;

场景2

有用户表user(user_id, name, age)和观影记录表view_record(user_id, moive_name),记录各个年龄段的观影次数?(10年作为一个区间,70岁以上作为一类)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
with a as (
select
user_id,
count(1) as cnt
from
view_record
group by
user_id
),
b as (
select
case
when age<10 then '10岁以下'
when age>=10 and age<20 then '10-20'
when age>=20 and age<30 then '20-30'
when age>=30 and age<40 then '30-40'
when age>=40 and age<50 then '40-50'
when age>=50 and age<60 then '50-60'
when age>=60 and age<70 then '60-70'
else '70岁以上'
end as age_level,
a.cnt
from
a
left join user b on a.user_id = b.user_id
)
select
age_level,
sum(b.cnt) as cnt
from
b
group by age_level order by cnt;

场景3

有用户访问日志表user_access_log(date_time, user_id, age),请用SQL计算出活跃用户数量和平均年龄?(活跃用户表示连续两天访问的用户)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
create table user_access_log(date_time string, user_id string, age int) using orc;
insert into user_access_log
values
('2019-02-12', 2, 19), ('2019-02-11', 1, 23),
('2019-02-11', 3, 39), ('2019-02-11', 1, 23),
('2019-02-11', 3, 39), ('2019-02-13', 1, 23),
('2019-02-15', 2, 19), ('2019-02-11', 2, 19),
('2019-02-11', 1, 23), ('2019-02-16', 2, 19);

with a as (
select
*,
-- datediff(enddate, startdate) 相差的天数
datediff(
date_time,
-- 在当前行向前取一行
lead(date_time, -1, date_time) over(partition by user_id order by date_time)
) as flag
from
user_access_log
)
select
count(distinct(user_id)) as active_user_cnt,
avg(age) as avg_age
from
a
where a.flag = 1;

场景4

有用户购买记录表,user_pay_log(user_id, money, pay_time, order_id),请用sql写出所有用户中在2020年10月份第一次购买商品的金额?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
create table user_pay_log(user_id string, money double, pay_time string, order_id string) using orc;
insert into user_pay_log
values
('01',18.5,'2017-10-01','o00001'),('01',205,'2020-10-01','o00002'),
('02',18.5,'2017-09-01','o00003'),('02',20.5,'2020-10-11','o00004'),
('03',100,'2017-09-01','o00005');

insert into user_pay_log
values
('01',18.5,'2017-10-05','o00006'),('01',205,'2020-10-11','o00007'),
('02',18.5,'2017-09-15','o00008'),('02',20.5,'2020-10-30','o00009'),
('03',100,'2017-10-01','o00010');

with a as (
select
user_id,
money,
pay_time,
ROW_NUMBER() over(partition by user_id order by pay_time asc) as rank
from
user_pay_log
where
DATE_FORMAT(pay_time, 'yyyy-MM')='2020-10'
)
select
user_id,
pay_time,
money
from
a
where
rank = 1;

场景5

有表account(dist_id, account, gold_coin) ,请写出SQL语句,查询各自区组的money排名前3的账号?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
CREATE  table account(dist_id string, account string, gold_coin int) using orc;
insert into account
values
('A', 'user1', 10),('A', 'user2', 2),('A', 'user3', 11),
('A', 'user4', 8),('B', 'user1', 1),('B', 'user2', 103),
('C', 'user1', 10);

with a (
select
dist_id,
account,
gold_coin,
row_number() over(partition by dist_id order by gold_coin desc) as rank
from
account
)
select
dist_id,
account,
gold_coin
from
a
where
rank <= 3;

场景6

有充值记录表credit_log(dist_id, account, money, create_time),请写出SQL语句,查询充值日志表2020年08月08号每个区组下充值额最大的账号,要求结果: 区组id,账号,金额,充值时间 ?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
create table credit_log(dist_id int, account string, money int, create_time string) using orc;
insert into credit_log
values
(1, 'user1', 100, '2021-01-01'),
(1, 'user1', 200, '2021-08-08'),
(1, 'user2', 10, '2021-01-01'),
(1, 'user3', 1000, '2020-08-08'),
(2, 'user4', 100, '2021-01-01'),
(2, 'user5', 20, '2021-08-08'),
(3, 'user6', 10, '2021-01-01'),
(4, 'user7', 1500, '2020-08-08');
insert into credit_log
values
(1, 'user8', 1000, '2020-08-08');

with a (
select
*,
-- 这里用rank原因是防止有同等金额的都属于最大的。row_number是打编号,求topn
rank() over(partition by dist_id order by money desc) as rank
from
credit_log
where
create_time = '2020-08-08'
)
select
dist_id,
account,
money,
create_time
from
a
where
rank = 1;