自学内容网 自学内容网

Hive练习题16-20

题目16: 同时在线问题

如下为某直播平台主播开播及关播时间,根据该数据计算出平台最高峰同时在线的主播人数。
id          stt                 edt
1001,2021-06-14 12:12:12,2021-06-14 18:12:12
1003,2021-06-14 13:12:12,2021-06-14 16:12:12
1004,2021-06-14 13:15:12,2021-06-14 20:12:12
1002,2021-06-14 15:12:12,2021-06-14 16:12:12
1005,2021-06-14 15:18:12,2021-06-14 20:12:12
1001,2021-06-14 20:12:12,2021-06-14 23:12:12
1006,2021-06-14 21:12:12,2021-06-14 23:15:12
1007,2021-06-14 22:12:12,2021-06-14 23:10:12


-- DDL
create table t16 (
  `id` string comment '主播id',
  `stt` string comment '主播登入时间',
  `edt` string comment '主播登出时间')
 comment '主播登入记录表'
row format delimited fields terminated by '\t'
lines terminated by '\n' stored as orc;


insert overwrite table t16
select '1001', '2021-06-14 12:12:12', '2021-06-14 18:12:12' union all
select '1003', '2021-06-14 13:12:12', '2021-06-14 16:12:12' union all
select '1004', '2021-06-14 13:15:12', '2021-06-14 20:12:12' union all
select '1002', '2021-06-14 15:12:12', '2021-06-14 16:12:12' union all
select '1005', '2021-06-14 15:18:12', '2021-06-14 20:12:12' union all
select '1001', '2021-06-14 20:12:12', '2021-06-14 23:12:12' union all
select '1006', '2021-06-14 21:12:12', '2021-06-14 23:15:12' union all
select '1007', '2021-06-14 22:12:12', '2021-06-14 23:10:12' union all
select '1008', '2021-06-14 11:12:12', '2021-06-14 19:10:12' ;

1. 思路分析
    1. 每条数据与本表自关联(笛卡尔积)
    2. 筛选出 同时在线的数据
        同时在线的条件 :
            1001 1----------5
              2-----------6
                 t1.stt <= t2.edt
            and  t1.edt >= t2.stt

-- 这个题目的思路是:每一个主播同时都跟谁同时在线。
-- 查询sql
with t as (
    select t1.id,t1.stt,t1.edt,count(distinct t2.id) onlineNum from t16 t1 left join t16 t2
    on t1.stt <= t2.edt
            and  t1.edt >= t2.stt group by t1.id,t1.stt,t1.edt
)
select max(onlineNum) from t;

-- 查询结果
t1.id   t1.stt                  t1.edt                  cnt
1001    2021-06-14 12:12:12     2021-06-14 18:12:12     6
1001    2021-06-14 20:12:12     2021-06-14 23:12:12     5
1002    2021-06-14 15:12:12     2021-06-14 16:12:12     6
1003    2021-06-14 13:12:12     2021-06-14 16:12:12     6
1004    2021-06-14 13:15:12     2021-06-14 20:12:12     6
1005    2021-06-14 15:18:12     2021-06-14 20:12:12     6
1006    2021-06-14 21:12:12     2021-06-14 23:15:12     3
1007    2021-06-14 22:12:12     2021-06-14 23:10:12     3
1008    2021-06-14 11:12:12     2021-06-14 19:10:12     6

说明 : 如下为某直播平台主播开播及关播时间,根据该数据计算出平台最高峰同时在线的主播人数

-- 思路
-- 1. 将一条数据拆分成两条(id,dt,flag),并且对数据进行打标:开播为1,关播为-1
-- 2. 然后按照dt排序,累计到当前时刻的flag之和,这个即为当前时刻主播在线的人数
-- 3. 获取上一步得到的主播在线人数的最大值,即为最高峰同时在线的主播人数

-- 答案
with t as (
    select
    id,stt dt , 1 flag
    from t16
    union all
    select
        id,edt dt , -1 flag
        from t16
),t2 as (
    select id,dt,flag,sum(flag) over(order by dt) onlineNum from t
)
select max(onlineNum) from t2 ;


题目17: 车辆每天上传N条数据,要求获得每日最后一条数据,请用至少三种方式写出来,并说明三种方式的区别

vin                pkgts             value
1111111111111111,2022-06-01 16:54:57,60.0
1111111111111111,2022-06-01 07:40:27,6.0
2222222222222222,2022-06-01 19:40:27,7.0
1111111111111111,2022-06-26 12:16:45,64.0
2222222222222222,2022-06-05 23:00:42,18.0
2222222222222222,2022-06-05 09:10:17,44.0
2222222222222222,2022-06-05 13:00:25,19.0
2222222222222222,2022-06-13 16:14:04,43.0
2222222222222222,2022-06-13 07:21:08,40.0



-- 17三种方法第一种
row_number() 编号取最后
with s1 as
(select vin,split(pkgts,' ')[0] day,split(pkgts,' ')[1] times,value from t17),
    s2 as
(select *,row_number() over (partition by day order by times desc) key from s1)
select vin,concat(day,' ',times),value from s2 where key = 1;

-- 17三种方法第二种
排序 用first_value 取分组内排序后,截止到当前行,第一个值 或 last_value 分组内排序后,截止到当前行,最后一个值
with s1 as
(select vin,split(pkgts,' ')[0] day,split(pkgts,' ')[1] times,value from t17),
    s2 as
(select last_value(day) over(partition by day order by times) k1,
       first_value(times) over(partition by day order by times desc) k2,
       first_value(vin) over(partition by day order by times desc ) vin,
       first_value(value) over(partition by day order by times desc ) value
       from s1)
select distinct vin,concat(k1,' ',k2),value from s2 ;


show create table t17

-- 17三种方法第三种
用lead取下一天,要是最后一天就取不到为null把这列为null的全拿出来就是要的结果
with s1 as
(select *,split(pkgts,' ')[0] days from t17),
    s2 as
(select *,lead(pkgts) over (partition by days order by pkgts) key from s1)
select vin,pkgts,value from s2 where key is null;

select version()

题目18:打折日期交叉问题

如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期
brand stt        edt
oppo,2021-06-05,2021-06-09
oppo,2021-06-11,2021-06-21
vivo,2021-06-05,2021-06-15
vivo,2021-06-09,2021-06-21
redmi,2021-06-05,2021-06-21
redmi,2021-06-09,2021-06-15
redmi,2021-06-17,2021-06-26
huawei,2021-06-05,2021-06-26
huawei,2021-06-09,2021-06-15
huawei,2021-06-17,2021-06-21

计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,
第一次活动时间为 2021-06-05 到 2021-06-15,第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15号为重复天数,
只统计一次,即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天。



with s1 as
(select *,
       lead(stt) over (partition by brand order by stt) nextstt,
       `if`(datediff(edt,lead(stt) over (partition by brand order by stt))>0,1,0) key from t18),
    s2 as
(select *,`if`(lag(key) over (partition by brand order by stt) = 1,'yes','no') finkey,
       row_number() over (partition by brand order by stt) num
       from s1),
    s3 as
(select *,lead(finkey) over (partition by brand order by stt) nextkey
from s2),
    s4 as
(select brand,
       `if`(num=1,`if`(nextkey='yes',datediff(lead(edt) over(partition by brand order by stt) ,stt)+1,datediff(edt,stt)+1),
           `if`(finkey='yes',0,datediff(edt,stt)+1)) n1
       from s3)
select brand,sum(n1) `打折时间` from s4 group by brand;

题目19: 【*】间隔连续问题

某游戏公司记录的用户每日登录数据
id         dt
1001 2021-12-12
1002 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1002 2021-12-16
1001 2021-12-19
1002 2021-12-17
1001 2021-12-20

计算每个用户最大的连续登录天数,可以间隔一天。解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。

create table login_in(
    id int,
    dt string
)
row format delimited
fields terminated by ' ';

load data local inpath '/home/hivedata/login.txt' into table login_in;

select * from login_in;


with t as (
  select *,
  lead(dt,1,dt) over (partition by id order by dt ) next_dt,
  if(datediff(lead(dt, 1, dt) over (partition by id order by dt ), dt) > 2, null,
          datediff(lead(dt, 1, dt) over (partition by id order by dt ), dt)
         )days
  from login_in
),t2 as (
  select *,sum(if(days <=2 ,0,1)) over (partition by id order by dt) groupId from t
),t3 as (
  select id,sum(days)+1 activeDays from t2 group by id,groupId
)
select id,max(activeDays) from t3 group by id;

题目20:找出所有科目成绩都大于某一学科平均成绩的学生

表结构:uid,subject_id,score

数据集如下
10010190
10010290
10010390
10020185
10020285
10020370
10030170
10030270
10030385

1)建表语句

create table score(
uid string,
subject_id string,
score int)
row format delimited fields terminated by '\t'; 

2)求出每个学科平均成绩

select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from
score;t1

3)根据是否大于平均成绩记录flag,大于则记为0否则记为1

select
uid,
if(score>avg_score,0,1) flag
from
t1;t2

4)根据学生id进行分组统计flag的和,和为0则是所有学科都大于平均成绩

select
uid
from
t2
group by
uid
having
sum(flag)=0;

5)最终SQL

select
uid
from
(select
uid,
if(score>avg_score,0,1) flag
from
(select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from
score)t1)t2
group by
uid
having
sum(flag)=0;


原文地址:https://blog.csdn.net/yyzzyyds/article/details/144773894

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!