4.4 hive综合案例

  • 内容推荐数据处理

    • 需求
      • 根据用户行为以及文章标签筛选出用户最感兴趣(阅读最多)的标签
  • 相关数据

    ​ user_id article_id event_time

    11,101,2018-12-01 06:01:10
    22,102,2018-12-01 07:28:12
    33,103,2018-12-01 07:50:14
    11,104,2018-12-01 09:08:12
    22,103,2018-12-01 13:37:12
    33,102,2018-12-02 07:09:12
    11,101,2018-12-02 18:42:12
    35,105,2018-12-03 09:21:12
    22,104,2018-12-03 16:42:12
    77,103,2018-12-03 18:31:12
    99,102,2018-12-04 00:04:12
    33,101,2018-12-04 19:10:12
    11,101,2018-12-05 09:07:12
    35,102,2018-12-05 11:00:12
    22,103,2018-12-05 12:11:12
    77,104,2018-12-05 18:02:02
    99,105,2018-12-05 20:09:11
    
    • 文章数据
    artical_id,artical_url,artical_keywords
    101,http://www.itcast.cn/1.html,kw8|kw1
    102,http://www.itcast.cn/2.html,kw6|kw3
    103,http://www.itcast.cn/3.html,kw7
    104,http://www.itcast.cn/4.html,kw5|kw1|kw4|kw9
    105,http://www.itcast.cn/5.html,
    
  • 数据上传hdfs

    hadoop fs -mkdir /tmp/demo
    hadoop fs -mkdir /tmp/demo/user_action
    
  • 创建外部表

    • 用户行为表
    drop table if exists user_actions;
    CREATE EXTERNAL TABLE user_actions(
        user_id STRING,
        article_id STRING,
        time_stamp STRING
    )
    ROW FORMAT delimited fields terminated by ','
    LOCATION '/tmp/demo/user_action';
    
    • 文章表
    drop table if exists articles;
    CREATE EXTERNAL TABLE articles(
        article_id STRING,
        url STRING,
        key_words array<STRING>
    )
    ROW FORMAT delimited fields terminated by ',' 
    COLLECTION ITEMS terminated BY '|' 
    LOCATION '/tmp/demo/article_keywords';
    /*
    key_words array<STRING>  数组的数据类型
    COLLECTION ITEMS terminated BY '|'  数组的元素之间用'|'分割
    */
    
    • 查看数据
    select * from user_actions;
    select * from articles;
    
    • 分组查询每个用户的浏览记录

      • collect_set/collect_list作用:
        • 将group by中的某列转为一个数组返回
        • collect_list不去重而collect_set去重
      • collect_set
      select user_id,collect_set(article_id) 
      from user_actions group by user_id;
      
      11      ["101","104"]
      22      ["102","103","104"]
      33      ["103","102","101"]
      35      ["105","102"]
      77      ["103","104"]
      99      ["102","105"]
      
      • collect_list
      select user_id,collect_list(article_id) 
      from user_actions group by user_id;
      
      11      ["101","104","101","101"]
      22      ["102","103","104","103"]
      33      ["103","102","101"]
      35      ["105","102"]
      77      ["103","104"]
      99      ["102","105"]
      
      • sort_array: 对数组排序
      select user_id,sort_array(collect_list(article_id)) as contents 
      from user_actions group by user_id;
      
      11      ["101","101","101","104"]
      22      ["102","103","103","104"]
      33      ["101","102","103"]
      35      ["102","105"]
      77      ["103","104"]
      99      ["102","105"]
      
    • 查看每一篇文章的关键字 lateral view explode

      • explode函数 将array 拆分
      select explode(key_words) from articles;
      
      • lateral view 和 explode 配合使用,将一行数据拆分成多行数据,在此基础上可以对拆分的数据进行聚合
      select article_id,kw from articles lateral view explode(key_words) t as kw;
      
      101     kw8
      101     kw1
      102     kw6
      102     kw3
      103     kw7
      104     kw5
      104     kw1
      104     kw4
      104     kw9
      
      select article_id,kw from articles lateral view outer explode(key_words) t as kw;
      
      101     kw8
      101     kw1
      102     kw6
      102     kw3
      103     kw7
      104     kw5
      104     kw1
      104     kw4
      104     kw9
      105     NULL
      #含有outer
      
  • 根据文章id找到用户查看文章的关键字

    • 原始数据
    101     http://www.itcast.cn/1.html     ["kw8","kw1"]
    102     http://www.itcast.cn/2.html     ["kw6","kw3"]
    103     http://www.itcast.cn/3.html     ["kw7"]
    104     http://www.itcast.cn/4.html     ["kw5","kw1","kw4","kw9"]
    105     http://www.itcast.cn/5.html     []
    
    select a.user_id, b.kw from user_actions 
    as a left outer JOIN (select article_id,kw from articles
    lateral view outer explode(key_words) t as kw) b
    on (a.article_id = b.article_id)
    order by a.user_id;
    
    11      kw1
    11      kw8
    11      kw5
    11      kw1
    11      kw4
    11      kw1
    11      kw9
    11      kw8
    11      kw1
    11      kw8
    22      kw1
    22      kw7
    22      kw9
    22      kw4
    22      kw5
    22      kw7
    22      kw3
    22      kw6
    33      kw8
    33      kw1
    33      kw3
    33      kw6
    33      kw7
    35      NULL
    35      kw6
    35      kw3
    77      kw9
    77      kw1
    77      kw7
    77      kw4
    77      kw5
    99      kw3
    99      kw6
    99      NULL
    
  • 根据文章id找到用户查看文章的关键字并统计频率

    select a.user_id, b.kw,count(1) as weight 
    from user_actions as a 
    left outer JOIN (select article_id,kw from articles
    lateral view outer explode(key_words) t as kw) b
    on (a.article_id = b.article_id)
    group by a.user_id,b.kw 
    order by a.user_id,weight desc;
    
    11      kw1     4
    11      kw8     3
    11      kw5     1
    11      kw9     1
    11      kw4     1
    22      kw7     2
    22      kw9     1
    22      kw1     1
    22      kw3     1
    22      kw4     1
    22      kw5     1
    22      kw6     1
    33      kw3     1
    33      kw8     1
    33      kw7     1
    33      kw6     1
    33      kw1     1
    35      NULL    1
    35      kw3     1
    35      kw6     1
    77      kw1     1
    77      kw4     1
    77      kw5     1
    77      kw7     1
    77      kw9     1
    99      NULL    1
    99      kw3     1
    99      kw6     1
    
  • CONCAT: CONCAT(str1,str2,…)

    返回结果为连接参数产生的字符串。如有任何一个参数为NULL ,则返回值为 NULL。

    select concat(user_id,article_id) from user_actions;
    

    CONCAT_WS:

    使用语法为:CONCAT_WS(separator,str1,str2,…)

    CONCAT_WS() 代表 CONCAT With Separator ,是CONCAT()的特殊形式。第一个参数是其它参数的分隔符。分隔符的位置放在要连接的两个字符串之间。分隔符可以是一个字符串,也可以是其它参数。如果分隔符为 NULL,则结果为 NULL。

    select concat_ws(':',user_id,article_id) from user_actions;
    
  • 将用户查看的关键字和频率合并成 key:value形式

    select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w 
    from user_actions as a 
    left outer JOIN (select article_id,kw from articles
    lateral view outer explode(key_words) t as kw) b
    on (a.article_id = b.article_id)
    group by a.user_id,b.kw;
    
    11      kw1:4
    11      kw4:1
    11      kw5:1
    11      kw8:3
    11      kw9:1
    22      kw1:1
    22      kw3:1
    22      kw4:1
    22      kw5:1
    22      kw6:1
    22      kw7:2
    22      kw9:1
    33      kw1:1
    33      kw3:1
    33      kw6:1
    33      kw7:1
    33      kw8:1
    35      1
    35      kw3:1
    35      kw6:1
    77      kw1:1
    77      kw4:1
    77      kw5:1
    77      kw7:1
    77      kw9:1
    99      1
    99      kw3:1
    99      kw6:1
    
  • 将用户查看的关键字和频率合并成 key:value形式并按用户聚合

    select cc.user_id,concat_ws(',',collect_set(cc.kw_w))
    from(
    select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w 
    from user_actions as a 
    left outer JOIN (select article_id,kw from articles
    lateral view outer explode(key_words) t as kw) b
    on (a.article_id = b.article_id)
    group by a.user_id,b.kw
    ) as cc 
    group by cc.user_id;
    
    11      kw1:4,kw4:1,kw5:1,kw8:3,kw9:1
    22      kw1:1,kw3:1,kw4:1,kw5:1,kw6:1,kw7:2,kw9:1
    33      kw1:1,kw3:1,kw6:1,kw7:1,kw8:1
    35      1,kw3:1,kw6:1
    77      kw1:1,kw4:1,kw5:1,kw7:1,kw9:1
    99      1,kw3:1,kw6:1
    
  • 将上面聚合结果转换成map

    select cc.user_id,str_to_map(concat_ws(',',collect_set(cc.kw_w))) as wm
    from(
    select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w 
    from user_actions as a 
    left outer JOIN (select article_id,kw from articles
    lateral view outer explode(key_words) t as kw) b
    on (a.article_id = b.article_id)
    group by a.user_id,b.kw
    ) as cc 
    group by cc.user_id;
    
    11      {"kw1":"4","kw4":"1","kw5":"1","kw8":"3","kw9":"1"}
    22      {"kw1":"1","kw3":"1","kw4":"1","kw5":"1","kw6":"1","kw7":"2","kw9":"1"}
    33      {"kw1":"1","kw3":"1","kw6":"1","kw7":"1","kw8":"1"}
    35      {"1":null,"kw3":"1","kw6":"1"}
    77      {"kw1":"1","kw4":"1","kw5":"1","kw7":"1","kw9":"1"}
    99      {"1":null,"kw3":"1","kw6":"1"}
    
  • 将用户的阅读偏好结果保存到表中

    create table user_kws as 
    select cc.user_id,str_to_map(concat_ws(',',collect_set(cc.kw_w))) as wm
    from(
    select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w 
    from user_actions as a 
    left outer JOIN (select article_id,kw from articles
    lateral view outer explode(key_words) t as kw) b
    on (a.article_id = b.article_id)
    group by a.user_id,b.kw
    ) as cc 
    group by cc.user_id;
    
  • 从表中通过key查询map中的值

    select user_id, wm['kw1'] from user_kws;
    
    11      4
    22      1
    33      1
    35      NULL
    77      1
    99      NULL
    
  • 从表中获取map中所有的key 和 所有的value

    select user_id,map_keys(wm),map_values(wm) from user_kws;
    
    11      ["kw1","kw4","kw5","kw8","kw9"] ["4","1","1","3","1"]
    22      ["kw1","kw3","kw4","kw5","kw6","kw7","kw9"]     ["1","1","1","1","1","2","1"]
    33      ["kw1","kw3","kw6","kw7","kw8"] ["1","1","1","1","1"]
    35      ["1","kw3","kw6"]       [null,"1","1"]
    77      ["kw1","kw4","kw5","kw7","kw9"] ["1","1","1","1","1"]
    99      ["1","kw3","kw6"]       [null,"1","1"]
    
  • 用lateral view explode把map中的数据转换成多列

    select user_id,keyword,weight from user_kws lateral view explode(wm) t as keyword,weight;
    
    11      kw1     4
    11      kw4     1
    11      kw5     1
    11      kw8     3
    11      kw9     1
    22      kw1     1
    22      kw3     1
    22      kw4     1
    22      kw5     1
    22      kw6     1
    22      kw7     2
    22      kw9     1
    33      kw1     1
    33      kw3     1
    33      kw6     1
    33      kw7     1
    33      kw8     1
    35      1       NULL
    35      kw3     1
    35      kw6     1
    77      kw1     1
    77      kw4     1
    77      kw5     1
    77      kw7     1
    77      kw9     1
    99      1       NULL
    99      kw3     1
    99      kw6     1
    

results matching ""

    No results matching ""