1、full outer join 替换方式
select * from t1
left join t2 on t1.id = t2.id
union
select * from t1
right join t2 on t1.id = t2.id
2、每组取最大的topK
group_concat & find_in_set
select
t1.*
from books_toufang t1
join
(
select
app_id
, group_concat(id order by chapter_count desc) as group_id_set
from books_toufang
group by app_id
)t2
on t1.app_id=t2.app_id and find_in_set(t1.id, group_id_set) between 1 and 5
order by t1.app_id, t1.chapter_count desc
;
set [session | global] group_concat_max_len = 10240;
3、获取每组的最大的topK
SQL: Getting TOP N rows for a grouped query
select
t1.*, t2.rn
from books_toufang t1
join
(
select
t1.id
, count(*) as rn
from books_toufang t1
join books_toufang t2
on t1.app_id=t2.app_id and t1.chapter_count<=t2.chapter_count
group by t1.id
having count(*)<=5
) t2
on t1.id=t2.id
order by app_id, rn
;
4、获取某列最大值对应的哪一行
select
user_id,
my_date,
from
(
select
user_id,
my_date,
max(my_date) over (partition by user_id) max_my_date
from users
)
where my_date = max_my_date
select
a.*
from
users a
inner join
(
select user_id, max(my_date) as max_my_date from users group by user_id
) as b
on a.my_date = b.max_my_date
5、 select into和 insert into select
-- table2必须存在
insert into table2(a, c, d) select a,c,5 from table1;
-- table2不存在,在插入时会自动创建表
select a,c into table2 from table1;
6、非数字判断
select x from table where cast(field as double) is not null;
7、mysql表基本信息
-- 查询列名和注释
select column_name,
column_comment
from information_schema.columns
where table_schema ='db'
and table_name = 'tablename' ;
-- 查看表的注释
select table_name,
table_comment
from information_schema.tables
where table_schema = 'db'
and table_name ='tablename';
-- 查看表生成的ddl
show create table table_name;
-- 查询所有的数据库信息
select distinct information_schema.schemata from tables ;
8、中文字符
regexp '[\u0391-\uFFE5]'
length(name) <> char_length(name)
-- 大小写不敏感
select 'a' regexp 'a', 'a' regexp binary 'a';
9、hive日期提取星期几
-- 返回值为0-6(0-6分别表示星期日-星期六)2017-01-01是星期日
select pmod(datediff('2017-07-27', '2017-01-01'), 7) from dual;
10、hive 多张小表mapjoin
select /*+ mapjoin(b, c) */ a.val, b.val, c.val from a
join b on (a.key = b.key1)
join c on (c.key = b.key1)
11、显示表的结构和位置信息
describe formatted table1;
describe formatted dbname.table1 partition(ds='20180808');
describe extended table1;
describe extended dbname.table1 partition(ds='20180808');
-- 获得表的建表语句
show create table table1;
-- 显示owner、location、format、# files、size等
show table extended like table1;
-- 显示# files、# rows、raw data size、total size
show tblproperties table1;
desc function extended datediff;
12、添加列、修改字段的顺序
alter table table1 change col_old_name col_new_name column_type after column_name;
alter table table1 change ts ts bigint;
alter table table1 change ts new_col bigint;
-- 修改列名的顺序
alter table table1 change column c c string after a;
-- 添加一列 cascade:同步修改元数据,restrict:不修改元数据
-- cascade重新刷新数据时添加的字段才会有值,不然刷新数据新添加的字段以前的数据都为null
alter table table1 add columns (added_column string comment '新添加的列') cascade;
alter table table1 change column added_column added_column string after original_column1 cascade;
alter table table1 drop if exists partition(ds='${ds}');
13、选取某一时间段的分区
select * from
where ds >= date_add(to_date('${bizdate}'), -30)
and ds < '${bizdate}'
datediff(to_date('${bizdate}', 'yyyymmdd'), gmt_create, 'dd') <= 30
14、mysql 批量更新
update my_table t1
join (
select 1 as id, 10 as _c1, 20 as _c2
) t2 on t1.id = t2.id
set c1 = c2, c1 = _c2;
15、mysql添加列
alter table report_all add column registered bigint(20) not null comment '实时激活数' default 0 after id;
16、行转列
user_id, rec_result
73008500 [43143,104248,43470,135783,112564,129673,26992,42237,34775,46552]
62331224 [67101,43477,85409,85397,121266,39521,36232,104000,58363,129365]
104767742 [39655,41067,42119,111065,36115,36187,67580,33333,33576,53355]
165109731 [100470,29320,25469,42522,135783,39897,26485,26992,46552,26817]
164701169 [143044,41470,46407,93705,46380,67070,33576,117227,41067,29879]
128165435 [43143,121255,95172,121256,117605,106697,106686,96297,43470,138398]
148097232 [121304,121256,117617,39797,75608,40534,19953,36232,121306,52857]
select
user_id, rec_book_id
from noveldw_dev.tdl_lsr_rec_free_banner_fdt
lateral view explode(split(substr(rec_result, 2, length(rec_result)-2), ',')) t as rec_book_id
where ds='2019-01-27'
limit 10;
17、hive shell带参数执行
hive -database "${db}" -hivevar ds='${ds}' -hivevar hh='${mi}' -hivevar mi='${mi}' \
-f adl_xxx_smt.sql \
>> ${LOG_BASE}/output.${ds}.${hh}.log 2>&1 &
-- 在adl_xxx_smt.sql中的sql参数替换采用ds='${ds}' hh='${hh}'
18、设置任务的优先级
Hive:
SET mapreduce.job.queuename=root.etl.distcp;
SET mapreduce.job.priority=HIGH;
MapReduce:
hadoop jar app.jar -D mapreduce.job.queuename=root.etl.distcp -D mapreduce.job.priority=HIGH
19、hive窗口函数
select
uid,
create_time,
order_id,
row_number() over(partition by uid order by create_time) as rn,
last_value(order_id) over(partition by uid order by create_time) as last,
first_value(order_id) over(partition by uid order by create_time desc) as first
from tmp_his_order
order by uid, create_time;
-- ntile(n),用于将分组数据按照顺序切分成n片,返回当前切片值
select
order_id,
create_time,
pv,
ntile(3) over(partition by order_id order by pv desc) as rn
from tmp_his_order;
-- rank() 排名相等会在名次中留下空位
-- dense_rank() 排名相等会在名次中不会留下空位
select
order_id,
create_time,
pv,
rank() over(partition by order_id order by pv desc) as rn1,
dense_rank() over(partition by order_id order by pv desc) as rn2,
row_number() over(partition by order_id order by pv desc) as rn3
from tmp_his_order
where order_id = 'order1';
-- cume_dist 小于等于当前值的行数/分组内总行数
select
dept,
userid,
price,
cume_dist() over(order by price) as rn1,
cume_dist() over(partition by dept order by price) as rn2
from tmp_his_order;
-- grouping__id
select
group_id
, phone_brand
, network_type
, provider_name
, count(1) user_num
from
(
select
cast(grouping__id as bigint)&7 as group_id,-- 一定要先将grouping__id转换为数值类型
nvl(phone_brand, '剔重汇总') phone_brand,
nvl(network_type, '剔重汇总') network_type,
nvl(provider_name, '剔重汇总') provider_name,
user_id
from temp.temp_active_user_info t
group by phone_brand, --1
network_type, --2
provider_name, --4
user_id --8
grouping sets (
(phone_brand, user_id), --9&7=1
(network_type, provider_name, user_id) --14&7=6
)
) t
group by group_id, phone_brand, network_type, provider_name
;
-- 1、先将grouping__id转换为数值类型
-- 2、&前后不能有空格
-- 3、&后的数字为去重字段的位置数减去1,例如上面的SQL语句种user_id的位置数为8,那&后紧跟7
-- cube 根据group by的维度的所有组合进行聚合
select
cate1_name
, cate2_name
, count(distinct book_id) as book_count
, sum(week_click_count) as total_week_click_count
, grouping_id()
from tdl_book_info_dim
group by cate1_name, cate2_name
with cube
order by grouping_id()
-- rollup 以最左侧的维度为主,从该维度进行层级聚合。
-- 上钻过程月天的UV->月的UV->总UV
select
cate1_name
, cate2_name
, count(distinct book_id) as book_count
, sum(week_click_count) as total_week_click_count
, grouping_id()
from tdl_book_info_dim
group by cate1_name, cate2_name
with rollup
order by grouping_id()
;
-- lag(col,n,DEFAULT) 用于统计窗口内往上第n行值
-- lead(col,n,DEFAULT) 用于统计窗口内往下第n行值
select
cate1_name
, cate2_name
, book_name
, row_number() over(partition by cate1_name order by week_click_count) as rn
, lag(week_click_count, 1, '0') over(partition by cate1_name order by week_click_count desc) as lag_1
, lead(week_click_count, 1, '0') over(partition by cate1_name order by week_click_count desc) as lead_1
from tdl_book_info_dim
limit 100;
-- partition by doesn't actually roll up the data.
-- It allows you to reset something on a per group basis.
-- 如果不指定rows between,默认为从起点到当前行
-- 如果不指定order by,则将分组内所有值累加
-- preceding:往前
-- following:往后
-- current row:当前行
-- unbounded:起点
-- unbounded preceding 表示从前面的起点
-- unbounded following:表示到后面的终点
select
order_id,
create_time,
pv,
-- 默认为从起点到当前行
sum(pv) over(partition by order_id order by create_time) as pv1,
--分组内所有行
sum(pv) over(partition by order_id) as pv3,
--当前行+往前3行
sum(pv) over(partition by order_id order by create_time rows between 3 preceding and current row) as pv4,
--当前行+往前3行+往后1行
sum(pv) over(partition by order_id order by create_time rows between 3 preceding and 1 following) as pv5,
--当前行+往后所有行
sum(pv) over(partition by order_id order by create_time rows between current row and unbounded following) as pv6
from tmp_his_order;
20、hive日期函数
select date_sub('2017-11-20', 1) from dual; -— 2017-11-19
select date_add('2017-11-20', 1) from dual; -- 2017-11-21
select datediff('2017-11-21', '2017-11-01 14:00:00') from dual; —- 20
select unix_timestamp() from dual; -- 1511597988
select unix_timestamp('2017-11-25 13:01:03') from dual; -- 1511586063
select unix_timestamp('20171125 13:01:03','yyyyMMdd HH:mm:ss') from dual; -- 1511586063
select from_unixtime(1511597988, 'yyyyMMdd') from dual; -- 20171125
select to_date('2017-11-25 13:01:03') from dual; -- 2017-11-25
-- 转成标准的yyyy-MM-dd格式
select to_date(from_unixtime(unix_timestamp('20200808', 'yyyyMMdd'))); -- 2020-08-08
select to_date(from_unixtime(unix_timestamp('2020 08 08', 'yyyy MM dd'))); -- 2020-08-08
--20171205转成2017-12-05
select from_unixtime(unix_timestamp('20171205','yyyymmdd'),'yyyy-mm-dd') from dual;
--2017-12-05转成20171205
select from_unixtime(unix_timestamp('2017-12-05','yyyy-mm-dd'),'yyyymmdd') from dual;
21、避免用科学计数法表示浮点数
-- a表示总的位数,b表示保留几位小数,a要>=实际的位数,否则为NULL
cast(column as decimal(a, b))
-- 保留4为数字,转成字符串
printf("%.4f", column)
format_number(column, 4)
22、将多列按列名和值转换为两列
select
operation_name
, network_type
, type
, time
from temp.temp_page_detail_load_time
lateral view explode(map('shell_time', shell_time, 'shell_time2', shell_time2, 'load_time', load_time)) t as type, time
;
23、快速的复制一张分区表
create table odl_lsr_rec_user_book_30d_fht like ods_storage.odl_lsr_rec_user_book_30d_fht;
hadoop fs -cp /user/hive/warehouse/ods_storage.db/odl_lsr_rec_user_book_30d_fht /user/hive/warehouse/test.db/odl_lsr_rec_user_book_30d_fht/
msck repair table odl_lsr_rec_user_book_30d_fht;
24、分析表的统计信息
analyze table als_30d_fht partition(ds='2017-11-21', hh='23') compute statistics;
-- 所有分区
analyze table als_30d_fht partition(ds, hh) compute statistics;
-- 对列进行分析
analyze table als_30d_fht partition(ds='2020-05-11') compute statistics for columns book_co_cnt;
25、带分隔符字符串连接函数concat_ws
select concat_ws(',', 'abc', 'def', 'gh') from dual; -- abc,def,gh
26、字符串使用户
select regexp_replace('foobar', 'oo|ar', '') from dual;
select regexp_extract('foothebar', 'foo(.*?)(bar)', 1) from dual; -- the
select regexp_extract('foothebar', 'foo(.*?)(bar)', 2) from dual; -- bar
select regexp_extract('foothebar', 'foo(.*?)(bar)', 0) from dual; -- foothebar
select substr('abcde', 3); -- cde
select substr('abcde', -1); -- e
-- substr(string A, int start, int len)
select substr('abcde', 3 ,2); -- cd
select substr('abcde', -2, 2); -- de
-- 截取第count分隔符之前的字符串,如count为正则从左边开始截取,如果为负则从右边开始截取
select substring_index('www.apache.org', '.', 2); -- www.apache
-- 将input出现在from中的字符串替换成to中的字符串
select translate("MOBIN", "BIN", "M"); -- MOM
27、json解析
{"store":
{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}
select get_json_object(src_json.json, '$.owner') from src_json; -- amy
select get_json_object(src_json.json, '$.store.fruit[0]') from src_json; -- {"weight":8,"type":"apple"}
select get_json_object(src_json.json, '$.non_exist_key') from src_json; -- NULL
-- 转json格式
select named_struct('app', app, 'app_cnt', app_cnt);
-- hive json array 解析打平
-- {"code":200,"data":[{"item_id":129938,"cpack":{"rec_id":"8321299389"}},{"item_id":46433,"cpack":{"rec_id":"832464339"}},{"item_id":159492,"cpack":{"rec_id":"8321594923"}},{"item_id":81595,"cpack":{"rec_id":"832815953"}},{"item_id":160330,"cpack":{"rec_id":"8321603305"}},{"item_id":113865,"cpack":{"rec_id":"8321138653"}},{"item_id":159429,"cpack":{"rec_id":"8321594291"}},{"item_id":137069,"cpack":{"rec_id":"8321370699"}},{"item_id":153300,"cpack":{"rec_id":"8321533000"}},{"item_id":156020,"cpack":{"rec_id":"8321560201"}}],"flow":"88fb13f0754d92","success":true,"upack":{"req_id":"344471"}}
select
get_json_object(bc, '$.item_id') as item_id
, get_json_object(bc, '$.cpack.rec_id') as rec_id
, flow
, get_json_object(upack, '$.req_id') as upack
from resp_log_fdt
lateral view json_tuple(resp, 'data', 'flow', 'upack') t as data, flow, upack
lateral view explode(split(regexp_replace(regexp_extract(t.data,'^\\[(.+)\\]$',1),'\\}\\,\\{', '\\}\\|\\|\\{'), '\\|\\|')) bb as bc
limit 100
;
28、分位数
explode(percentile_approx(cast(rating as double),array(0.05,0.5,0.95),9999)) as percentile
-- x是直方图的柱子的中心
select explode(histogram_numeric(rating, 100)) from table1;
select inline(histogram_numeric(rating, 100)) from table1;
29、同一行最大最小值
select greatest(1,2,3) from dual;
3
select least(1,2,3) from dual;
1
30、group 为 array
select
a, b, concat_ws(',' , collect_set(cast(c as string)))
from table1
group by a, b;
31、hive数据上传下载
load data local inpath '/tmp/test.txt' overwrite into table test;
load data local inpath "/tmp/test.txt" overwrite into table test_p partition (date=20140722)
insert overwrite local directory '/tmp/test' row format delimited fields terminated by ',' select * from test;
32、mysql移除字段中的所有空白字符
replace(replace(replace(replace(filed ,char(13),''),char(10),''),char(9),''),' ','')
33、hive 锁表处理
-- 查看锁表状态
show locks <table_name>;
show locks <table_name> partition (<partition_desc>);
load data local inpath '/tmp/test.txt' into table test_p partition(pt=20140722);
-- 会触发锁操作。先调用alter table add partition来创建分区,然后hadoop dfs -put向其中导入数据。
34、mysql 时间处理
--今天
select * from t1 where to_days(create)=to_days(now());
select * from t1 where date(create)=curdate();
--昨天
select * from t1 where to_days(now())–to_days(create)<=1;
--7天
select * from t1 where date_sub(curdate(), interval 7 day)<=date(create);
--本月
select * from t1 where date_format(create, '%y%m')=date_format(curdate(), '%y%m');
--上个月
select * from t1 where period_diff(date_format(now() , '%y%m'), date_format(create, '%y%m'))=1;
-- 时间戳和日期
select from_unixtime(1532544000)
select unix_timestamp(date('2018-07-26'))
日期类型
datetime 8 bytes YYYY-MM-DD HH:MM:SS 1000-01-01 00:00:00 ~ 9999-12-31 23:59:59
timestamp 4 bytes YYYY-MM-DD HH:MM:SS 1970-01-01 00:00:01 ~ 2038
date 3 bytes YYYY-MM-DD 1000-01-01 ~ 9999-12-31
datetime 的日期范围比较大;timestamp 所占存储空间比较小,只是 datetime 的一半。在 insert, update数据时,timestamp 列会自动以当前时间(CURRENT_TIMESTAMP)填充/更新。timestamp比较受时区timezone的影响以及MYSQL版本和服务器的SQL MODE的影响.
-- 日期字符串转换
select date_format('2018-08-08 22:23:01', '%Y%m%d%H%i%s');
select str_to_date('08.08.2108 08:09:30', '%m.%d.%Y %h:%i:%s');
35、hive 正则匹配
select '123456aa' rlike '^\\d+';
select 'footbar' regrep '^f.*r$';
select regexp_extract('foothebar', 'foo(.*?)(bar)', 1); # the
36、mysql Duplicate entry for key group_key
set session max_heap_table_size=536870912;
set session tmp_table_size=536870912;
37、hive order by,sort by, distribute by, cluster by
-- order by会对查询的结果做一次全局排序,所有的数据都会到同一个reducer进行处理
-- sort by在每个reducer端做排序,保证了局部有序
-- ditribute by控制map的输出在reducer是如何划分
select mid, money, name from store distribute by mid sort by mid asc, money asc
-- cluster by的功能就是distribute by和sort by相结合
select mid, money, name from store cluster by mid
38、mysql中in和exists区别
-- mysql中的in是把外表和内表作hash连接,而exists是对外表作loop循环,每次loop循环再对内表进行查询。
-- 如果查询的两个表大小相当,那么用in和exists差别不大。
-- 如果两个表中一个较小,一个是大表,则子查询表大的用exists,子查询表小的用in。
-- 小表A,大表B
select * from A where exists(select cc from B where cc=A.cc);
select * from B where cc in (select cc from A);
39、hive MD5
select reflect('org.apache.commons.codec.digest.DigestUtils', 'md5Hex', 'your_string') ;
select reflect('org.apache.commons.codec.digest.DigestUtils', 'sha256Hex', 'your_string');
40、用户留存
-- 近30天每个用户的留存
insert overwrite table tdl_user_retention_info
select
user_id
, active_date
, reg_date
, retention_n
from
(
select
app_id
, user_id
, pt as active_date
, substr(from_unixtime(cast(registertime/1000 as bigint)), 1, 10) as reg_date
, datediff(pt, substr(from_unixtime(cast(registertime/1000 as bigint)), 1, 10)) as retention_n
from dw_active_users
where pt>=date_sub('${ds}', 30) and pt<='${ds}'
and substr(from_unixtime(cast(registertime/1000 as bigint)), 1, 10)>=date_sub('${ds}', 30)
and substr(from_unixtime(cast(registertime/1000 as bigint)), 1, 10)<='${ds}'
)t
group by user_id, active_date, reg_date, retention_n
;
-- 聚合,次留、3留、7次留
select
t2.reg_date
, count(distinct if(retention_n=1, t2.user_id, null)) as active_uv1
, count(distinct if(retention_n=2, t2.user_id, null)) as active_uv2
, count(distinct if(retention_n=6, t2.user_id, null)) as active_uv7
, count(distinct if(retention_n=0, t1.user_id, null)) as reg_uv
from
(
select
user_id
from tdl_new_user_fdt
where ds>='2019-07-01'
)t1
join tdl_user_retention_info t2
on t1.user_id=cast(t2.user_id as string)
group by t2.reg_date
order by t2.reg_date
;
41、explode
-- explode
select tf.* from (select 0) t lateral view explode(array('A','B','C')) tf as col;
select tf.* from (select 0) t lateral view explode(map('A',10,'B',20,'C',30)) tf as key,value;
select tf.* from (select 0) t lateral view posexplode(array('A','B','C')) tf as pos,val;
-- inline (array of structs)
select tf.* from (select 0) t lateral view inline(array(struct('A', 10, date '2015-01-01'), struct('B', 20, date '2016-02-02'))) tf as col1, col2, col3;
-- stack (values)
select tf.* from (select 0) t lateral view stack(2, 'A', 10, date '2015-01-01', 'B', 20, date '2016-01-01') tf as col0, col1, col2;
41、数组联动排序
with array_table as (
select array(1, 2, 3, 4, 5) as col1, array(0.43, 0.01, 0.45, 0.22, 0.001) as col2
)
select
col1
, col2
, collect_list(c1_val) as new_col1
, collect_list(c2_val) as new_col2
from
(
select
t.col1 as col1
, t.col2 as col2
, val as c1_val
, t.col2[pos] as c2_val
, pos as c1_pos
from array_table t
lateral view posexplode(col1) c1 as pos, val
distribute by col1, col2
sort by c2_val
)s
group by col1, col2;
42、hive数据倾斜
Map倾斜
-
原因
1、上游表文件大小不均匀,并且小文件特别多,导致当前map任务读取数据分布不均匀,引起长尾;
2、Map任务做
count distinct
时,某些文件的某个值特别多引起长尾; -
方案
1、调节map任务的个数和每个map任务读取小文件的个数;
2、通过
distirbute by rand()
随机分布函数将数据重新随机分发,将聚合、join操作在其他阶段完成;
Join倾斜
1、join的某路输入比较小,采用mapjoin避免分发引起的长尾;
2、join的每路输入都比较大,且长尾是空值导致的,将空值处理成随机值,on coalesce(t1.key, rand()*9999) = t2.key
;
Reduce倾斜
1、对同一个表的不同列count distinct
,造成map端数据膨胀,使得下游的join和reduce出现链路上的长尾;
2、map端直接聚合出现key分布不均匀,造成reduce长尾。通过对热点key单独处理;
3、动态分区过多造成小文件过多,引起reduce长尾;
4、多个distinct同时出现在一段sql,数据会被分发多次,不经会造成数据膨胀N倍,还会把长尾现象放大N倍。
43、hive常用设置
-- mapjoin
set hive.auto.convert.join=true;
set hive.mapjoin.smalltable.filesize=64000000;
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=256000000;
-- map输入合并小文件
set mapred.max.split.size=256000000;
set mapred.min.split.size.per.node=256000000;
set mapred.min.split.size.per.rack=256000000;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
-- map输入文件合并
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.size.per.task=256000000;
set hive.merge.smallfiles.avgsize=12800000;
-- jvm重用
set mapreduce.job.jvm.numtasks=-1;
--子查询并发执行
set hive.support.concurrency=true;
set mapreduce.job.reduce.slowstart.completedmaps=0.9;
-- 矢量查询,1024行数据组成一个batch进行处理
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled=true;
-- 如果单个map处理大量的计算任务,将单个mapper的分片设置小一点,这样可以起多个任务并行执行
set mapreduce.input.fileinputformat.split.maxsize=1000000;
44、AUC计算
参考深入理解AUC
将测试集的正负样本按照模型预测得分从小到大排序,对于第 \(j\) 个正样本,假设它的排序为 \(r_j\),排在第j个正样本前面的负样本个数为 \(r_j - j\) 个,对于第 \(j\) 个正样本来说,它的得分比随机取的一个负样本得分大的概率是 \((r_j − j)/N_−\),其中\(N_−\)是总的负样本数目。平均下来,随机取的正样本得分比负样本大的概率为
\[ AUC = \frac{1}{N_{+}} \sum_{j=1}^{N_{+}}\left(r_{j}-j\right) / N_{-}=\frac{\sum_{j=1}^{N_{+}} r_{j}-N_{+}\left(N_{+}+1\right) / 2}{N_{+} N_{-}} \]
select
(ry - 0.5*n1*(n1+1))/n0/n1 as auc
from
(
select
sum(if(y=0, 1, 0)) as n0,
sum(if(y=1, 1, 0)) as n1,
sum(if(y=1, r, 0)) as ry
from
(
select
y, row_number() over(order by score asc) as r
from
(
select y, score from some.table
)t1
)t2
)t3;
-- gauc
with
user_inst_info as (
select
mark
, label
, row_number() over(partition by mark order by score asc) as r
from
(
select
'uuid:'|| uuid || '_gauc' || '@'|| split(info, '_')[1] as mark
, cast(score as decimal(20, 6)) as score
, label
from model_predict
where uuid in ('1121004652213939')
)t
),
user_stat_info as (
select
mark
, sum_neg
, sum_pos
, num_ins
, 1.000000 * (sum_pos_rank - 0.5 * sum_pos * (sum_pos + 1)) / (sum_neg * sum_pos) as auc
from
(
select
mark
, sum(if(label<>1, 1, 0)) as sum_neg
, sum(if(label=1, 1, 0)) as sum_pos
, count(1) as num_ins
, sum(if(label=1, r, 0)) as sum_pos_rank
from user_inst_info
group by mark
having sum(if(label=1, 1, 0))>0 and sum(if(label<>1, 1, 0))>0
)t
)
select
split(mark, '@')[1] as mark
, count(1) as mids
, sum(num_ins) as num_ins
, sum(sum_pos) as sum_pos
, 1.00000 * sum(sum_pos) / sum(num_ins) as pos_rate
, 1.00000 * sum(auc * num_ins) / sum(num_ins) as gauc
from user_inst_info
group by 1
order by 1
;
45、计算连续登陆天数
-- 每个人连续工作时间段及对应的工资和
select
name
, start_date
, end_date
, salary * (datediff(end_date, start_date) + 1) as salary
from
(
select
name
, date_sub(active_date, rn - 1) as flag_date
, salary
, min(active_date) as start_date
, max(active_date) as end_date
from
(
select
name
, active_date
, salary
, row_number() over(partition by name order by active_date) as rn
from test
group by name, active_date, salary
)t
group by name, date_sub(active_date, rn - 1), salary
)t
;
-- 计算连续登陆天数
select
user_id
, login_time
, date_sub(login_date, rn - 1) as login_group
, min(login_date) as start_date
, max(login_date) as end_date
, count(1) as cont_days
from
(
select
user_id
, login_date
, row_number() over(partition by user_id order by login_date) as rn
from test
)t
group by user_id, date_sub(login_time, rn - 1)
;
46、with as
with q1 as ( select key from src where key = '5')
select * from q1;
-- from style
with q1 as (select * from src where key= '5')
from q1
select *;
-- chaining CTEs
with q1 as ( select key from q2 where key = '5'),
q2 as ( select key from src where key = '5')
select * from (select key from q1) a;
with q1 as ( select key from src where key = '5'),
q2 as ( select key from q1 where key = '5')
select * from (select key from q2) a;
-- union example
with q1 as (select * from src where key= '5'),
q2 as (select * from src s2 where key = '4')
select * from q1 union all select * from q2;
-- insert example
create table s1 like src;
with q1 as ( select key, value from src where key = '5')
from q1
insert overwrite table s1
select *;
-- ctas example
create table s2 as
with q1 as ( select key from src where key = '4')
select * from q1;
47、presto获取表路径
select "$path" from dws_table;
48、presto 时间转换
select cast('2019-09-05' as date);
select cast('2019-09-05 09:50:55' as timestamp);
select date('2019-09-05');
select date(cast('2019-09-05 09:50:55' as timestamp));
select cast(to_unixtime(timestamp('2023-06-28 23:39:36')) as bigint); 1687966776
select date_parse('20200825', '%Y%m%d'); -- 2020-08-25 00:00:00.000
select date_parse('20200825', '%Y%m%d') + interval '1' day; -- 2020-08-26 00:00:00.000
select date_parse('20200825', '%Y%m%d') + interval '-1' day; -- 2020-08-24 00:00:00.000
select date_format(date_parse('20200825', '%Y%m%d') + interval '1' day, 'yyyyMMdd'); -- 20200826
select date_parse('2019-09-05 09:50:55', '%Y-%m-%d %H:%i:%s');
select format_datetime(from_unixtime(cast(substr('1598865255', 1, 10) as int)), 'yyyy-MM-dd HH:mm:ss');
select format_datetime(cast('2019-09-05 09:50:55' as timestamp), 'yyyy-MM-dd HH:mm:ss');
select format_datetime(cast('2019-09-05 09:50:55' as timestamp), 'yyyy-MM-dd');
select format_datetime(date('2019-09-05'),'yyyy-MM-dd 00:00:00');
hour(from_unixtime(cast('1602539327' as int)))
--date类型字段之间进行比较
select date_diff('day', cast('2019-11-05' as date), cast('2019-11-07' as date));
select date_diff('day', cast('2019-09-05 09:50:55' as timestamp), cast('2019-09-07 09:50:55' as timestamp));
select date_diff('day', date('2019-09-05'), cast('2019-09-07 09:50:55' as timestamp));
select current_date; --获得今天的日期
select current_date - interval '1' day;
select format_datetime(current_date - interval '1' day, 'yyyyMMdd');
select current_timestamp; --获得当前时间
select format_datetime(current_timestamp, 'yyyy-MM-dd HH:mm:ss');
select format_datetime(current_timestamp, 'yyyy-MM-dd');
select format_datetime(current_timestamp - interval '1' day, 'yyyyMMdd');
log_date between date_format(date_parse('20201216', '%Y%m%d') - interval '7' day, 'yyyyMMdd') and '${day_n}'
49、presto cross join unnest
with t(a) as (
values array [1, 3, 4, 5], array [3, 5, 6, 1]
)
select
array_agg(value_sum order by ordinal)
from
(
select
ordinal, avg(value) as value_sum
from t
cross join unnest(a) with ordinality as x(value, ordinal)
group by ordinal
)t1
;
50、presto array 聚合平均
select
user
, sqrt(reduce(zip_with(user_vec, center_vec, (x, y) -> pow(x - y, 2.0)), 0, (s, x) -> s + x, s -> s)) as score
from
(
values
('a', array[1,2,3], array[4, 5, 6]),
('b', array[4,5,6], array[7, 8, 9])
) as t(user, user_vec, center_vec)
;
-- cos similarity
with
tdl_user_norm_emb_fdt as (
select
user
, transform(user_vec, x -> x / user_norm) as user_norm_vec
, transform(center_vec, x -> x / cent_norm) as cent_norm_vec
from
(
select
user
, cast(split(user_vec, ',') as array(double)) as user_vec
, cast(split(center_vec, ',') as array(double)) as center_vec
, reduce(cast(split(user_vec, ',') as array(double)), 0, (s, x) -> s + x * x, s -> sqrt(s)) as user_norm
, reduce(cast(split(center_vec, ',') as array(double)), 0, (s, x) -> s + x * x, s -> sqrt(s)) as cent_norm
from
(
values
('a', '1,2,3', '4,5,6'),
('b', '4,5,6', '7,8,9')
) as t(user, user_vec, center_vec)
)t
)
select
user
, reduce(zip_with(user_norm_vec, cent_norm_vec, (x, y) -> x * y), 0, (s, x) -> s + x, s -> s) as cos_sim
from tdl_user_norm_emb_fdt
;
51、presto md5
select lower(to_hex(md5(cast('C22D5673' as varbinary))));
52、presto转json
select
uid
, json_format(cast(
map_from_entries(array[('hisotry',
array_agg(
map_from_entries(array[('vid', cast(vid as bigint)), ('time', cast(log_date as bigint))])
)
)])
as json)
) as value
from user_click
where log_date='20210131'
;
select
trackid
, mid
, cast(element_at(item, 'up_id') as bigint) as upid
, get_json_object(element_at(item, 'feature'), '$.sources_simple') as trigger_type
, get_json_object(user_feature, '$.page_number') as page_number
from rec_feed_show
cross join unnest(try_cast(json_parse(showlist) as array<map<varchar, varchar>>)) as x(item)
where log_date='20230328'
53、presto max_by
select
country,
count(case when status = 'logged-in' then 1 end) as logged_in_users,
count(case when status = 'logged-out' then 1 end) as logged_out_users
from (
select
user_id,
country,
status,
row_number() over (partition by user_id order by time desc) as rn
from user_activity
)
where rn = 1
group by country
order by count(case when status = 'logged-in' then 1 end) desc
;
select country,
count_if(status = 'logged-in') as logged_in_users,
count_if(status = 'logged-out') as logged_out_users
from (
select
user_id,
max_by(country, time) as country,
max_by(status, time) as status
from user_activity
group by user_id
)
group by 1
order by 2 desc
;
54、presto zip_with
with
tdl_av_feature_fdt as (
select
key as itemid,
zip_with(
transform(filter(split(value, '|'), x->trim(x)<>''), x->split(x, ' ')[1])
, transform(filter(split(value, '|'), x->trim(x)<>''), x->trim(x))
, (x, y) -> (trim(x), trim(replace(y, x, '')))
) as features
from item_feature
where log_date='20220707' and log_hour='22'
)
select
k as feat_name
, count(1) as total
, count(distinct itemid) as avs
, count(distinct if(trim(v)='', null, trim(v))) as non_dis_vals
, sum(if(trim(v)='', 0, 1)) as non_empty_vals
from tdl_av_feature_fdt
cross join unnest(features) as t(k, v)
group by 1
order by 1
;
with
t_user_feature_fdt as (
select
key as mid,
zip_with(
transform(filter(split(value.value, '|'), x->trim(x)<>''), x->split(x, ' ')[1])
, transform(filter(split(value.value, '|'), x->trim(x)<>''), x->trim(x))
, (x, y) -> (trim(x), trim(replace(y, x, '')))
) as features
from user_feature
where log_date='20220712' and log_hour='20'
)
select
mid
, k
, array_join(array_sort(split(trim(v), ' ')), ' ') as v
from t_user_feature_fdt
cross join unnest(features) as t(k, v)
;