Commit a9dc207f authored by mengxiangxuan's avatar mengxiangxuan

00

parent 38c4dde2
......@@ -25,6 +25,7 @@ cursor.execute(sql11)
df11 = pd.DataFrame(cursor.fetchall())
df11.columns=['slotid','app_id','cnt']
df11=df11.ix[df11['slotid'].notnull()].astype('int')
df11=df11.ix[df11['cnt']>10] #剔除流量很低的广告位
df11=df11.drop(['cnt'],axis=1)
#全部有转化埋点非免费广告约1000
......@@ -37,7 +38,7 @@ cursor.execute(sql12)
df12 = pd.DataFrame(cursor.fetchall())
df12.columns=['advert_id']
#全部ad_slot组合约 250w
#全部ad_slot组合约 300w
print('all ad_slot')
df1=pd.DataFrame(columns=['slotid','advert_id'])
for i in df12['advert_id']:
......@@ -46,7 +47,7 @@ for i in df12['advert_id']:
df1=df1.append(df_temp)
#有历史发券且置信或能拿到预估cvr的 ad+slot 约12w
#有历史发券且置信或能拿到预估cvr的 ad+slot 约10w
sql='''
select advert_id,slotid
from (
......@@ -60,7 +61,7 @@ left outer join
tmp.tmp_cpc_act_advert_df b
on a.advert_id = b.advert_id
group by a.advert_id,a.slotid) t
where act_click_cnt>2 or launch_cnt>100
where launch_cnt>100
'''.format(yestodayn,yestoday1)
cursor.execute(sql)
ad_slot_launch = pd.DataFrame(cursor.fetchall())
......
......@@ -72,7 +72,7 @@ sql2='''CREATE TABLE if not exists advert.dws_not_luanch_create_samples_mxx as
where dt>='{0}' and dt<='{1}') c
distribute by slotid sort by slotid,rank_num desc
)a
where rnb<=500'''.format(yestodayn,yestoday1)
where rnb<=200'''.format(yestodayn,yestoday1)
#未定向组合
# sql3='''drop table advert.dws_not_luanch_slot_ad_mxx'''
# sql4='''create table advert.dws_not_luanch_slot_ad_mxx (slotid string,advert_id string)
......@@ -100,15 +100,20 @@ cursor.execute(sql2)
###广告相关信息
sql='''select distinct advert_id,account_id,case when length(match_tag_nums)=16 then substr(match_tag_nums,7)
sql='''select advert_id,account_id,
case when length(match_tag_nums)=16 then substr(match_tag_nums,7)
when length(match_tag_nums)=22 then substr(match_tag_nums,13)
else match_tag_nums end match_tag_nums
else match_tag_nums end match_tag_nums,avg(fee) fee
from advert.dws_advert_order_wide_v4_level_6_di
where dt>='{0}' and dt<='{1}'
and advert_id is not null'''.format(yestodayn,yestoday1)
where dt>='{0}' and dt<='{1}' and advert_id is not null
group by advert_id,account_id,
case when length(match_tag_nums)=16 then substr(match_tag_nums,7)
when length(match_tag_nums)=22 then substr(match_tag_nums,13)
else match_tag_nums end'''.format(yestodayn,yestoday1)
cursor.execute(sql)
ad_info = pd.DataFrame(cursor.fetchall())
ad_info.columns=['advert_id','account_id','match_tag_nums']
ad_info.columns=['advert_id','account_id','match_tag_nums','fee']
ad_info=ad_info.ix[ad_info['fee']>0]
ad_info_match_slot=pd.merge(ad_slot_df,ad_info,how='left',on='advert_id')
ad_info_match_slot['account_id']=ad_info_match_slot['account_id'].fillna(value=0).astype('int').astype('str')
......@@ -198,16 +203,38 @@ cursor.execute(sql)
df1 = pd.DataFrame(cursor.fetchall())
df1.columns=list(la)
df1=df1.astype('str')
df1.to_csv('not_luanch_scene.csv',index=False,sep='|')
df1.to_csv('scene_info.csv',index=False,sep='|')
#广告特征数据
df2=ad_info_match_slot
df2.columns=['f108001','f101001','f106001','f102001']
df2.columns=['f108001','f101001','f106001','f102001','fee']
df2=df2.astype('str')
df2.to_csv('not_luanch_ad_info.csv',index=False,sep='|')
df2.to_csv('ad_info.csv',index=False,sep='|')
slotid=list(set(df1['f108001']) & set(df2['f108001']))
pd.DataFrame(slotid).to_csv('slotid.csv',index=None)
#[slot,配置]发券最小arpu
sql_min_arpu='''
select slotid,min(arpu) min_arpu from
(select slotid,advert_id,orientation_id,
sum(charge_fees) cost,
count(1) launch_cnt,
sum(charge_fees)/count(1) arpu
from advert.dws_advert_order_wide_v4_level_6_di
where dt>='{0}' and dt<='{1}'
group by slotid,advert_id,orientation_id) a
where arpu>0 and launch_cnt>5
group by slotid
'''.format(yestodayn,yestoday1)
cursor.execute(sql_min_arpu)
slot_min_arpu = pd.DataFrame(cursor.fetchall())
slot_min_arpu.columns=['slotid','min_arpu']
slot_min_arpu=slot_min_arpu.ix[pd.notnull(slot_min_arpu['slotid'])]
slot_min_arpu['slotid']=slot_min_arpu['slotid'].astype('int').astype('str')
slotid_df=pd.DataFrame(list(set(df1['f108001']) & set(df2['f108001'])),columns=['slotid'])
slot_min_arpu=pd.merge(slot_min_arpu,slotid_df,how='inner',on=['slotid'])
slot_min_arpu.to_csv('slot_min_arpu.csv',index=None)
#############################
#java读取 场景info,广告info,slotid,拼接样本,流式处理
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment