-
Notifications
You must be signed in to change notification settings - Fork 4
/
2feature_listing_info.py
85 lines (77 loc) · 5.1 KB
/
2feature_listing_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
#@author: limeng
#@file: 2feature_listing_info.py
#@time: 2019/6/8 8:39
"""
文件说明:对feature_listing_info特征构造
"""
import pandas as pd
import numpy as np
import gc
from dateutil.relativedelta import relativedelta
# path = "F:/数据集/1906拍拍/"
# outpath = "F:/数据集处理/1906拍拍/"
path = "/data/dev/lm/paipai/ori_data/"
outpath = "/data/dev/lm/paipai/feature/"
# Y指标基础表
basic = pd.read_csv(open(outpath + "feature_main_key.csv", encoding='utf8'),parse_dates=['auditing_date'])
basic["auditing_date_last3"] = basic["auditing_date"].apply(lambda x: x - relativedelta(months=+3))
basic["auditing_date_last6"] = basic["auditing_date"].apply(lambda x: x - relativedelta(months=+6))
basic["auditing_date_last9"] = basic["auditing_date"].apply(lambda x: x - relativedelta(months=+9))
basic["auditing_date_last12"] = basic["auditing_date"].apply(lambda x: x - relativedelta(months=+12))
basic = basic.sort_values(["user_id", "listing_id"])
listing_info = pd.read_csv(open(path+"listing_info.csv",encoding='utf8'),parse_dates=['auditing_date'])
# 近n个月特征
agg = {
"term": ["count", "max", "min","mean","std"],
"rate":["max","min","mean","std"],
"principal": ["sum","max", "min", "mean","std"],
}
#左连接筛选卡时间
listing_info.columns = ["user_id","listing_id_info","auditing_date_info","term","rate","principal"]
basic_union = basic.merge(listing_info,how='left',on='user_id')
print(basic_union.shape)
for month in [3,6,9,12]:
print("近几个月的统计:",month)
basic_tmp = basic_union.loc[(basic_union["auditing_date_info"]<basic_union["auditing_date"])&(
basic_union["auditing_date_info"]>=basic_union["auditing_date_last{}".format(month)])]
if month == 12:
basic_tmp["date_diff"]=(basic_tmp["auditing_date"]-basic_tmp["auditing_date_info"]).apply(lambda x:int(x.days))
agg['date_diff']=["max","min","mean","std"]
basic_tmp = basic_tmp.groupby(["user_id","listing_id"], as_index=False).agg(agg)
basic_tmp.columns = ['listing_info_last{}_'.format(month) + i[0] + '_' + i[1] for i in basic_tmp.columns]
basic_tmp = basic_tmp.rename(columns={"listing_info_last{}_user_id_".format(month):"user_id","listing_info_last{}_listing_id_".format(month):"listing_id"})
basic = basic.merge(basic_tmp,how='left',on=["user_id","listing_id"])
#当前标的属性
listing_info.columns = ['user_id','listing_id','auditing_date','listing_info_term','listing_info_rate','listing_info_principal']
basic = basic.merge(listing_info,how='left',on=['user_id','listing_id','auditing_date'])
#全部历史借款距当前订单最小天/最大天
basic_union = basic_union.loc[basic_union["auditing_date_info"]<basic_union["auditing_date"]]
basic_union["date_diff"] = (basic_union["auditing_date"]-basic_union["auditing_date_info"]).dt.days
date_diff_cnt = basic_union.groupby(["user_id","listing_id"],as_index=False).agg({'date_diff':["count","max",'min','mean','std']})
date_diff_cnt.columns = ['listing_info_date_diff_' + i[0] + '_' + i[1] for i in date_diff_cnt.columns]
date_diff_cnt = date_diff_cnt.rename(columns={"listing_info_date_diff_user_id_":"user_id","listing_info_date_diff_listing_id_":"listing_id"})
basic = basic.merge(date_diff_cnt,how='left',on=["user_id","listing_id"])
# 当前标的期数/费率/金额占3/6/9/12均值的比例
for i in [3,6,9,12]:
print("当前标的占比:",i)
basic["listing_info_term_ratio_last{}".format(i)] = basic["listing_info_term"]/basic["listing_info_last{}_term_mean".format(i)]
basic["listing_info_rate_ratio_last{}".format(i)] = basic["listing_info_rate"]/basic["listing_info_last{}_rate_mean".format(i)]
basic["listing_info_principal_ratio_last{}".format(i)] = basic["listing_info_principal"] / basic["listing_info_last{}_principal_sum".format(i)]
# 近3月标的期数/费率/金额占6/9/12均值的比例
for i in [6,9,12]:
print("近3月标的占比:",i)
basic["listing_info_last3_term_mean_ratio_last{}".format(i)] = basic["listing_info_last3_term_mean"]/basic["listing_info_last{}_term_mean".format(i)]
basic["listing_info_last3_rate_mean_ratio_last{}".format(i)] = basic["listing_info_last3_rate_mean"]/basic["listing_info_last{}_rate_mean".format(i)]
basic["listing_info_last3_principal_sum_ratio_last{}".format(i)] = basic["listing_info_last3_principal_sum"] / basic["listing_info_last{}_principal_sum".format(i)]
# 近6月标的期数/费率/金额占9/12均值的比例
for i in [9,12]:
print("近6月标的占比:",i)
basic["listing_info_last6_term_mean_ratio_last{}".format(i)] = basic["listing_info_last6_term_mean"]/basic["listing_info_last{}_term_mean".format(i)]
basic["listing_info_last6_rate_mean_ratio_last{}".format(i)] = basic["listing_info_last6_rate_mean"]/basic["listing_info_last{}_rate_mean".format(i)]
basic["listing_info_last6_principal_sum_ratio_last{}".format(i)] = basic["listing_info_last6_principal_sum"] / basic["listing_info_last{}_principal_sum".format(i)]
del basic["auditing_date_last3"]
del basic["auditing_date_last6"]
del basic["auditing_date_last9"]
del basic["auditing_date_last12"]
basic.to_csv(outpath+'feature_listing_info0619.csv',index=None)