kddcup2015,二分类,课程逃课预测。写了好久了,突然想起简单整理一下,以备后需。
step1,预处理,利用numpy和pandas库,数值化特征,简单而优雅
#!/usr/bin/env python
# coding=utf-8 import pickle
import pandas as pd
import numpy as np source_dict={'server':0,'browser':1}
event_dict = {"problem":5,"video":3,"access":1,"wiki":4,"discussion":6,"navigate":2,"page_close":0} def gen_time_dict():
rng = pd.date_range('2013-10-27','2014-08-01')
time_dict = pd.Series(np.arange(len(rng)),index=rng)
fw = open('data/time_dict.csv','w')
pickle.dump(time_dict,fw)
return time_dict def gen_courseid_dict():
df = pd.read_csv('data/date.csv',usecols=[0])
course_map = pd.factorize(df.course_id)[1]
course_dict = dict(zip(course_map,range(len(course_map))))
fw = open('data/course_idTrain2.csv','w')
pickle.dump(course_dict,fw)
print "course_dict done"
return course_dict def gen_object_dict():
df = pd.read_csv('data/log_train.csv',usecols=[4])
obj_map = pd.factorize(df.object)[1]
obj_dict = dict(zip(obj_map,range(len(obj_map)))) df2 = pd.read_csv('data/test/log_test.csv',usecols=[4])
obj_map2 = pd.factorize(df2.object)[1]
diff = [w for w in obj_map2 if w not in obj_map]
obj_dict2 =dict(zip(diff,np.arange(len(obj_map),len(obj_map)+len(diff)))) obj_dict.update(obj_dict2)
fw = open('data/object_pkl.csv','w')
pickle.dump(obj_dict,fw)
print "obj_dict done.."
return obj_dict def time_map(x):
x = x[:10]
return time_dict[x] def obj_map(x):
return obj_dict[x] def course_map(x):
return course_dict[x] time_dict = gen_time_dict()
course_dict= gen_courseid_dict()
obj_dict= gen_object_dict() def log_trainData():
print "read log_train.csv "
df1 = pd.read_csv('data/log_train.csv',converters={1:time_map,4:obj_map})
print df1.head() df1.source = df1.source.map(lambda x:source_dict[x])
df1.event = df1.event.map(lambda x:event_dict[x])
print df1.head()
print df1.tail()
df1.to_csv('data/log_trainData.csv',index=False) def course_Data():
df2 = pd.read_csv('data/enrollment_train.csv',usecols=[0,2],converters={2:course_map})
df3 = pd.read_csv('data/date.csv',converters={0:course_map,1:time_map,2:time_map})
df4 = pd.merge(df2,df3,on='course_id',how='outer')
df4 = df4.sort_index(by='enrollment_id')
print df4.tail(10)
df4.to_csv("data/course_Trainpkl.csv",index=False) df1 = pd.read_csv('data/test/enrollment_test.csv',usecols=[0,2],converters={2:course_map})
df4 = pd.merge(df1,df3)
df4 = df4.sort_index(by='enrollment_id')
print df4.tail(10)
df4.to_csv("data/test/course_Testpkl.csv",index=False) def log_testData():
print "read log_test.csv "
df1 = pd.read_csv('data/test/log_test.csv',converters={1:time_map,4:obj_map})
print df1.tail(10)
df1.source = df1.source.map(lambda x:source_dict[x])
df1.event = df1.event.map(lambda x:event_dict[x])
print df1.tail(10)
df1.to_csv('data/test/log_testData.csv',index=False) log_trainData()
log_testData()
course_Data()
2. 使用各种机器学习方法进行建模,预测
#!/usr/bin/env python
# coding=utf-8 import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import pickle debug=True
if debug:
N=5000
else:
N = 20000 class DropOutPredict(object): course_dict={}
def __init__(self):
print "welcom kdd2015 contest, jkmiao@526588996" fr = open("data/coursePkl.pkl")
self.course_dict = pickle.load(fr) def datestr2num(s):
return pd.to_datetime(s) def norm_res(self,x):
if x<0.0001:
x=0
elif x>0.98:
x=1.0
return x def norm_course(self,c):
return self.course_dict[c] def loadTrainData(self):
df1 = pd.read_csv('./data/log_train.csv',usecols=[0,2,3,4])
df1.source=pd.factorize(df1.source)[0]
df1.event=pd.factorize(df1.event)[0]
df1.object=pd.factorize(df1.object)[0] gp = df1.groupby("enrollment_id")
gp2 = df1.groupby(["enrollment_id","source"]) df2 = pd.read_csv('data/enrollment_train.csv',usecols=[2])
df2.course_id=pd.factorize(df2.course_id)[0] df3 = pd.read_csv('data/truth_train.csv',usecols=[1],names=["drop"])
data = df1.pivot_table("source",rows="enrollment_id",cols="event",aggfunc="count",fill_value=0) data["browser"] = gp2.event.count().unstack()[0]
data["server"] = gp2.event.count().unstack()[1]
data["course_id"]=df2.course_id
data["cnt"] = gp.event.count()
data["std"] = gp.object.std()
data["var"] =gp.event.var()
data["mean"] =gp.event.mean()
data = data.fillna(0)
print data.head()
X = data.values
y = np.ravel(df3["drop"])
return X,y def loadTestData(self):
df1 = pd.read_csv('data/test/log_test.csv',usecols=[0,2,3,4])
df1.source = pd.factorize(df1.source)[0]
df1.event = pd.factorize(df1.event)[0]
df1.object = pd.factorize(df1.object)[0] gp = df1.groupby("enrollment_id")
gp2 = df1.groupby(["enrollment_id","source"]) df2 = pd.read_csv("data/test/enrollment_test.csv",usecols=[2])
df2.course_id = pd.factorize(df2.course_id)[0] data = df1.pivot_table("source",rows="enrollment_id",cols="event",aggfunc="count",fill_value=0) data["browser"] = gp2.event.count().unstack()[0]
data["server"] = gp2.event.count().unstack()[1]
data["course_id"] = df2.course_id
data["cnt"] = gp.event.count()
data["std"] = gp.object.std()
data["var"] = gp.event.var()
data["mean"] = gp.event.mean()
data = data.fillna(0)
print "test data head():...\n",data.head()
test = data.values
return test def gbdt_clf(self,x_train,x_test,y_train,y_test,test):
clf = GradientBoostingClassifier(n_estimators=450,learning_rate=0.1,random_state=20)
clf.fit(x_train,y_train)
y_pred = clf.predict_proba(x_test)[:,1] scores = roc_auc_score(y_test,y_pred)
print "gbdt_clf scores ... ",scores
pred = clf.predict_proba(test)[:,1]
print pred[:5]
self.saveResult(pred,"data/test/gbdt_clf.csv") def svc_clf(self,x_train,x_test,y_train,y_test,test):
tuned_parameters = [{'kernel':['poly'],'C':[10,500,1200]},
{'kernel':['linear'],'C':[200,500,800]}]
clf = GridSearchCV(SVC(probability=True),tuned_parameters,cv=5,scoring="roc_auc") # clf = svm.SVC(C=2.0,kernel="rbf",probability=True,random_state=42)
clf.fit(x_train,y_train)
print "Best parameters set found : "
print clf.best_params_ y_pred = clf.predict_proba(x_test)[:,1]
scores = roc_auc_score(y_test,y_pred)
print "svm clf scores...",scores
pred = clf.predict_proba(test)[:,1]
self.saveResult(pred,"data/test/svc_res"+str(scores)+".csv")
return pred[:5] def saveResult(self,pred,fileName):
enrollment_test = pd.read_csv('./data/test/enrollment_test.csv',usecols=[0])
enrollment_test['drop'] = pred
res = enrollment_test[['enrollment_id','drop']];
print "***"*30
print res.head()
res.to_csv(fileName,index=False,header=False) def drop_predict(self):
print "loading train data..."
X,y = self.loadTrainData() x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.23,random_state=20)
print "loading test data..."
test = self.loadTestData() print "moding gbdt_clf..."
self.gbdt_clf(x_train,x_test,y_train,y_test,test) x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=28)
print "moding svm_clf ..."
self.svc_clf(x_train,x_test,y_train,y_test,test) def em_result(self):
print "ensemable results..."
df_gbdt = pd.read_csv("data/test/gbdt_res.csv",header=None,names=["id","drop1"])
df_svm = pd.read_csv("data/test/svc_res.csv",header=None,usecols=[1],names=["id","drop2"])
# df_knn = pd.read_csv("data/test/knn_res.csv",header=None,usecols=[1],names=["id","drop3"]) # df_ex1 = pd.read_csv("data/test/gbdt_clf0.861831055542.csv",header=None,usecols=[1],names=["drop4"])
# df_ex2 = pd.read_csv("data/test/gbdt_clf0.863249041131.csv",header=None,usecols=[1],names=["drop5"]) # final result
# df = pd.concat([df_gbdt,df_svm,df_knn,df_ex1,df_ex2],axis=1)
# df["drop"] = df["drop1"]*0.4+df["drop2"]*0.2+df["drop3"]*0.2+df["drop4"]*0.1+df["drop5"]*0.1 df = pd.concat([df_gbdt,df_svm],axis=1)
df["drop"] = df.drop1*0.7+df.drop2*0.3
df["drop"] = map(lambda x:self.norm_res(x),df["drop"])
print df.head()
# df.drop(["drop1","drop2","drop3","drop4","drop5"],axis=1,inplace=True)
df.drop(["drop1","drop2"],axis=1,inplace=True)
print df.head()
df.to_csv("data/test/em_res.csv",header=False,index=False) if __name__ == '__main__':
drop = DropOutPredict()
drop.drop_predict()
drop.em_result()
print "done." # 准确率召回率AOC值可达84%左右
3, 继续特征工程,加强提取特征,AOC值接近89%
#!/usr/bin/env python
# coding=utf-8 import numpy as np
import pandas as pd
import cPickle as pickle from sklearn import svm
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import scale from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score def norm( x ):
if x<0.000001:
x=0
elif x>0.96:
x=1
return x def last_time(x):
return x.max()-x.min() def loadTrainData():
df1 = pd.read_csv('data/log_trainData.csv')
print df1.head()
print df1.tail()
df2 = pd.read_csv('data/truth_train.csv',header=None,usecols=[1],names=["drop"])
df3 = pd.read_csv('data/course_Trainpkl.csv',usecols=[1,2,3]) gp = df1.groupby("enrollment_id") data = df1.pivot_table("source",rows='enrollment_id',cols="event",aggfunc='count',fill_value=0) eventdf = gp.event.describe().unstack() timedf = gp.time.describe().unstack()
timedf.drop('count',axis=1) sourcedf = gp.source.describe().unstack()
sourcedf.drop(['count','min','max'],axis=1) objectdf = gp.object.describe().unstack()
objectdf.drop(['count'],axis=1) # 并连"特征表"
data = pd.concat([data,eventdf],axis=1)
data = pd.concat([data,timedf],axis=1)
data = pd.concat([data,sourcedf],axis=1)
data = pd.concat([data,objectdf],axis=1) # 课程特征,持续时间,id号,从?天到?天
data['dtime'] = gp.time.apply(last_time)
data["course_id"] = df3["course_id"].values
data["from"] = df3["from"].values
data["to"] = df3["to"].values # 最大最小值规范化,并未什么提升
# X = MinMaxScaler().fit_transform(X)
print "origin data: "
print data.tail()
data = data.fillna(0)
data.to_csv('data/trainData.csv',index=False)
X = data.values
# 去均值后规范化
X = scale(X)
# fw = open("data/train/trainData.pkl",'w')
# pickle.dump(X,fw)
y = np.ravel(df2['drop'])
print "y: ",y[:5]
return X,y def loadTestData():
df1 = pd.read_csv('data/test/log_testData.csv')
print df1.head()
df3 = pd.read_csv('data/test/course_Testpkl.csv',usecols=[1,2,3]) gp = df1.groupby("enrollment_id") data = df1.pivot_table("source",rows='enrollment_id',cols="event",aggfunc='count',fill_value=0) eventdf = gp.event.describe().unstack() timedf = gp.time.describe().unstack()
timedf.drop('count',axis=1) sourcedf = gp.source.describe().unstack()
sourcedf.drop(['count','min','max'],axis=1) objectdf = gp.object.describe().unstack()
objectdf.drop(['count'],axis=1) data = pd.concat([data,eventdf],axis=1)
data = pd.concat([data,timedf],axis=1)
data = pd.concat([data,sourcedf],axis=1)
data = pd.concat([data,objectdf],axis=1) data['dtime'] = gp.time.apply(last_time)
data["course_id"] = df3["course_id"].values
data["from"] = df3["from"].values
data["to"] = df3["to"].values # data["cnt"]=gp.size()
# data["eventstd"] = gp.event.std()
# data['eventmean'] = gp.event.mean()
# data['eventmdeian'] = gp.event.median()
# data['equantile0.25'] = gp.event.quantile(0.25)
# data['equantile0.75'] = gp.event.quantile(0.75)
# data['equantilemad'] = gp.event.mad() print "test data: "
print data.tail(10)
data = data.fillna(0)
# 写入文件,以备后需,直接读取
data.to_csv('data/test/testData.csv',index=False) # 也可以直接生成序列化文件
# fw = open("data/test/testData.pkl",'w')
# pickle.dump(data,fw) test = data.values
# test = MinMaxScaler().fit_transform(test)
test = scale(test)
return test def svc_clf(x_train,x_test,y_train,y_test,test):
clf = svm.SVC(kernel='linear',probability=True,random_state=42)
clf.fit(x_train,y_train)
y_pred= clf.predict_proba(x_test)[:,1]
scores = roc_auc_score(y_test,y_pred) # 必须test值写在前面,否则报错
print "svm scores:...",scores
pred = clf.predict_proba(test)[:,1]
saveResult(pred,'data/test/svc_res.csv') def lr_clf(x_train,x_test,y_train,y_test,test):
clf = linear_model.LogisticRegression()
clf.fit(x_train,y_train)
y_pred = clf.predict_proba(x_test)[:,1]
scores= roc_auc_score(y_test,y_pred)
print "lr_clf scores: ",scores y_pred = map(norm,y_pred)
score2 = roc_auc_score(y_test,y_pred)
print "after nomailzied score ... ",score2 pred = clf.predict_proba(test)[:,1]
saveResult(pred,'data/test/lr_res.csv') def rf_clf(x_train,x_test,y_train,y_test,test):
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train,x_train)
y_pred = clf.predict_proba(x_test)[:,1]
scores = roc_auc_score(y_test,y_pred)
pred = clf.predict(test)[:,1]
print "rf_scores: ",scores
saveResult(pred,'./data/test/rf_res.csv') def gbdt_clf(x_train,x_test,y_train,y_test,test):
clf = GradientBoostingClassifier(n_estimators=500)
clf.fit(x_train,y_train)
y_pred=clf.predict_proba(x_test)[:,1]
scores = roc_auc_score(y_test,y_pred)
pred = clf.predict_proba(test)[:,1]
print "gbdt_clf scores: ",scores
saveResult(pred,'data/test/gbdt_clf'+str(scores)+'.csv') def saveResult(pred,fileName):
# 获取用户id号
df = pd.read_csv('data/test/enrollment_test.csv',usecols=[0])
# 加上预测值
df['drop'] = pred
print df.head()
# 写入提交文件
df.to_csv(fileName,index=False,header=False) # 将较好的几个结果ensemble一下
def em_res():
df = pd.read_csv("data/test/gbdt_res.csv",header=None,names=["id","drop"])
df1 = pd.read_csv("data/test/gbdt_clf0.875919444048.csv",header=None,usecols=[1],names=["drop1"])
df2 = pd.read_csv("data/test/final_res.csv",header=None,usecols=[1],names=["drop2"])
df["drop"] =df["drop"]*0.5+ df1["drop1"]*0.2+df2["drop2"]*0.3
df["drop"] = df["drop"]
df.to_csv("data/test/final_res.csv",index=None,header=None) # 后续使用时,直接读取,加快速度
def loadPickleTrainData():
df1 = pd.read_csv('data/trainData.csv')
print df1.head()
X = df1.values
# X = scale(X)
fr2 = open("data/train/trainLabel.txt")
y = pickle.load(fr2)
return X,y def loadPickleTestData():
df1 = pd.read_csv('data/test/testData.csv')
test = df1.values
# test = scale(test)
return test def dropPredict():
em_res()
print "loading train data..."
X,y = loadPickleTrainData()
# X,y = loadTrainData() print "loading test data... "
test = loadPickleTestData()
# test = loadTestData() print "\nmodeling lr..."
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.31,random_state=148)
lr_clf(x_train,x_test,y_train,y_test,test) x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.28,random_state=151)
print "\nmodeling rf..."
# rf_clf(x_train,x_test,y_train,y_test,test) print "\nmodeling gbdt..."
gbdt_clf(x_train,x_test,y_train,y_test,test) print "\nmodeling svm..."
svc_clf(x_train,x_test,y_train,y_test,test) if __name__ =="__main__":
print "start>>>"
dropPredict()
print "done"