import pandas as pd
from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# #加载数据集
data_filename = "NBA15_16_dataset/basketball.csv"
dataset = pd.read_csv(data_filename,encoding="utf-8")
#清洗数据
dataset = pd.read_csv(data_filename,parse_dates=["Date"])
dataset.columns = ["Date", "Start(ET)","Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type","Attend.", "Notes"]
#抽取新的特征
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]
y_true = dataset["HomeWin"].values
dataset["HomeWin"].mean()
#构造新属性  需要预测的两只球队在各自的上场比赛中胜负情况
encoding = LabelEncoder()
encoding.fit(dataset["Home Team"].values)
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()
print(X_teams.shape)
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring="accuracy")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

NBA 预测 消除队名的影响

# import pandas as pd 
# import numpy as np
# # #加载数据集
# data_filename = "NBA15_16_dataset/basketball.csv"
# dataset = pd.read_csv(data_filename,encoding="utf-8")
# #清洗数据
# #1 
# dataset = pd.read_csv(data_filename,parse_dates=["Date"])
# #2
# dataset.columns = ["Date", "Start(ET)","Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type","Attend.", "Notes"]
# #抽取新的特征
# dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]
# # dataset.head()
# y_true = dataset["HomeWin"].values
# dataset["HomeWin"].mean()
# #构造新属性  需要预测的两只球队在各自的上场比赛中胜负情况
# from collections import defaultdict


# won_last = defaultdict(int)
# dataset["HomeLastWin"] = 0
# dataset["VisitorLastWin"] = 0
# for index, row in dataset.iterrows():

#     home_team = row["Home Team"]
#     visitor_team = row["Visitor Team"]
#     row["HomeLastWin"] = won_last[home_team]

#     dataset.set_value(index, "HomeLastWin", won_last[home_team])
#     dataset.set_value(index, "VisitorLastWin", won_last[visitor_team])
#     won_last[home_team] = int(row["HomeWin"])
#     won_last[visitor_team] = 1 - int(row["HomeWin"])

# #决策树进行预测
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.cross_validation import cross_val_score
# import numpy as np

# clf = DecisionTreeClassifier(random_state=14)
# x_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values

# scores = cross_val_score(clf, x_previouswins, y_true, scoring="accuracy")
# print(scores)
# print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
# #新建特征 排名
# standings_filename = "NBA15_16_dataset/standings.csv"
# standings = pd.read_csv(standings_filename, skiprows=0, encoding="utf-8")
# standings.head()
# dataset["HomeTeamRanksHigher"] = 0
# for index, row in dataset.iterrows():
#     home_team = row["Home Team"]
#     visitor_team = row["Visitor Team"]
#     home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
#     visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
#     dataset.set_value(index, "HomeTeamRanksHigher",int(home_rank < visitor_rank))
# X_homehigher = dataset[["HomeTeamRanksHigher","HomeLastWin", "VisitorLastWin",]].values

# clf = DecisionTreeClassifier(random_state=14)
# scores = cross_val_score(clf, X_homehigher, y_true, scoring="accuracy")
# print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
# dataset["HomeTeamRanksHigher"] = 0
# for index, row in dataset.iterrows():
#     home_team = row["Home Team"]
#     visitor_team = row["Visitor Team"]
#     home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
#     visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
#     dataset.set_value(index, "HomeTeamRanksHigher",int(home_rank < visitor_rank))
# X_homehigher = dataset[["HomeTeamRanksHigher","HomeLastWin", "VisitorLastWin",]].values

# clf = DecisionTreeClassifier(random_state=14)
# scores = cross_val_score(clf, X_homehigher, y_true, scoring="accuracy")
# print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
12-31 02:51