import pandas as pd from collections import defaultdict from sklearn.tree import DecisionTreeClassifier from sklearn.cross_validation import cross_val_score import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder # #加载数据集 data_filename = "NBA15_16_dataset/basketball.csv" dataset = pd.read_csv(data_filename,encoding="utf-8") #清洗数据 dataset = pd.read_csv(data_filename,parse_dates=["Date"]) dataset.columns = ["Date", "Start(ET)","Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type","Attend.", "Notes"] #抽取新的特征 dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"] y_true = dataset["HomeWin"].values dataset["HomeWin"].mean() #构造新属性 需要预测的两只球队在各自的上场比赛中胜负情况 encoding = LabelEncoder() encoding.fit(dataset["Home Team"].values) home_teams = encoding.transform(dataset["Home Team"].values) visitor_teams = encoding.transform(dataset["Visitor Team"].values) X_teams = np.vstack([home_teams, visitor_teams]).T onehot = OneHotEncoder() X_teams = onehot.fit_transform(X_teams).todense() print(X_teams.shape) clf = DecisionTreeClassifier(random_state=14) scores = cross_val_score(clf, X_teams, y_true, scoring="accuracy") print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
NBA 预测 消除队名的影响
# import pandas as pd # import numpy as np # # #加载数据集 # data_filename = "NBA15_16_dataset/basketball.csv" # dataset = pd.read_csv(data_filename,encoding="utf-8") # #清洗数据 # #1 # dataset = pd.read_csv(data_filename,parse_dates=["Date"]) # #2 # dataset.columns = ["Date", "Start(ET)","Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type","Attend.", "Notes"] # #抽取新的特征 # dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"] # # dataset.head() # y_true = dataset["HomeWin"].values # dataset["HomeWin"].mean() # #构造新属性 需要预测的两只球队在各自的上场比赛中胜负情况 # from collections import defaultdict # won_last = defaultdict(int) # dataset["HomeLastWin"] = 0 # dataset["VisitorLastWin"] = 0 # for index, row in dataset.iterrows(): # home_team = row["Home Team"] # visitor_team = row["Visitor Team"] # row["HomeLastWin"] = won_last[home_team] # dataset.set_value(index, "HomeLastWin", won_last[home_team]) # dataset.set_value(index, "VisitorLastWin", won_last[visitor_team]) # won_last[home_team] = int(row["HomeWin"]) # won_last[visitor_team] = 1 - int(row["HomeWin"]) # #决策树进行预测 # from sklearn.tree import DecisionTreeClassifier # from sklearn.cross_validation import cross_val_score # import numpy as np # clf = DecisionTreeClassifier(random_state=14) # x_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values # scores = cross_val_score(clf, x_previouswins, y_true, scoring="accuracy") # print(scores) # print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100)) # #新建特征 排名 # standings_filename = "NBA15_16_dataset/standings.csv" # standings = pd.read_csv(standings_filename, skiprows=0, encoding="utf-8") # standings.head() # dataset["HomeTeamRanksHigher"] = 0 # for index, row in dataset.iterrows(): # home_team = row["Home Team"] # visitor_team = row["Visitor Team"] # home_rank = standings[standings["Team"] == home_team]["Rk"].values[0] # visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0] # dataset.set_value(index, "HomeTeamRanksHigher",int(home_rank < visitor_rank)) # X_homehigher = dataset[["HomeTeamRanksHigher","HomeLastWin", "VisitorLastWin",]].values # clf = DecisionTreeClassifier(random_state=14) # scores = cross_val_score(clf, X_homehigher, y_true, scoring="accuracy") # print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100)) # dataset["HomeTeamRanksHigher"] = 0 # for index, row in dataset.iterrows(): # home_team = row["Home Team"] # visitor_team = row["Visitor Team"] # home_rank = standings[standings["Team"] == home_team]["Rk"].values[0] # visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0] # dataset.set_value(index, "HomeTeamRanksHigher",int(home_rank < visitor_rank)) # X_homehigher = dataset[["HomeTeamRanksHigher","HomeLastWin", "VisitorLastWin",]].values # clf = DecisionTreeClassifier(random_state=14) # scores = cross_val_score(clf, X_homehigher, y_true, scoring="accuracy") # print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))