import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
data_filename = 'data/leagues_NBA_2014_games_games.csv'
index = ["日期", "分数类型", "客队", "客队得分", "主队", "主队得分", "加时", "备注"]
data = pd.read_csv(data_filename, parse_dates=["Date"])
data.columns = index
data['主队获胜'] = data['主队得分'] > data['客队得分']
data['客队上场胜利'] = 0
data['主队上场胜利'] = 0
won_last = defaultdict(int)
# 提取新特征:客队上场胜利、主队上场胜利
for i in range(len(data)):
if won_last.get(data.iloc[i, 2]) == 1:
data.iloc[i, -2] = 1
if won_last.get(data.iloc[i, 4]) == 1:
data.iloc[i, -1] = 1
if data.iloc[i, 8]:
won_last[data.iloc[i, 4]] = 1
won_last[data.iloc[i, 2]] = 0
else:
won_last[data.iloc[i, 4]] = 0
won_last[data.iloc[i, 2]] = 1
clf = DecisionTreeClassifier(random_state=14)
X = np.array(data[["客队上场胜利", "主队上场胜利"]], dtype=int)
y = np.array(data['主队获胜'], dtype=int)
scores_0 = cross_val_score(clf, X, y, scoring='accuracy')
print("正确率={:.2f}%".format(np.mean(scores_0) * 100))
# 提取新特征:主客队水平
standings_filename = 'data/leagues_NBA_2013_standings_expanded-standings.csv'
standings = pd.read_csv(standings_filename)
data['主水平更高'] = 0
standings_dict = defaultdict(int)
for i in range(len(standings)):
standings_dict[standings.loc[i, 'Team']] = i
for i in range(len(data)):
home_rank = standings_dict.get(data.loc[i, '主队'])
visitor_rank = standings_dict.get(data.loc[i, '客队'])
if home_rank < visitor_rank:
data.loc[i, '主水平更高'] = 1
X = np.array(data[["客队上场胜利", "主队上场胜利", "主水平更高"]])
scores_1 = cross_val_score(clf, X, y, scoring='accuracy')
print("正确率={:.2f}%".format(np.mean(scores_1) * 100))
# 提取新特征:两队交战上场胜利情况
last_won = defaultdict(str)
data['对局情况'] = 0
for i in range(len(data)):
key = tuple(sorted([data.loc[i, '主队'], data.loc[i, '客队']]))
if last_won.get(key) is not None:
if last_won.get(key) == data.loc[i, '主队']:
data.loc[i, '对局情况'] = 1
# else:
# data.loc[i, '对局情况'] = -1
if data.loc[i, '主队获胜']:
last_won[key] = data.loc[i, '主队']
else:
last_won[key] = data.loc[i, '客队']
X = np.array(data[["主水平更高", "对局情况"]])
scores_2 = cross_val_score(clf, X, y, scoring='accuracy')
print("正确率={:.2f}%".format(np.mean(scores_2) * 100))
# 提取新特征:胜利场数
won_num = defaultdict(int)
data['胜利场数'] = 0
for i in range(len(data)):
home_win = won_num.get(data.loc[i, '主队'])
visitor_win = won_num.get(data.loc[i, '客队'])
if home_win is None:
home_win = 0
if visitor_win is None:
visitor_win = 0
if data.loc[i, '主队获胜'] == 1:
won_num[data.loc[i, '主队']] = home_win + 1
else:
won_num[data.loc[i, '主队']] = visitor_win + 1
if home_win > visitor_win:
data.loc[i, '胜利场数'] = 1
X = np.array(data[["主水平更高", "胜利场数"]])
scores_3 = cross_val_score(clf, X, y, scoring='accuracy')
print("正确率={:.2f}%".format(np.mean(scores_3) * 100))
# 提取新特征:连胜场数
winning_streak = defaultdict(int)
data['连胜场数'] = 0
for i in range(len(data)):
home_streak = winning_streak.get(data.loc[i, '主队'])
visitor_streak = winning_streak.get(data.loc[i, '客队'])
if home_streak is None:
home_streak = 0
if visitor_streak is None:
visitor_streak = 0
if home_streak > visitor_streak:
data.loc[i, '连胜场数'] = 1
if data.loc[i, '主队获胜'] == 1:
winning_streak[data.loc[i, '主队']] = home_streak + 1
winning_streak[data.loc[i, '客队']] = 0
else:
winning_streak[data.loc[i, '主队']] = 0
winning_streak[data.loc[i, '客队']] = visitor_streak + 1
X = np.array(data[["主水平更高", "胜利场数", "连胜场数"]])
scores_4 = cross_val_score(clf, X, y, scoring='accuracy')
print("正确率={:.2f}%".format(np.mean(scores_4) * 100))
# 字符串型数据整型化
encoding = LabelEncoder()
encoding.fit(data['主队'].values)
home_teams = encoding.transform(data['主队'].values)
visitor_teams = encoding.transform(data['客队'].values)
X_teams = np.vstack([home_teams, visitor_teams]).T
# 整型数据离散化
onehot = OneHotEncoder()
X_teams_expanded = np.array(onehot.fit_transform(X_teams).toarray())
X_per = np.array(data[["主水平更高", "连胜场数"]])
X = np.concatenate((X_teams_expanded, X_per), axis=1)
scores_5 = cross_val_score(clf, X, y, scoring='accuracy')
print("正确率={:.2f}%".format(np.mean(scores_5) * 100))
# 集成学习-随机森林
clf_r = RandomForestClassifier(random_state=14)
scores_6 = cross_val_score(clf_r, X, y, scoring='accuracy')
print("正确率={:.2f}%".format(np.mean(scores_6) * 100))
parameter_space = {
"n_estimators": [100, ],
"criterion": ["gini", "entropy"],
"min_samples_leaf": [2, 4, 6],
}
# 使用GridSearchCV类搜索最佳参数
grid = GridSearchCV(clf_r, parameter_space)
grid.fit(X, y)
print("正确率={:.2f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)
评论区