特征选择
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.stats import pearsonr
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
data_filename = 'data/adult.data'
adult = pd.read_csv(data_filename, sep=', ', header=None, engine='python',
names=["Age", "Work-Class", "fnlwgt",
"Education", "Education-Num",
"Marital-Status", "Occupation",
"Relationship", "Race", "Sex",
"Capital-gain", "Capital-loss",
"Hours-per-week", "Native-Country",
"Earnings-Raw"])
# 计算某列的统计量
print(adult['Hours-per-week'].describe())
print(adult['Work-Class'].unique())
print('----------------------------------------------------')
# 创建LongHours(时长)特征,表示一个人每周工作时长是否多于40小时。
adult["LongHours"] = adult["Hours-per-week"] > 40
# 使用scikit-learn中的VarianceThreshold转换器删除特征值的方差达不到最低标准的特征
array = np.arange(30).reshape(10, 3)
array[:, 1] = 1
vt = VarianceThreshold()
Xt = vt.fit_transform(array)
print(Xt)
print(vt.variances_)
print('----------------------------------------------------')
# 使用scikit-learn选择单变量特征的转换器,其中SelectKBest返回k个最佳特征,SelectPercentile返回表现最佳的前r%个特征。
X = np.array(adult[["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]])
y = np.array(adult["Earnings-Raw"] == '>50K')
transformer = SelectKBest(score_func=chi2, k=3)
Xt_chi2 = transformer.fit_transform(X, y)
print(Xt_chi2)
print(transformer.scores_)
print('----------------------------------------------------')
# 使用皮尔逊(Pearson)相关系数计算相关性
def multivariate_pearsonr(X, y):
scores, pvalues = [], []
for column in range(X.shape[1]):
cur_score, cur_p = pearsonr(X[:, column], y)
scores.append(abs(cur_score))
pvalues.append(cur_p)
return (np.array(scores), np.array(pvalues))
transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, y)
print(transformer.scores_)
print('----------------------------------------------------')
# 比较卡方检验以及皮尔逊相关系数获得的特征集合的准确性
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')
scores_pearson = cross_val_score(clf, Xt_pearson, y,scoring='accuracy')
print('卡方检验的准确率为:{:.2f}%'.format(np.mean(scores_chi2)*100),';皮尔逊相关系数的准确率为:{:.2f}%'.format(np.mean(scores_pearson)*100))
使用主成分分析找到能用较少信息描述数据集的特征组合
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import matplotlib as mpl
mpl.use("TkAgg")
# 数据读取以及预处理
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
data_filename = 'data/ad.data'
missing_values = [' ?', ' ?', ' ?', ' ?', '?']
ads = pd.read_csv(data_filename, header=None, na_values=missing_values, low_memory=False)
ads[1558] = ads[1558] == 'ad.'
ads.dropna(inplace=True)
X = np.array(ads.drop(1558, axis=1), dtype=float)
y = np.array(ads[1558])
# 使用主成分分析找到能用较少信息描述数据集的特征组合
pca = PCA(n_components=5)
Xd = pca.fit_transform(X)
np.set_printoptions(precision=3, suppress=True)
print(pca.explained_variance_ratio_)
print('----------------------------------------------------')
# 验证使用主成分分析找到能用较少信息描述数据集的特征组合的准确率
clf = DecisionTreeClassifier(random_state=14)
scores_reduced = cross_val_score(clf, Xd, y, scoring='accuracy')
print('主成分分析获取特征值的准确率为:{:.2f}%'.format(np.mean(scores_reduced) * 100))
# 把PCA返回的前两个特征做成图形。
classes = set(y)
colors = ['red', 'green']
for cur_class, color in zip(classes, colors):
print(cur_class,color)
mask = (y == cur_class)
plt.scatter(Xd[mask, 0], Xd[mask, 1], marker='o', color=color,
label=int(cur_class))
plt.legend()
plt.show()
评论区