admin 管理员组文章数量: 1087131
特征工程2
第三章 特征增强:清洗数据
import os
os.listdir()
['.config', 'sample_data']
!git clone /********/Feature-Engineering-Made-Easy.git
Cloning into 'Feature-Engineering-Made-Easy'...
remote: Enumerating objects: 63, done.[K
remote: Total 63 (delta 0), reused 0 (delta 0), pack-reused 63[K
Unpacking objects: 100% (63/63), done.
Checking out files: 100% (62/62), done.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns%matplotlib inline
plt.style.use('fivethirtyeight')
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.import pandas.util.testing as tm
pima = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/pima.data')
pima.head()
6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
1 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
2 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
3 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
4 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 |
pima_column_names = ['times_pregnant', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi',
'pedigree_function', 'age', 'onset_diabetes']pima = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/pima.data',names = pima_column_names)pima.head()
times_pregnant | plasma_glucose_concentration | diastolic_blood_pressure | triceps_thickness | serum_insulin | bmi | pedigree_function | age | onset_diabetes | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
pima['onset_diabetes'].value_counts(normalize = True)
0 0.651042
1 0.348958
Name: onset_diabetes, dtype: float64
#绘制两类的直方图
col = 'plasma_glucose_concentration'
plt.hist(pima[pima['onset_diabetes']==0][col],alpha = 0.5,label = 'non_diabetes')
plt.hist(pima[pima['onset_diabetes']==1][col],alpha =.5,label = 'diabetes')
plt.legend(loc = 'upper right')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.title('Histogram of {}'.format(col))
plt.show()
for col in ['times_pregnant', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi',
'pedigree_function', 'age']:plt.hist(pima[pima['onset_diabetes']==0][col],10,alpha = 0.5,label = 'non_diabetes')plt.hist(pima[pima['onset_diabetes']==1][col],10,alpha =.5,label = 'diabetes')plt.legend(loc = 'upper right')plt.xlabel(col)plt.ylabel('Frequency')plt.title('Histogram of {}'.format(col))plt.show()
import seaborn as snssns.heatmap(pima.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x7f5e2606fc50>
pima.isnull().sum()
times_pregnant 0
plasma_glucose_concentration 0
diastolic_blood_pressure 0
triceps_thickness 0
serum_insulin 0
bmi 0
pedigree_function 0
age 0
onset_diabetes 0
dtype: int64
pima.shape
(768, 9)
pima['onset_diabetes'].value_counts(normalize = True)
# 空准确率
0 0.651042
1 0.348958
Name: onset_diabetes, dtype: float64
pima.describe()
times_pregnant | plasma_glucose_concentration | diastolic_blood_pressure | triceps_thickness | serum_insulin | bmi | pedigree_function | age | onset_diabetes | |
---|---|---|---|---|---|---|---|---|---|
count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
#注意到缺失值被填充为0为0
columns = ['serum_insulin', 'bmi', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness','serum_insulin']for col in columns:pima[col] = pima[col].map(lambda value: value if value !=0 else None)pima.isnull().sum()
times_pregnant 0
plasma_glucose_concentration 5
diastolic_blood_pressure 35
triceps_thickness 227
serum_insulin 374
bmi 11
pedigree_function 0
age 0
onset_diabetes 0
dtype: int64
pima.head()
times_pregnant | plasma_glucose_concentration | diastolic_blood_pressure | triceps_thickness | serum_insulin | bmi | pedigree_function | age | onset_diabetes | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148.0 | 72.0 | 35.0 | NaN | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85.0 | 66.0 | 29.0 | NaN | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183.0 | 64.0 | NaN | NaN | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89.0 | 66.0 | 23.0 | 94.0 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137.0 | 40.0 | 35.0 | 168.0 | 43.1 | 2.288 | 33 | 1 |
pima.describe()
times_pregnant | plasma_glucose_concentration | diastolic_blood_pressure | triceps_thickness | serum_insulin | bmi | pedigree_function | age | onset_diabetes | |
---|---|---|---|---|---|---|---|---|---|
count | 768.000000 | 763.000000 | 733.000000 | 541.000000 | 394.000000 | 757.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 121.686763 | 72.405184 | 29.153420 | 155.548223 | 32.457464 | 0.471876 | 33.240885 | 0.348958 |
std | 3.369578 | 30.535641 | 12.382158 | 10.476982 | 118.775855 | 6.924988 | 0.331329 | 11.760232 | 0.476951 |
min | 0.000000 | 44.000000 | 24.000000 | 7.000000 | 14.000000 | 18.200000 | 0.078000 | 21.000000 | 0.000000 |
25% | 1.000000 | 99.000000 | 64.000000 | 22.000000 | 76.250000 | 27.500000 | 0.243750 | 24.000000 | 0.000000 |
50% | 3.000000 | 117.000000 | 72.000000 | 29.000000 | 125.000000 | 32.300000 | 0.372500 | 29.000000 | 0.000000 |
75% | 6.000000 | 141.000000 | 80.000000 | 36.000000 | 190.000000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
填充缺失值
empty_plasma_index = pima[pima['plasma_glucose_concentration'].isnull()].index
empty_plasma_index
Int64Index([75, 182, 342, 349, 502], dtype='int64')
pima.loc[empty_plasma_index]['plasma_glucose_concentration']
75 NaN
182 NaN
342 NaN
349 NaN
502 NaN
Name: plasma_glucose_concentration, dtype: float64
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')pima_imputed = imputer.fit_transform(pima)
type(pima_imputed)
numpy.ndarray
pima_imputed = pd.DataFrame(pima_imputed,columns=pima_column_names)pima_imputed.head()
times_pregnant | plasma_glucose_concentration | diastolic_blood_pressure | triceps_thickness | serum_insulin | bmi | pedigree_function | age | onset_diabetes | |
---|---|---|---|---|---|---|---|---|---|
0 | 6.0 | 148.0 | 72.0 | 35.00000 | 155.548223 | 33.6 | 0.627 | 50.0 | 1.0 |
1 | 1.0 | 85.0 | 66.0 | 29.00000 | 155.548223 | 26.6 | 0.351 | 31.0 | 0.0 |
2 | 8.0 | 183.0 | 64.0 | 29.15342 | 155.548223 | 23.3 | 0.672 | 32.0 | 1.0 |
3 | 1.0 | 89.0 | 66.0 | 23.00000 | 94.000000 | 28.1 | 0.167 | 21.0 | 0.0 |
4 | 0.0 | 137.0 | 40.0 | 35.00000 | 168.000000 | 43.1 | 2.288 | 33.0 | 1.0 |
pima_imputed.isnull().sum()
times_pregnant 0
plasma_glucose_concentration 0
diastolic_blood_pressure 0
triceps_thickness 0
serum_insulin 0
bmi 0
pedigree_function 0
age 0
onset_diabetes 0
dtype: int64
在机器学习流水线上填充值
借
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
X = pima[['serum_insulin']].copy() ## ĺŒă€ă€‘
y = pima['onset_diabetes'].copy()X.isnull().sum()
serum_insulin 374
dtype: int64
X.shape
(768, 1)
entire_data_set_mean = X.mean()
X = X.fillna(entire_data_set_mean)
print(entire_data_set_mean)
serum_insulin 155.548223
dtype: float64
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 99)
X_train.shape,y_train.shape
((576, 1), (576,))
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.65625
上例中训练集和测试集填充缺失值错误采用了均值填充
矺夹借错误采用了ĺ‡ĺ€źĺĄŤĺ
# 合适的方法
X = pima[['serum_insulin']].copy()
y = pima['onset_diabetes'].copy()X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 99)
X.isnull().sum()
serum_insulin 374
dtype: int64
X_test.shape,y_test.shape,X_train.shape,y_train.shape
((192, 1), (192,), (576, 1), (576,))
training_mean = X_train.mean()
X_train = X_train.fillna(training_mean)
X_test = X_test.fillna(training_mean)print(training_mean)
serum_insulin 158.546053
dtype: float64
X_test.shape,y_test.shape,X_train.shape,y_train.shape
((192, 1), (192,), (576, 1), (576,))
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
print(knn.score(X_test,y_test))
0.4895833333333333
Pipeline
from sklearn.pipeline import Pipelineknn_params = {'classify__n_neighbors':[1,2,3,4,5,6,7]} # 必须重新定义参数以符合流水线knn = KNeighborsClassifier()mean_impute = Pipeline([('imputer',SimpleImputer(strategy='mean')),('classify',knn)])X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']grid = GridSearchCV(mean_impute,knn_params)
grid.fit(X,y)print(grid.best_score_,grid.best_params_)
0.7305407011289364 {'classify__n_neighbors': 7}
from sklearn.pipeline import Pipelineknn_params = {'classify__n_neighbors':[1,2,3,4,5,6,7]} # 必须重新定义参数以符合流水线knn = KNeighborsClassifier()median_impute = Pipeline([('imputer',SimpleImputer(strategy='median')),('classify',knn)])X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']grid = GridSearchCV(median_impute,knn_params)
grid.fit(X,y)print(grid.best_score_,grid.best_params_)
0.7292589763177999 {'classify__n_neighbors': 7}
标准化与归一化
impute = SimpleImputer()pima_imputed_mean = pd.DataFrame(impute.fit_transform(pima),columns=pima_column_names)
pima_imputed_mean.hist(figsize=(15,15));
pima_imputed_mean.hist(figsize = (15,15),sharex=True);
Z_score
from sklearn.preprocessing import StandardScalerscale = StandardScaler()
pima_imputed__mean_scaled = pd.DataFrame(scale.fit_transform(pima_imputed_mean),columns=pima_column_names)
pima_imputed__mean_scaled.hist(figsize=(15,15),sharex=True);
#将Z-score加入到Pipeline中
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler# knn_params = {'imputer__stragety':['mean','median'],'classify__n_neighbors':[1,2,3,4,5,6,7]} # ĺż
饝é‡ć–°ĺŽšäš‰ĺ‚数䝼珌ĺˆćľć°´çşż
knn_params = {'imputer__strategy':['mean','median'], 'classify__n_neighbors':[1, 2,
3, 4, 5, 6, 7]}mean_impute_standardize = Pipeline([('imputer',SimpleImputer()),('standardize',StandardScaler()),('classify',knn)])X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']grid = GridSearchCV(mean_impute_standardize,knn_params)
grid.fit(X,y)print(grid.best_score_,grid.best_params_)
0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}
knn_params = {'imputer__strategy':['mean', 'median'], 'classify__n_neighbors':[1, 2,
3, 4, 5, 6, 7]}mean_impute_standardize = Pipeline([('imputer', SimpleImputer()), ('standardize',
StandardScaler()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']grid = GridSearchCV(mean_impute_standardize, knn_params)
grid.fit(X, y)print(grid.best_score_, grid.best_params_)
0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}
本文标签: 特征工程2
版权声明:本文标题:特征工程2 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.roclinux.cn/p/1699044184a324995.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论