admin 管理员组文章数量: 1087649
数据挖掘与机器学习作业
逻辑回归
导入包
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from my_tools import *
import warnings
warnings.filterwarnings("ignore")
jibing_res = pd.read_excel("./jibing_feature_res_final.xlsx")
jibing = pd.read_excel("./jibing_feature_final.xlsx")
jibing.head()
左右 | 是否外伤 | 症状持续时间 | 明显夜间痛 | 年龄 | 高血压 | 高血脂 | 2型糖尿病 | 吸烟与否 | 饮酒与否 | ... | 腺苷脱氨酶ADA | 果糖胺 | 肌酸激酶 | α-L-盐藻糖苷酶 | 乳酸 | 淀粉酶 | 同型半胱氨酸 | 铁 | 总铁结合力 | 血型 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 3 | 0 | 65 | 1 | 0 | 0 | 0 | 0 | ... | 10.0 | 1.32 | 48.0 | 12.0 | 1.9 | 49.0 | 9.9 | 12.3 | 43.5 | 3 |
1 | 1 | 1 | 2 | 0 | 62 | 1 | 0 | 0 | 0 | 0 | ... | 10.0 | 1.67 | 77.0 | 16.0 | 1.4 | 81.0 | 9.2 | 16.9 | 55.5 | 0 |
2 | 1 | 0 | 4 | 1 | 55 | 0 | 0 | 0 | 0 | 0 | ... | 15.0 | 1.86 | 78.0 | 22.0 | 1.9 | 89.0 | 9.9 | 7.0 | 51.4 | 0 |
3 | 1 | 0 | 3 | 0 | 60 | 0 | 0 | 0 | 0 | 0 | ... | 16.0 | 1.68 | 92.0 | 12.0 | 1.4 | 69.0 | 9.3 | 15.8 | 53.0 | 0 |
4 | 0 | 1 | 3 | 0 | 61 | 0 | 0 | 0 | 0 | 0 | ... | 13.0 | 1.60 | 58.0 | 14.0 | 1.7 | 153.0 | 8.1 | 13.2 | 45.9 | 0 |
5 rows × 60 columns
jibing_res.head()
结果 | |
---|---|
0 | 0 |
1 | 1 |
2 | 1 |
3 | 0 |
4 | 1 |
归一化
jibing = guiyihua(jibing)
标准化
jibing = biaozhunhua(jibing)
jibing.iloc[0]
左右 0.000000
是否外伤 0.000000
症状持续时间 3.000000
明显夜间痛 0.000000
年龄 0.402864
高血压 1.000000
高血脂 0.000000
2型糖尿病 0.000000
吸烟与否 0.000000
饮酒与否 -0.448892
红细胞计数*10^12/L -0.111242
血红蛋白 -1.262287
红细胞压积 -0.628449
血小板计数 1.836626
血小板压积 -0.016066
总蛋白g/L 0.117665
白蛋白g/L -0.783686
球蛋白g/L 0.892589
白球比 -1.141215
ALT丙氨酸氨基转移酶 -0.955624
碱性磷酸酶 0.577122
谷氨酸转肽酶 -0.458009
AST:ALT 1.972187
总胆红素 -0.567388
直接胆红素 0.058454
间接胆红素 -0.700329
钾 1.331665
钠 -0.154827
氯 -0.203053
钙 -1.011273
磷 -0.094543
镁 1.419808
葡萄糖 -0.813153
肌酐 0.219459
尿素 0.950509
尿酸 -0.222815
甘油三酯 0.111053
总胆固醇 0.102856
H高密度胆固醇 0.085759
L低密度胆固醇 0.101836
载脂蛋白A1 -0.047968
载脂蛋白B 0.763163
载脂蛋白E mg/l -0.397325
aPoB/aPoA1 0.073904
脂蛋白小a -0.059640
乳酸脱氢酶LDH -1.057407
β-2微球蛋白 1.273672
胆碱酯酶 -1.187449
前白蛋白mg/l 0.070510
总胆汁酸 -0.415554
腺苷脱氨酶ADA -0.396787
果糖胺 -0.160764
肌酸激酶 -0.176406
α-L-盐藻糖苷酶 -1.241122
乳酸 0.269307
淀粉酶 -0.755958
同型半胱氨酸 -0.420427
铁 -0.880622
总铁结合力 -1.226099
血型 3.000000
Name: 0, dtype: float64
为解决样本不均衡问题
使用 SMOTE 补充数据
SMOTE:插值的方法扩充数据
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from imblearn.over_sampling import SMOTE
f1_list = []
set_font()
for i in range(1,60):smote = SMOTE(sampling_strategy=1, random_state=42)selector = SelectKBest(mutual_info_classif, k=i)jibing_ = selector.fit_transform(jibing, jibing_res)Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain) clf = LogisticRegression(random_state=42)clf.fit(Xtrain, Ytrain)y_pre = clf.predict(Xtest)metrics_ = res_metrics(Ytest,y_pre,"调参")f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,60),f1_list,"f1 - 特征筛选")
发现 f1-score 的最高值出现在 0-10 之间
f1_list=[]
for i in range(5,11):smote = SMOTE(sampling_strategy=1, random_state=42)selector = SelectKBest(mutual_info_classif, k=i)jibing_ = selector.fit_transform(jibing, jibing_res)Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)clf = LogisticRegression(random_state=42)clf.fit(Xtrain, Ytrain)y_pre = clf.predict(Xtest)metrics_ = res_metrics(Ytest,y_pre,"调参")f1_list.append(metrics_["f1-score"])
zhexiantu(range(5,11),f1_list,"f1 - 特征筛选")
选取最佳的6个特征进行训练,K = 6
smote = SMOTE(sampling_strategy=1, random_state=42)
selector = SelectKBest(mutual_info_classif, k=6)
jibing_ = selector.fit_transform(jibing, jibing_res)
Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
clf = LogisticRegression(random_state=42)
clf.fit(Xtrain, Ytrain)y_pre = clf.predict(Xtest)
metrics_ = res_metrics(Ytest,y_pre,"f1 - k=6")
#####################f1 - k=6#####################
+--------------------+--------------------+--------------------+
| precision | recall | f1 |
+--------------------+--------------------+--------------------+
| 0.8265539116030419 | 0.6724137931034483 | 0.7415586728925839 |
+--------------------+--------------------+--------------------+
尝试降维
PCA
f1_list = []
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
for i in range(1,6):clf = LogisticRegression(random_state=42)pca = PCA(n_components=i,random_state=42)Xtrain_ = pca.fit_transform(Xtrain,Ytrain)clf.fit(Xtrain_, Ytrain)Xtest_ = pca.fit_transform(Xtest)y_pre = clf.predict(Xtest_)metrics_ = res_metrics(Ytest,y_pre,"调参")f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,6),f1_list,"f1 - PCA")
f1_list = []
from sklearn.manifold import TSNE
for i in range(1,4):clf = LogisticRegression(random_state=42)tsne = TSNE(n_components=i,random_state=42)Xtrain_ = tsne.fit_transform(Xtrain,Ytrain)clf.fit(Xtrain_, Ytrain)Xtest_ = tsne.fit_transform(Xtest)y_pre = clf.predict(Xtest_)metrics_ = res_metrics(Ytest,y_pre,"调参")f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,4),f1_list,"tsne - F1")
放弃降维,直接使用 K = 6 的特征筛选
smote = SMOTE(sampling_strategy=1, random_state=42)
selector = SelectKBest(mutual_info_classif, k=6)
jibing_ = selector.fit_transform(jibing, jibing_res)
Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
# 训练,拟合
clf = LogisticRegression(random_state=42)
clf.fit(Xtrain, Ytrain)
y_pre = clf.predict(Xtest)
metrics_ = res_metrics(Ytest,y_pre,"逻辑回归-特征筛选")
####################逻辑回归-特征筛选#####################
+--------------------+--------------------+--------------------+
| precision | recall | f1 |
+--------------------+--------------------+--------------------+
| 0.8263447971781305 | 0.7068965517241379 | 0.7619678246723789 |
+--------------------+--------------------+--------------------+
最高的 f1-score 为0.76
本文标签: 数据挖掘与机器学习作业
版权声明:本文标题:数据挖掘与机器学习作业 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.roclinux.cn/p/1700324403a397197.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论