1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split %matplotlib inline #データの読込み df = pd.read_csv('train.csv') #欠損値処理1 Fareは平均値、乗船地は一般的なSを代入 df['Fare'] = df['Fare'].fillna(df['Fare'].median()) df['Embarked'] = df['Embarked'].fillna('S') #欠損地処理2 #年齢をfillnaで平均を取るとランダムフォレストの結果が #訓練スコア 0.96 #テストスコア 0.80 #年齢は学習において重要なのでNaの場合行ごと削除した結果 #訓練スコア 0.98 #テストスコア 0.81に上昇した df=df.dropna(subset=['Age']) #カテゴリ変数の変換 df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0) df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) #学習に不要と思われるデータを削除 df = df.drop(['Cabin','Name','PassengerId','Ticket'],axis=1) #訓練データとテストデータに分離 train_X = df.drop('Survived', axis=1) train_y = df.Survived (train_X, test_X ,train_y, test_y) = train_test_split(train_X, train_y, test_size = 0.3, random_state = 666) #複数のモデル構築でどの分類器がベストかを調べる from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge #決定木 ki = DecisionTreeClassifier(random_state=0).fit(train_X, train_y) #ランダムフォレスト mori = RandomForestClassifier(random_state=0).fit(train_X,train_y) #ロジスティック回帰 logi = LogisticRegression(C=0.1).fit(train_X,train_y) #KNN KNN = KNeighborsClassifier(4).fit(train_X,train_y) #SVC svc = SVC(probability=True).fit(train_X,train_y) #linear linear = LinearRegression().fit(train_X,train_y) #ridge ridge = Ridge(alpha=1).fit(train_X,train_y) data ={"clf": ["tree", "forest","logistic","KNN","svc","Linear","Ridge"], "traning score":[(ki.score(train_X,train_y)),(mori.score(train_X,train_y)), (logi.score(train_X,train_y)),(KNN.score(train_X,train_y)), (svc.score(train_X,train_y)),(linear.score(train_X,train_y)), (ridge.score(train_X,train_y))], "test score":[(ki.score(test_X,test_y)),(mori.score(test_X,test_y)), logi.score(train_X,train_y),(KNN.score(test_X,test_y)), (svc.score(test_X,test_y)),(linear.score(train_X,train_y)),(ridge.score(train_X,train_y))] } frame = pd.DataFrame(data,index=["tree", "forest","logistic","KNN","svc","Linear","Ridge"]) frame.plot(kind="bar") |