c値、交差検証のそれぞれ最良のパラメーターを調べる
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import confusion_matrix from sklearn.preprocessing import Imputer from sklearn.preprocessing import StandardScaler from sklearn.model_selection import GridSearchCV #読込 diabetes_df = pd.read_csv( 'http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data', header=None) #8列目が目的変数8列目を別にする X = diabetes_df.iloc[:, :8] y = diabetes_df.iloc[:, 8:].values.flatten() # 1次元に展開 # X shape: (768, 8), y shape: (768,) print('X shape: {}, y shape: {}'.format(X.shape, y.shape)) #欠損値処理 (欠損値の指定defaultは'NaN',mean, median, mode のどれか,行か列かの指定 med_imp = Imputer(missing_values=0, strategy='median', axis=0) med_imp.fit(X.iloc[:, 1:6]) X.iloc[:, 1:6] = med_imp.transform(X.iloc[:, 1:6]) # 学習データとテストデータを分ける X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) std_scl = StandardScaler() std_scl.fit(X_train) X_train = std_scl.transform(X_train) X_test = std_scl.transform(X_test) #svc分類、真偽陰陽にて判定 svc = SVC() svc.fit(X_train, y_train) print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, svc.predict(X_test)))) #Cとgammaのそれぞれ0.001, 0.01, 0.1, 1, 10, 100の6つの値を計算 #6かける6で36パターンを表示 #交差検証は10分割 svc_param_grid = { 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100] } svc_grid_search = GridSearchCV(SVC(), svc_param_grid, cv=10) svc_grid_search.fit(X_train, y_train) print('Train score: {:.3f}'.format(svc_grid_search.score(X_train, y_train))) print('Test score: {:.3f}'.format(svc_grid_search.score(X_test, y_test))) print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, svc_grid_search.predict(X_test)))) |