머신러닝으로 타이타닉 생존자 예측하기
- 다양한 머신러닝 알고리즘을 이용해서
- 교차검증 방식으로 모델을 훈련시키고
- 예측 정확도를 통해 평가해 봄
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
titanic = pd.read_csv('../data/titanic.csv')
titanic.head()
|
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
ticket |
fare |
cabin |
embarked |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
24160 |
211.3375 |
B5 |
S |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
113781 |
151.5500 |
C22 C26 |
S |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
113781 |
151.5500 |
C22 C26 |
S |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
113781 |
151.5500 |
C22 C26 |
S |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
113781 |
151.5500 |
C22 C26 |
S |
전처리
- 분석대상 컬럼들은 반드시 숫자형 값들로 구성되어야 함
- sex, embarked 컬럼을 숫자형으로 변환
- cabin은 결측치가 많기 때문에 컬럼자체를 제거
- ticket은 분석하기에 너무 많은 범주를 포함함 - 과감히 제거
- 승객직함titles을 추출해서 분석대상 컬럼으로 지정하고 숫자형으로 변환
- 최종컬럼 : pclass, sex, age, sibsp, parch, fare, embarked, titles
cabin, ticket 컬럼제거
titanic.drop(columns=['cabin', 'ticket'], inplace=True, axis=1)
titanic.head()
|
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
승객나이, 승선위치 결측치 처리
# 결측치 현황 확인
titanic.isna().sum()
pclass 0
survived 0
name 0
sex 0
age 263
sibsp 0
parch 0
fare 1
embarked 2
dtype: int64
# 나이의 결측치는 중앙값으로 대체
median = titanic.age.median()
titanic.age.fillna(median, inplace=True)
# 결측치 현황 확인
titanic.isna().sum()
pclass 0
survived 0
name 0
sex 0
age 0
sibsp 0
parch 0
fare 1
embarked 2
dtype: int64
# 요금과 승선위치의 결측치는 소량이므로 바로 제거
titanic.dropna(inplace=True)
# 결측치 현황 확인
titanic.isna().sum()
pclass 0
survived 0
name 0
sex 0
age 0
sibsp 0
parch 0
fare 0
embarked 0
dtype: int64
승객이름에서 직함이라는 파생변수 생성
# 승객이름 : Allen, Miss. Elisabeth Walton
# 지정한 규칙에 맞춰 특정문자열 추출 : extract(정규식)
fmt = ' ([a-zA-Z]+)\.'
titanic['title'] = titanic.name.str.extract(fmt)
titanic.title.value_counts()
Mr 756
Miss 259
Mrs 196
Master 61
Rev 8
Dr 8
Col 4
Mlle 2
Ms 2
Major 2
Capt 1
Sir 1
Dona 1
Jonkheer 1
Countess 1
Don 1
Mme 1
Lady 1
Name: title, dtype: int64
성별, 승선위치, 직함등을 숫자형으로 변환
# 성별
encoder = LabelEncoder()
gender = titanic.sex
encoder.fit(gender) # female : 0, male : 1
titanic['gender'] = encoder.transform(gender)
titanic.head()
|
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
title |
gender |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
Miss |
0 |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
Master |
1 |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
Miss |
0 |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
Mr |
1 |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
Mrs |
0 |
encoder = LabelEncoder()
embarked = titanic.embarked
encoder.fit(embarked) # female : 0, male : 1
titanic['Embarked'] = encoder.transform(embarked)
titanic.head()
|
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
title |
gender |
Embarked |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
Miss |
0 |
2 |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
Master |
1 |
2 |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
Miss |
0 |
2 |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
Mr |
1 |
2 |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
Mrs |
0 |
2 |
encoder = LabelEncoder()
title = titanic.title
encoder.fit(title) # female : 0, male : 1
titanic['Title'] = encoder.transform(title)
titanic.head()
## 심심해서 만들어봄 ㅋㅋㅋ
# title = pd.DataFrame(titanic.title)
# title = pd.DataFrame(title.groupby('title').value_counts()).index
# title = list(title)
# titanic.title.apply(lambda x: title.index(x))
|
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
title |
gender |
Embarked |
Title |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
Miss |
0 |
2 |
10 |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
Master |
1 |
2 |
9 |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
Miss |
0 |
2 |
10 |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
Mr |
1 |
2 |
13 |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
Mrs |
0 |
2 |
14 |
data = titanic.iloc[:, [0, 4, 5, 6, 7, 10, 11, 12]]
target = titanic.survived
data.head(5)
|
pclass |
age |
sibsp |
parch |
fare |
gender |
Embarked |
Title |
0 |
1 |
29.0000 |
0 |
0 |
211.3375 |
0 |
2 |
10 |
1 |
1 |
0.9167 |
1 |
2 |
151.5500 |
1 |
2 |
9 |
2 |
1 |
2.0000 |
1 |
2 |
151.5500 |
0 |
2 |
10 |
3 |
1 |
30.0000 |
1 |
2 |
151.5500 |
1 |
2 |
13 |
4 |
1 |
25.0000 |
1 |
2 |
151.5500 |
0 |
2 |
14 |
# 데이터의 범주가 동일 한게 이상적임 - 현재 비이상적 데이터
target.value_counts()
0 808
1 498
Name: survived, dtype: int64
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.7,
stratify=target, random_state=2211161315)
머신러닝 알고리즘 적용
# 모델 학습 후 평가
from sklearn.tree import DecisionTreeClassifier
dtclf = DecisionTreeClassifier(random_state=2211161315)
dtclf.fit(X_train, y_train)
pred = dtclf.predict(X_test)
accuracy_score(y_test, pred)
0.7448979591836735
rfclf = RandomForestClassifier(random_state=2211161315)
rfclf.fit(X_train, y_train)
pred = rfclf.predict(X_test)
accuracy_score(y_test, pred)
0.7729591836734694
교차검증을 통한 머신러닝 알고리즘 적용
from sklearn.model_selection import cross_val_score
dtclf = DecisionTreeClassifier(random_state=2211161315)
scores = cross_val_score(dtclf, data, target, scoring='accuracy', cv = 10)
np.mean(scores)
0.7188138578978274
rfclf = RandomForestClassifier(random_state=2211161315)
scores = cross_val_score(rfclf, data, target, scoring='accuracy', cv = 10)
np.mean(scores)
0.7448385202583676
titanic.to_csv('../data/titanic2.csv', index=True)