import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
데이터전처리
- 머신러닝 알고리즘을 익히는 것 못치 않게
데이터 전처리 역시 중요한 과정 중에 하나
- 무엇보다 머신러닝 알고리즘을 적용하기 전에
데이터에 대해 미리 처리해야 하는 기본사항이 존재
결측치 처리
원핫인코딩
- 머신러닝 알고리즘들은 문자열값을 데이터의 입력값으로 허용하지 않음
- 따라서, 모든 문자열값은 인코딩해서 숫자형으로 변환해둬야 함
- 한편, 텍스트 데이터들은 벡터화해서 처리
- 머신러닝을 위한 인코딩은
레이블인코딩
과 원핫인코딩
등이 있음
레이블 인코딩
from sklearn.preprocessing import LabelEncoder
items = ['티비','냉장고','가스렌지','에어콘','컴퓨터']
encoder = LabelEncoder()
encoder.fit(items)
lables = encoder.transform(items)
lables
array([4, 1, 0, 2, 3])
# 인코딩 순서 확인
encoder.classes_
array(['가스렌지', '냉장고', '에어콘', '컴퓨터', '티비'], dtype='<U4')
LabelEncoder 시용시 문제점
- 문자열값을 숫자형으로 변환시켰을때 발생할 수 있는 문제는
각 값의 대소 관계를 통해 중요도 여부가 존재할 수 있음
- 즉, 인코딩 된 값에 서수척도가 생길수 있음
- 따라서, 대소관계가 있는 데이터를 분석할 경우 정확도에 영향을 미칠수있음
'티비','냉장고','가스렌지','에어콘','컴퓨터'
1 0 0 0 0
0 1 0 0 0
0 0 1 0 0
0 0 0 1 0
0 0 0 0 1
from sklearn.preprocessing import OneHotEncoder
# 먼저, LabelEncoder로 문자열을 숫자값으로 변환해두어야 함!
# 그런 다음, 1차원 데이터를 2차원 데이터 변환
lables = lables.reshape(-1, 1) # -1 : 행을 열로 변환 (가로->세로)
lables
array([[4],
[1],
[0],
[2],
[3]])
onehot = OneHotEncoder()
onehot.fit(lables)
onlables = onehot.transform(lables)
array([[0., 0., 0., 0., 1.],
[0., 1., 0., 0., 0.],
[1., 0., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.]])
# ['티비','냉장고','가스렌지','에어콘','컴퓨터']
onlables.toarray()
array([[0., 0., 0., 0., 1.],
[0., 1., 0., 0., 0.],
[1., 0., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.]])
pandas의 원핫인코딩
- get_dummies 함수 사용
- 단, 변환대상은 반드시 데이터프레임이어야 함!
df = pd.DataFrame({'': items})
pd.get_dummies(df)
|
_가스렌지 |
_냉장고 |
_에어콘 |
_컴퓨터 |
_티비 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
1 |
0 |
0 |
0 |
2 |
1 |
0 |
0 |
0 |
0 |
3 |
0 |
0 |
1 |
0 |
0 |
4 |
0 |
0 |
0 |
1 |
0 |
특성 스케일링과 표준화/정규화
- 서로 다른 범위, 단위의 변수값을
일정수준으로 맞추는 작업을 특성feature 스케일링이라 함
- 어떤 데이터의 값이 정수와 실수가 혼용되어 있거나
- 값의 범위가 1 ~ 100, 0 ~ 0.001, 1 ~ 10000 등등의 경우
- 데이터 분석시 많은 cpu 파워/메모리가 필요하고
- 학습시 느려질수 있으며 제대로 된 결과가 나오지 않을 수 있음
- 이것을 제대로 변환하는 방법은
정규화
와 표준화
가 있음
표준화
- 특성값을 평균이 0이고 표준편차가 1인 정규분포를 가진값으로 변환하는 것
X = np.arange(9) - 3
X
array([-3, -2, -1, 0, 1, 2, 3, 4, 5])
np.mean(X, axis=0), np.std(X, axis=0)
(1.0, 2.581988897471611)
from sklearn.preprocessing import StandardScaler
X = X.reshape(-1, 1)
scaler = StandardScaler()
scaler.fit(X) # wide to long 변환
XX = scaler.transform(X)
XX
array([[-1.54919334],
[-1.161895 ],
[-0.77459667],
[-0.38729833],
[ 0. ],
[ 0.38729833],
[ 0.77459667],
[ 1.161895 ],
[ 1.54919334]])
print(np.min(XX, axis=0))
print(np.max(XX, axis=0))
[-1.54919334]
[1.54919334]
정규화
- 특성값을 최소 0, 최대 1사이의 값으로 변환하는 것
- 데이터의 분포가 정규분포를 따르지 않을때 적용
from sklearn.preprocessing import MinMaxScaler
X = X.reshape(-1, 1)
scaler = MinMaxScaler()
scaler.fit(X) # wide to long 변환
XXX = scaler.transform(X)
print(np.min(XXX, axis=0))
print(np.max(XXX, axis=0))
[0.]
[1.]
중앙값, 사분위수 기반 스케일링
- RobustScaler
- 이상치나 극단치에 영향을 덜 받는 스케일링 기법
from sklearn.preprocessing import RobustScaler
X = X.reshape(-1, 1)
scaler = RobustScaler()
scaler.fit(X) # wide to long 변환
XXXX = scaler.transform(X)
print(np.min(XXXX, axis=0))
print(np.max(XXXX, axis=0))
[-1.]
[1.]
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
titanic = pd.read_csv('../data/titanic2.csv')
표준화 적용
titanic.head()
data = titanic.iloc[:, [1, 5,6, 7, 8, 11, 12, 13]]
scaler = StandardScaler()
scaler.fit(data)
data2 = scaler.transform(data)
target = titanic.survived
X_train, X_test, y_train, y_test = train_test_split(data2, target, train_size = 0.7,
stratify=target)
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, y_train)
pred = dtclf.predict(X_test)
accuracy_score(y_test, pred)
0.7551020408163265
rfclf = RandomForestClassifier()
rfclf.fit(X_train, y_train)
pred = rfclf.predict(X_test)
accuracy_score(y_test, pred)
0.7882653061224489
rfclf = RandomForestClassifier()
scores = cross_val_score(rfclf, data2, target, scoring='accuracy', cv = 10)
np.mean(scores)
0.7425249559600705
정규화 적용
data = titanic.iloc[:, [1, 5,6, 7, 8, 11, 12, 13]]
scaler = MinMaxScaler()
scaler.fit(data)
data2 = scaler.transform(data)
target = titanic.survived
X_train, X_test, y_train, y_test = train_test_split(data2, target, train_size = 0.7,
stratify=target)
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, y_train)
pred = dtclf.predict(X_test)
accuracy_score(y_test, pred)
0.7729591836734694
rfclf = RandomForestClassifier()
rfclf.fit(X_train, y_train)
pred = rfclf.predict(X_test)
accuracy_score(y_test, pred)
0.7908163265306123
rfclf = RandomForestClassifier()
scores = cross_val_score(rfclf, data2, target, scoring='accuracy', cv = 10)
np.mean(scores)
0.7448502642395772
중앙값 스케일러
data = titanic.iloc[:, [1, 5,6, 7, 8, 11, 12, 13]]
scaler = RobustScaler()
scaler.fit(data)
data2 = scaler.transform(data)
target = titanic.survived
X_train, X_test, y_train, y_test = train_test_split(data2, target, train_size = 0.7,
stratify=target)
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, y_train)
pred = dtclf.predict(X_test)
accuracy_score(y_test, pred)
0.7780612244897959
rfclf = RandomForestClassifier()
rfclf.fit(X_train, y_train)
pred = rfclf.predict(X_test)
accuracy_score(y_test, pred)
0.7908163265306123
rfclf = RandomForestClassifier()
scores = cross_val_score(rfclf, data2, target, scoring='accuracy', cv = 10)
np.mean(scores)
0.7364004697592483
titanic.head()
t_age = np.array(titanic.age)
t_age = t_age.reshape(-1, 1)
t_fare = np.array(titanic.fare)
t_fare = t_fare.reshape(-1, 1)
scaler = StandardScaler()
scaler.fit(t_age) # wide to long 변환
scaler.fit(t_fare)
s_age = scaler.transform(t_age)
s_fare = scaler.transform(t_fare)
titanic['age_scale'] = s_age
titanic['fare_scale'] = s_fare
titanic
data = titanic.iloc[:, [1, 6, 7, 11, 12, 13, 14, 15]]
target = titanic.survived
data
|
pclass |
sibsp |
parch |
gender |
Embarked |
Title |
age_scale |
fare_scale |
0 |
1 |
0 |
0 |
0 |
2 |
10 |
-0.081628 |
3.442063 |
1 |
1 |
1 |
2 |
1 |
2 |
9 |
-0.624341 |
2.286663 |
2 |
1 |
1 |
2 |
0 |
2 |
10 |
-0.603406 |
2.286663 |
3 |
1 |
1 |
2 |
1 |
2 |
13 |
-0.062303 |
2.286663 |
4 |
1 |
1 |
2 |
0 |
2 |
14 |
-0.158929 |
2.286663 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1301 |
3 |
1 |
0 |
0 |
0 |
10 |
-0.361842 |
-0.362727 |
1302 |
3 |
1 |
0 |
0 |
0 |
10 |
-0.100953 |
-0.362727 |
1303 |
3 |
0 |
0 |
1 |
0 |
13 |
-0.129941 |
-0.502433 |
1304 |
3 |
0 |
0 |
1 |
0 |
13 |
-0.120279 |
-0.502433 |
1305 |
3 |
0 |
0 |
1 |
2 |
13 |
-0.081628 |
-0.489871 |
1306 rows × 8 columns
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.7,
stratify=target, random_state=2211161315)
from sklearn.tree import DecisionTreeClassifier
dtclf = DecisionTreeClassifier(random_state=2211161315)
dtclf.fit(X_train, y_train)
pred = dtclf.predict(X_test)
accuracy_score(y_test, pred)
0.7448979591836735
scaler = MinMaxScaler()
scaler.fit(t_age) # wide to long 변환
scaler.fit(t_fare)
s_age = scaler.transform(t_age)
s_fare = scaler.transform(t_fare)
titanic['age_scale'] = s_age
titanic['fare_scale'] = s_fare
titanic
data = titanic.iloc[:, [1, 6, 7, 11, 12, 13, 14, 15]]
target = titanic.survived
|
Unnamed: 0 |
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
title |
gender |
Embarked |
Title |
age_scale |
fare_scale |
0 |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
Miss |
0 |
2 |
10 |
0.056604 |
0.412503 |
1 |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
Master |
1 |
2 |
9 |
0.001789 |
0.295806 |
2 |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
Miss |
0 |
2 |
10 |
0.003904 |
0.295806 |
3 |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
Mr |
1 |
2 |
13 |
0.058556 |
0.295806 |
4 |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
Mrs |
0 |
2 |
14 |
0.048797 |
0.295806 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1301 |
1304 |
3 |
0 |
Zabour, Miss. Hileni |
female |
14.5000 |
1 |
0 |
14.4542 |
C |
Miss |
0 |
0 |
10 |
0.028302 |
0.028213 |
1302 |
1305 |
3 |
0 |
Zabour, Miss. Thamine |
female |
28.0000 |
1 |
0 |
14.4542 |
C |
Miss |
0 |
0 |
10 |
0.054652 |
0.028213 |
1303 |
1306 |
3 |
0 |
Zakarian, Mr. Mapriededer |
male |
26.5000 |
0 |
0 |
7.2250 |
C |
Mr |
1 |
0 |
13 |
0.051725 |
0.014102 |
1304 |
1307 |
3 |
0 |
Zakarian, Mr. Ortin |
male |
27.0000 |
0 |
0 |
7.2250 |
C |
Mr |
1 |
0 |
13 |
0.052700 |
0.014102 |
1305 |
1308 |
3 |
0 |
Zimmerman, Mr. Leo |
male |
29.0000 |
0 |
0 |
7.8750 |
S |
Mr |
1 |
2 |
13 |
0.056604 |
0.015371 |
1306 rows × 16 columns
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.7,
stratify=target, random_state=2211161315)
scaler = RobustScaler()
scaler.fit(t_age) # wide to long 변환
XX = scaler.transform(t_age)
titanic['age_scale'] = XX
titanic
|
Unnamed: 0 |
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
title |
gender |
Embarked |
Title |
age_scale |
0 |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
Miss |
0 |
2 |
10 |
0.076923 |
1 |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
Master |
1 |
2 |
9 |
-2.083331 |
2 |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
Miss |
0 |
2 |
10 |
-2.000000 |
3 |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
Mr |
1 |
2 |
13 |
0.153846 |
4 |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
Mrs |
0 |
2 |
14 |
-0.230769 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1301 |
1304 |
3 |
0 |
Zabour, Miss. Hileni |
female |
14.5000 |
1 |
0 |
14.4542 |
C |
Miss |
0 |
0 |
10 |
-1.038462 |
1302 |
1305 |
3 |
0 |
Zabour, Miss. Thamine |
female |
28.0000 |
1 |
0 |
14.4542 |
C |
Miss |
0 |
0 |
10 |
0.000000 |
1303 |
1306 |
3 |
0 |
Zakarian, Mr. Mapriededer |
male |
26.5000 |
0 |
0 |
7.2250 |
C |
Mr |
1 |
0 |
13 |
-0.115385 |
1304 |
1307 |
3 |
0 |
Zakarian, Mr. Ortin |
male |
27.0000 |
0 |
0 |
7.2250 |
C |
Mr |
1 |
0 |
13 |
-0.076923 |
1305 |
1308 |
3 |
0 |
Zimmerman, Mr. Leo |
male |
29.0000 |
0 |
0 |
7.8750 |
S |
Mr |
1 |
2 |
13 |
0.076923 |
1306 rows × 15 columns
승객의 나이, 요금을 범주형으로 변환
# 나이범주 : 5, 10, 20, 30, 40, 50, 60, 70, 80, 90~
def getAge(x):
result = 'a'
if 80 < x <= 90:
result = 'b'
elif 70 < x <= 80:
result = 'c'
elif 60 < x <= 70:
result = 'd'
elif 50 < x <= 60:
result = 'e'
elif 40 < x <= 50:
result = 'f'
elif 30 < x <= 40:
result = 'g'
elif 20 < x <= 30:
result = 'h'
elif 10 < x <= 20:
result = 'i'
elif 5 < x <= 10:
result = 'j'
elif x <= 5:
result = 'k'
return result
titanic['Age_Area'] = titanic.age.apply(lambda x: getAge(x))
titanic
|
Unnamed: 0 |
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
title |
gender |
Embarked |
Title |
Age_Area |
Fare_Area |
0 |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
Miss |
0 |
2 |
10 |
h |
4 |
1 |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
Master |
1 |
2 |
9 |
k |
5 |
2 |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
Miss |
0 |
2 |
10 |
k |
5 |
3 |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
Mr |
1 |
2 |
13 |
h |
5 |
4 |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
Mrs |
0 |
2 |
14 |
h |
5 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1301 |
1304 |
3 |
0 |
Zabour, Miss. Hileni |
female |
14.5000 |
1 |
0 |
14.4542 |
C |
Miss |
0 |
0 |
10 |
i |
6 |
1302 |
1305 |
3 |
0 |
Zabour, Miss. Thamine |
female |
28.0000 |
1 |
0 |
14.4542 |
C |
Miss |
0 |
0 |
10 |
h |
6 |
1303 |
1306 |
3 |
0 |
Zakarian, Mr. Mapriededer |
male |
26.5000 |
0 |
0 |
7.2250 |
C |
Mr |
1 |
0 |
13 |
h |
6 |
1304 |
1307 |
3 |
0 |
Zakarian, Mr. Ortin |
male |
27.0000 |
0 |
0 |
7.2250 |
C |
Mr |
1 |
0 |
13 |
h |
6 |
1305 |
1308 |
3 |
0 |
Zimmerman, Mr. Leo |
male |
29.0000 |
0 |
0 |
7.8750 |
S |
Mr |
1 |
2 |
13 |
h |
6 |
1306 rows × 16 columns
# 요금범주 : 100, 200, 300, 400, 500
def getFare(x):
result = 'a'
if 400 < x <= 500:
result = 'b'
elif 300 < x <= 400:
result = 'c'
elif 200 < x <= 300:
result = 'd'
elif 100 < x <= 200:
result = 'e'
elif x <= 100:
result = 'f'
return result
titanic['Fare_Area'] = titanic.fare.apply(lambda x: getFare(x))
titanic
|
Unnamed: 0 |
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
title |
gender |
Embarked |
Title |
Age_Area |
Fare_Area |
0 |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
Miss |
0 |
2 |
10 |
h |
d |
1 |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
Master |
1 |
2 |
9 |
k |
e |
2 |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
Miss |
0 |
2 |
10 |
k |
e |
3 |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
Mr |
1 |
2 |
13 |
h |
e |
4 |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
Mrs |
0 |
2 |
14 |
h |
e |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1301 |
1304 |
3 |
0 |
Zabour, Miss. Hileni |
female |
14.5000 |
1 |
0 |
14.4542 |
C |
Miss |
0 |
0 |
10 |
i |
f |
1302 |
1305 |
3 |
0 |
Zabour, Miss. Thamine |
female |
28.0000 |
1 |
0 |
14.4542 |
C |
Miss |
0 |
0 |
10 |
h |
f |
1303 |
1306 |
3 |
0 |
Zakarian, Mr. Mapriededer |
male |
26.5000 |
0 |
0 |
7.2250 |
C |
Mr |
1 |
0 |
13 |
h |
f |
1304 |
1307 |
3 |
0 |
Zakarian, Mr. Ortin |
male |
27.0000 |
0 |
0 |
7.2250 |
C |
Mr |
1 |
0 |
13 |
h |
f |
1305 |
1308 |
3 |
0 |
Zimmerman, Mr. Leo |
male |
29.0000 |
0 |
0 |
7.8750 |
S |
Mr |
1 |
2 |
13 |
h |
f |
1306 rows × 16 columns
encoder = LabelEncoder()
encoder.fit(titanic.Age_Area)
titanic['Age_Area'] = encoder.transform(titanic.Age_Area)
encoder = LabelEncoder()
encoder.fit(titanic.Fare_Area)
titanic['Fare_Area'] = encoder.transform(titanic.Fare_Area)
titanic.head()
|
Unnamed: 0 |
pclass |
survived |
name |
sex |
age |
sibsp |
parch |
fare |
embarked |
title |
gender |
Embarked |
Title |
Age_Area |
Fare_Area |
0 |
0 |
1 |
1 |
Allen, Miss. Elisabeth Walton |
female |
29.0000 |
0 |
0 |
211.3375 |
S |
Miss |
0 |
2 |
10 |
5 |
1 |
1 |
1 |
1 |
1 |
Allison, Master. Hudson Trevor |
male |
0.9167 |
1 |
2 |
151.5500 |
S |
Master |
1 |
2 |
9 |
8 |
2 |
2 |
2 |
1 |
0 |
Allison, Miss. Helen Loraine |
female |
2.0000 |
1 |
2 |
151.5500 |
S |
Miss |
0 |
2 |
10 |
8 |
2 |
3 |
3 |
1 |
0 |
Allison, Mr. Hudson Joshua Creighton |
male |
30.0000 |
1 |
2 |
151.5500 |
S |
Mr |
1 |
2 |
13 |
5 |
2 |
4 |
4 |
1 |
0 |
Allison, Mrs. Hudson J C (Bessie Waldo Daniels) |
female |
25.0000 |
1 |
2 |
151.5500 |
S |
Mrs |
0 |
2 |
14 |
5 |
2 |
data = titanic.iloc[:, [1, 6, 7, 11, 12, 13, 14, 15]]
# data
scaler = StandardScaler()
scaler.fit(data)
data2 = scaler.transform(data)
target = titanic.survived
pd.DataFrame(data2)
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
0 |
-1.549301 |
-0.479763 |
-0.445612 |
-1.347091 |
0.623271 |
-1.203895 |
0.220416 |
-4.794691 |
1 |
-1.549301 |
0.479763 |
1.863789 |
0.742340 |
0.623271 |
-1.738323 |
2.384797 |
-2.275839 |
2 |
-1.549301 |
0.479763 |
1.863789 |
-1.347091 |
0.623271 |
-1.203895 |
2.384797 |
-2.275839 |
3 |
-1.549301 |
0.479763 |
1.863789 |
0.742340 |
0.623271 |
0.399389 |
0.220416 |
-2.275839 |
4 |
-1.549301 |
0.479763 |
1.863789 |
-1.347091 |
0.623271 |
0.933817 |
0.220416 |
-2.275839 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1301 |
0.840997 |
0.479763 |
-0.445612 |
-1.347091 |
-1.832210 |
-1.203895 |
0.941876 |
0.243013 |
1302 |
0.840997 |
0.479763 |
-0.445612 |
-1.347091 |
-1.832210 |
-1.203895 |
0.220416 |
0.243013 |
1303 |
0.840997 |
-0.479763 |
-0.445612 |
0.742340 |
-1.832210 |
0.399389 |
0.220416 |
0.243013 |
1304 |
0.840997 |
-0.479763 |
-0.445612 |
0.742340 |
-1.832210 |
0.399389 |
0.220416 |
0.243013 |
1305 |
0.840997 |
-0.479763 |
-0.445612 |
0.742340 |
0.623271 |
0.399389 |
0.220416 |
0.243013 |
1306 rows × 8 columns
X_train, X_test, y_train, y_test = train_test_split(data2, target, train_size = 0.7,
stratify=target)
dtclf = RandomForestClassifier()
dtclf.fit(X_train, y_train)
pred = dtclf.predict(X_test)
accuracy_score(y_test, pred)
0.8061224489795918
data = titanic.iloc[:, [1, 6, 7, 11, 12, 13, 14, 15]]
# data
scaler = MinMaxScaler()
scaler.fit(data)
data2 = scaler.transform(data)
target = titanic.survived
pd.DataFrame(data2)
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
0 |
0.0 |
0.000 |
0.000000 |
0.0 |
1.0 |
0.588235 |
0.625 |
0.333333 |
1 |
0.0 |
0.125 |
0.222222 |
1.0 |
1.0 |
0.529412 |
1.000 |
0.666667 |
2 |
0.0 |
0.125 |
0.222222 |
0.0 |
1.0 |
0.588235 |
1.000 |
0.666667 |
3 |
0.0 |
0.125 |
0.222222 |
1.0 |
1.0 |
0.764706 |
0.625 |
0.666667 |
4 |
0.0 |
0.125 |
0.222222 |
0.0 |
1.0 |
0.823529 |
0.625 |
0.666667 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1301 |
1.0 |
0.125 |
0.000000 |
0.0 |
0.0 |
0.588235 |
0.750 |
1.000000 |
1302 |
1.0 |
0.125 |
0.000000 |
0.0 |
0.0 |
0.588235 |
0.625 |
1.000000 |
1303 |
1.0 |
0.000 |
0.000000 |
1.0 |
0.0 |
0.764706 |
0.625 |
1.000000 |
1304 |
1.0 |
0.000 |
0.000000 |
1.0 |
0.0 |
0.764706 |
0.625 |
1.000000 |
1305 |
1.0 |
0.000 |
0.000000 |
1.0 |
1.0 |
0.764706 |
0.625 |
1.000000 |
1306 rows × 8 columns
X_train, X_test, y_train, y_test = train_test_split(data2, target, train_size = 0.7,
stratify=target)
dtclf = RandomForestClassifier()
dtclf.fit(X_train, y_train)
pred = dtclf.predict(X_test)
accuracy_score(y_test, pred)
0.7908163265306123
data = titanic.iloc[:, [1, 6, 7, 11, 12, 13, 14, 15]]
# data
scaler = RobustScaler()
scaler.fit(data)
data2 = scaler.transform(data)
target = titanic.survived
pd.DataFrame(data2)
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
0 |
-2.0 |
0.0 |
0.0 |
-1.0 |
0.0 |
-1.000000 |
0.0 |
-2.0 |
1 |
-2.0 |
1.0 |
2.0 |
0.0 |
0.0 |
-1.333333 |
3.0 |
-1.0 |
2 |
-2.0 |
1.0 |
2.0 |
-1.0 |
0.0 |
-1.000000 |
3.0 |
-1.0 |
3 |
-2.0 |
1.0 |
2.0 |
0.0 |
0.0 |
0.000000 |
0.0 |
-1.0 |
4 |
-2.0 |
1.0 |
2.0 |
-1.0 |
0.0 |
0.333333 |
0.0 |
-1.0 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1301 |
0.0 |
1.0 |
0.0 |
-1.0 |
-2.0 |
-1.000000 |
1.0 |
0.0 |
1302 |
0.0 |
1.0 |
0.0 |
-1.0 |
-2.0 |
-1.000000 |
0.0 |
0.0 |
1303 |
0.0 |
0.0 |
0.0 |
0.0 |
-2.0 |
0.000000 |
0.0 |
0.0 |
1304 |
0.0 |
0.0 |
0.0 |
0.0 |
-2.0 |
0.000000 |
0.0 |
0.0 |
1305 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.000000 |
0.0 |
0.0 |
1306 rows × 8 columns
X_train, X_test, y_train, y_test = train_test_split(data2, target, train_size = 0.7,
stratify=target)
rfclf = RandomForestClassifier()
rfclf.fit(X_train, y_train)
pred = rfclf.predict(X_test)
accuracy_score(y_test, pred)
0.7780612244897959
rfclf = DecisionTreeClassifier()
scores = cross_val_score(rfclf, data2, target, scoring='accuracy', cv = 10)
rfclf = RandomForestClassifier()
scores = cross_val_score(rfclf, data2, target, scoring='accuracy', cv = 10)
np.mean(scores)
0.7433529066353495