도미 데이터 머신러닝(KNN)-2

발뛰 2022. 3. 17. 19:09
# -*- coding: utf-8 -*-

# Commented out IPython magic to ensure Python compatibility.
# %run my_init.py

# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline
#정적그래프를 그릴때 사용하는 옵션

"""#데이터 준비하기"""

bream_length = [25.4, 26.3, 26.5, 29.0, 29.0, 29.7, 29.7, 30.0, 30.0, 30.7, 31.0, 31.0, 
                31.5, 32.0, 32.0, 32.0, 33.0, 33.0, 33.5, 33.5, 34.0, 34.0, 34.5, 35.0, 
                35.0, 35.0, 35.0, 36.0, 36.0, 37.0, 38.5, 38.5, 39.5, 41.0, 41.0]
bream_weight = [242.0, 290.0, 340.0, 363.0, 430.0, 450.0, 500.0, 390.0, 450.0, 500.0, 475.0, 500.0, 
                500.0, 340.0, 600.0, 600.0, 700.0, 700.0, 610.0, 650.0, 575.0, 685.0, 620.0, 680.0, 
                700.0, 725.0, 720.0, 714.0, 850.0, 1000.0, 920.0, 955.0, 925.0, 975.0, 950.0]

smelt_length = [9.8, 10.5, 10.6, 11.0, 11.2, 11.3, 11.8, 11.8, 12.0, 12.2, 12.4, 13.0, 14.3, 15.0]
smelt_weight = [6.7, 7.5, 7.0, 9.7, 9.8, 8.7, 10.0, 9.9, 9.8, 12.2, 13.4, 12.2, 19.7, 19.9]

length = bream_length + smelt_length
weight = bream_weight + smelt_weight

fish_data = [(l,w) for l,w in zip(length, weight)]

fish_target = [1]*35 + [0]*14
#도미: 35개, 빙어:14개

train_input = fish_data[0:35:1] #start:end:step
train_target = fish_target[0:35:1]
test_input = fish_data[35:]
test_target = fish_data[35:]
#연습과 테스트 데이터로 나누기

"""##훈련하기"""

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

kn = KNeighborsClassifier()
kn.fit(train_input, train_target) #학습, 한번은 해줘야함

"""#평가하기"""

#kn.score(test_input, test_target)
accuracy_score(test_target, kn.predict(test_input))

"""#numpy 연습"""

import numpy as np

np.random.seed(42)
X=np.random.randint(1,100,15).reshape(3,-1) #3행5열
X
#시드를 지정해줬기 때문에 결과가 항상 같아야한다.

X.sum()

X.sum(axis=0)
#열에 대한 합
#axis=1 : 행에 대한 값

"""#샘플링 편향 해결하기"""

np.random.seed(42)
index = np.arange(49)
index

np.random.shuffle(index)  #객체를 변경하는 메소드
index

input_arr = np.array(fish_data)
target_arr = np.array(fish_target)
#array로 변환

train_input = input_arr[index[:35]]
train_target = target_arr[index[:35]]
test_input = input_arr[index[35:]]
test_target = target_arr[index[35:]]

train_input.shape, train_target.shape

test_input.shape, test_target.shape

# Commented out IPython magic to ensure Python compatibility.
# %matplotlib notebook
plt.scatter(train_input[:,0],train_input[:,1])
plt.scatter(test_input[:,0],test_input[:,1])

plt.xlabel('length')
plt.ylabel('weight')
plt.legend(['train','test'])

kn.fit(train_input, train_target)
kn.score(test_input, test_target)

accuracy_score(test_target, kn.predict(test_input))

"""###Quartile Range"""

np.random.seed(42)
arr = np.random.randint(1,101,100)  #100개의 데이터
arr

Q=np.percentile(arr,[25,50,75])  #arr 데이터의 25%, 50%, 75%값 반환
Q

q1,q2,q3 = Q[0],Q[1],Q[2]

IQR = q3-q1

"""이상치찾기"""

#low fence
lfence = q1-1.5*IQR
lfence

#upper fence
ufence = q3+1.5*IQR
ufence

arr[30] = 200

arr < lfence

arr > ufence

(arr < lfence) | (arr > ufence)

outliers = arr[(arr < lfence) | (arr>ufence)]
#수식 자체가 인덱스가 될 수있다.
outliers
#현재 없음.

#이상치를 뺀 결과값들
normal = arr[(arr > lfence) & (arr <= ufence)]
normal