06-1 군집 알고리즘

레이블(종속 데이터)가 없음
추천 시스템에 사용
k-means
과일 사진 데이터 준비하기

  
!wget https://bit.ly/fruits_300_data -O fruits_300.npy
import numpy as np
import matplotlib.pyplot as plt
fruits = np.load('fruits_300.npy')
print(fruits.shape)  # (300, 100, 100)
print(fruits[0, 0, :]) 
# [  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   1
#    2   2   2   2   2   2   1   1   1   1   1   1   1   1   2   3   2   1
#    2   1   1   1   1   2   1   3   2   1   3   1   4   1   2   5   5   5
#   19 148 192 117  28   1   1   2   1   4   1   1   3   1   1   1   1   1
#    2   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
#    1   1   1   1   1   1   1   1   1   1]

plt.imshow(fruits[0], cmap='gray')
plt.show()

  
plt.imshow(fruits[0], cmap='gray_r')
plt.show()

  
fig, axs = plt.subplots(1, 2)
axs[0].imshow(fruits[100], cmap='gray_r')
axs[1].imshow(fruits[200], cmap='gray_r')
plt.show()

픽셀 값 분석하기

  
apple = fruits[0:100].reshape(-1, 100*100)  # 3차원 데이터를 2차원으로 변경
pineapple = fruits[100:200].reshape(-1, 100*100)
banana = fruits[200:300].reshape(-1, 100*100)
print(apple.shape)  # (100, 10000)
print(apple.mean(axis=1))  # 각 컬럼에 대한 row 평균
# [ 88.3346  97.9249  87.3709  98.3703  92.8705  82.6439  94.4244  95.5999
#   90.681   81.6226  87.0578  95.0745  93.8416  87.017   97.5078  87.2019
#   88.9827 100.9158  92.7823 100.9184 104.9854  88.674   99.5643  97.2495
#   94.1179  92.1935  95.1671  93.3322 102.8967  94.6695  90.5285  89.0744
#   97.7641  97.2938 100.7564  90.5236 100.2542  85.8452  96.4615  97.1492
#   90.711  102.3193  87.1629  89.8751  86.7327  86.3991  95.2865  89.1709
#   96.8163  91.6604  96.1065  99.6829  94.9718  87.4812  89.2596  89.5268
#   93.799   97.3983  87.151   97.825  103.22    94.4239  83.6657  83.5159
#  102.8453  87.0379  91.2742 100.4848  93.8388  90.8568  97.4616  97.5022
#   82.446   87.1789  96.9206  90.3135  90.565   97.6538  98.0919  93.6252
#   87.3867  84.7073  89.1135  86.7646  88.7301  86.643   96.7323  97.2604
#   81.9424  87.1687  97.2066  83.4712  95.9781  91.8096  98.4086 100.7823
#  101.556  100.7027  91.6098  88.8976]

  
plt.hist(np.mean(apple, axis=1), alpha=0.8)
plt.hist(np.mean(pineapple, axis=1), alpha=0.8)
plt.hist(np.mean(banana, axis=1), alpha=0.8)
plt.legend(['apple', 'pineapple', 'banana'])
plt.show()

  
fig, axs = plt.subplots(1, 3, figsize=(20, 5))
axs[0].bar(range(10000), np.mean(apple, axis=0))
axs[1].bar(range(10000), np.mean(pineapple, axis=0))
axs[2].bar(range(10000), np.mean(banana, axis=0))
plt.show()

  
apple_mean = np.mean(apple, axis=0).reshape(100, 100)  # 컬럼별 평균
pineapple_mean = np.mean(pineapple, axis=0).reshape(100, 100)
banana_mean = np.mean(banana, axis=0).reshape(100, 100)

fig, axs = plt.subplots(1, 3, figsize=(20, 5))
axs[0].imshow(apple_mean, cmap='gray_r')
axs[1].imshow(pineapple_mean, cmap='gray_r')
axs[2].imshow(banana_mean, cmap='gray_r')
plt.show()

평균값과 가까운 사진 고르기

  
abs_diff = np.abs(fruits - apple_mean)
abs_mean = np.mean(abs_diff, axis=(1,2))
print(abs_mean.shape)  # (300,)

apple_index = np.argsort(abs_mean)[:100]
fig, axs = plt.subplots(10, 10, figsize=(10,10))
for i in range(10):
    for j in range(10):
        axs[i, j].imshow(fruits[apple_index[i*10 + j]], cmap='gray_r')
        axs[i, j].axis('off')
plt.show()

  
abs_diff = np.abs(fruits - pineapple_mean)
abs_mean = np.mean(abs_diff, axis=(1, 2))

pineapple_index = np.argsort(abs_mean)[:100]
fig, axs = plt.subplots(10, 10, figsize=(10,10))
for i in range(10):
    for j in range(10):
        axs[i, j].imshow(fruits[pineapple_index[i*10 + j]], cmap='gray_r')
        axs[i, j].axis('off')
plt.show()

확인문제

  
abs_diff = np.abs(fruits - banana_mean)
abs_mean = np.mean(abs_diff, axis=(1,2))

banana_index = np.argsort(abs_mean)[:100]
fig, axs = plt.subplots(10, 10, figsize=(10,10))
for i in range(10):
    for j in range(10):
        axs[i, j].imshow(fruits[banana_index[i*10 + j]], cmap='gray_r')
        axs[i, j].axis('off')
plt.show()

06-2 K-means

비지도학습 모델
- Clustering(군집합) 모델중 하나
Similarity Based Model
이상치를 찾는데 유용하게 쓰일 수 있다

센트로이드(centroid) 또는 클러스터 중심(cluster center)

k-means 군집 알고리즘이 자동으로 찾은 중심값
KMeans 클래스
```python !wget https://bit.ly/fruits_300_data -O fruits_300.npy

import numpy as np

fruits = np.load(‘fruits_300.npy’) fruits_2d = fruits.reshape(-1, 100*100)

from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, random_state=42) km.fit(fruits_2d)

print(km.labels_)

[2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

2 2 2 2 2 0 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 2 0 2

2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

1 1 1 1]

print(np.unique(km.labels_, return_counts=True))

(array([0, 1, 2], dtype=int32), array([111, 98, 91]))

import matplotlib.pyplot as plt

def draw_fruits(arr, ratio=1): n = len(arr) # n은 샘플 개수입니다 # 한 줄에 10개씩 이미지를 그립니다. 샘플 개수를 10으로 나누어 전체 행 개수를 계산합니다. rows = int(np.ceil(n/10)) # 행이 1개 이면 열 개수는 샘플 개수입니다. 그렇지 않으면 10개입니다. cols = n if rows < 2 else 10 fig, axs = plt.subplots(rows, cols, figsize=(colsratio, rowsratio), squeeze=False) for i in range(rows): for j in range(cols): if i10 + j < n: # n 개까지만 그립니다. axs[i, j].imshow(arr[i10 + j], cmap=’gray_r’) axs[i, j].axis(‘off’) plt.show()

```python
draw_fruits(fruits[km.labels_==0])

  
draw_fruits(fruits[km.labels_==1])

  
draw_fruits(fruits[km.labels_==2])

클러스터 중심

  
draw_fruits(km.cluster_centers_.reshape(-1, 100, 100), ratio=3)

  
print(km.transform(fruits_2d[100:101]))  # [[3393.8136117  8837.37750892 5267.70439881]]

print(km.predict(fruits_2d[100:101]))  # [0]

draw_fruits(fruits[100:101])

  
# centroid가 이동한 횟수
print(km.n_iter_)  # 4

최적의 k 찾기

  
inertia = []
for k in range(2, 7):
    km = KMeans(n_clusters=k, n_init='auto', random_state=42)
    km.fit(fruits_2d)
    inertia.append(km.inertia_)

plt.plot(range(2, 7), inertia)
plt.xlabel('k')
plt.ylabel('inertia')
plt.show()

꺽이는 부분: k의 개수를 늘렸을때 효율이 상대적으로 적어지는 부분 - 최적의 k

06-3 주성분 분석(PCA: Principal Component Analysis)

속성을 압축하는 과정
ex) 체중 + 키 속성을 bmi로 합치는 행위
오히려 늘릴 수도 있다 ```python !wget https://bit.ly/fruits_300_data -O fruits_300.npy

import numpy as np

fruits = np.load(‘fruits_300.npy’) fruits_2d = fruits.reshape(-1, 100*100)

from sklearn.decomposition import PCA

pca = PCA(n_components=50) pca.fit(fruits_2d)

print(pca.components_.shape) # (50, 10000)

import matplotlib.pyplot as plt

![](https://i.imgur.com/G5PAPOF.png)

```python
draw_fruits(pca.components_.reshape(-1, 100, 100))

print(fruits_2d.shape)  # (300, 10000)

fruits_pca = pca.transform(fruits_2d)

print(fruits_pca.shape)  # (300, 50)

원본 데이터 재구성

  
fruits_inverse = pca.inverse_transform(fruits_pca)
print(fruits_inverse.shape)

fruits_reconstruct = fruits_inverse.reshape(-1, 100, 100)

for start in [0, 100, 200]:
    draw_fruits(fruits_reconstruct[start:start+100])
    print("\n")

설명된 분산

  
print(np.sum(pca.explained_variance_ratio_))  # 0.9215627696228221

plt.plot(pca.explained_variance_ratio_)

다른 알고리즘과 함께 사용하기

  
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

target = np.array([0] * 100 + [1] * 100 + [2] * 100)

from sklearn.model_selection import cross_validate

scores = cross_validate(lr, fruits_2d, target)
print(np.mean(scores['test_score']))  # 0.9966666666666667
print(np.mean(scores['fit_time']))  # 0.2854924201965332

scores = cross_validate(lr, fruits_pca, target)
print(np.mean(scores['test_score']))  # 1.0
print(np.mean(scores['fit_time']))  # 0.010412645339965821

pca = PCA(n_components=0.5)
pca.fit(fruits_2d)

print(pca.n_components_)  # 2

fruits_pca = pca.transform(fruits_2d)
print(fruits_pca.shape)  # (300, 2)

scores = cross_validate(lr, fruits_pca, target)
print(np.mean(scores['test_score']))  # 0.9933333333333334
print(np.mean(scores['fit_time']))  # 0.013453817367553711

from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, random_state=42)
km.fit(fruits_pca)

print(np.unique(km.labels_, return_counts=True))  # (array([0, 1, 2], dtype=int32), array([110,  99,  91]))

for label in range(0, 3):
    draw_fruits(fruits[km.labels_ == label])
    print("\n")

  
for label in range(0, 3):
    data = fruits_pca[km.labels_ == label]
    plt.scatter(data[:,0], data[:,1])
plt.legend(['apple', 'banana', 'pineapple'])
plt.show()

파라미터 정리

K-means 클러스터링 (K-means Clustering)

n_clusters: 군집의 수를 결정하는 파라미터 (default: 8)
init: 초기 중심점의 설정 방법 (default: ‘k-means++’)
n_init: 중심 초기화를 몇 번 반복할지 결정 (default: 10)
max_iter: 알고리즘의 최대 반복 횟수 (default: 300)
tol: 수렴 조건을 판단하기 위한 허용 오차 (default: 1×10−41×10−4)
precompute_distances: 거리 계산을 미리 할지 결정 (default: ‘auto’)
verbose: 상세한 정보 출력 여부 (default: 0)
random_state: 결과의 재현성을 위한 랜덤 시드 (default: None)
copy_x: 데이터를 복사할지 여부 (default: True)
n_jobs: 병렬 처리를 위한 CPU 코어 수 (default: None)
algorithm: 사용할 K-means 알고리즘 버전 (default: ‘auto’)

주성분 분석 (Principal Component Analysis, PCA)

n_components: 주성분의 수를 결정하는 파라미터 (default: None, 가능한 값: 정수, ‘mle’, 0-1 사이의 실수)
svd_solver: SVD (Singular Value Decomposition) 해결 방법 (default: ‘auto’, 가능한 값: ‘auto’, ‘full’, ‘arpack’, ‘randomized’)
whiten: 주성분을 희석화할지 여부, 일반적으로 클러스터링이나 분류에서 유용 (default: False)
tol: SVD를 위한 허용 오차 (default: 0.0, svd_solver가 ‘arpack’일 경우에 사용)
random_state: 결과의 재현성을 위한 랜덤 시드 (default: None)

MLE (Maximum Likelihood Estimation, 최대우도추정)

정의: MLE는 주어진 데이터를 가장 잘 설명하는 확률 분포의 파라미터를 찾는 통계적 방법입니다.
PCA에서의 역할:
- n_components에 ‘mle’를 설정하면, PCA는 최대우도추정을 사용하여 자동으로 주성분의 수를 선택합니다.
- 이 방법은 주어진 데이터에 가장 적합한 차원의 수를 자동으로 결정하려고 할 때 유용합니다.
작동 원리:
1. 데이터에 가장 잘 맞는 확률 모델을 가정합니다.
2. 모델의 파라미터를 조정하여 데이터의 우도(likelihood)를 최대화합니다.
목적:
- 데이터를 가장 잘 설명하는 모델의 파라미터를 찾아, 모델의 성능을 개선하려는 목적으로 사용됩니다.

[SeSAC]혼공 머신러닝+딥러닝 Ch6. 비지도 학습

06-1 군집 알고리즘

과일 사진 데이터 준비하기

픽셀 값 분석하기

평균값과 가까운 사진 고르기

확인문제

06-2 K-means

KMeans 클래스

[2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

2 2 2 2 2 0 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 2 0 2

2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

1 1 1 1]

(array([0, 1, 2], dtype=int32), array([111, 98, 91]))

클러스터 중심

최적의 k 찾기

06-3 주성분 분석(PCA: Principal Component Analysis)

원본 데이터 재구성

설명된 분산

다른 알고리즘과 함께 사용하기

파라미터 정리

K-means 클러스터링 (K-means Clustering)

주성분 분석 (Principal Component Analysis, PCA)

MLE (Maximum Likelihood Estimation, 최대우도추정)

Further Reading

[SeSAC]혼공 머신러닝+딥러닝 Ch3, 4

[SeSAC]혼공 머신러닝+딥러닝 Ch5. 트리 알고리즘

[SeSAC]혼공 머신러닝+딥러닝 Ch7. 딥러닝