Skip to content

문서 클러스터링

from sklearn.decomposition import NMF
nmf = NMF(n_components=15, random_state=1234, max_iter=500)
doc_emb = nmf.fit_transform(dtm)
doc_emb[0]
array([0.04495691, 0.00993037, 0.00414172, 0.        , 0.08992402,
       0.01568343, 0.        , 0.20330137, 0.00417763, 0.        ,
       0.        , 0.        , 0.10685757, 0.        , 0.18545175])
(doc_emb[0] ** 2).sum()
0.09762898174810467
from sklearn.preprocessing import normalize
norm_doc_emb = normalize(doc_emb)
norm_doc_emb[0]
array([0.1438822 , 0.03178161, 0.01325536, 0.        , 0.28779706,
       0.05019399, 0.        , 0.65065523, 0.0133703 , 0.        ,
       0.        , 0.        , 0.34199196, 0.        , 0.59352848])
(norm_doc_emb[0] ** 2).sum()
0.9999999999999999
from sklearn.cluster import KMeans
kms = {}
inertia = []
ks = list(range(2, 30))
for k in ks:
    km = KMeans(n_clusters=k, n_init='auto', random_state=1234)
    km.fit(norm_doc_emb)
    inertia.append(km.inertia_)
    kms[k] = km
import matplotlib.pyplot as plt
plt.plot(ks, inertia)
[<matplotlib.lines.Line2D at 0x246c68391e0>]
n_cluster = 7
km = kms[n_cluster]
cluster = km.fit_predict(norm_doc_emb)
cluster
array([2, 5, 1, 2, 2, 2, 3, 2, 5, 1, 0, 3, 5, 4, 0, 0, 1, 2, 1, 2, 2, 0,
       0, 3, 0, 5, 3, 6, 5, 6, 3, 0, 2, 1, 5, 2, 3, 0, 2, 0, 0, 4, 2, 2,
       0, 2, 6, 2, 5, 0, 1, 3, 4, 0, 5, 0, 3, 2, 1, 1, 2, 2, 1, 1, 1, 2,
       2, 3, 5, 2, 6, 6, 2, 2, 4, 3, 2, 2, 2, 5, 2, 5, 2, 1, 2, 6, 3, 3,
       2, 4, 5, 2, 4, 3, 2, 2, 1, 6, 1, 2, 1, 2, 3, 2, 5, 5, 4, 3, 3, 0,
       5, 3, 2, 5, 5, 2, 1, 0, 3, 2, 2, 3, 1, 0, 3, 3, 1, 2, 0, 5, 5, 2,
       1, 1, 0, 5, 4, 4, 2, 2, 2, 2, 2, 2, 0, 3, 1, 2, 3, 1, 5, 2, 3, 6,
       5, 4, 6, 2, 5, 2, 5, 2, 0, 6, 0, 3, 1, 0, 0, 3, 5, 0, 2, 3, 0, 3,
       2, 0, 0, 3, 0, 0, 4, 3, 3, 6, 3, 1, 2, 3, 2, 1, 5, 3, 6, 6, 5, 5,
       2, 5, 3, 0, 0, 2, 6, 2, 3, 3, 0, 3, 2, 2, 1, 0, 5, 1, 2, 1, 5, 1,
       2, 0, 5, 5, 2, 2, 5, 2, 4, 3, 5, 2, 3, 4, 4, 2, 5, 6, 0, 4, 2, 4,
       4, 2, 0, 0, 2, 4, 0, 0, 2, 2, 2, 2, 6, 2, 2, 6, 5, 2, 2, 6, 4, 2,
       3, 2, 5, 1, 4, 3, 0, 6, 3, 3, 2, 0, 0, 2, 4, 3, 5, 1, 3, 3, 6, 3,
       4, 6, 2, 0, 3, 2, 4, 0, 6, 4, 0, 2, 6, 4, 4, 6, 0, 0, 0, 0, 4, 0,
       0, 3, 0, 4, 0, 2, 4, 4, 2, 6, 5, 2, 2, 6, 4, 3, 2, 2, 0, 3, 4, 0,
       6, 6, 2, 2, 0, 2, 6, 6, 4, 1, 1, 2, 4, 6, 4, 2, 2, 4, 4, 3, 6, 5,
       4, 3, 2, 6, 3, 0, 6, 0, 2, 2, 2, 2, 2, 3, 0, 4, 3, 2, 5, 0, 3, 2,
       2, 5, 4, 5, 6, 2, 1, 2, 2, 2, 5, 1, 6, 5, 6, 5, 5, 4, 3, 2, 6, 6,
       0, 6, 6, 6, 2, 2, 3, 0, 6, 6, 6, 6, 6, 2, 6, 6, 2, 0, 5, 2, 3, 0,
       6, 0, 6, 1, 6, 6, 2, 4, 5, 1, 2, 3, 5, 2, 1, 2, 2, 1, 2, 0, 5])
df[cluster == 0].head()
status ko_title en_title abstract
10 등록 탈모방지 및 발모촉진용 헤어샴푸 조성물과 이의 제조방법 hair shampoo composition for preventing hair l... 본 발명은 샴푸 전동 의자에 관한 것으로, 회전암대 회동수단; 일측이 회전암대 회동...
14 등록 영유아 샴푸 보조 기구 SHAMPOO ASSIST DEVICE FOR INFANT AND TODDLER 본 발명은 직립형 샴푸 캡에 관한 것으로, 사용자의 머리에 착용 가능하도록 형성되고...
15 등록 컨디셔닝 샴푸 조성물 Conditioning shampoo composition 본 발명의 실시예에 따른 영유아 샴푸 보조 기구는 아기의 머리를 지지하는 상단 돌출...
21 등록 샴푸 용기용 펌프 디스펜서 PUMP DISPENSER FOR SHAMPOO VESSEL 본 발명은 샴푸 용기용 펌프 디스펜서에 관한 것으로서, 승강수단은 상기 피스톤(23...
22 등록 석창포 추출물을 함유하는 발모촉진과 탈모방지 및 비듬방지를 위한 샴푸 조성물과 조성... Acoris gramineus Sol shampoo composition and m... 본 발명은 샴푸 용기용 펌프 디스펜서에 관한 것으로서, 승강수단은 상기 피스톤(23...
from sklearn.metrics import silhouette_samples, silhouette_score
slh_avg = silhouette_score(norm_doc_emb, cluster)
slh_avg
0.2648528175529998
slh_values = silhouette_samples(norm_doc_emb, cluster)
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

y_lower = 10
for i in range(n_cluster):
    cluster_slh = slh_values[cluster == i]
    cluster_slh.sort()

    size = cluster_slh.shape[0]
    y_upper = y_lower + size

    color = cm.nipy_spectral(float(i) / n_cluster)
    plt.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        cluster_slh,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,
    )

    plt.text(-0.05, y_lower + 0.5 * size, str(i))
    y_lower = y_upper + 10
plt.axvline(slh_avg, linestyle='--', color='gray')
plt.yticks([])
([], [])