logo

[텍스트 분석] 문서 클러스터링

from sklearn.decomposition import NMF
nmf = NMF(n_components=15, random_state=1234, max_iter=500)
doc_emb = nmf.fit_transform(dtm)
doc_emb[0]
array([0.04495691, 0.00993037, 0.00414172, 0.        , 0.08992402,
       0.01568343, 0.        , 0.20330137, 0.00417763, 0.        ,
       0.        , 0.        , 0.10685757, 0.        , 0.18545175])
(doc_emb[0] ** 2).sum()
0.09762898174810467
from sklearn.preprocessing import normalize
norm_doc_emb = normalize(doc_emb)
norm_doc_emb[0]
array([0.1438822 , 0.03178161, 0.01325536, 0.        , 0.28779706,
       0.05019399, 0.        , 0.65065523, 0.0133703 , 0.        ,
       0.        , 0.        , 0.34199196, 0.        , 0.59352848])
(norm_doc_emb[0] ** 2).sum()
0.9999999999999999
from sklearn.cluster import KMeans
kms = {}
inertia = []
ks = list(range(2, 30))
for k in ks:
    km = KMeans(n_clusters=k, n_init='auto', random_state=1234)
    km.fit(norm_doc_emb)
    inertia.append(km.inertia_)
    kms[k] = km
import matplotlib.pyplot as plt
plt.plot(ks, inertia)
[<matplotlib.lines.Line2D at 0x246c68391e0>]
n_cluster = 7
km = kms[n_cluster]
cluster = km.fit_predict(norm_doc_emb)
cluster
array([2, 5, 1, 2, 2, 2, 3, 2, 5, 1, 0, 3, 5, 4, 0, 0, 1, 2, 1, 2, 2, 0,
       0, 3, 0, 5, 3, 6, 5, 6, 3, 0, 2, 1, 5, 2, 3, 0, 2, 0, 0, 4, 2, 2,
       0, 2, 6, 2, 5, 0, 1, 3, 4, 0, 5, 0, 3, 2, 1, 1, 2, 2, 1, 1, 1, 2,
       2, 3, 5, 2, 6, 6, 2, 2, 4, 3, 2, 2, 2, 5, 2, 5, 2, 1, 2, 6, 3, 3,
       2, 4, 5, 2, 4, 3, 2, 2, 1, 6, 1, 2, 1, 2, 3, 2, 5, 5, 4, 3, 3, 0,
       5, 3, 2, 5, 5, 2, 1, 0, 3, 2, 2, 3, 1, 0, 3, 3, 1, 2, 0, 5, 5, 2,
       1, 1, 0, 5, 4, 4, 2, 2, 2, 2, 2, 2, 0, 3, 1, 2, 3, 1, 5, 2, 3, 6,
       5, 4, 6, 2, 5, 2, 5, 2, 0, 6, 0, 3, 1, 0, 0, 3, 5, 0, 2, 3, 0, 3,
       2, 0, 0, 3, 0, 0, 4, 3, 3, 6, 3, 1, 2, 3, 2, 1, 5, 3, 6, 6, 5, 5,
       2, 5, 3, 0, 0, 2, 6, 2, 3, 3, 0, 3, 2, 2, 1, 0, 5, 1, 2, 1, 5, 1,
       2, 0, 5, 5, 2, 2, 5, 2, 4, 3, 5, 2, 3, 4, 4, 2, 5, 6, 0, 4, 2, 4,
       4, 2, 0, 0, 2, 4, 0, 0, 2, 2, 2, 2, 6, 2, 2, 6, 5, 2, 2, 6, 4, 2,
       3, 2, 5, 1, 4, 3, 0, 6, 3, 3, 2, 0, 0, 2, 4, 3, 5, 1, 3, 3, 6, 3,
       4, 6, 2, 0, 3, 2, 4, 0, 6, 4, 0, 2, 6, 4, 4, 6, 0, 0, 0, 0, 4, 0,
       0, 3, 0, 4, 0, 2, 4, 4, 2, 6, 5, 2, 2, 6, 4, 3, 2, 2, 0, 3, 4, 0,
       6, 6, 2, 2, 0, 2, 6, 6, 4, 1, 1, 2, 4, 6, 4, 2, 2, 4, 4, 3, 6, 5,
       4, 3, 2, 6, 3, 0, 6, 0, 2, 2, 2, 2, 2, 3, 0, 4, 3, 2, 5, 0, 3, 2,
       2, 5, 4, 5, 6, 2, 1, 2, 2, 2, 5, 1, 6, 5, 6, 5, 5, 4, 3, 2, 6, 6,
       0, 6, 6, 6, 2, 2, 3, 0, 6, 6, 6, 6, 6, 2, 6, 6, 2, 0, 5, 2, 3, 0,
       6, 0, 6, 1, 6, 6, 2, 4, 5, 1, 2, 3, 5, 2, 1, 2, 2, 1, 2, 0, 5])
df[cluster == 0].head()
statusko_titleen_titleabstract
10등록탈모방지 및 발모촉진용 헤어샴푸 조성물과 이의 제조방법hair shampoo composition for preventing hair l...본 발명은 샴푸 전동 의자에 관한 것으로, 회전암대 회동수단; 일측이 회전암대 회동...
14등록영유아 샴푸 보조 기구SHAMPOO ASSIST DEVICE FOR INFANT AND TODDLER본 발명은 직립형 샴푸 캡에 관한 것으로, 사용자의 머리에 착용 가능하도록 형성되고...
15등록컨디셔닝 샴푸 조성물Conditioning shampoo composition본 발명의 실시예에 따른 영유아 샴푸 보조 기구는 아기의 머리를 지지하는 상단 돌출...
21등록샴푸 용기용 펌프 디스펜서PUMP DISPENSER FOR SHAMPOO VESSEL본 발명은 샴푸 용기용 펌프 디스펜서에 관한 것으로서, 승강수단은 상기 피스톤(23...
22등록석창포 추출물을 함유하는 발모촉진과 탈모방지 및 비듬방지를 위한 샴푸 조성물과 조성...Acoris gramineus Sol shampoo composition and m...본 발명은 샴푸 용기용 펌프 디스펜서에 관한 것으로서, 승강수단은 상기 피스톤(23...
from sklearn.metrics import silhouette_samples, silhouette_score
slh_avg = silhouette_score(norm_doc_emb, cluster)
slh_avg
0.2648528175529998
slh_values = silhouette_samples(norm_doc_emb, cluster)
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

y_lower = 10
for i in range(n_cluster):
    cluster_slh = slh_values[cluster == i]
    cluster_slh.sort()

    size = cluster_slh.shape[0]
    y_upper = y_lower + size

    color = cm.nipy_spectral(float(i) / n_cluster)
    plt.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        cluster_slh,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,
    )

    plt.text(-0.05, y_lower + 0.5 * size, str(i))
    y_lower = y_upper + 10
plt.axvline(slh_avg, linestyle='--', color='gray')
plt.yticks([])
([], [])
Previous
NMF
Next
LDA