logo

[텍스트 분석] RNN

import pandas as pd
df = pd.read_excel('yelp.xlsx')
import tensorflow as tf
NUM_WORDS = 1000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS)
tokenizer.index_word
{1: 'the',
 2: 'and',
 3: 'i',
 4: 'was',
 5: 'a',

...
tokenizer.fit_on_texts(df['review'])
seq = tokenizer.texts_to_sequences(df['review'])
seq[0]
[428, 165, 8, 15]
pad_seq = tf.keras.preprocessing.sequence.pad_sequences(seq, padding='post')
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    pad_seq, df['sentiment'], test_size=0.2, random_state=0)
from tensorflow.keras.layers import Embedding, LSTM, Dense
model = tf.keras.Sequential([
    Embedding(NUM_WORDS, 32),
    LSTM(32),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10)
model.evaluate(x_test, y_test)
7/7 [==============================] - 0s 2ms/step - loss: 0.5871 - accuracy: 0.8400

[0.5870778560638428, 0.8399999737739563]
Previous
정칙화