import pandas as pd
df = pd.read_excel('yelp.xlsx')
import tensorflow as tf
NUM_WORDS = 1000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS)
tokenizer.index_word
{1: 'the',
2: 'and',
3: 'i',
4: 'was',
5: 'a',
...
tokenizer.fit_on_texts(df['review'])
seq = tokenizer.texts_to_sequences(df['review'])
seq[0]
[428, 165, 8, 15]
pad_seq = tf.keras.preprocessing.sequence.pad_sequences(seq, padding='post')
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
pad_seq, df['sentiment'], test_size=0.2, random_state=0)
from tensorflow.keras.layers import Embedding, LSTM, Dense
model = tf.keras.Sequential([
Embedding(NUM_WORDS, 32),
LSTM(32),
Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10)
model.evaluate(x_test, y_test)
7/7 [==============================] - 0s 2ms/step - loss: 0.5871 - accuracy: 0.8400
[0.5870778560638428, 0.8399999737739563]