词嵌入实例1

# NOTE: PLEASE MAKE SURE YOU ARE RUNNING THIS IN A PYTHON3 ENVIRONMENT

import tensorflow as tf
print(tf.__version__)

# This is needed for the iterator over the data
# But not necessary if you have TF 2.0 installed
#!pip install tensorflow==2.0.0-beta0


#tf.enable_eager_execution()

!pip install -q tensorflow-datasets

2.4.1
/bin/bash: pip: command not found

import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /home/yunshu/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m



Dl Completed...: 0 url [00:00, ? url/s]



Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]



Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]



Shuffling imdb_reviews-train.tfrecord...:   0%|          | 0/25000 [00:00<?, ? examples/s]



Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]



Shuffling imdb_reviews-test.tfrecord...:   0%|          | 0/25000 [00:00<?, ? examples/s]



Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]



Shuffling imdb_reviews-unsupervised.tfrecord...:   0%|          | 0/50000 [00:00<?, ? examples/s]


[1mDataset imdb_reviews downloaded and prepared to /home/yunshu/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m

import numpy as np

train_data, test_data = imdb['train'], imdb['test']


training_sentences = []
training_labels = []

testing_sentences = []  
testing_labels = []


# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())
  
  
for s,l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())
 
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

vocab_size = 10000
embedding_dim = 16 # ćŻä¸ŞĺčŻć ĺ°ĺçĺéçť´ĺşŚ
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type, padding='post')

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[1]))
print(training_sentences[1])

b'i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the <OOV> and having just eaten a lot however on this occasion i fell asleep because the film was rubbish the plot development was constant constantly slow and boring things seemed to happen but with no explanation of what was causing them or why i admit i may have missed part of the film but i watched the majority of it and everything just seemed to happen of its own <OOV> without any real concern for anything else i cant recommend this film at all ' ? ? ? ? ? ? ?
b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.'

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
=================================================================
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________

num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
782/782 [==============================] - 20s 23ms/step - loss: 0.5984 - accuracy: 0.6450 - val_loss: 0.3516 - val_accuracy: 0.8438
Epoch 2/10
782/782 [==============================] - 2s 2ms/step - loss: 0.2451 - accuracy: 0.9080 - val_loss: 0.3743 - val_accuracy: 0.8349
Epoch 3/10
782/782 [==============================] - 1s 2ms/step - loss: 0.1010 - accuracy: 0.9764 - val_loss: 0.4639 - val_accuracy: 0.8195
Epoch 4/10
782/782 [==============================] - 1s 2ms/step - loss: 0.0294 - accuracy: 0.9963 - val_loss: 0.5523 - val_accuracy: 0.8164
Epoch 5/10
782/782 [==============================] - 1s 2ms/step - loss: 0.0074 - accuracy: 0.9995 - val_loss: 0.6410 - val_accuracy: 0.8126
Epoch 6/10
782/782 [==============================] - 1s 2ms/step - loss: 0.0023 - accuracy: 1.0000 - val_loss: 0.6722 - val_accuracy: 0.8180
Epoch 7/10
782/782 [==============================] - 1s 2ms/step - loss: 9.7437e-04 - accuracy: 1.0000 - val_loss: 0.7204 - val_accuracy: 0.8171
Epoch 8/10
782/782 [==============================] - 2s 2ms/step - loss: 5.2066e-04 - accuracy: 1.0000 - val_loss: 0.7731 - val_accuracy: 0.8143
Epoch 9/10
782/782 [==============================] - 1s 2ms/step - loss: 2.9587e-04 - accuracy: 1.0000 - val_loss: 0.8290 - val_accuracy: 0.8113
Epoch 10/10
782/782 [==============================] - 1s 2ms/step - loss: 1.8574e-04 - accuracy: 1.0000 - val_loss: 0.8574 - val_accuracy: 0.8135





<tensorflow.python.keras.callbacks.History at 0x7ff8c5a3e8e0>

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()


# try:
#   from google.colab import files
# except ImportError:
#   pass
# else:
#   files.download('vecs.tsv')
#   files.download('meta.tsv')