Note for course TF 3: NLP in TF

Anh-Thi Dinh

Tokenizing + padding

📙 Notebook: Tokenizer basic examples.
📙 Notebook:
Sarcasm detection.
  • A common simple character encoding is ASCII,
  • We can encode each word as a number (token) — Tokenizer.
  • Tokenize words > build all the words to make a corpus > turn your sentences into lists of values based on these tokens. > manipulate these lists (make the same length, for example)
1from tensorflow.keras.preprocessing.text import Tokenizer
2
3sentences = [
4    'i love my dog',
5    'I, love my cat',
6    'You love my dog so much!'
7]
8
9tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
10            # num_words: max of words to be tokenized & pick
11            #   the most common 100 words.
12            # More words, more accuracy, more time to train
13            # oov_token: replace unseen words by "<OOV>"
14tokenizer.fit_on_texts(sentences) # fix texts based on tokens
1# indexing words
2word_index = tokenizer.word_index
3print(word_index)
4# {'<OOV>': 1, 'love': 2, 'my': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7, 'so': 8, 'much': 9}
5# "!", ",", capital, ... are removed
1# encode sentences
2sequences = tokenizer.texts_to_sequences(sentences)
3print(sequences)
4# [[4, 2, 3, 5],
5#  [4, 2, 3, 6],
6#  [7, 2, 3, 5, 8, 9]]
7# if a word is not in the word index, it will be lost in the text_to_sequences()
1# make encoded sentences equal
2from tensorflow.keras.preprocessing.sequence import pad_sequences
3
4padded = pad_sequences(sequences, value=-1,
5                       maxlen=5, padding="post", truncating="post")
6         # maxlen: max len of encoded sentence
7         # value: value to be filld (default 0)
8         # padding: add missing values at beginning or ending of sentence?
9         # truncating: longer than maxlen? cut at beginning or ending?
10print(padded)
11# [[ 4  2  3  5 -1]
12#  [ 4  2  3  6 -1]
13#  [ 7  2  3  5  8]]
1# read json text
2import json
3with open("/tmp/sarcasm.json", 'r') as f:
4    datastore = json.load(f)
5
6sentences = []
7labels = []
8urls = []
9for item in datastore:
10    sentences.append(item['headline'])
11    labels.append(item['is_sarcastic'])
12    urls.append(item['article_link'])

Word embeddings

IMDB review dataset

📙 Notebook: Train IMDB review dataset.
👉
Video explain the code.
  • Word embeddings = the idea in which words and associated words are clustered as vectors in a multi-dimensional space. That allows words with similar meaning to have a similar representation.
  • The meaning of the words can come from labeling of the dataset.
    • Example: "dull" and "boring" show up a lot in negative reviews → they have similar sentiments → they are close to each other in the sentence → thus their vectors will be similar → NN train + learn these vectors + associating them with the labels to come up with what's called in embedding.
  • The purpose of embedding dimension is the number of dimensions for the vector representing the word encoding.
1import tensorflow as tf
2print(tf.__version__) # check version of tensorflow
3
4# If you are using tf1, you need below code
5tf.enable_eager_execution()
1# IMDB reviews dataset
2import tensorflow_datasets as tfds
3imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
4
5train_data, test_data = imdb['train'], imdb['test']
6
7for s,l in train_data: # "s" for sentences "l" for labels
8    # The values for "s" and "l" are tensors
9    # so we need to extracr their values
10    training_sentences.append(s.numpy().decode('utf8'))
11    training_labels.append(l.numpy())
1# Prepare for the NN
2vocab_size = 10000
3embedding_dim = 16 # embedding to dim 16
4max_length = 120 # of each sentence
5trunc_type='post' # cut the last words
6oov_tok = "<OOV>" # replace not-encoded words by this
7
8from tensorflow.keras.preprocessing.text import Tokenizer
9from tensorflow.keras.preprocessing.sequence import pad_sequences
10
11tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
12tokenizer.fit_on_texts(training_sentences)
13    # encoding the words
14word_index = tokenizer.word_index
15    # list of word index (built based on training set)
16    # there may be many oov_tok in test set
17sequences = tokenizer.texts_to_sequences(training_sentences)
18    # apply on sentences
19padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)
20    # padding the sentences
21
22# apply to the test set
23testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
24testing_padded = pad_sequences(testing_sequences,maxlen=max_length)
1# Simple NN
2model = tf.keras.Sequential([
3    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
4                              # The result of embedding will be a 2D array:
5                              # length of sentence x embedding_dim
6    tf.keras.layers.Flatten(),
7    # Alternatively (a little diff on speed and accuracy):
8    # tf.keras.layers.GlobalAveragePooling1D()
9    #   average across the vectors to flatten it out
10    tf.keras.layers.Dense(6, activation='relu'),
11    tf.keras.layers.Dense(1, activation='sigmoid')
12])
13model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
14model.summary()
1# Training
2model.fit(padded, training_labels_final, epochs=10, validation_data=(testing_padded, testing_labels_final))
1# the result
2e = model.layers[0] # get the result of the embedding layers
3weights = e.get_weights()[0]
4print(weights.shape) # shape: (vocab_size, embedding_dim)
If you wanna visualize the result (in 3D) with Embedding projector
1import io
2
3out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
4out_m = io.open('meta.tsv', 'w', encoding='utf-8')
5for word_num in range(1, vocab_size):
6  word = reverse_word_index[word_num]
7  embeddings = weights[word_num]
8  out_m.write(word + "\\n")
9  out_v.write('\\t'.join([str(x) for x in embeddings]) + "\\n")
10out_v.close()
11out_m.close()
12
13try:
14  from google.colab import files
15except ImportError:
16  pass
17else:
18  files.download('vecs.tsv')
19  files.download('meta.tsv')

Sarcasm dataset

📙 Notebook: Train Sacarsm dataset.
  • In text data, it usually happens that the accuracy increase over the number of training but the loss increase sharply also. We can "play" with hyperparameter to see the effect.
1# Run this to ensure TensorFlow 2.x is used
2try:
3  # %tensorflow_version only exists in Colab.
4  %tensorflow_version 2.x
5except Exception:
6  pass

Pre-tokenized datasets

  • There are someones who did the work (tokenization) for you.
  • Try on IMDB dataset that has been pre-tokenized.
  • The tokenization is done on subwords!
  • The sequence of words can be just important as their existence.
1# load imdb dataset from tensorflow
2import tensorflow_datasets as tfds
3imdb, info = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)
4
5# extract train/test sets
6train_data, test_data = imdb['train'], imdb['test']
7
8# take the tokernizer
9tokenizer = info.features['text'].encoder
10
11print(tokenizer.subwords)
12# ['the_', ', ', '. ', 'a_', 'and_', 'of_', 'to_', 's_', 'is_',...
1sample_string = 'TensorFlow, from basics to mastery'
2
3tokenized_string = tokenizer.encode(sample_string)
4print ('Tokenized string is {}'.format(tokenized_string))
5# Tokenized string is [6307, 2327, 4043, 2120, 2, 48, 4249, 4429, 7, 2652, 8050]
6
7original_string = tokenizer.decode(tokenized_string)
8print ('The original string: {}'.format(original_string))
9# The original string: TensorFlow, from basics to mastery
1# take a look on tokenized string
2# case sensitive + punctuation maintained
3for ts in tokenized_string:
4  print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))
5
6# 6307 ----> Ten
7# 2327 ----> sor
8# 4043 ----> Fl
9# ...
  • The code run quite long (4 minutes each epoch if using GPU on colab) because there are a lot of hyperparameters and sub-words.
  • Result: 50% acc & loss is decreasing but very small.
    • Because we are using sub-words, not for-words → they (sub-words) are nonsensical. → they are only when we put them together in sequences → learning from sequences would be a great way forward → RNN (Recurrent Neural Networks)

Sequence models

  • The relative ordering, the sequence of words, matters for the meaning of the sentence .
  • For NN to take into account for the ordering of the words: RNN (Recurrent Neural Networks), LSTM (Long short-term memory).
  • Why not RNN but LSTM ? With RNN, the context is preserved from timstamp to timestamp BUT that may get lost in longer sentences → LSTM gets better because it has cell state.
  • Example of using LSTM: "I grew up in Ireland, I went to school and at school, they made me learn how to speak..." → "speak" is the context and we go back to the beginning to catch "Ireland", then the next word could be "leanr how to speak Gaelic"!

RNN idea

  • The usual NN, something like "f(data, labels)=rules" cannot take into account of sequences.
  • An example of using sequences: Fibonacci sequence → the result of current function is the input of next function itself,...
RNN basic idea (source).

LSTM idea

  • Sometimes, the sequence context leads to lose information like the example of "Ireland" and "Gaelic" before.
  • LSTM has an additional pipeline called Cell State. It can pass through the network to impact it + help to keep context from earlier tokens relevance.
LSTM basic idea (image from the course).
1# SINGLE LAYER LSTM
2model = tf.keras.Sequential([
3    tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
4    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
5      # 64: #oututs desired (but the result may be different)
6    tf.keras.layers.Dense(64, activation='relu'),
7    tf.keras.layers.Dense(1, activation='sigmoid')
8])
1# MULTI PLAYER LSTM
2model = tf.keras.Sequential([
3    tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
4    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
5      # return_sequences=True: required if we wanna feed LSTM into another one
6      # It ensures that the output of LSTM match the desired inputs of the next one
7    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
8    tf.keras.layers.Dense(64, activation='relu'),
9    tf.keras.layers.Dense(1, activation='sigmoid')
10])
1 layer vs 2 layer LSTM accuracy after 50 epochs (image from the course). 2 layer is better (smoother) which makes us more confident about the model. The validation acc is sticked to 80% because we used 8000 sub-words taken from training set, so there may be many tokens from the test set that would be out of vocabulary.

With vs without LSTM

1# WITHOUT LSTM (like previous section)
2model = tf.keras.Sequential([
3    tf.keras.layers.Embedding(vocab_size, embedding_dim,
4                              input_length=max_length),
5    #
6    tf.keras.layers.Flatten(),
7    tf.keras.layers.GlobalmaxPooling1D(),
8    #
9    tf.keras.layers.Dense(6, activation='relu'),
10    tf.keras.layers.Dense(1, activation='sigmoid')
11])
1# WITH LSTM
2model = tf.keras.Sequential([
3    tf.keras.layers.Embedding(vocab_size, embedding_dim,
4                              input_length=max_length),
5    #
6    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
7    #
8    tf.keras.layers.Dense(6, activation='relu'),
9    tf.keras.layers.Dense(1, activation='sigmoid')
10])
With vs without LSTM (image from the course). With LSTM is really better but there is still overfitting here.

Using a ConvNet

1model = tf.keras.Sequential([
2    tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
3    #
4    tf.keras.layers.Conv1D(128, 5, activation='relu'),
5    #
6    tf.keras.layers.GlobalAveragePooling1D(),
7    tf.keras.layers.Dense(64, activation='relu'),
8    tf.keras.layers.Dense(1, activation='sigmoid')
9])
Using Convolution network. (image from the course). It's really better but there is overfitting there.

IMDB dataset

Try with 3 different choices:
  • Simple NN: 5s/epoch, 170K params, nice acc, overfitting.
  • LSTM: 43s/epoch, 30K params, acc better, overfitting.
  • GRU (Gated Recurrent Unit layer, a different type of RNN): 20s/epoch, 169K params, very good acc, overfitting.
  • Conv1D: 6s/epoch, 171K params, good acc, overfitting.
Remark: With the texts, you'll probably get a bit more overfitting than you would have done with images. Because we have out of voca words in validation data.

Sequence models and literature

One application of sequence models: read text then generate another look-alike text.
  • How they predict a new word in the notebook? → Check this video.
1input_sequences = []
2for line in corpus:
3	# convert each sentence to list of numbers
4	token_list = tokenizer.texts_to_sequences([line])[0]
5	# convert each list to n-gram sequence
6	# eg. from [1,2,3,4,5]
7	# 		to [1,2], [1,2,3], [1,2,3,4], [1,2,3,4,5]
8	for i in range(1, len(token_list)):
9		n_gram_sequence = token_list[:i+1]
10		input_sequences.append(n_gram_sequence)
11
12# pad sequences to the maximum length of all sentences
13max_sequence_len = max([len(x) for x in input_sequences])
14input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
15
16# create predictors and label
17# [0,0,1,2] -> 2 is label
18# [0,1,2,3] -> 3 is label
19# [1,2,3,4] -> 4 is label
20xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
21
22# one-hot encoding the labels (classification problem)
23ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
1model = Sequential()
2model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
3model.add(Bidirectional(LSTM(20))) # take only 20 units (bi-direction) to train
4model.add(Dense(total_words, activation='softmax'))
5model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
6history = model.fit(xs, ys, epochs=500, verbose=1)
1seed_text = "Laurence went to dublin"
2next_words = 100
3
4for _ in range(next_words):
5	token_list = tokenizer.texts_to_sequences([seed_text])[0]
6	# "went to dublin" -> [134, 13, 59]
7	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
8	#  [0, 0, 0, 0, 0, 0, 0, 134, 13, 59]
9	predicted = model.predict_classes(token_list, verbose=0)
10	output_word = ""
11	# revert an index back to the word
12	for word, index in tokenizer.word_index.items():
13		if index == predicted:
14			output_word = word
15			break
16	# add predicted word to the seed text and make another prediction
17	seed_text += " " + output_word
18print(seed_text)
19# all the words are predicted based on the probability
20# next one will be less certain than the previous
21# -> less meaningful
  • Using more words will help.
1# read from a file
2tokenizer = Tokenizer()
3data = open('/tmp/irish-lyrics-eof.txt').read()
4corpus = data.lower().split("\\n")
A little changes from the previous,
1model = Sequential()
2model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
3model.add(Bidirectional(LSTM(150)))
4model.add(Dense(total_words, activation='softmax'))
5adam = Adam(lr=0.01) # customized optimizer
6model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
7#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
8history = model.fit(xs, ys, epochs=100, verbose=1)
  • Different convernges can create different poetry.
  • If we use one-hot for a very big corpus → take a lot of RAM → use character-based prediction → #unique characters is far less than #unique words. → notebook "Text generation with RNN"