!pip install -U -q gensim==4.2.0 lime

     |████████████████████████████████| 24.1 MB 45.2 MB/s 
     |████████████████████████████████| 275 kB 62.7 MB/s 
  Building wheel for lime (setup.py) ... done


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf


raw = pd.read_csv("/content/drive/My Drive/Colab Notebooks/dat/nlp/dat/tweets.csv.gz")


print(raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB
None


# Colab includes an extension that renders pandas dataframes into interactive displays that can be filtered, sorted, and explored dynamically.
from google.colab import data_table
data_table.enable_dataframe_formatter() 

# YOUR CODE HERE
raw[['label','tweet']].groupby('label').sample(5)


# YOUR CODE HERE
raw['len_tweet'] = raw['tweet'].apply(lambda x:len(x))


raw.head()


pd.set_option("display.precision", 1)
raw.groupby('label')['len_tweet'].describe()
# YOUR CODE HERE


import re


raw.sample(5, random_state = 203)


raw['tidy_tweet'] = raw['tweet'].apply(lambda tweet: re.sub(r'(@[\w]+)', '', tweet)) # YOUR CODE HERE
raw.sample(5, random_state=203)


raw['tidy_tweet'] = raw['tidy_tweet'].apply(lambda tweet: re.sub('[^A-Za-z0-9#\s ]+', '', tweet))# YOUR CODE HERE
raw.sample(5, random_state=203)


raw['tidy_tweet'] = raw['tidy_tweet'].apply(lambda tweet:re.sub(r'\b\w{1,3}\b', '', tweet))  # YOUR CODE HERE
raw.sample(5, random_state=203)


tokenized_tweet = raw['tidy_tweet'].apply(lambda t: t.split())   # YOUR CODE HERE
tokenized_tweet.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cant, cause, they, don...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: tidy_tweet, dtype: object


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


tokenized_tweet =  tokenized_tweet.apply(lambda x: [token for token in x if token not in stop_words])  # YOUR CODE HERE


assert any(word in tokenized_tweet for word in stop_words) == False


from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')# YOUR CODE HERE


tokenized_tweet = tokenized_tweet.apply(lambda x: ' '.join(x)) 
tokenized_tweet.head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks #lyft credit cant cause dont offer whee...
2                                       bihday majesty
3                                #model love take time
4                       factsguide society #motivation
Name: tidy_tweet, dtype: object


raw['tidy_tweet'] = tokenized_tweet.apply(lambda t:t.join(t))# YOUR CODE HERE


raw.sample(1, random_state=203)


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    raw['tidy_tweet'], raw.label, 
    test_size=0.25, random_state=42, stratify=raw.label)


assert X_train.shape == y_train.shape == (23971, )
assert X_test.shape == y_test.shape == (7991,)


from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

def plot_wordcloud(text:str) -> None:
    '''
    Plot a wordcloud of top 50 words from the input text
    masked by twitter logo
    '''
    mask = np.array(Image.open('/content/drive/My Drive/Colab Notebooks/dat/nlp/img/twitter-mask.png')) # REPLACE w/ YOUR FILE PATH
    wordcloud = WordCloud( 
        background_color='white', 
        random_state=42,
        max_words=50, 
        max_font_size=80, 
        mask = mask).generate(text)
    plt.figure(figsize=(10,10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()


all_words = X_train.str.cat(sep = ' ')# YOUR CODE HERE


plot_wordcloud(all_words)


negative_words =  ''.join(X_train[y_train==1])   # YOUR CODE HERE


plot_wordcloud(negative_words)


def hashtag_extract(x) -> list:
    """
    extract hastags from an iterable (list or series) and 
    return the hashtags in a list.
    """
    hashtags = []
    # Loop over the words in the tweet
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)
    return hashtags


HT_regular = hashtag_extract(X_train[y_train == 0]) # YOUR CODE HERE


len(HT_regular)

22290


assert type(HT_regular) == list
assert type(HT_regular[0]) == list # nested list


HT_negative = hashtag_extract(X_train[y_train == 1])# YOUR CODE HERE


HT_regular = sum(HT_regular,[])
HT_negative = sum(HT_negative,[])


HT_negative[0]

'democracy'


assert type(HT_regular) == type(HT_negative) == list
assert type(HT_regular[0]) == type(HT_negative[0]) == str


from typing import List, Tuple
from collections import Counter
def top_hashtags(hashtags:List[str], n=10) -> List[Tuple[str, int]]:
    ''' Function to return the top n hashtags '''
    # YOUR CODE HERE
    c = Counter(hashtags)
    sorted_c = sorted(c.items(),key=lambda kv:kv[1], reverse = True)
    return sorted_c[:n]


# YOUR CODE HERE
top_hashtags(HT_regular)

[('love', 67022),
 ('smile', 23494),
 ('healthy', 19185),
 ('life', 17773),
 ('cute', 16660),
 ('summer', 16549),
 ('blog', 16403),
 ('gold', 14687),
 ('thankful', 14423),
 ('positive', 14401)]


# YOUR CODE HERE
top_hashtags(HT_negative)

[('trump', 5731),
 ('allahsoil', 4834),
 ('retweet', 2536),
 ('liberal', 2515),
 ('libtard', 2400),
 ('politics', 2025),
 ('black', 1809),
 ('brexit', 1537),
 ('hate', 1390),
 ('tampa', 1376)]


from sklearn.feature_extraction.text import CountVectorizer
MAX_FEATURES = 1000


bow_vectorizer = CountVectorizer(max_features = MAX_FEATURES)  # YOUR CODE HERE
bow_train = bow_vectorizer.fit_transform(X_train)   # YOUR CODE HERE


assert bow_train.shape == (X_train.shape[0], MAX_FEATURES)


# YOUR CODE HERE
bow_train[0:3].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [9, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])


from scipy.sparse.csr import csr_matrix
assert type(bow_train) == csr_matrix


from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, stop_words='english')  # YOUR CODE HERE
tfidf_train = tfidf_vectorizer.fit_transform(X_train)


assert type(tfidf_train) == csr_matrix
assert tfidf_train.shape == bow_train.shape == (X_train.shape[0], MAX_FEATURES)


tokenized_tweet = pd.Series(X_train)# YOUR CODE HERE


assert tokenized_tweet.shape == X_train.shape


tokenized_tweet.head()

1036     llike spread peanut butter white bread #little...
2380     wwatching made america simpson 30for30 interes...
31605    ffrancis underwood seen leaving marseille #noj...
23437    ## #enjoy #music #today #free #apps #free #mus...
2669     ##juicing experience #notsobad #healthyliving ...
Name: tidy_tweet, dtype: object


from gensim.models import Word2Vec


w2v = Word2Vec(
        sentences=tokenized_tweet,
        vector_size= MAX_FEATURES,# YOUR CODE HERE
        window=5, min_count=2, sg = 1, 
        hs = 0, negative = 10,  workers= 2, 
        seed = 34)


%%time
# YOUR CODE HERE
w2v.train(tokenized_tweet,total_examples=1, epochs=20)


# YOUR CODE HERE
print(type(w2v))
print(type(w2v.wv))


w2v.wv.most_similar(positive=['love'])


from gensim.models.keyedvectors import KeyedVectors

def word_vector(tokens:list, size:int, keyed_vec:KeyedVectors= w2v.wv):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += keyed_vec[word].reshape((1, size))
            count += 1
        except KeyError: 
            # handling the case where the token is not in vocabulary        
            continue
    if count != 0:
        vec /= count
    return vec

def tokens_to_array(tokens:list, size:int, keyed_vec:KeyedVectors= w2v.wv):
    array = np.zeros((len(tokens), size))
    for i in range(len(tokens)):
        array[i,:] = word_vector(tokens.iloc[i], size, keyed_vec=keyed_vec)
    return array


w2v_train = tokens_to_array(X_train, size=MAX_FEATURES)# YOUR CODE HERE


assert w2v_train.shape == (X_train.shape[0], MAX_FEATURES)


bow_test = bow_vectorizer.transform(X_test) # YOUR CODE HERE
tfidf_test = tfidf_vectorizer.transform(X_test) # YOUR CODE HERE

tokenized_tweet_test = X_test.str.split() # YOUR CODE HERE
w2v_test = tokens_to_array(X_test, size=MAX_FEATURES) # YOUR CODE HERE


assert bow_test.shape == tfidf_test.shape == w2v_test.shape == (X_test.shape[0], MAX_FEATURES)


from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report


# YOUR CODE HERE
BNBmodel = BernoulliNB()


# YOUR CODE HERE (train the model)
BNBmodel.fit(bow_train, y_train)


# YOUR CODE HERE (report)
print(classification_report(BNBmodel.predict(bow_test), y_test))


# YOUR CODE HERE
BNBmodel2 = BernoulliNB()
BNBmodel2.fit(tfidf_train, y_train)
print(classification_report(BNBmodel2.predict(tfidf_test), y_test))


# YOUR CODE HERE


BNBmodel3 = BernoulliNB()
BNBmodel3.fit(w2v_train, y_train)
print(classification_report(BNBmodel3.predict(w2v_test), y_test))


VOCAB_SIZE = 25000  
MAX_LEN = 50


import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def tokenize_pad_sequences(text):
    '''
    tokenize the input text into sequences of integers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(
        num_words=VOCAB_SIZE,# YOUR CODE HERE
        filters='[^a-zA-Z#\s]',# YOUR CODE HERE
        lower=True, split=' ', oov_token='oov')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=MAX_LEN)

    return X, tokenizer


print('Before Tokenization & Padding \n', raw['tidy_tweet'][0])
X, tokenizer = tokenize_pad_sequences(raw['tidy_tweet'])
print('After Tokenization & Padding \n', X[0])
y = raw['label'].values


X_train, X_test, y_train, y_test = train_test_split(
    # YOUR CODE HERE
    X, raw.label, 
    test_size=0.25, random_state=42, stratify=raw.label
    )
X_train, X_val, y_train, y_val = train_test_split(
    # YOUR CODE HERE
    X_train, y_train, 
    test_size=0.20, random_state=42, stratify=y_train
    )


print('Train Set ->', X_train.shape, y_train.shape)
print('Validation Set ->', X_val.shape, y_val.shape)
print('Test Set ->', X_test.shape, y_test.shape)


from keras.models import Sequential
# YOUR CODE HERE (layer imports)
from tensorflow.keras import layers
EMBEDDING_DIM = 16   
model = Sequential([
    # YOUR CODE HERE
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN), # embedding layer
    tf.keras.layers.Bidirectional(layers.LSTM(32, return_sequences=True)), #bLSTM layer
    tf.keras.layers.GlobalAveragePooling1D(), # pooling layer
    tf.keras.layers.Dropout(0.20), # dropout layer
    tf.keras.layers.Dense(32, activation="relu"), # ReLu layer
    tf.keras.layers.Dense(1, activation="sigmoid") # classification layer
])


model.summary()


from keras.metrics import Precision, Recall
model.compile(
    loss= "binary_crossentropy", # YOUR CODE HERE
    optimizer='adam', 
    metrics= ["accuracy", # YOUR CODE HERE
              "precision",
              "recall"
              ]
    )


EPOCHS=10
BATCH_SIZE = 32
history = model.fit(X_train, y_train,
                    validation_data= [x_val, y_val]# YOUR CODE HERE
                    batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2)


def plot_graphs(history, metric):
  fig, ax = plt.subplots()
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  ax.set_xticks(range(EPOCHS))
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])


# YOUR CODE HERE
plot_graphs(history,"accuracy")


# YOUR CODE HERE
plot_graphs(history,"loss")


# YOUR CODE HERE
model_pred = pd.cut(pd.DataFrame(model.predict(X_test))[0], bins=[0,.5,1], labels=[0,1])
print(classification_report(y_test, model_pred))


# # NB using tf-idf
#               precision    recall  f1-score   support

#            0       0.96      0.97      0.97      7430
#            1       0.55      0.48      0.51       561

#     accuracy                           0.94      7991
#    macro avg       0.75      0.72      0.74      7991
# weighted avg       0.93      0.94      0.93      7991

# # NB using word2vec
#               precision    recall  f1-score   support

#            0       0.98      0.85      0.91      7430
#            1       0.29      0.82      0.43       561

#     accuracy                           0.85      7991
#    macro avg       0.64      0.83      0.67      7991
# weighted avg       0.94      0.85      0.88      7991


%load_ext tensorboard


from tensorboard.plugins import projector


rm -rf /logs/


import os
log_dir='/logs/tweets-example/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)


with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
  i = 0
  for label in tokenizer.word_index.keys():
    if label == 'oov':
      continue # skip oov
    f.write("{}\n".format(label))
    if i > VOCAB_SIZE:
      break
    i += 1


weights = tf.Variable(model.layers[0].get_weights()[0][1:]) # `embeddings` has a shape of (num_vocab, embedding_dim)


checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))


config = projector.ProjectorConfig()
embedding = config.embeddings.add()


embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)


ls /logs/tweets-example/


%tensorboard --logdir /logs/tweets-example/


from lime.lime_text import LimeTextExplainer


explainer = LimeTextExplainer(class_names=['no', 'yes'], random_state=2)


def predict_proba(arr):
    processed = tokenizer.texts_to_sequences(arr)
    processed = pad_sequences(processed, padding='post', maxlen=MAX_LEN)
    pred = model.predict(processed)
    r = []
    for i in pred:
        temp = i[0]
        r.append(np.array([1-temp,temp])) 
    return np.array(r)


idx = 16399 
exp = explainer.explain_instance(
    # YOUR CODE HERE
    raw.tidy_tweet.iloc[idx],

    num_features=6)
exp.show_in_notebook(text=raw.tidy_tweet.iloc[idx])


# YOUR CODE HERE

	label	tweet
27691	0	so cool! i love the new #youtube 4 #gamers and...
25568	0	we open mid-july but are taking bookings now! ...
1050	0	@user we can do no great things, only small t...
19517	0	@user 80-yr-old hindu man #gokaldas beaten up...
14280	0	happy 6th bihday junior ððâ¤â¤ð i h...
19161	1	@user @user @user @user @user why would i ta...
20915	1	#newyear 'wish list' of cretin #carlpaladino,...
23279	1	this is sooooo or may be just funny
11308	1	you might be a libtard if... #libtard #sjw #l...
4708	1	#lgbti #poc need to speak up against the that...

	id	tweet	len_tweet
0	1	@user when a father is dysfunctional and is s...	102
1	2	@user @user thanks for #lyft credit i can't us...	122
2	3	bihday your majesty	21
3	4	#model i love u take with u all the time in ...	86
4	5	factsguide: society now #motivation	39

	count	mean	std	min	25%	50%	75%	max
label
0	29720.0	84.3	29.6	11.0	62.0	88.0	107.0	274.0
1	2242.0	90.2	27.4	12.0	69.0	96.0	111.0	152.0

	id	label	tweet	len_tweet
790	791	1	@user and you keep telling that only aryans ar...	109
21928	21929	0	@user what makes you ?	25
25642	25643	0	â #nzd/usd extends rbnz-led rally, hits fre...	101
20436	20437	0	i'm on a mission to ride all of the animals! ...	91
22552	22553	0	the color of a human skin matters a lot to the...	88

	id	label	tweet	len_tweet	tidy_tweet
790	791	1	@user and you keep telling that only aryans ar...	109	and you keep telling that only aryans are all...
21928	21929	0	@user what makes you ?	25	what makes you ?
25642	25643	0	â #nzd/usd extends rbnz-led rally, hits fre...	101	â #nzd/usd extends rbnz-led rally, hits fre...
20436	20437	0	i'm on a mission to ride all of the animals! ...	91	i'm on a mission to ride all of the animals! ...
22552	22553	0	the color of a human skin matters a lot to the...	88	the color of a human skin matters a lot to the...

Detect hate speech¶

Learning Objectives¶

Task I: Data Preprocessing¶

Task 2. Wordcloud and Hashtag¶

Task 3. Features¶

Task 4. Naive Bayes classifiers¶

Task 5. Bidirectional LSTM¶

Task 6. Interpretation¶

Acknowledgement & Reference¶

	id	label	tweet	len_tweet	tidy_tweet
790	791	1	@user and you keep telling that only aryans ar...	109	keep telling that only aryans allowed rap...
21928	21929	0	@user what makes you ?	25	what makes
25642	25643	0	â #nzd/usd extends rbnz-led rally, hits fre...	101	#nzdusd extends rbnzled rally hits fresh 1ye...
20436	20437	0	i'm on a mission to ride all of the animals! ...	91	mission ride animals #teamchanlv #veg...
22552	22553	0	the color of a human skin matters a lot to the...	88	color human skin matters system when c...