By Dr. Michael Fire
For this lecture, we are going to use the Kaggle, TuriCreate, Gensim, pyLDAvis, spaCy, NLTK, Plotly Express Afinn packages. Let's set them up:
!pip install turicreate
!pip install kaggle
!pip install gensim
!pip install pyLDAvis
!pip install spaCy
!pip install afinn
!pip install nltk
!pip install plotly_express
import nltk
nltk.download('stopwords')
nltk.download('punkt')
!python -m spacy download en_core_web_lg # Important! you need to restart runtime after install
#setting up Kaggle & TuriCreate package s
import json
import os
!mkdir /root/.kaggle/
# Installing the Kaggle package
#Important Note: complete this with your own key - after running this for the first time remmember to **remove** your API_KEY
api_token = {"username":"<Insert Your Kaggle User Name>","key":"<Insert Your Kaggle API key>"}
# creating kaggle.json file with the personal API-Key details
# You can also put this file on your Google Drive
with open('/root/.kaggle/kaggle.json', 'w') as file:
json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json
In this example, we are going to use the methods we learned in order to create a fake news classifier. For this example, we will use the Fake News Dataset. First let's load the dataset into a DataFrame object:
!mkdir ./datasets
!mkdir ./datasets/fake-news
# download the dataset from Kaggle and unzip it
!kaggle datasets download jruvika/fake-news-detection -p ./datasets/fake-news
!unzip ./datasets/fake-news/*.zip -d ./datasets/fake-news/
import turicreate as tc
%matplotlib inline
fake_news_dataset_path = "./datasets/fake-news/data.csv"
sf = tc.SFrame.read_csv(fake_news_dataset_path)
sf
sf['full_text'] = sf.apply(lambda r: r['Headline'] + "\n\n" + r['Body'])
sf
Let's use TuriCreate to create topic models for the unreliable news:
import turicreate as tc
from nltk.corpus import stopwords
from nltk.stem.porter import *
from functools import lru_cache
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
stop_words_set = set(stopwords.words("english"))
stemmer = PorterStemmer()
#Using cahcing for faster performence
@lru_cache(maxsize=None)
def word_stemming(w):
return stemmer.stem(w)
def skip_word(w):
if len(w) <2:
return True
if w.isdigit():
return True
if w in stop_words_set or stemmer.stem(w) in stop_words_set:
return True
return False
def text_to_bow(text):
text = text.lower()
l = [word_stemming(w) for w in word_tokenize(text) if not skip_word(w) ]
l = [w for w in l if not skip_word(w)]
d = Counter(l)
return dict(d)
f_sf = sf[sf['Label'] == 1]
bow_list = []
for t in f_sf['Headline']:
bow_list.append(text_to_bow(t))
f_sf['bow'] = bow_list
bow_list = []
for t in f_sf['full_text']:
bow_list.append(text_to_bow(t))
f_sf['full_bow'] = bow_list
f_sf.materialize()
docs = f_sf['bow']
docs[:2]
topic_model = tc.topic_model.create(docs, num_topics=100)
topic_model.get_topics().print_rows(200)
Let's use BM25 to find the most relevant items about aliens:
tc.text_analytics.bm25(f_sf['bow'], ['trump', 'obama']).sort('bm25', ascending=False)
f_sf[945]['Headline']
f_sf[358]['Headline']
tc.text_analytics.bm25(f_sf['bow'], ['brexit']).sort('bm25', ascending=False)
f_sf[1323]['Headline']
Let's find the most common people/organizations/locations in the texts:
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_lg')
def get_entites_from_text(text):
entities_dict= {}
#using spaCy to get entities
doc = nlp(text)
for entity in doc.ents:
label = entity.label_
if label not in entities_dict:
entities_dict[label] = set()
entities_dict[label].add(entity.text)
return entities_dict
l =[]
for i in tqdm(range(len(sf['full_text']))):
t = sf[i]['full_text']
l.append(get_entites_from_text(t))
sf['entities_dict'] = l
f_sf = sf[sf['Label'] == 1]
f_sf
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline
def draw_word_cloud(words_list, min_times=10):
stopwords = set(STOPWORDS)
stopwords_parts = {"'s", " ' s'", " `s" }
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10)
def skip_entity(e):
if e in stopwords:
return True
for p in stopwords_parts:
if p in e:
return True
return False
c = Counter(words_list)
# using the subject frquencies
d = {k:v for k,v in dict(c).items() if v > min_times and not skip_entity(k)}
wordcloud.generate_from_frequencies(d)
plt.figure(figsize = (20, 20), facecolor = None)
plt.imshow(wordcloud)
find_most_common_person = []
for d in f_sf['entities_dict']:
if 'PERSON' in d:
find_most_common_person += d['PERSON']
draw_word_cloud(find_most_common_person, min_times=20)
find_most_common_location = []
for d in f_sf['entities_dict']:
if 'LOC' in d:
find_most_common_location += d['LOC']
draw_word_cloud(find_most_common_location, min_times=10)
find_most_common_event = []
for d in f_sf['entities_dict']:
if 'EVENT' in d:
find_most_common_event += d['EVENT']
draw_word_cloud(find_most_common_event, min_times=10)
find_most_common_gpe = []
for d in f_sf['entities_dict']:
if 'GPE' in d:
find_most_common_gpe += d['GPE']
draw_word_cloud(find_most_common_gpe, min_times=20)
find_most_common_fac = []
for d in f_sf['entities_dict']:
if 'FAC' in d:
find_most_common_fac += d['FAC']
draw_word_cloud(find_most_common_fac, min_times=4)
tc.text_analytics.bm25(f_sf['full_text'], ['world', 'war', 'iii' ]).sort('bm25', ascending=False)
f_sf[695]['full_text']
Let's create a classifier which predicts if a text item is fake or not:
sf['bow'] = tc.text_analytics.count_words(sf['full_text'])
sf['bow'] = sf['bow'].apply(lambda d: {k:v for k,v in d.items() if v > 1})
sf['tfidf'] = tc.text_analytics.tf_idf(sf['bow'])
train, test = sf.random_split(0.8)
cls = tc.classifier.create(train, features=["bow", "tfidf"], target="Label")
cls.evaluate(test)
l = []
for t in tqdm(sf['full_text']):
l.append(nlp(t).vector)
sf['vector'] = l
train, test = sf.random_split(0.8)
cls1 = tc.classifier.create(train, features=["bow", "tfidf"], target="Label")
cls1.evaluate(test)
cls2 = tc.classifier.create(train, features=["vector"], target="Label")
cls2.evaluate(test)
In this example, we will analyze SMS texts and try to predict if they are spam or not. Throughout this example we will use the SMS Spam Collection Dataset. Let's load this data into an SFrame object:
!mkdir ./datasets
!mkdir ./datasets/sms-spam
# download the dataset from Kaggle and unzip it
!kaggle datasets download uciml/sms-spam-collection-dataset -p ./datasets/sms-spam
!unzip ./datasets/sms-spam/*.zip -d ./datasets/sms-spam/
import pandas as pd
path = "./datasets/sms-spam/spam.csv" # need to be save with UTF8 encodings
df = pd.read_csv(path, delimiter='',encoding='latin-1')
df
import turicreate as tc
sf = tc.SFrame(df[['v1', 'v2']])
sf = sf.rename({'v1':'class', 'v2':'text'})
sf
Let's explore the data a little before constructing a classifier:
import seaborn as sns
%matplotlib inline
sns.set()
sf['length'] = sf['text'].apply(lambda t: len(t))
sns.distplot(sf[sf['class'] == 'ham']['length'], axlabel="Text Length (Chars)", color='g')
sns.distplot(sf[sf['class'] == 'spam']['length'], axlabel="Text Length (Chars)", color='r')
sf['words_num'] = sf['text'].apply(lambda t: len(t.split()))
sns.distplot(sf[sf['class'] == 'ham']['words_num'], axlabel="Text Length (Words)", color='g')
sns.distplot(sf[sf['class'] == 'spam']['words_num'], axlabel="Text Length (Words)", color='r')
Let's find the most common 2-grams in the text:
from collections import Counter
from nltk.tokenize import word_tokenize
def get_most_common_bigrams(txt):
words = word_tokenize(txt)
bigrams = [f"{w1} {w2}" for w1,w2 in zip(words, words[1:])]
c = Counter(bigrams)
return c
txt = "\n".join(sf['text'])
c = get_most_common_bigrams(txt)
c.most_common(20)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def draw_sms_words_cloud(d):
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
min_font_size = 10)
wordcloud.generate_from_frequencies(d)
plt.figure(figsize = (20, 20), facecolor = None)
plt.imshow(wordcloud)
draw_sms_words_cloud(dict(c))
txt = "\n".join(sf[sf['class'] == 'ham']['text'])
c = get_most_common_bigrams(txt)
c.most_common(20)
draw_sms_words_cloud(dict(c))
txt = "\n".join(sf[sf['class'] == 'spam']['text'])
c = get_most_common_bigrams(txt)
c.most_common(20)
draw_sms_words_cloud(dict(c))
In this dataset, the differences between SPAM/HAM messages are obvious right away from the above figures -- Spam message are on average longer and tend to call for action. Let's create a classifier which can predict if a text is spam or ham:
sf['1grams-words'] = tc.text_analytics.count_ngrams(sf['text'], n=1, method="word")
sf['2grams-words'] = tc.text_analytics.count_ngrams(sf['text'], n=2, method="word")
sf['1grams-chars'] = tc.text_analytics.count_ngrams(sf['text'], n=1, method="character")
sf['2grams-chars'] = tc.text_analytics.count_ngrams(sf['text'], n=2, method="character")
train,test = sf.random_split(0.8)
cls1 = tc.classifier.create(train, features=["2grams-words"], target="class")
cls2 = tc.classifier.create(train, features=["2grams-chars"], target="class")
cls3 = tc.classifier.create(train, features=["2grams-chars", "1grams-chars", "2grams-words", "1grams-words"], target="class")
cls1.evaluate(test)
cls2.evaluate(test)
cls3.evaluate(test)
Let's look on the spam SMS messages, which were classified as ham:
cls3.classify(test)
test['predicted_prob'] = cls3.classify(test)['probability']
test['predicted_class'] = cls3.classify(test)['class']
spam_test =test[test['class'] == 'spam']
l = list(spam_test[spam_test['predicted_class'] == 'ham']['text'])
print(len(l))
for m in l:
print(f"{m}\n\n")
In this example, we will develop a simple classifier which can predict the category of a movie from the movie's Wikipedia plot summary descriptions. To achieve this, we will use the Wikipedia Movie Plots dataset. Let's load the dataset into an SFrame object:
!mkdir ./datasets
!mkdir ./datasets/movie-plots
# download the dataset from Kaggle and unzip it
!kaggle datasets download jrobischon/wikipedia-movie-plots -p ./datasets/movie-plots
!unzip ./datasets/movie-plots/*.zip -d ./datasets/movie-plots/
import pandas as pd
import turicreate as tc
import turicreate.aggregate as agg
path = "./datasets/movie-plots/wiki_movie_plots_deduped.csv"
df = pd.read_csv(path)
sf = tc.SFrame(df[['Title', 'Genre', 'Plot']])
sf
g = sf.groupby('Genre', {'count':agg.COUNT()})
g.sort("count", ascending=False).print_rows(100)
We can see that some of the movies have multiple genres. Let's normalize the data so each row will contain only one genre:
def get_genres(genre):
genre = str(genre).lower().strip()
if genre == 'unknown' or genre == '':
return None
if "," not in genre and "/" not in genre:
return [genre]
l = []
genre = genre.replace(",", "/")
if "/" in genre:
l = genre.split("/")
return [g.strip() for g in l]
sf['GenreNorm'] = sf['Genre'].apply(lambda g: get_genres(g))
sf
sf = sf[sf['GenreNorm'] != None]
sf = sf.stack('GenreNorm', new_column_name='GenreNorm')
sf
g = sf.groupby('GenreNorm', {'count':agg.COUNT()})
g.sort("count", ascending=False).print_rows(100)
Let's remove all the genres with fewer than 100 movies, and use spaCy to create a vector from each movie's plot:
import spacy
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
sns.set()
genres_set = set(g[g['count'] > 100]['GenreNorm'])
sf = sf[sf['GenreNorm'].apply(lambda g: g in genres_set)]
sf.materialize()
print(f"We are left with {len(genres_set)} geners and {len(sf)} movies")
plt.figure(figsize=(15,8))
g = sf.groupby('GenreNorm', {'count':agg.COUNT()})
g = g.sort("count", ascending=False)
g = g.rename({"GenreNorm": "Genre"})
df = g.to_dataframe()
sns.barplot(y=df['Genre'], x=df["count"], palette="rocket")
from tqdm import tqdm
nlp = spacy.load('en_core_web_lg')
vector_list = []
for plot in tqdm(sf['Plot']):
vector_list.append(nlp(plot).vector)
sf['vector'] = vector_list
train,test = sf.random_split(0.8)
cls = tc.classifier.create(train, features=["vector"], target="GenreNorm")
e = cls.evaluate(test)
e
e['confusion_matrix'].sort('count', ascending=False).print_rows(100)
It is important to remember that each movie can belong to several categories. Therefore, we need to evaluate our classifiers differently (or use multilabel classifiers). Nevertheless, we can see that our out-of-the-box classifier still obtains decent results. Let's visualize some of the results using t-SNE:
import numpy as np
from sklearn.manifold import TSNE
#Note: hopefully this code is correct...
# The code was inspired from https://nlpforhackers.io/word-embeddings/
X = []
for v in sf['vector']:
X.append(v)
X = np.array(X)
print("Computed X: ", X.shape)
X_embedded = TSNE(n_components=2, n_iter=250, verbose=2).fit_transform(X)
print("Computed t-SNE", X_embedded.shape)
df = pd.DataFrame(columns=['x', 'y', 'Genre'])
df['x'], df['y'], df['Genre'] = X_embedded[:,0], X_embedded[:,1], sf['GenreNorm']
df
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
g_set = set(df['Genre'])
d = dict(zip(g_set, range(len(g_set))))
colors = [d[g] for g in df["Genre"]]
plt.figure(figsize=(20,10))
sns.scatterplot(x="x", y="y", hue="Genre", data=df)
plt.xlim(-0.5,0.8)
plt.ylim(-1,1)
df2 = df[df["Genre"].apply(lambda g: g in {'war', "western", "romance","animation"})]
plt.figure(figsize=(20,10))
sns.scatterplot(x="x", y="y", hue="Genre", data=df2)
Charles John Huffam Dickens (1812 - 1870) was an English writer. He created some of the world's best-known fictional characters, like Oliver Twist. Dickens is regarded by many as the greatest novelist of the Victorian era. In this example, we are going to analyze Dickens’ works by using NLP. We will use The Works of Charles Dickens dataset. Let's start by finding the main characters’ names in Oliver Twist:
import kaggle
!mkdir ./datasets
!mkdir ./datasets/dickens
# download the dataset from Kaggle and unzip it
!kaggle datasets download fuzzyfroghunter/dickens -p ./datasets/
!unzip ./datasets/dickens.zip -d ./datasets/
import spacy
nlp = spacy.load('en_core_web_lg')
datasets_path = "./datasets/dickens"
oliver_path = f"{datasets_path}/pg730.txt"
def get_entites_dict_from_text(text):
entities_dict= {}
#using spaCy to get entities
doc = nlp(text)
for entity in doc.ents:
label = entity.label_
e = entity.text.lower()
if label not in entities_dict:
entities_dict[label] = {}
if e not in entities_dict[label]:
entities_dict[label][e] = 0
entities_dict[label][e] += 1
return entities_dict
def get_book_entities(path, person_min_times, other_entities_min_times=3):
txt = open(path,"r", encoding="utf8", errors="ignore").read()
txt = txt.replace("\n", " ")
doc = nlp(txt)
d = get_entites_dict_from_text(txt)
entities_dict = {}
for k in d.keys():
min_times = other_entities_min_times
if k == "PERSON":
min_times = person_min_times
entity_dict = {k:v for k,v in d[k].items() if v>min_times}
entities_dict[k] = entity_dict
return entities_dict
entities_dict = get_book_entities(oliver_path, 20)
entities_dict
By extracting only the entities, we can learn a lot about the book. Let's create a network among the book characters. A link in the network will be between two people that appeared in the same paragraph:
from tqdm import tqdm
txt = open(oliver_path).read()
paragraphs_list = txt.split("\n\n")
links_dict = {}
def get_persons_links(txt):
links_set = set()
doc = nlp(txt)
l = [entity.text.lower().strip() for entity in doc.ents if entity.label_ == "PERSON"]
for e1 in l:
for e2 in l:
if e1 == e2 or len(e1) < 2 or len(e2)< 2:
continue
if e1 > e2:
e1, e2 = e2, e1 # switch order
links_set.add((e1,e2))
return links_set
links_list = []
for para in tqdm(paragraphs_list):
# for each paragraph each link counts only once
links_list += list(get_persons_links(para))
from collections import Counter
import networkx as nx
c = Counter(links_list)
c.most_common(40)
We constructed the graph, and now let's visualize it:
g = nx.Graph()
for e,count in dict(c).items():
if count < 6:
# only links that appeared at least 6 times
continue
v1,v2 = e
g.add_edge(v1,v2, weight=count)
nx.info(g)
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20,20))
nx.draw_kamada_kawai(g, with_labels=True)
Without reading the book, we can understand that Oliver Twist is the main character of the book, and we can also visualize his social network with its communities. For example, Mr. Bumble is connected to one part of the network, and Mr. Brownlow in another.
Let's try to find which of the Dickens books are the most similar to each other. First, let's create a word2vec model using Dickens texts:
import os
import nltk
from nltk.tokenize import word_tokenize
files = [p for p in os.listdir(datasets_path) if p.endswith(".txt")]
txt = ""
for p in files:
txt += open(f"{datasets_path}/{p}").read()
print(f"Number of chars={len(txt)} and words={len(word_tokenize(txt))}")
import re
import gensim
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
re_words_split = re.compile("(\w+)")
def txt2words(s):
s = re.sub("[^a-zA-Z]", " ", s).lower()
return re_words_split.findall(s)
class Sentences(object):
def __init__(self, txt):
self._txt = txt
def __iter__(self):
for s in tokenizer.tokenize(self._txt):
yield txt2words(s)
# We will create a Word2Vec model based on Dickens work
sentences = Sentences(txt)
model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=3, workers=6)
model.wv.most_similar("oliver")
According to the constructed model, we can see that Barnaby and Nicholas are the most similar to Oliver. According to Wikipedia "Dickens began writing Nickleby while still working on Oliver Twist." Let's calculate the average vector of each book:
import numpy as np
def txt2vector(txt):
words = word_tokenize(txt)
words = [w for w in words if w in model]
if len(words) != 0:
return np.mean([model[w] for w in words], axis=0)
return None
vectors = [txt2vector(open(f"{datasets_path}/{p}").read()) for p in files]
import turicreate as tc
sf = tc.SFrame({'Path': files, 'Vector':vectors})
meta_sf = tc.SFrame.read_csv(f"{datasets_path}/metadata.tsv", delimiter="\t")
sf = sf.join(meta_sf)
sf
meta_sf.print_rows(33)
from sklearn.manifold import TSNE
#Note: hopefully this code is correct...
# The code was inspired from https://nlpforhackers.io/word-embeddings/
X = []
for v in sf['Vector']:
X.append(v)
X = np.array(X)
print("Computed X: ", X.shape)
X_embedded = TSNE(n_components=2, n_iter=250, verbose=2).fit_transform(X)
print("Computed t-SNE", X_embedded.shape)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.DataFrame(columns=['x', 'y', 'Title'])
df['x'], df['y'], df['Title'] = X_embedded[:,0], X_embedded[:,1], sf['Title']
g_set = set(df['Title'])
d = dict(zip(g_set, range(len(g_set))))
colors = [d[g] for g in df["Title"]]
plt.figure(figsize=(20,30))
sns.scatterplot(x="x", y="y", data=df)
def label_point(x, y, val, ax):
a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
for i, point in a.iterrows():
ax.text(point['x']+.02, point['y'], str(point['val']))
label_point(df['x'], df['y'], df['Title'], plt.gca())
From the above results, we can see that Oliver Twist is some what an outlier.
In this example, we are going to explore the Oscars speeches over the last 80 years. Let's start by loading the Oscars Speeches dataset into an SFrame object:
import kaggle
!mkdir ./datasets
!mkdir ./datasets/oscar_speech
# download the dataset from Kaggle and unzip it
!kaggle datasets download cerosdotcom/oscars-speeches -p ./datasets/oscar_speech
!unzip ./datasets/oscar_speech/*.zip -d ./datasets/oscar_speech
import turicreate as tc
import turicreate.aggregate as agg
import pandas as pd
%matplotlib inline
oscar_speeces_dataset = "./datasets/oscar_speech/oscar_speech_db.csv"
df = pd.read_csv(oscar_speeces_dataset)
df
import re
from nltk.tokenize import word_tokenize
sf = tc.SFrame(df[['Year', 'Category', 'Speech']])
r_year = re.compile(r"\d{4}")
sf['Year'] = sf['Year'].apply(lambda s: r_year.findall(s))
sf['Year'] = sf['Year'].apply(lambda l: int(l[0]) if len(l) > 0 else None)
sf['Chars Number'] = sf['Speech'].apply(lambda s: len(s))
sf['Words Number'] = sf['Speech'].apply(lambda s: len(word_tokenize(s)))
sf
Now, to better understand the data, let's visualize various speech statistics. For the visualization, we will use the Plotly-Express package:
import seaborn as sns
sns.set()
sns.distplot(sf['Words Number'])
import plotly_express as px
import turicreate.aggregate as agg
g = sf.groupby("Year", {'Words AVG': agg.AVG('Words Number')})
g = g.sort('Year')
px.line(g.to_dataframe(), x="Year", y="Words AVG")
g = sf.groupby("Category", {'count': agg.COUNT()})
g = g[g["count"] >= 40]
selected_categories_set = set(g["Category"])
sf2 = sf[sf['Category'].apply(lambda c: c in selected_categories_set)]
g = sf2.groupby(["Year",'Category'], {'Words AVG': agg.AVG('Words Number')})
g = g.sort(['Year','Category'])
px.line(g.to_dataframe(), x="Year", y="Words AVG", color="Category")
We can see that on average the speech lengths increased over time, especially Honorary Award speeches in recent years. Let's try to find the different topics of the speeches:
docs = tc.text_analytics.count_ngrams(sf['Speech'], n=1, method="word")
docs = docs.dict_trim_by_keys(tc.text_analytics.stop_words(lang='en'), exclude=True)
topic_model = tc.topic_model.create(docs, num_topics=30)
topic_model.get_topics().print_rows(100)
Let's try to create a classifier that predicts if the winner is a male or female according to his/her speech:
g_sf = sf[sf['Category'].apply(lambda c: 'Actor' in c or 'Actress' in c)]
def get_gender(c):
if 'actor' in c.lower():
return 'Male'
if 'actress' in c.lower():
return 'Female'
return None
g_sf['Gender'] = g_sf['Category'].apply(lambda c: get_gender(c))
sns.distplot(g_sf[g_sf['Gender'] == 'Male']['Chars Number'], axlabel="Speech Length (Chars)", color='g')
sns.distplot(g_sf[g_sf['Gender'] == 'Female']['Chars Number'], axlabel="Speech Length (Chars)", color='r')
g_sf['bow'] = tc.text_analytics.count_words(g_sf['Speech'])
train,test = g_sf.random_split(0.7)
cls = tc.logistic_classifier.create(train, features=['bow'], target='Gender')
cls.evaluate(test)
Let's create a classifier to predict the speech's decade:
import spacy
nlp = spacy.load('en_core_web_lg')
sf['Decade'] = sf['Year'].apply(lambda y: y - y %10)
vectors = []
for s in sf['Speech']:
vectors.append(nlp(s).vector)
sf['Vector'] = vectors
sf = sf.dropna()
train,test = sf.random_split(0.8)
cls = tc.classifier.create(train, features=['Vector'], target='Decade')
e = cls.evaluate(test)
e
e['confusion_matrix'].sort('count', ascending=False)
from sklearn.decomposition import PCA
import numpy as np
#Important Note: This is non-final code that may have mistakes
X = []
for v in sf['Vector']:
X.append(v)
X = np.array(X)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
df = pd.DataFrame(data = principalComponents
, columns = ['principal component 1', 'principal component 2'])
df
df = pd.concat([df, sf[['Decade']].to_dataframe()], axis = 1)
px.scatter(df, x='principal component 1', y='principal component 2', color="Decade")
pca = PCA(n_components=3)
pcaComp = pca.fit_transform(X)
df = pd.DataFrame(data = pcaComp, columns = ['PCA1', 'PCA2', 'PCA3'])
df = pd.concat([df, sf[['Decade']].to_dataframe()], axis = 1)
px.scatter_3d(df, x="PCA1", y="PCA2",z="PCA3", color="Decade")
In this example, we are going to identify aggressive tweets using the Tweets Dataset for Detection of Cyber-Trolls. Let's start by loading the dataset into an SFrame object:
!mkdir ./datasets
!mkdir ./datasets/trolls
# download the dataset from Kaggle and unzip it
!kaggle datasets download dataturks/dataset-for-detection-of-cybertrolls -p ./datasets/trolls
!unzip ./datasets/trolls/*.zip -d ./datasets/trolls/
import turicreate as tc
import turicreate.aggregate as agg
dataset_path = "./datasets/trolls/Dataset for Detection of Cyber-Trolls.json"
sf = tc.SFrame.read_json(dataset_path, orient="lines")
sf = sf.unpack('annotation')
sf = sf.rename({'annotation.label':'label'})
sf
sf['label'] = sf['label'].apply(lambda l: int(l[0]))
sf.groupby('label', {'count':agg.COUNT()})
Let's find the most common words in aggressive tweets:
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
stop_words_set = set(stopwords.words("english"))
stemmer = PorterStemmer()
#Using cahcing for faster performence
def word_stemming(w):
return stemmer.stem(w)
def skip_word(w):
w = w.lower()
if len(w) <2:
return True
if w.isdigit():
return True
if w in stop_words_set or stemmer.stem(w) in stop_words_set:
return True
return False
txt = "\n".join(sf[sf['label'] == 1]['content'])
l = [w.lower() for w in word_tokenize(txt) if not skip_word(w)]
c = Counter(l)
c.most_common(20)
Let's use spaCy to classify the tweets:
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_lg')
vectors = []
for t in tqdm(sf['content']):
vectors.append(nlp(t).vector)
sf['vector'] = vectors
train,test = sf.random_split(0.8)
cls = tc.classifier.create(train, features=['vector'],target='label')
cls.evaluate(test)
Let's try to improve the results by using pre-trained word vectors from Twitter:
import gensim.downloader as api
#loading Twitter pretrained model
model = api.load("glove-twitter-100") # download the model and return as object ready for use
model.most_similar("cat")
import numpy as np
def txt2vector(txt):
words = word_tokenize(txt)
words = [w for w in words if w in model]
if len(words) != 0:
return np.mean([model[w] for w in words], axis=0)
return None
vectors = []
for txt in sf['content']:
vectors.append(txt2vector(txt))
sf['twitter_vector'] = vectors
Let's use BERT to calculate the word embeddings of the tweets:
!pip install spacy-transformers # we need to install this package before using the transformes models
!python -m spacy download en_trf_bertbaseuncased_lg
import cupy as cp
spacy.require_gpu()
nlp = spacy.load('en_trf_bertbaseuncased_lg')
l = []
for t in tqdm(sf['content']):
l.append(nlp(t).vector)
sf['bert_vector'] = [cp.asnumpy(v) for v in l ]
train,test = sf.random_split(0.8)
cls1 = tc.random_forest_classifier.create(train, features=['vector'],target='label', max_iterations=25)
cls2 = tc.random_forest_classifier.create(train, features=['twitter_vector'],target='label', max_iterations=25)
cls3 = tc.random_forest_classifier.create(train, features=['bert_vector'],target='label', max_iterations=25)
cls4 = tc.random_forest_classifier.create(train, features=['bert_vector', 'vector', 'twitter_vector'],target='label', max_iterations=25)
cls1.evaluate(test)
cls2.evaluate(test)
cls3.evaluate(test)
cls4.evaluate(test)
In this example, we are going to construct an article categories classifier based on the article's title. To construct the classifier, we will utilize the News Category Dataset. Let's load the dataset to an SFrame object:
!mkdir ./datasets
!mkdir ./datasets/news
# download the dataset from Kaggle and unzip it
!kaggle datasets download rmisra/news-category-dataset -p ./datasets/news
!unzip ./datasets/news/*.zip -d ./datasets/news/
import turicreate as tc
dataset_path = "./datasets/news/News_Category_Dataset_v2.json"
sf = tc.SFrame.read_json(dataset_path, orient="lines")
sf
Let's learn more about the data by visualizing it:
import turicreate as tc
import turicreate.aggregate as agg
def get_year(s):
try:
return int(s.split("-")[0])
except:
return None
sf['length'] = sf['headline'].apply(lambda l: len(l))
sf['year'] = sf['date'].apply(lambda s: get_year(s))
g = sf.groupby(['year', 'category'], {'Count': agg.COUNT(), 'Avg. Length': agg.AVG('length')})
g
import plotly.express as px
px.scatter(g.to_dataframe(), x="year", y="Avg. Length", color="category", size="Count", size_max=20)
From the above chart, we can see that since 2014 there has been an increase in the average length of titles. Additionally, we can see that since 2014 there has been a sharp increase in the number of political items. Furthermore, we can observe that probably the dataset doesn't contain all the 2018 news items. Let's create a classifier that can predict the item's category based on its title:
import gensim.downloader as api
#loading Twitter pretrained model
model = api.load("word2vec-google-news-300") # download a Google-News word2vec model 1.6GB
model.most_similar("clinton")
import numpy as np
from nltk import word_tokenize
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_lg')
def txt2vector(txt):
txt = txt.lower()
words = word_tokenize(txt)
words = [w for w in words if w in model]
if len(words) != 0:
return np.mean([model[w] for w in words], axis=0)
return None
head_line_vectors = []
vectors = []
for txt in tqdm(sf['headline']):
head_line_vectors.append(txt2vector(txt))
vectors.append(nlp(txt).vector)
sf['headline_vector'] = vectors
sf['vector'] = vectors
train, test = sf.random_split(0.8)
cls = tc.random_forest_classifier.create(train, features=['vector', 'headline_vector'], target="category")
e = cls.evaluate(test)
e
e['confusion_matrix'].sort('count', ascending=False).print_rows(100)
Let's go back to the fake news dataset from the example. Now we can use the constructed classifier to predict the category of each news item:
!mkdir ./datasets/fake-news
# download the dataset from Kaggle and unzip it
!kaggle datasets download jruvika/fake-news-detection -p ./datasets/fake-news
!unzip ./datasets/fake-news/*.zip -d ./datasets/fake-news/
import pandas as pd
%matplotlib inline
fake_news_dataset_path = "./datasets/fake-news/data.csv"
df = pd.read_csv(fake_news_dataset_path)
df['title'] = df['Headline'].apply(lambda t: str(t))
f_sf = tc.SFrame(df[['Headline','Label']])
f_sf
head_line_vectors = []
vectors = []
for txt in tqdm(f_sf['Headline']):
head_line_vectors.append(txt2vector(txt))
vectors.append(nlp(txt).vector)
f_sf['headline_vector'] = vectors
f_sf['vector'] = vectors
f_sf['Category'] = cls.predict(f_sf)
f_sf
f_sf.groupby(['Label','Category'], {'count': agg.COUNT()}).sort('count', ascending=False)
train, test = f_sf.random_split(0.8)
cls1 = tc.random_forest_classifier.create(train, features=['headline_vector', 'vector'], target='Label', max_iterations=50)
cls1.evaluate(test)
cls2 = tc.random_forest_classifier.create(train, features=['headline_vector', 'vector', 'Category'], target='Label', max_iterations=50)
cls2.evaluate(test)
We can see that performing a transfer learning and adding a category to each news item, maybe cab assist to increase the classifier performance.