SPAM NO_SPAM Data Analysis
import pandas as pd
import numpy as np
import matplotlib .pyplot as plt
import matplotlib as matplot
import matplotlib .image as mpimg
import seaborn as sns
% matplotlib inline
from wordcloud import WordCloud
import scattertext as st
import spacy
from IPython .display import HTML
import spacy
from gensim .models import word2vec
from scattertext import SampleCorpora , word_similarity_explorer_gensim , Word2VecFromParsedCorpus
from scattertext .CorpusFromParsedDocuments import CorpusFromParsedDocuments
dataset = pd .read_json ("dataset.json" )
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
body
label
subject
0
hello , offer fantastic 100 % free access most...
SPAM
Subject: re : free !\n
1
* * * * * * * * * * * * * * * * * * * * * * * ...
SPAM
Subject: bulk email profit\n
2
stock invest interest , please carefully revie...
SPAM
Subject: possible + 900 % stock investment ret...
3
syntax project innovationskolleg " formal mode...
NOT_SPAM
Subject: minus workshop split constituent\n
4
multidisciplinary periodical : call comment * ...
NOT_SPAM
Subject: multidisciplinary periodical : call c...
body False
label False
subject False
dtype: bool
fig , axs = plt .subplots (ncols = 1 , figsize = (12 ,6 ))
g = sns .countplot (dataset ["label" ])
plt .tight_layout ()
plt .show ();
nlp = spacy .en .English ()
corpus = st .CorpusFromPandas (dataset , category_col = 'label' , text_col = 'body' ,nlp = nlp ).build ()
html = st .produce_scattertext_explorer (corpus , category = 'SPAM' ,category_name = 'SPAM' ,not_category_name = 'NOT_SPAM' ,width_in_pixels = 1000 )
open ("Convention-Visualization.html" , 'wb' ).write (html .encode ('utf-8' ));
#Notebook server crash while loading the html file. So render the html file into broswer and upload the snapshot
#visualization purpose only.
img = mpimg .imread ('Convention-Visualization.png' )
matplot .rcParams ['figure.figsize' ] = (30.0 , 10.0 )
plt .imshow (img )
plt .show ()
dataset_spam = dataset .loc [dataset .label == 'SPAM' ,['body' ]]
dataset_not_spam = dataset .loc [dataset .label == 'NOT_SPAM' ,['body' ]]
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
body
0
hello , offer fantastic 100 % free access most...
1
* * * * * * * * * * * * * * * * * * * * * * * ...
2
stock invest interest , please carefully revie...
5
locate anyone anywhere usa * * * * * * * old f...
6
hope n't object complete stranger mail , belie...
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
body
3
syntax project innovationskolleg " formal mode...
4
multidisciplinary periodical : call comment * ...
7
inform untimely death jochem schindler , prof ...
9
week ago , post query language moo site . rece...
10
cycorp seek enthusiastic , highly-motivate mul...
wordcloud_spam = WordCloud (max_font_size = 40 ).generate (' ' .join (list (dataset_spam ['body' ])))
plt .figure ()
plt .imshow (wordcloud_spam , interpolation = "bilinear" )
plt .axis ("off" )
plt .show ()
wordcloud_not_spam = WordCloud (max_font_size = 40 ).generate (' ' .join (list (dataset_not_spam ['body' ])))
plt .figure ()
plt .imshow (wordcloud_not_spam , interpolation = "bilinear" )
plt .axis ("off" )
plt .show ()