SPAM NO_SPAM Data Analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud

import scattertext as st
import spacy
from IPython.display import HTML


import spacy
from gensim.models import word2vec
from scattertext import SampleCorpora, word_similarity_explorer_gensim, Word2VecFromParsedCorpus
from scattertext.CorpusFromParsedDocuments import CorpusFromParsedDocuments

dataset = pd.read_json("dataset.json")

dataset.head()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	body	label	subject
0	hello , offer fantastic 100 % free access most...	SPAM	Subject: re : free !\n
1	* * * * * * * * * * * * * * * * * * * * * * * ...	SPAM	Subject: bulk email profit\n
2	stock invest interest , please carefully revie...	SPAM	Subject: possible + 900 % stock investment ret...
3	syntax project innovationskolleg " formal mode...	NOT_SPAM	Subject: minus workshop split constituent\n
4	multidisciplinary periodical : call comment * ...	NOT_SPAM	Subject: multidisciplinary periodical : call c...

dataset.shape

(702, 3)

dataset.isnull().any()

body       False
label      False
subject    False
dtype: bool

fig, axs = plt.subplots(ncols=1, figsize=(12,6))
g = sns.countplot(dataset["label"])
plt.tight_layout()
plt.show();

nlp = spacy.en.English()
corpus = st.CorpusFromPandas(dataset, category_col='label',  text_col='body',nlp=nlp).build()

html = st.produce_scattertext_explorer(corpus, category='SPAM',category_name='SPAM',not_category_name='NOT_SPAM',width_in_pixels=1000)
open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'));

#Notebook server crash while loading the html file. So render the html file into broswer and upload the snapshot
#visualization purpose only.
img = mpimg.imread('Convention-Visualization.png')
matplot.rcParams['figure.figsize'] = (30.0, 10.0)
plt.imshow(img)
plt.show()

dataset_spam = dataset.loc[dataset.label == 'SPAM',['body']]
dataset_not_spam = dataset.loc[dataset.label == 'NOT_SPAM',['body']]

dataset_spam.head()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	body
0	hello , offer fantastic 100 % free access most...
1	* * * * * * * * * * * * * * * * * * * * * * * ...
2	stock invest interest , please carefully revie...
5	locate anyone anywhere usa * * * * * * * old f...
6	hope n't object complete stranger mail , belie...

dataset_not_spam.head()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	body
3	syntax project innovationskolleg " formal mode...
4	multidisciplinary periodical : call comment * ...
7	inform untimely death jochem schindler , prof ...
9	week ago , post query language moo site . rece...
10	cycorp seek enthusiastic , highly-motivate mul...

wordcloud_spam = WordCloud(max_font_size=40).generate(' '.join(list(dataset_spam['body'])))
plt.figure()
plt.imshow(wordcloud_spam, interpolation="bilinear")
plt.axis("off")
plt.show()

wordcloud_not_spam = WordCloud(max_font_size=40).generate(' '.join(list(dataset_not_spam['body'])))
plt.figure()
plt.imshow(wordcloud_not_spam, interpolation="bilinear")
plt.axis("off")
plt.show()

mahendrathapa / spam-detector Goto Github PK

spam-detector's Introduction

SPAM NO_SPAM Data Analysis

spam-detector's People

Contributors

Watchers

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent