import pandas as pd
import numpy as np
from pathlib import Path
import randomData is from https://spamassassin.apache.org/old/publiccorpus/
emailFolder = Path("datasets/emailData/")
parent = "datasets/emailData/"
spamFolders = ["spam", "spam_2"]
hamFolders = ["easy_ham", "hard_ham", "easy_ham_2"]
spamFiles =[]
hamFiles =[]
for s in spamFolders:
folder = Path(parent+s)
spamFiles.extend(list(folder.glob("*")))
for h in hamFolders:
folder = Path(parent+h)
hamFiles.extend(list(folder.glob("*")))
hamData = [open(f, "r", encoding='iso-8859-1').read() for f in hamFiles if f.is_file()]
spamData = [open(f, "r", encoding='iso-8859-1').read() for f in spamFiles if f.is_file()]spamData = spamData[1:] #my data left an extra file in the emails# be careful about running this multiple times in a row
random.seed(12345)
random.shuffle(spamData)
random.shuffle(hamData)
spamData = [(x,1) for x in spamData]
hamData = [(x,0) for x in hamData]
trainData = spamData[:len(spamData)//10*8]
trainData.extend(hamData[:len(hamData)//10*8])
testData = spamData[len(spamData)//10*8:]
testData.extend(hamData[len(hamData)//10*8:])
random.shuffle(trainData)
random.shuffle(testData)trainLabels = [x[1] for x in trainData]
trainData = [x[0] for x in trainData]
testLabels = [x[1] for x in testData]
testData = [x[0] for x in testData]def preprocessEmail(email):
l = email.split()
cleaned = []
for w in l:
if not w:
continue
if("http" in w or "www" in w or w.endswith(".com") or w.endswith(".net") or w.endswith(".org")):
cleaned.append("URL")
elif("@" in w and "." in w):
cleaned.append("EMAIL")
else:
cleaned.append(w.lower())
return " ".join(cleaned)
def preprocessEmails(emails):
return [preprocessEmail(email) for email in emails]
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
emailPipeline = Pipeline([("preprocessing", FunctionTransformer(preprocessEmails)),
("vectorizer", CountVectorizer(max_features=500)),
("classifier", RandomForestClassifier())])
noPreprocessing = Pipeline([("vectorizer", CountVectorizer(max_features=500)),
("classifier", RandomForestClassifier())])from sklearn.model_selection import cross_val_score
display(cross_val_score(emailPipeline, trainData, trainLabels, cv=5, n_jobs=-1, scoring="accuracy"))
display(cross_val_score(noPreprocessing, trainData, trainLabels, cv=5, n_jobs=-1, scoring="accuracy"))
# It looks like our model does mildly better in some cases without preprocessingarray([0.97669256, 0.99112098, 0.98890122, 0.98113208, 0.98222222])
array([0.9800222 , 0.99112098, 0.98890122, 0.98779134, 0.98444444])
from sklearn.model_selection import RandomizedSearchCV
params = {"vectorizer__max_features":[500,1000,3000,5000],
"vectorizer__min_df":[1,2,5],
"vectorizer__ngram_range": [(1,1),(1,2),(1,3)],
"classifier__n_estimators": [50,100,200,300],
"classifier__max_depth": [5,10,15,20]
}
randomSearch = RandomizedSearchCV(emailPipeline, params, cv=3, n_iter=20, n_jobs=-1, random_state=12345)
randomSearch.fit(trainData, trainLabels)RandomizedSearchCV(cv=3,
estimator=Pipeline(steps=[('preprocessing',
FunctionTransformer(func=<function preprocessEmails at 0x00000257450CB600>)),
('vectorizer',
CountVectorizer(max_features=500)),
('classifier',
RandomForestClassifier())]),
n_iter=20, n_jobs=-1,
param_distributions={'classifier__max_depth': [5, 10, 15,
20],
'classifier__n_estimators': [50, 100,
200, 300],
'vectorizer__max_features': [500, 1000,
3000,
5000],
'vectorizer__min_df': [1, 2, 5],
'vectorizer__ngram_range': [(1, 1),
(1, 2),
(1, 3)]},
random_state=12345)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=3,
estimator=Pipeline(steps=[('preprocessing',
FunctionTransformer(func=<function preprocessEmails at 0x00000257450CB600>)),
('vectorizer',
CountVectorizer(max_features=500)),
('classifier',
RandomForestClassifier())]),
n_iter=20, n_jobs=-1,
param_distributions={'classifier__max_depth': [5, 10, 15,
20],
'classifier__n_estimators': [50, 100,
200, 300],
'vectorizer__max_features': [500, 1000,
3000,
5000],
'vectorizer__min_df': [1, 2, 5],
'vectorizer__ngram_range': [(1, 1),
(1, 2),
(1, 3)]},
random_state=12345)Pipeline(steps=[('preprocessing',
FunctionTransformer(func=<function preprocessEmails at 0x00000257450CB600>)),
('vectorizer', CountVectorizer(max_features=500)),
('classifier', RandomForestClassifier())])FunctionTransformer(func=<function preprocessEmails at 0x00000257450CB600>)
CountVectorizer(max_features=500)
RandomForestClassifier()
finalModel = randomSearch.best_estimator_from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
predictions = finalModel.predict(testData)
print("Accuracy Score: ", accuracy_score(testLabels, predictions))
print("Precision Score:", precision_score(testLabels, predictions))
print("Recall Score: ", recall_score(testLabels, predictions))
print("F1 Score: ", f1_score(testLabels, predictions))Accuracy Score: 0.9920071047957372
Precision Score: 0.9897610921501706
Recall Score: 0.9797297297297297
F1 Score: 0.9847198641765704