import pandas as pd
import numpy as np
from pathlib import Path
import random

Data is from https://spamassassin.apache.org/old/publiccorpus/

emailFolder = Path("datasets/emailData/")
parent = "datasets/emailData/"
spamFolders = ["spam", "spam_2"]
hamFolders = ["easy_ham", "hard_ham", "easy_ham_2"]
spamFiles =[]
hamFiles =[]
for s in spamFolders:
    folder = Path(parent+s)
    spamFiles.extend(list(folder.glob("*")))
for h in hamFolders:
    folder = Path(parent+h)
    hamFiles.extend(list(folder.glob("*")))
hamData = [open(f, "r", encoding='iso-8859-1').read() for f in hamFiles if f.is_file()]
spamData = [open(f, "r", encoding='iso-8859-1').read() for f in spamFiles if f.is_file()]
spamData = spamData[1:] #my data left an extra file in the emails
# be careful about running this multiple times in a row
random.seed(12345)
random.shuffle(spamData)
random.shuffle(hamData)
spamData = [(x,1) for x in spamData]
hamData = [(x,0) for x in hamData]
trainData = spamData[:len(spamData)//10*8]
trainData.extend(hamData[:len(hamData)//10*8])
testData = spamData[len(spamData)//10*8:]
testData.extend(hamData[len(hamData)//10*8:])
random.shuffle(trainData)
random.shuffle(testData)
trainLabels = [x[1] for x in trainData]
trainData = [x[0] for x in trainData]
testLabels = [x[1] for x in testData]
testData = [x[0] for x in testData]
def preprocessEmail(email):
    l = email.split()
    cleaned = []
    for w in l:
        if not w:
            continue
        if("http" in w or "www" in w or w.endswith(".com") or w.endswith(".net") or w.endswith(".org")):
            cleaned.append("URL")
        elif("@" in w and "." in w):
            cleaned.append("EMAIL")
        else:
            cleaned.append(w.lower())
    return " ".join(cleaned)
def preprocessEmails(emails):
    return [preprocessEmail(email) for email in emails]

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

emailPipeline = Pipeline([("preprocessing", FunctionTransformer(preprocessEmails)),
                          ("vectorizer", CountVectorizer(max_features=500)),
                          ("classifier", RandomForestClassifier())])
noPreprocessing = Pipeline([("vectorizer", CountVectorizer(max_features=500)),
                             ("classifier", RandomForestClassifier())])
from sklearn.model_selection import cross_val_score

display(cross_val_score(emailPipeline, trainData, trainLabels, cv=5, n_jobs=-1, scoring="accuracy"))
display(cross_val_score(noPreprocessing, trainData, trainLabels, cv=5, n_jobs=-1, scoring="accuracy"))

# It looks like our model does mildly better in some cases without preprocessing
array([0.97669256, 0.99112098, 0.98890122, 0.98113208, 0.98222222])
array([0.9800222 , 0.99112098, 0.98890122, 0.98779134, 0.98444444])
from sklearn.model_selection import RandomizedSearchCV

params = {"vectorizer__max_features":[500,1000,3000,5000],
          "vectorizer__min_df":[1,2,5],
          "vectorizer__ngram_range": [(1,1),(1,2),(1,3)],
          "classifier__n_estimators": [50,100,200,300],
          "classifier__max_depth": [5,10,15,20]
          }

randomSearch = RandomizedSearchCV(emailPipeline, params, cv=3, n_iter=20, n_jobs=-1, random_state=12345)
randomSearch.fit(trainData, trainLabels)
RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('preprocessing',
                                              FunctionTransformer(func=<function preprocessEmails at 0x00000257450CB600>)),
                                             ('vectorizer',
                                              CountVectorizer(max_features=500)),
                                             ('classifier',
                                              RandomForestClassifier())]),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'classifier__max_depth': [5, 10, 15,
                                                                  20],
                                        'classifier__n_estimators': [50, 100,
                                                                     200, 300],
                                        'vectorizer__max_features': [500, 1000,
                                                                     3000,
                                                                     5000],
                                        'vectorizer__min_df': [1, 2, 5],
                                        'vectorizer__ngram_range': [(1, 1),
                                                                    (1, 2),
                                                                    (1, 3)]},
                   random_state=12345)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
finalModel = randomSearch.best_estimator_
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

predictions = finalModel.predict(testData)

print("Accuracy Score: ", accuracy_score(testLabels, predictions))
print("Precision Score:", precision_score(testLabels, predictions))
print("Recall Score:   ", recall_score(testLabels, predictions))
print("F1 Score:       ", f1_score(testLabels, predictions))
Accuracy Score:  0.9920071047957372
Precision Score: 0.9897610921501706
Recall Score:    0.9797297297297297
F1 Score:        0.9847198641765704