diff --git a/muttest.py b/muttest.py
index 292cf05..ec63e3a 100644
--- a/muttest.py
+++ b/muttest.py
@@ -1,20 +1,86 @@
+import math
 import os
 import re
 import subprocess
 import sys
+from math import sqrt
+from statistics import mean, variance
 from typing import List, Dict
 
+import matplotlib.pyplot as plt
 import pandas as pd
+import seaborn as sns
+from scipy.stats import wilcoxon
 from tqdm import tqdm
 
 ROOT_DIR = os.path.dirname(__file__)
 IN_SOURCE_DIR = os.path.join(ROOT_DIR, "benchmark")
 IN_TEST_DIR = os.path.join(ROOT_DIR, "tests")
 IN_FUZZER_TEST_DIR = os.path.join(ROOT_DIR, "fuzzer_tests")
+OUT_DIR = os.path.join(ROOT_DIR, "out")
 MUT_PY_PATH = os.path.join(ROOT_DIR, 'env37', 'bin', 'mut.py')
 REPS: int = 10
 
 
+def cohen_d(d1: List[float], d2: List[float]) -> float:
+    pooled_sd = sqrt(((len(d1) - 1) * variance(d1) + (len(d2) - 1) * variance(d2)) /
+                     (len(d1) + len(d2) - 2))
+
+    if pooled_sd == 0:
+        return math.inf
+
+    return (mean(d1) - mean(d2)) / pooled_sd
+
+
+def effect_size(eff: float) -> str:
+    if eff <= 0.01:
+        return 'Very small'
+    elif eff <= 0.2:
+        return 'Small'
+    elif eff <= 0.5:
+        return 'Medium'
+    elif eff <= 0.8:
+        return 'Large'
+    elif eff <= 1.2:
+        return 'Very large'
+    else:
+        return 'Huge'
+
+
+def compute_stats(df_gen: pd.DataFrame, df_fuz: pd.DataFrame, output_file: str, avg_output_file: str, stat_csv: str):
+    combined_df = pd.concat([df_gen, df_fuz], keys=["genetic", "fuzzer"]).reset_index()
+    combined_df.columns = ['source', *combined_df.columns[1:]]
+    del combined_df[combined_df.columns[1]]
+
+    plt.figure(figsize=(18, 8))
+    sns.set(style="whitegrid")
+    sns.boxplot(data=combined_df, x="file", y="score", hue="source")
+    plt.yticks(range(0, 101, 10))
+    plt.savefig(output_file)
+
+    plt.figure(figsize=(18, 8))
+    df_avg = combined_df.groupby(['file', 'source']).mean().reset_index()
+    sns.set(style="whitegrid")
+    sns.barplot(data=df_avg, x="file", y="score", hue="source")
+    plt.yticks(range(0, 101, 10))
+    plt.savefig(avg_output_file)
+
+    df_avg = df_avg.pivot(index='file', columns='source', values='score').rename_axis(None, axis=1)
+    df_avg['cohen-d'] = [math.nan] * len(df_avg.index)
+    df_avg['interpretation'] = [math.nan] * len(df_avg.index)
+    df_avg['wilcoxon'] = [math.nan] * len(df_avg.index)
+
+    for f in combined_df['file'].drop_duplicates():
+        list_gen = df_gen.loc[(df_gen.file == f), 'score'].tolist()
+        list_fuz = df_fuz.loc[(df_fuz.file == f), 'score'].tolist()
+
+        df_avg.loc[f, 'cohen-d'] = cohen_d(list_gen, list_fuz)
+        df_avg.loc[f, 'interpretation'] = effect_size(df_avg.loc[f, 'cohen-d'])
+        df_avg.loc[f, 'wilcoxon'] = wilcoxon(list_gen, list_fuz, zero_method='zsplit').pvalue
+
+    df_avg.to_csv(stat_csv)
+
+
 def run_mutpy(test_path: str, source_path: str) -> float:
     output = subprocess.check_output(
         [sys.executable, MUT_PY_PATH, '-t', source_path, '-u', test_path]).decode('utf-8')
@@ -26,7 +92,7 @@ def mutate_suite(out_file: str, in_test_dir: str, to_test: List[str]):
     scores: List[Dict[str, any]] = []
 
     if os.path.isfile(out_file):  # do not re-generate if file exists
-        return
+        return pd.read_csv(out_file, index_col=0)
 
     for filename in tqdm(to_test, desc=f"mut.py [{os.path.basename(out_file)}]"):
         source_path = os.path.join(IN_SOURCE_DIR, f"{filename}.py")
@@ -38,6 +104,7 @@ def mutate_suite(out_file: str, in_test_dir: str, to_test: List[str]):
 
     df = pd.DataFrame.from_records(scores)
     df.to_csv(out_file)
+    return df
 
 
 def main():
@@ -45,8 +112,13 @@ def main():
     to_test = [file[0] for file in files if file[1] == ".py"]
     to_test = [e for t in to_test for e in ([t] * REPS)]
 
-    mutate_suite(os.path.join(IN_TEST_DIR, 'mutation_results_genetic.csv'), IN_TEST_DIR, to_test)
-    mutate_suite(os.path.join(IN_FUZZER_TEST_DIR, 'mutation_results_fuzzer.csv'), IN_FUZZER_TEST_DIR, to_test)
+    df_gen = mutate_suite(os.path.join(OUT_DIR, 'mutation_results_genetic.csv'), IN_TEST_DIR, to_test)
+    df_fuz = mutate_suite(os.path.join(OUT_DIR, 'mutation_results_fuzzer.csv'), IN_FUZZER_TEST_DIR, to_test)
+
+    compute_stats(df_gen, df_fuz,
+                  os.path.join(OUT_DIR, "mutation_scores.png"),
+                  os.path.join(OUT_DIR, "mutation_scores_mean.png"),
+                  os.path.join(OUT_DIR, "stats.csv"))
 
 
 if __name__ == "__main__":
diff --git a/fuzzer_tests/mutation_results_fuzzer.csv b/out/mutation_results_fuzzer.csv
similarity index 100%
rename from fuzzer_tests/mutation_results_fuzzer.csv
rename to out/mutation_results_fuzzer.csv
diff --git a/tests/mutation_results_genetic.csv b/out/mutation_results_genetic.csv
similarity index 100%
rename from tests/mutation_results_genetic.csv
rename to out/mutation_results_genetic.csv
diff --git a/out/mutation_scores.png b/out/mutation_scores.png
new file mode 100644
index 0000000..cc82e2f
Binary files /dev/null and b/out/mutation_scores.png differ
diff --git a/out/mutation_scores_mean.png b/out/mutation_scores_mean.png
new file mode 100644
index 0000000..bb87989
Binary files /dev/null and b/out/mutation_scores_mean.png differ
diff --git a/out/stats.csv b/out/stats.csv
new file mode 100644
index 0000000..b4b01a7
--- /dev/null
+++ b/out/stats.csv
@@ -0,0 +1,11 @@
+file,fuzzer,genetic,cohen-d,interpretation,wilcoxon
+anagram_check,23.1,38.5,inf,Huge,0.001953125
+caesar_cipher,58.8,64.7,inf,Huge,0.001953125
+check_armstrong,90.3,93.5,inf,Huge,0.001953125
+common_divisor_count,72.3,80.9,inf,Huge,0.001953125
+exponentiation,71.4,71.4,inf,Huge,1.0
+gcd,47.8,60.9,inf,Huge,0.001953125
+longest_substring,82.6,69.6,inf,Huge,0.001953125
+rabin_karp,64.9,50.9,inf,Huge,0.001953125
+railfence_cipher,89.4,86.2,inf,Huge,0.001953125
+zellers_birthday,68.3,65.0,inf,Huge,0.001953125
diff --git a/requirements.txt b/requirements.txt
index 3c1c5cb..9cd6f74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,7 @@ deap==1.4.1
 astunparse==1.6.3
 frozendict==2.3.8
 tqdm==4.66.1
-pandas==1.3.5
\ No newline at end of file
+pandas==1.3.5
+matplotlib!=3.6.1,>=3.1
+seaborn==0.12.2
+scipy==1.7.3
\ No newline at end of file