diff --git a/muttest.py b/muttest.py index 292cf05..ec63e3a 100644 --- a/muttest.py +++ b/muttest.py @@ -1,20 +1,86 @@ +import math import os import re import subprocess import sys +from math import sqrt +from statistics import mean, variance from typing import List, Dict +import matplotlib.pyplot as plt import pandas as pd +import seaborn as sns +from scipy.stats import wilcoxon from tqdm import tqdm ROOT_DIR = os.path.dirname(__file__) IN_SOURCE_DIR = os.path.join(ROOT_DIR, "benchmark") IN_TEST_DIR = os.path.join(ROOT_DIR, "tests") IN_FUZZER_TEST_DIR = os.path.join(ROOT_DIR, "fuzzer_tests") +OUT_DIR = os.path.join(ROOT_DIR, "out") MUT_PY_PATH = os.path.join(ROOT_DIR, 'env37', 'bin', 'mut.py') REPS: int = 10 +def cohen_d(d1: List[float], d2: List[float]) -> float: + pooled_sd = sqrt(((len(d1) - 1) * variance(d1) + (len(d2) - 1) * variance(d2)) / + (len(d1) + len(d2) - 2)) + + if pooled_sd == 0: + return math.inf + + return (mean(d1) - mean(d2)) / pooled_sd + + +def effect_size(eff: float) -> str: + if eff <= 0.01: + return 'Very small' + elif eff <= 0.2: + return 'Small' + elif eff <= 0.5: + return 'Medium' + elif eff <= 0.8: + return 'Large' + elif eff <= 1.2: + return 'Very large' + else: + return 'Huge' + + +def compute_stats(df_gen: pd.DataFrame, df_fuz: pd.DataFrame, output_file: str, avg_output_file: str, stat_csv: str): + combined_df = pd.concat([df_gen, df_fuz], keys=["genetic", "fuzzer"]).reset_index() + combined_df.columns = ['source', *combined_df.columns[1:]] + del combined_df[combined_df.columns[1]] + + plt.figure(figsize=(18, 8)) + sns.set(style="whitegrid") + sns.boxplot(data=combined_df, x="file", y="score", hue="source") + plt.yticks(range(0, 101, 10)) + plt.savefig(output_file) + + plt.figure(figsize=(18, 8)) + df_avg = combined_df.groupby(['file', 'source']).mean().reset_index() + sns.set(style="whitegrid") + sns.barplot(data=df_avg, x="file", y="score", hue="source") + plt.yticks(range(0, 101, 10)) + plt.savefig(avg_output_file) + + df_avg = df_avg.pivot(index='file', columns='source', values='score').rename_axis(None, axis=1) + df_avg['cohen-d'] = [math.nan] * len(df_avg.index) + df_avg['interpretation'] = [math.nan] * len(df_avg.index) + df_avg['wilcoxon'] = [math.nan] * len(df_avg.index) + + for f in combined_df['file'].drop_duplicates(): + list_gen = df_gen.loc[(df_gen.file == f), 'score'].tolist() + list_fuz = df_fuz.loc[(df_fuz.file == f), 'score'].tolist() + + df_avg.loc[f, 'cohen-d'] = cohen_d(list_gen, list_fuz) + df_avg.loc[f, 'interpretation'] = effect_size(df_avg.loc[f, 'cohen-d']) + df_avg.loc[f, 'wilcoxon'] = wilcoxon(list_gen, list_fuz, zero_method='zsplit').pvalue + + df_avg.to_csv(stat_csv) + + def run_mutpy(test_path: str, source_path: str) -> float: output = subprocess.check_output( [sys.executable, MUT_PY_PATH, '-t', source_path, '-u', test_path]).decode('utf-8') @@ -26,7 +92,7 @@ def mutate_suite(out_file: str, in_test_dir: str, to_test: List[str]): scores: List[Dict[str, any]] = [] if os.path.isfile(out_file): # do not re-generate if file exists - return + return pd.read_csv(out_file, index_col=0) for filename in tqdm(to_test, desc=f"mut.py [{os.path.basename(out_file)}]"): source_path = os.path.join(IN_SOURCE_DIR, f"{filename}.py") @@ -38,6 +104,7 @@ def mutate_suite(out_file: str, in_test_dir: str, to_test: List[str]): df = pd.DataFrame.from_records(scores) df.to_csv(out_file) + return df def main(): @@ -45,8 +112,13 @@ def main(): to_test = [file[0] for file in files if file[1] == ".py"] to_test = [e for t in to_test for e in ([t] * REPS)] - mutate_suite(os.path.join(IN_TEST_DIR, 'mutation_results_genetic.csv'), IN_TEST_DIR, to_test) - mutate_suite(os.path.join(IN_FUZZER_TEST_DIR, 'mutation_results_fuzzer.csv'), IN_FUZZER_TEST_DIR, to_test) + df_gen = mutate_suite(os.path.join(OUT_DIR, 'mutation_results_genetic.csv'), IN_TEST_DIR, to_test) + df_fuz = mutate_suite(os.path.join(OUT_DIR, 'mutation_results_fuzzer.csv'), IN_FUZZER_TEST_DIR, to_test) + + compute_stats(df_gen, df_fuz, + os.path.join(OUT_DIR, "mutation_scores.png"), + os.path.join(OUT_DIR, "mutation_scores_mean.png"), + os.path.join(OUT_DIR, "stats.csv")) if __name__ == "__main__": diff --git a/fuzzer_tests/mutation_results_fuzzer.csv b/out/mutation_results_fuzzer.csv similarity index 100% rename from fuzzer_tests/mutation_results_fuzzer.csv rename to out/mutation_results_fuzzer.csv diff --git a/tests/mutation_results_genetic.csv b/out/mutation_results_genetic.csv similarity index 100% rename from tests/mutation_results_genetic.csv rename to out/mutation_results_genetic.csv diff --git a/out/mutation_scores.png b/out/mutation_scores.png new file mode 100644 index 0000000..cc82e2f Binary files /dev/null and b/out/mutation_scores.png differ diff --git a/out/mutation_scores_mean.png b/out/mutation_scores_mean.png new file mode 100644 index 0000000..bb87989 Binary files /dev/null and b/out/mutation_scores_mean.png differ diff --git a/out/stats.csv b/out/stats.csv new file mode 100644 index 0000000..b4b01a7 --- /dev/null +++ b/out/stats.csv @@ -0,0 +1,11 @@ +file,fuzzer,genetic,cohen-d,interpretation,wilcoxon +anagram_check,23.1,38.5,inf,Huge,0.001953125 +caesar_cipher,58.8,64.7,inf,Huge,0.001953125 +check_armstrong,90.3,93.5,inf,Huge,0.001953125 +common_divisor_count,72.3,80.9,inf,Huge,0.001953125 +exponentiation,71.4,71.4,inf,Huge,1.0 +gcd,47.8,60.9,inf,Huge,0.001953125 +longest_substring,82.6,69.6,inf,Huge,0.001953125 +rabin_karp,64.9,50.9,inf,Huge,0.001953125 +railfence_cipher,89.4,86.2,inf,Huge,0.001953125 +zellers_birthday,68.3,65.0,inf,Huge,0.001953125 diff --git a/requirements.txt b/requirements.txt index 3c1c5cb..9cd6f74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,7 @@ deap==1.4.1 astunparse==1.6.3 frozendict==2.3.8 tqdm==4.66.1 -pandas==1.3.5 \ No newline at end of file +pandas==1.3.5 +matplotlib!=3.6.1,>=3.1 +seaborn==0.12.2 +scipy==1.7.3 \ No newline at end of file