diff --git a/.gitignore b/.gitignore index c64692c..06a47c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -virtualenv/ \ No newline at end of file +env/ +__pycache__/ diff --git a/find_god_classes.py b/find_god_classes.py new file mode 100755 index 0000000..17e39a0 --- /dev/null +++ b/find_god_classes.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +import javalang +import os +import pandas as pd +import glob + +# God class if: +# |M(C)| > E(M) + 6*V(M) +# (number of methods greater than average across all classes plus 6 times the +# standard deviation) + + +DIR: str = os.path.dirname(os.path.realpath(__file__)) +SOURCES: str = DIR + '/resources/xerces2-j-src' +OUT_DIR: str = DIR + '/god_classes' + + +def clean_output(): + filelist = glob.glob(OUT_DIR + '/*.csv') + for f in filelist: + os.remove(f) + + +def parse(path: str) -> list[tuple[str, str]]: + # Get the AST of the file + with open(path) as file: + data = file.read() + tree = javalang.parse.parse(data) + + # Fetch package name from package declaration + # if node is missing, assuming default package ('') + package_name = '' + for _, node in tree.filter(javalang.tree.PackageDeclaration): + package_name = node.name + break + + # Get all classes and number of methods for each one + rows: list[tuple[str, str]] = [] + for _, node in tree.filter(javalang.tree.ClassDeclaration): + fqdn = package_name + '.' + node.name + rows.append((fqdn, len(node.methods),)) + + return rows + + +def create_df(root) -> pd.DataFrame: + frame = pd.DataFrame(columns=['class_name', 'method_num']) + + i: int = 0 + for path, dirs, files in os.walk(root): + for f in files: + if f.endswith('.java'): + # for each java file, add all entries found to dataframe + for row in parse(path + '/' + f): + frame.loc[i] = row + i += 1 + + return frame + + +def main(): + clean_output() + df = create_df(SOURCES) + + mean = df.loc[:, 'method_num'].mean() + std = df.loc[:, 'method_num'].std() + threshold = mean + 6 * std + + god_classes_df = df[df['method_num'] > threshold] + god_classes_df.to_csv(OUT_DIR + '/god_classes.csv') + + +if __name__ == '__main__': + main() diff --git a/god_classes/.gitkeep b/god_classes/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/god_classes/god_classes.csv b/god_classes/god_classes.csv new file mode 100644 index 0000000..6cd28ad --- /dev/null +++ b/god_classes/god_classes.csv @@ -0,0 +1,5 @@ +,class_name,method_num +250,org.apache.xerces.impl.xs.traversers.XSDHandler,118 +300,org.apache.xerces.impl.dtd.DTDGrammar,101 +406,org.apache.xerces.xinclude.XIncludeHandler,116 +602,org.apache.xerces.dom.CoreDocumentImpl,125 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..87736ac --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +javalang==0.13.0 +pandas==1.5.2 +scikit_learn==1.2.1 +shrek==0.0.2