#!/usr/bin/env python3 import javalang import os import pandas as pd import glob # God class if: # |M(C)| > E(M) + 6*V(M) # (number of methods greater than average across all classes plus 6 times the # standard deviation) DIR: str = os.path.dirname(os.path.realpath(__file__)) SOURCES: str = DIR + '/xerces2/src' OUT_DIR: str = DIR + '/god_classes' def clean_output(): filelist = glob.glob(OUT_DIR + '/*.csv') for f in filelist: os.remove(f) def parse(path: str) -> list[tuple[str, str]]: # Get the AST of the file with open(path) as file: data = file.read() tree = javalang.parse.parse(data) # Fetch package name from package declaration # if node is missing, assuming default package ('') package_name = '' for _, node in tree.filter(javalang.tree.PackageDeclaration): package_name = node.name break # Get all classes and number of methods for each one rows: list[tuple[str, str]] = [] for _, node in tree.filter(javalang.tree.ClassDeclaration): fqdn = package_name + '.' + node.name rows.append((fqdn, len(node.methods),)) return rows def create_df(root) -> pd.DataFrame: frame = pd.DataFrame(columns=['class_name', 'method_num']) i: int = 0 for path, dirs, files in os.walk(root): for f in files: if f.endswith('.java'): # for each java file, add all entries found to dataframe for row in parse(path + '/' + f): frame.loc[i] = row i += 1 return frame def main(): clean_output() df = create_df(SOURCES) mean = df.loc[:, 'method_num'].mean() std = df.loc[:, 'method_num'].std() threshold = mean + 6 * std god_classes_df = df[df['method_num'] > threshold] god_classes_df.to_csv(OUT_DIR + '/god_classes.csv') if __name__ == '__main__': main()