#!/usr/bin/env python3 import javalang import os import pandas as pd import glob import warnings import tabulate DIR: str = os.path.dirname(os.path.realpath(__file__)) SOURCES: str = DIR + '/resources/xerces2-j-src' OUT_DIR: str = DIR + '/feature_vectors' IN_DIR: str = DIR + '/god_classes' def clean_output(): filelist = glob.glob(OUT_DIR + '/*.csv') for f in filelist: os.remove(f) def get_fields(java_class: javalang.tree.ClassDeclaration) -> set[str]: names = set() for f in java_class.fields: names.add(f.declarators[0].name) return names def get_methods(java_class: javalang.tree.ClassDeclaration) -> set[str]: names = set() for f in java_class.methods: names.add(f.name) return names def empty_qualifier(node: any) -> bool: return node.qualifier is None or node.qualifier == '' or node.qualifier == 'this' def get_fields_accessed_by_method(method: javalang.tree.MethodDeclaration, fields: set[str]) -> set[str]: # Ignore formal parameters and local variable declarations local_variables = set() for _, node in method.filter(javalang.tree.LocalVariableDeclaration): local_variables.add(node.declarators[0].name) for _, node in method.filter(javalang.tree.FormalParameter): local_variables.add(node.name) nodes = set() for _, node in method.filter(javalang.tree.MemberReference): if empty_qualifier(node): name = node.member if name in fields: nodes.add(name) else: # qualifier of 'a.b.c.x' is 'a.b.c', we only care about 'a' name = node.qualifier.split('.', maxsplit=1)[0] # cannot use field set here, since a class may be referenced (through a static property) # hence we fetch the local variables (and parameters) in the method and we explicitly avoid them if not name in local_variables: # if a MemberReference includes a non empty qualifier (e.g., a.x), # consider the qualifier (a), not the member (x)˝ nodes.add(name) return nodes def get_methods_accessed_by_method(method: javalang.tree.MethodDeclaration, methods: set[str]) -> set[str]: nodes = set() for _, node in method.filter(javalang.tree.MethodInvocation): name: str = node.member if empty_qualifier(node) and name in methods: nodes.add(name) return nodes def parse(path: str, name: str, df_table): # Get the AST of the file with open(path) as file: data = file.read() tree = javalang.parse.parse(data) # Fetch package name from package declaration # if node is missing, assuming default package ('') package_name = '' for _, node in tree.filter(javalang.tree.PackageDeclaration): package_name = node.name break for _, node in tree.filter(javalang.tree.ClassDeclaration): # consider only the class matching the input file name, to skip inner classes if path.endswith(node.name + '.java'): fqdn = package_name + '.' + node.name fields = get_fields(node) methods = get_methods(node) cols = sorted(fields.union(methods)) df = pd.DataFrame(columns=cols, dtype=int) for m in node.methods: # make sure method is included in csv file df.loc[m.name, :] = 0 m_fields = get_fields_accessed_by_method(m, fields) m_methods = get_methods_accessed_by_method(m, methods) for member in m_fields.union(m_methods): if member not in df: df[member] = 0 df.loc[m.name, member] = 1 df = df.fillna(0) for i in df.columns: df[[i]] = df[[i]].astype(int) df = df.loc[:, (df != 0).any(axis=0)] df_table.loc[name, '# Feature Vectors'] = df.shape[0] df_table.loc[name, '# Attributes'] = df.shape[1] df.to_csv(OUT_DIR + '/' + fqdn + '.csv') break def main(): warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) clean_output() df = pd.read_csv(IN_DIR + '/god_classes.csv') df_table = pd.DataFrame(columns=['# Feature Vectors', '# Attributes']) for clazz in df['class_name'].to_list(): clazz_path = SOURCES + '/' + clazz.replace('.', '/') + '.java' print(clazz_path) parse(clazz_path, clazz, df_table) df_table.index.name = "Class Name" print(df_table.to_markdown()) if __name__ == '__main__': main()