#!/usr/bin/env python3 import javalang import os import pandas as pd import glob import re # God class if: # |M(C)| > E(M) + 6*V(M) # (number of methods greater than average across all classes plus 6 times the # standard deviation) DIR: str = os.path.dirname(os.path.realpath(__file__)) SOURCES: str = DIR + '/sources/src/com/google/javascript/jscomp/' OUT_DIR: str = DIR + '/metrics' def clean_output(): filelist = glob.glob(OUT_DIR + '/*.csv') for f in filelist: os.remove(f) def count(node: javalang.tree.Node, the_filter) -> int: i = 0 for _, _ in node.filter(the_filter): i += 1 return i def metrics(clazz: javalang.tree.ClassDeclaration) -> dict[str, str]: m = {} m['MTH'] = len(clazz.methods) # MTH: #Methods m['FLD'] = len(clazz.fields) # FLD: #Fields # RFC: #Public methods + #Method invocations m['RFC'] = 0 for method in clazz.methods: if 'public' in method.modifiers: m['RFC'] += 1 m['RFC'] += count(clazz, javalang.tree.MethodInvocation) # INT: #Implemented interfaces m['INT'] = 0 if clazz.implements is None else len(clazz.implements) # Method metrics (max across methods) sz = 0; cpx = 0; ex = 0; ret = 0; sum_m_name_len = 0 for method in clazz.methods: sum_m_name_len += len(method.name) # SZ: #Statements sz = max(sz, count(method, javalang.tree.Statement)) # CPX: #CONDITIONAL + #LOOP statements cpx = max(cpx, count(method, javalang.tree.IfStatement) + count(method, javalang.tree.TernaryExpression) + count(method, javalang.tree.WhileStatement) + count(method, javalang.tree.ForStatement)) # EX: #Exceptions in throws clause ex = max(ex, 0 if method.throws is None else len(method.throws)) # RET: #Return points ret = max(ret, count(method, javalang.tree.ReturnStatement)) m['SZ'] = sz; m['CPX'] = cpx; m['EX'] = ex; m['RET'] = ret # NLP metrics # Average length of method names l = len(clazz.methods) m['NML'] = 0 if l == 0 else sum_m_name_len / l m['BCM'] = 0; m['WRD'] = 0 for _, node in clazz.filter(javalang.tree.Documented): if node.documentation is not None: # BCM: #block comments m['BCM'] += 1 # #Words (longest alphanumeric substrings) in block comments m['WRD'] += len(re.findall('\w+', node.documentation)) # #Words in comments / #Statements s = count(clazz, javalang.tree.Statement) m['DCM'] = 0 if s == 0 else m['WRD'] / s return m def create_df(root) -> pd.DataFrame: df = pd.DataFrame(columns=['class_name', 'MTH', 'FLD', 'RFC', 'INT', 'SZ', 'CPX', 'EX', 'RET', 'BCM', 'NML', 'WRD', 'DCM']) i: int = 0 for path, dirs, files in os.walk(root): for f in files: if f.endswith('.java'): # Get the AST of the file with open(path + '/' + f) as file: data = file.read() tree = javalang.parse.parse(data) # Fetch package name from package declaration # if node is missing, assuming default package ('') package_name = '' for _, node in tree.filter(javalang.tree.PackageDeclaration): package_name = node.name break # Get all classes and number of methods for each one rows: list[tuple] = [] for _, node in tree.filter(javalang.tree.ClassDeclaration): fqdn = package_name + '.' + node.name df.loc[i, 'class_name'] = fqdn m = metrics(node) for metric in m: df.loc[i, metric] = m[metric] i += 1 return df def main(): clean_output() df = create_df(SOURCES) df.to_csv(OUT_DIR + '/feature_vectors.csv', index=False) if __name__ == '__main__': main()