#!/usr/bin/env python3 import javalang import os import pandas as pd import glob import re DIR: str = os.path.dirname(os.path.realpath(__file__)) SOURCES: str = DIR + '/resources/defects4j-checkout-closure-1f/src/com/google/javascript/jscomp/' OUT_DIR: str = DIR + '/metrics' def clean_output(): filelist = glob.glob(OUT_DIR + '/*.csv') for f in filelist: os.remove(f) def count(node: javalang.tree.Node, the_filter) -> int: i = 0 for _, _ in node.filter(the_filter): i += 1 return i def metrics(clazz: javalang.tree.ClassDeclaration) -> dict[str, str]: # Method metrics m = { 'MTH': len(clazz.methods), 'FLD': len(clazz.fields), 'RFC': 0 } # RFC: #Public methods + #Method invocations for method in clazz.methods: if 'public' in method.modifiers: m['RFC'] += 1 m['RFC'] += count(clazz, javalang.tree.MethodInvocation) # INT: #Implemented interfaces m['INT'] = 0 if clazz.implements is None else len(clazz.implements) # Method metrics (max across methods) sz = 0 cpx = 0 ex = 0 ret = 0 sum_m_name_len = 0 for method in clazz.methods: sum_m_name_len += len(method.name) # SZ: #Statements # BlockStatements nodes are curly braces (i.e. scopes), therefore they # must be discarded sz = max(sz, count(method, javalang.tree.Statement) - count(method, javalang.tree.BlockStatement)) # CPX: #CONDITIONAL + #LOOP statements cpx = max(cpx, count(method, javalang.tree.IfStatement) + count(method, javalang.tree.TernaryExpression) + count(method, javalang.tree.WhileStatement) + count(method, javalang.tree.ForStatement)) # EX: #Exceptions in throws clause ex = max(ex, 0 if method.throws is None else len(method.throws)) # RET: #Return points ret = max(ret, count(method, javalang.tree.ReturnStatement)) m['SZ'] = sz m['CPX'] = cpx m['EX'] = ex m['RET'] = ret # NLP metrics # Average length of method names l = len(clazz.methods) m['NML'] = 0 if l == 0 else sum_m_name_len / l m['BCM'] = 0 m['WRD'] = 0 for _, node in clazz.filter(javalang.tree.Documented): if node.documentation is not None: # BCM: #block comments m['BCM'] += 1 # #Words (the longest alphanumeric substrings) in block comments m['WRD'] += len(re.findall('\\w+', node.documentation)) # #Words in comments / #Statements s = count(clazz, javalang.tree.Statement) m['DCM'] = 0 if s == 0 else m['WRD'] / s return m def create_df(root) -> pd.DataFrame: df = pd.DataFrame(columns=['class_name', 'MTH', 'FLD', 'RFC', 'INT', 'SZ', 'CPX', 'EX', 'RET', 'BCM', 'NML', 'WRD', 'DCM']) i: int = 0 for path, dirs, files in os.walk(root): for f in files: if f.endswith('.java'): # Get the AST of the file with open(path + '/' + f) as file: data = file.read() tree = javalang.parse.parse(data) # Fetch package name from package declaration # if node is missing, assuming default package ('') package_name = '' for _, node in tree.filter(javalang.tree.PackageDeclaration): package_name = node.name break # Get all classes and number of methods for each one for parents, node in tree.filter(javalang.tree.ClassDeclaration): is_inner_class = False for p in parents[:-1]: t = type(p) # for some reason, all the parents other than the root # are wrapped in a list with only the type as element, # so I extract the type if this is the case if t == list: t = type(p[0]) if t == javalang.tree.ClassDeclaration: is_inner_class = True break if not is_inner_class: fqdn = package_name + '.' + node.name df.loc[i, 'class_name'] = fqdn m = metrics(node) for metric in m: df.loc[i, metric] = m[metric] i += 1 return df def main(): clean_output() df = create_df(SOURCES) df.to_csv(OUT_DIR + '/feature_vectors.csv', index=False) if __name__ == '__main__': main()