ima02/extract_feature_vectors.py

#!/usr/bin/env python3
import javalang
import os
import pandas as pd
import glob
import re

DIR: str = os.path.dirname(os.path.realpath(__file__))
SOURCES: str = DIR + '/resources/defects4j-checkout-closure-1f/src/com/google/javascript/jscomp/'
OUT_DIR: str = DIR + '/metrics'


def clean_output():
    filelist = glob.glob(OUT_DIR + '/*.csv')
    for f in filelist:
        os.remove(f)


def count(node: javalang.tree.Node, the_filter) -> int:
    i = 0
    for _, _ in node.filter(the_filter):
        i += 1
    return i


def metrics(clazz: javalang.tree.ClassDeclaration) -> dict[str, str]:
    # Method metrics
    m = {
        'MTH': len(clazz.methods),
        'FLD': len(clazz.fields),
        'RFC': 0
    }

    # RFC: #Public methods + #Method invocations
    for method in clazz.methods:
        if 'public' in method.modifiers:
            m['RFC'] += 1
    m['RFC'] += count(clazz, javalang.tree.MethodInvocation)

    # INT: #Implemented interfaces
    m['INT'] = 0 if clazz.implements is None else len(clazz.implements)

    # Method metrics (max across methods)
    sz = 0
    cpx = 0
    ex = 0
    ret = 0
    sum_m_name_len = 0
    for method in clazz.methods:
        sum_m_name_len += len(method.name)

        # SZ: #Statements
        # BlockStatements nodes are curly braces (i.e. scopes), therefore they
        # must be discarded
        sz = max(sz, count(method, javalang.tree.Statement) -
                 count(method, javalang.tree.BlockStatement))

        # CPX: #CONDITIONAL + #LOOP statements
        cpx = max(cpx,
                  count(method, javalang.tree.IfStatement) +
                  count(method, javalang.tree.TernaryExpression) +
                  count(method, javalang.tree.WhileStatement) +
                  count(method, javalang.tree.ForStatement))

        # EX: #Exceptions in throws clause
        ex = max(ex, 0 if method.throws is None else len(method.throws))

        # RET: #Return points
        ret = max(ret, count(method, javalang.tree.ReturnStatement))
    m['SZ'] = sz
    m['CPX'] = cpx
    m['EX'] = ex
    m['RET'] = ret

    # NLP metrics

    # Average length of method names
    l = len(clazz.methods)
    m['NML'] = 0 if l == 0 else sum_m_name_len / l

    m['BCM'] = 0
    m['WRD'] = 0
    for _, node in clazz.filter(javalang.tree.Documented):
        if node.documentation is not None:
            # BCM: #block comments
            m['BCM'] += 1

            # #Words (the longest alphanumeric substrings) in block comments
            m['WRD'] += len(re.findall('\\w+', node.documentation))

    # #Words in comments / #Statements
    s = count(clazz, javalang.tree.Statement)
    m['DCM'] = 0 if s == 0 else m['WRD'] / s

    return m


def create_df(root) -> pd.DataFrame:
    df = pd.DataFrame(columns=['class_name', 'MTH', 'FLD', 'RFC',
                      'INT', 'SZ', 'CPX', 'EX', 'RET', 'BCM', 'NML', 'WRD', 'DCM'])

    i: int = 0
    for path, dirs, files in os.walk(root):
        for f in files:
            if f.endswith('.java'):
                # Get the AST of the file
                with open(path + '/' + f) as file:
                    data = file.read()
                tree = javalang.parse.parse(data)

                # Fetch package name from package declaration
                # if node is missing, assuming default package ('')
                package_name = ''
                for _, node in tree.filter(javalang.tree.PackageDeclaration):
                    package_name = node.name
                    break

                # Get all classes and number of methods for each one
                for parents, node in tree.filter(javalang.tree.ClassDeclaration):

                    is_inner_class = False
                    for p in parents[:-1]:
                        t = type(p)

                        # for some reason, all the parents other than the root
                        # are wrapped in a list with only the type as element,
                        # so I extract the type if this is the case
                        if t == list:
                            t = type(p[0])

                        if t == javalang.tree.ClassDeclaration:
                            is_inner_class = True
                            break

                    if not is_inner_class:
                        fqdn = package_name + '.' + node.name

                        df.loc[i, 'class_name'] = fqdn
                        m = metrics(node)
                        for metric in m:
                            df.loc[i, metric] = m[metric]
                        i += 1

    return df


def main():
    clean_output()
    df = create_df(SOURCES)
    df.to_csv(OUT_DIR + '/feature_vectors.csv', index=False)


if __name__ == '__main__':
    main()