ima02/extract_feature_vectors.py

#!/usr/bin/env python3
import javalang
import os
import pandas as pd
import glob
import re

DIR: str = os.path.dirname(os.path.realpath(__file__))
SOURCES: str = DIR + '/resources/defects4j-checkout-closure-1f/src/com/google/javascript/jscomp/'
OUT_DIR: str = DIR + '/metrics'


def clean_output():
    filelist = glob.glob(OUT_DIR + '/*.csv')
    for f in filelist:
        os.remove(f)


def count(node: javalang.tree.Node, the_filter) -> int:
    i = 0
    for _, _ in node.filter(the_filter):
        i += 1
    return i


def metrics(clazz: javalang.tree.ClassDeclaration) -> dict[str, str]:
    # Method metrics
    m = {
        'MTH': len(clazz.methods),
        'FLD': len(clazz.fields),
        'RFC': 0
    }

    # RFC: #Public methods + #Method invocations
    for method in clazz.methods:
        if 'public' in method.modifiers:
            m['RFC'] += 1
    m['RFC'] += count(clazz, javalang.tree.MethodInvocation)

    # INT: #Implemented interfaces
    m['INT'] = 0 if clazz.implements is None else len(clazz.implements)

    # Method metrics (max across methods)
    sz = 0
    cpx = 0
    ex = 0
    ret = 0
    sum_m_name_len = 0
    for method in clazz.methods:
        sum_m_name_len += len(method.name)

        # SZ: #Statements
        # BlockStatements nodes are curly braces (i.e. scopes), therefore they
        # must be discarded
        sz = max(sz, count(method, javalang.tree.Statement) -
                 count(method, javalang.tree.BlockStatement))

        # CPX: #CONDITIONAL + #LOOP statements
        cpx = max(cpx,
                  count(method, javalang.tree.IfStatement) +
                  count(method, javalang.tree.TernaryExpression) +
                  count(method, javalang.tree.WhileStatement) +
                  count(method, javalang.tree.ForStatement))

        # EX: #Exceptions in throws clause
        ex = max(ex, 0 if method.throws is None else len(method.throws))

        # RET: #Return points
        ret = max(ret, count(method, javalang.tree.ReturnStatement))
    m['SZ'] = sz
    m['CPX'] = cpx
    m['EX'] = ex
    m['RET'] = ret

    # NLP metrics

    # Average length of method names
    l = len(clazz.methods)
    m['NML'] = 0 if l == 0 else sum_m_name_len / l

    m['BCM'] = 0
    m['WRD'] = 0
    for _, node in clazz.filter(javalang.tree.Documented):
        if node.documentation is not None:
            # BCM: #block comments
            m['BCM'] += 1

            # #Words (the longest alphanumeric substrings) in block comments
            m['WRD'] += len(re.findall('\\w+', node.documentation))

    # #Words in comments / #Statements
    s = count(clazz, javalang.tree.Statement)
    m['DCM'] = 0 if s == 0 else m['WRD'] / s

    return m


def create_df(root) -> pd.DataFrame:
    df = pd.DataFrame(columns=['class_name', 'MTH', 'FLD', 'RFC',
                      'INT', 'SZ', 'CPX', 'EX', 'RET', 'BCM', 'NML', 'WRD', 'DCM'])

    i: int = 0
    for path, dirs, files in os.walk(root):
        for f in files:
            if f.endswith('.java'):
                # Get the AST of the file
                with open(path + '/' + f) as file:
                    data = file.read()
                tree = javalang.parse.parse(data)

                # Fetch package name from package declaration
                # if node is missing, assuming default package ('')
                package_name = ''
                for _, node in tree.filter(javalang.tree.PackageDeclaration):
                    package_name = node.name
                    break

                # Get all classes and number of methods for each one
                for parents, node in tree.filter(javalang.tree.ClassDeclaration):

                    is_inner_class = False
                    for p in parents[:-1]:
                        t = type(p)

                        # for some reason, all the parents other than the root
                        # are wrapped in a list with only the type as element,
                        # so I extract the type if this is the case
                        if t == list:
                            t = type(p[0])
                        
                        if t == javalang.tree.ClassDeclaration:
                            is_inner_class = True
                            break

                    if not is_inner_class:
                        fqdn = package_name + '.' + node.name

                        df.loc[i, 'class_name'] = fqdn
                        m = metrics(node)
                        for metric in m:
                            df.loc[i, metric] = m[metric]
                        i += 1

    return df


def main():
    clean_output()
    df = create_df(SOURCES)
    df.to_csv(OUT_DIR + '/feature_vectors.csv', index=False)


if __name__ == '__main__':
    main()
done part 5, part 6, part 7, and 20-times CV for part 8 2023-04-25 12:23:41 +00:00			`#!/usr/bin/env python3`
			`import javalang`
			`import os`
			`import pandas as pd`
			`import glob`
			`import re`

			`DIR: str = os.path.dirname(os.path.realpath(__file__))`
			`SOURCES: str = DIR + '/resources/defects4j-checkout-closure-1f/src/com/google/javascript/jscomp/'`
			`OUT_DIR: str = DIR + '/metrics'`


			`def clean_output():`
			`filelist = glob.glob(OUT_DIR + '/*.csv')`
			`for f in filelist:`
			`os.remove(f)`


			`def count(node: javalang.tree.Node, the_filter) -> int:`
			`i = 0`
			`for _, _ in node.filter(the_filter):`
			`i += 1`
			`return i`


			`def metrics(clazz: javalang.tree.ClassDeclaration) -> dict[str, str]:`
			`# Method metrics`
			`m = {`
			`'MTH': len(clazz.methods),`
			`'FLD': len(clazz.fields),`
			`'RFC': 0`
			`}`

			`# RFC: #Public methods + #Method invocations`
			`for method in clazz.methods:`
			`if 'public' in method.modifiers:`
			`m['RFC'] += 1`
			`m['RFC'] += count(clazz, javalang.tree.MethodInvocation)`

			`# INT: #Implemented interfaces`
			`m['INT'] = 0 if clazz.implements is None else len(clazz.implements)`

			`# Method metrics (max across methods)`
			`sz = 0`
			`cpx = 0`
			`ex = 0`
			`ret = 0`
			`sum_m_name_len = 0`
			`for method in clazz.methods:`
			`sum_m_name_len += len(method.name)`

			`# SZ: #Statements`
Balanced classes for classifier training 2023-05-22 15:39:51 +00:00			`# BlockStatements nodes are curly braces (i.e. scopes), therefore they`
			`# must be discarded`
			`sz = max(sz, count(method, javalang.tree.Statement) -`
			`count(method, javalang.tree.BlockStatement))`
done part 5, part 6, part 7, and 20-times CV for part 8 2023-04-25 12:23:41 +00:00
			`# CPX: #CONDITIONAL + #LOOP statements`
			`cpx = max(cpx,`
			`count(method, javalang.tree.IfStatement) +`
			`count(method, javalang.tree.TernaryExpression) +`
			`count(method, javalang.tree.WhileStatement) +`
			`count(method, javalang.tree.ForStatement))`

			`# EX: #Exceptions in throws clause`
			`ex = max(ex, 0 if method.throws is None else len(method.throws))`

			`# RET: #Return points`
			`ret = max(ret, count(method, javalang.tree.ReturnStatement))`
			`m['SZ'] = sz`
			`m['CPX'] = cpx`
			`m['EX'] = ex`
			`m['RET'] = ret`

			`# NLP metrics`

			`# Average length of method names`
			`l = len(clazz.methods)`
			`m['NML'] = 0 if l == 0 else sum_m_name_len / l`

			`m['BCM'] = 0`
			`m['WRD'] = 0`
			`for _, node in clazz.filter(javalang.tree.Documented):`
			`if node.documentation is not None:`
			`# BCM: #block comments`
			`m['BCM'] += 1`

			`# #Words (the longest alphanumeric substrings) in block comments`
			`m['WRD'] += len(re.findall('\\w+', node.documentation))`

			`# #Words in comments / #Statements`
			`s = count(clazz, javalang.tree.Statement)`
			`m['DCM'] = 0 if s == 0 else m['WRD'] / s`

			`return m`


			`def create_df(root) -> pd.DataFrame:`
			`df = pd.DataFrame(columns=['class_name', 'MTH', 'FLD', 'RFC',`
			`'INT', 'SZ', 'CPX', 'EX', 'RET', 'BCM', 'NML', 'WRD', 'DCM'])`

			`i: int = 0`
			`for path, dirs, files in os.walk(root):`
			`for f in files:`
			`if f.endswith('.java'):`
			`# Get the AST of the file`
			`with open(path + '/' + f) as file:`
			`data = file.read()`
			`tree = javalang.parse.parse(data)`

			`# Fetch package name from package declaration`
			`# if node is missing, assuming default package ('')`
			`package_name = ''`
			`for _, node in tree.filter(javalang.tree.PackageDeclaration):`
			`package_name = node.name`
			`break`

			`# Get all classes and number of methods for each one`
Balanced classes for classifier training 2023-05-22 15:39:51 +00:00			`for parents, node in tree.filter(javalang.tree.ClassDeclaration):`

			`is_inner_class = False`
			`for p in parents[:-1]:`
			`t = type(p)`

			`# for some reason, all the parents other than the root`
			`# are wrapped in a list with only the type as element,`
			`# so I extract the type if this is the case`
			`if t == list:`
			`t = type(p[0])`

			`if t == javalang.tree.ClassDeclaration:`
			`is_inner_class = True`
			`break`

			`if not is_inner_class:`
			`fqdn = package_name + '.' + node.name`

			`df.loc[i, 'class_name'] = fqdn`
			`m = metrics(node)`
			`for metric in m:`
			`df.loc[i, metric] = m[metric]`
			`i += 1`
done part 5, part 6, part 7, and 20-times CV for part 8 2023-04-25 12:23:41 +00:00
			`return df`


			`def main():`
			`clean_output()`
			`df = create_df(SOURCES)`
			`df.to_csv(OUT_DIR + '/feature_vectors.csv', index=False)`


			`if __name__ == '__main__':`
			`main()`