This repository has been archived on 2023-06-18. You can view files and clone it, but cannot push or open issues or pull requests.
ima02/extract_feature_vectors.py

155 lines
4.7 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
import javalang
import os
import pandas as pd
import glob
import re
DIR: str = os.path.dirname(os.path.realpath(__file__))
SOURCES: str = DIR + '/resources/defects4j-checkout-closure-1f/src/com/google/javascript/jscomp/'
OUT_DIR: str = DIR + '/metrics'
def clean_output():
filelist = glob.glob(OUT_DIR + '/*.csv')
for f in filelist:
os.remove(f)
def count(node: javalang.tree.Node, the_filter) -> int:
i = 0
for _, _ in node.filter(the_filter):
i += 1
return i
def metrics(clazz: javalang.tree.ClassDeclaration) -> dict[str, str]:
# Method metrics
m = {
'MTH': len(clazz.methods),
'FLD': len(clazz.fields),
'RFC': 0
}
# RFC: #Public methods + #Method invocations
for method in clazz.methods:
if 'public' in method.modifiers:
m['RFC'] += 1
m['RFC'] += count(clazz, javalang.tree.MethodInvocation)
# INT: #Implemented interfaces
m['INT'] = 0 if clazz.implements is None else len(clazz.implements)
# Method metrics (max across methods)
sz = 0
cpx = 0
ex = 0
ret = 0
sum_m_name_len = 0
for method in clazz.methods:
sum_m_name_len += len(method.name)
# SZ: #Statements
# BlockStatements nodes are curly braces (i.e. scopes), therefore they
# must be discarded
sz = max(sz, count(method, javalang.tree.Statement) -
count(method, javalang.tree.BlockStatement))
# CPX: #CONDITIONAL + #LOOP statements
cpx = max(cpx,
count(method, javalang.tree.IfStatement) +
count(method, javalang.tree.TernaryExpression) +
count(method, javalang.tree.WhileStatement) +
count(method, javalang.tree.ForStatement))
# EX: #Exceptions in throws clause
ex = max(ex, 0 if method.throws is None else len(method.throws))
# RET: #Return points
ret = max(ret, count(method, javalang.tree.ReturnStatement))
m['SZ'] = sz
m['CPX'] = cpx
m['EX'] = ex
m['RET'] = ret
# NLP metrics
# Average length of method names
l = len(clazz.methods)
m['NML'] = 0 if l == 0 else sum_m_name_len / l
m['BCM'] = 0
m['WRD'] = 0
for _, node in clazz.filter(javalang.tree.Documented):
if node.documentation is not None:
# BCM: #block comments
m['BCM'] += 1
# #Words (the longest alphanumeric substrings) in block comments
m['WRD'] += len(re.findall('\\w+', node.documentation))
# #Words in comments / #Statements
s = count(clazz, javalang.tree.Statement)
m['DCM'] = 0 if s == 0 else m['WRD'] / s
return m
def create_df(root) -> pd.DataFrame:
df = pd.DataFrame(columns=['class_name', 'MTH', 'FLD', 'RFC',
'INT', 'SZ', 'CPX', 'EX', 'RET', 'BCM', 'NML', 'WRD', 'DCM'])
i: int = 0
for path, dirs, files in os.walk(root):
for f in files:
if f.endswith('.java'):
# Get the AST of the file
with open(path + '/' + f) as file:
data = file.read()
tree = javalang.parse.parse(data)
# Fetch package name from package declaration
# if node is missing, assuming default package ('')
package_name = ''
for _, node in tree.filter(javalang.tree.PackageDeclaration):
package_name = node.name
break
# Get all classes and number of methods for each one
for parents, node in tree.filter(javalang.tree.ClassDeclaration):
is_inner_class = False
for p in parents[:-1]:
t = type(p)
# for some reason, all the parents other than the root
# are wrapped in a list with only the type as element,
# so I extract the type if this is the case
if t == list:
t = type(p[0])
if t == javalang.tree.ClassDeclaration:
is_inner_class = True
break
if not is_inner_class:
fqdn = package_name + '.' + node.name
df.loc[i, 'class_name'] = fqdn
m = metrics(node)
for metric in m:
df.loc[i, metric] = m[metric]
i += 1
return df
def main():
clean_output()
df = create_df(SOURCES)
df.to_csv(OUT_DIR + '/feature_vectors.csv', index=False)
if __name__ == '__main__':
main()