This repository has been archived on 2023-06-18. You can view files and clone it, but cannot push or open issues or pull requests.
ima01/extract_feature_vectors.py

144 lines
4.6 KiB
Python
Executable File

#!/usr/bin/env python3
import javalang
import os
import pandas as pd
import glob
import warnings
import tabulate
DIR: str = os.path.dirname(os.path.realpath(__file__))
SOURCES: str = DIR + '/resources/xerces2-j-src'
OUT_DIR: str = DIR + '/feature_vectors'
IN_DIR: str = DIR + '/god_classes'
def clean_output():
filelist = glob.glob(OUT_DIR + '/*.csv')
for f in filelist:
os.remove(f)
def get_fields(java_class: javalang.tree.ClassDeclaration) -> set[str]:
names = set()
for f in java_class.fields:
names.add(f.declarators[0].name)
return names
def get_methods(java_class: javalang.tree.ClassDeclaration) -> set[str]:
names = set()
for f in java_class.methods:
names.add(f.name)
return names
def empty_qualifier(node: any) -> bool:
return node.qualifier is None or node.qualifier == '' or node.qualifier == 'this'
def get_fields_accessed_by_method(method: javalang.tree.MethodDeclaration, fields: set[str]) -> set[str]:
# Ignore formal parameters and local variable declarations
local_variables = set()
for _, node in method.filter(javalang.tree.LocalVariableDeclaration):
local_variables.add(node.declarators[0].name)
for _, node in method.filter(javalang.tree.FormalParameter):
local_variables.add(node.name)
nodes = set()
for _, node in method.filter(javalang.tree.MemberReference):
if empty_qualifier(node):
name = node.member
if name in fields:
nodes.add(name)
else:
# qualifier of 'a.b.c.x' is 'a.b.c', we only care about 'a'
name = node.qualifier.split('.', maxsplit=1)[0]
# cannot use field set here, since a class may be referenced (through a static property)
# hence we fetch the local variables (and parameters) in the method and we explicitly avoid them
if not name in local_variables:
# if a MemberReference includes a non empty qualifier (e.g., a.x),
# consider the qualifier (a), not the member (x)˝
nodes.add(name)
return nodes
def get_methods_accessed_by_method(method: javalang.tree.MethodDeclaration, methods: set[str]) -> set[str]:
nodes = set()
for _, node in method.filter(javalang.tree.MethodInvocation):
name: str = node.member
if empty_qualifier(node) and name in methods:
nodes.add(name)
return nodes
def parse(path: str, name: str, df_table):
# Get the AST of the file
with open(path) as file:
data = file.read()
tree = javalang.parse.parse(data)
# Fetch package name from package declaration
# if node is missing, assuming default package ('')
package_name = ''
for _, node in tree.filter(javalang.tree.PackageDeclaration):
package_name = node.name
break
for _, node in tree.filter(javalang.tree.ClassDeclaration):
# consider only the class matching the input file name, to skip inner classes
if path.endswith(node.name + '.java'):
fqdn = package_name + '.' + node.name
fields = get_fields(node)
methods = get_methods(node)
cols = sorted(fields.union(methods))
df = pd.DataFrame(columns=cols, dtype=int)
for m in node.methods:
# make sure method is included in csv file
df.loc[m.name, :] = 0
m_fields = get_fields_accessed_by_method(m, fields)
m_methods = get_methods_accessed_by_method(m, methods)
for member in m_fields.union(m_methods):
if member not in df:
df[member] = 0
df.loc[m.name, member] = 1
df = df.fillna(0)
for i in df.columns:
df[[i]] = df[[i]].astype(int)
df = df.loc[:, (df != 0).any(axis=0)]
df_table.loc[name, '# Feature Vectors'] = df.shape[0]
df_table.loc[name, '# Attributes'] = df.shape[1]
df.to_csv(OUT_DIR + '/' + fqdn + '.csv')
break
def main():
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
clean_output()
df = pd.read_csv(IN_DIR + '/god_classes.csv')
df_table = pd.DataFrame(columns=['# Feature Vectors', '# Attributes'])
for clazz in df['class_name'].to_list():
clazz_path = SOURCES + '/' + clazz.replace('.', '/') + '.java'
print(clazz_path)
parse(clazz_path, clazz, df_table)
df_table.index.name = "Class Name"
print(df_table.to_markdown())
if __name__ == '__main__':
main()