2023-02-16 20:25:12 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
import javalang
|
|
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
import glob
|
|
|
|
import re
|
|
|
|
|
|
|
|
DIR: str = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
SOURCES: str = DIR + '/sources/src/com/google/javascript/jscomp/'
|
|
|
|
OUT_DIR: str = DIR + '/metrics'
|
|
|
|
|
|
|
|
|
|
|
|
def clean_output():
|
|
|
|
filelist = glob.glob(OUT_DIR + '/*.csv')
|
|
|
|
for f in filelist:
|
|
|
|
os.remove(f)
|
|
|
|
|
|
|
|
|
|
|
|
def count(node: javalang.tree.Node, the_filter) -> int:
|
|
|
|
i = 0
|
|
|
|
for _, _ in node.filter(the_filter):
|
|
|
|
i += 1
|
|
|
|
return i
|
|
|
|
|
|
|
|
|
|
|
|
def metrics(clazz: javalang.tree.ClassDeclaration) -> dict[str, str]:
|
2023-02-19 17:18:51 +00:00
|
|
|
# Method metrics
|
|
|
|
m = {
|
|
|
|
'MTH': len(clazz.methods),
|
|
|
|
'FLD': len(clazz.fields),
|
|
|
|
'RFC': 0
|
|
|
|
}
|
2023-02-16 20:25:12 +00:00
|
|
|
|
|
|
|
# RFC: #Public methods + #Method invocations
|
|
|
|
for method in clazz.methods:
|
|
|
|
if 'public' in method.modifiers:
|
|
|
|
m['RFC'] += 1
|
|
|
|
m['RFC'] += count(clazz, javalang.tree.MethodInvocation)
|
|
|
|
|
|
|
|
# INT: #Implemented interfaces
|
|
|
|
m['INT'] = 0 if clazz.implements is None else len(clazz.implements)
|
|
|
|
|
|
|
|
# Method metrics (max across methods)
|
2023-02-19 13:20:13 +00:00
|
|
|
sz = 0
|
|
|
|
cpx = 0
|
|
|
|
ex = 0
|
|
|
|
ret = 0
|
|
|
|
sum_m_name_len = 0
|
2023-02-16 20:25:12 +00:00
|
|
|
for method in clazz.methods:
|
|
|
|
sum_m_name_len += len(method.name)
|
|
|
|
|
|
|
|
# SZ: #Statements
|
|
|
|
sz = max(sz, count(method, javalang.tree.Statement))
|
|
|
|
|
|
|
|
# CPX: #CONDITIONAL + #LOOP statements
|
2023-02-19 13:20:13 +00:00
|
|
|
cpx = max(cpx,
|
|
|
|
count(method, javalang.tree.IfStatement) +
|
|
|
|
count(method, javalang.tree.TernaryExpression) +
|
|
|
|
count(method, javalang.tree.WhileStatement) +
|
|
|
|
count(method, javalang.tree.ForStatement))
|
2023-02-16 20:25:12 +00:00
|
|
|
|
|
|
|
# EX: #Exceptions in throws clause
|
|
|
|
ex = max(ex, 0 if method.throws is None else len(method.throws))
|
|
|
|
|
|
|
|
# RET: #Return points
|
|
|
|
ret = max(ret, count(method, javalang.tree.ReturnStatement))
|
2023-02-19 13:20:13 +00:00
|
|
|
m['SZ'] = sz
|
|
|
|
m['CPX'] = cpx
|
|
|
|
m['EX'] = ex
|
|
|
|
m['RET'] = ret
|
|
|
|
|
|
|
|
# NLP metrics
|
2023-02-16 20:25:12 +00:00
|
|
|
|
|
|
|
# Average length of method names
|
|
|
|
l = len(clazz.methods)
|
|
|
|
m['NML'] = 0 if l == 0 else sum_m_name_len / l
|
|
|
|
|
2023-02-19 13:20:13 +00:00
|
|
|
m['BCM'] = 0
|
|
|
|
m['WRD'] = 0
|
2023-02-16 20:25:12 +00:00
|
|
|
for _, node in clazz.filter(javalang.tree.Documented):
|
|
|
|
if node.documentation is not None:
|
|
|
|
# BCM: #block comments
|
2023-02-19 13:20:13 +00:00
|
|
|
m['BCM'] += 1
|
2023-02-16 20:25:12 +00:00
|
|
|
|
2023-02-19 17:18:51 +00:00
|
|
|
# #Words (the longest alphanumeric substrings) in block comments
|
|
|
|
m['WRD'] += len(re.findall('\\w+', node.documentation))
|
2023-02-16 20:25:12 +00:00
|
|
|
|
|
|
|
# #Words in comments / #Statements
|
|
|
|
s = count(clazz, javalang.tree.Statement)
|
|
|
|
m['DCM'] = 0 if s == 0 else m['WRD'] / s
|
|
|
|
|
|
|
|
return m
|
|
|
|
|
|
|
|
|
|
|
|
def create_df(root) -> pd.DataFrame:
|
2023-02-19 13:20:13 +00:00
|
|
|
df = pd.DataFrame(columns=['class_name', 'MTH', 'FLD', 'RFC',
|
|
|
|
'INT', 'SZ', 'CPX', 'EX', 'RET', 'BCM', 'NML', 'WRD', 'DCM'])
|
2023-02-16 20:25:12 +00:00
|
|
|
|
|
|
|
i: int = 0
|
|
|
|
for path, dirs, files in os.walk(root):
|
|
|
|
for f in files:
|
|
|
|
if f.endswith('.java'):
|
|
|
|
# Get the AST of the file
|
|
|
|
with open(path + '/' + f) as file:
|
|
|
|
data = file.read()
|
|
|
|
tree = javalang.parse.parse(data)
|
|
|
|
|
|
|
|
# Fetch package name from package declaration
|
|
|
|
# if node is missing, assuming default package ('')
|
|
|
|
package_name = ''
|
|
|
|
for _, node in tree.filter(javalang.tree.PackageDeclaration):
|
|
|
|
package_name = node.name
|
|
|
|
break
|
|
|
|
|
|
|
|
# Get all classes and number of methods for each one
|
|
|
|
for _, node in tree.filter(javalang.tree.ClassDeclaration):
|
|
|
|
fqdn = package_name + '.' + node.name
|
2023-02-19 13:20:13 +00:00
|
|
|
|
2023-02-16 20:25:12 +00:00
|
|
|
df.loc[i, 'class_name'] = fqdn
|
|
|
|
m = metrics(node)
|
|
|
|
for metric in m:
|
|
|
|
df.loc[i, metric] = m[metric]
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
clean_output()
|
|
|
|
df = create_df(SOURCES)
|
|
|
|
df.to_csv(OUT_DIR + '/feature_vectors.csv', index=False)
|
|
|
|
|
2023-02-19 13:20:13 +00:00
|
|
|
|
2023-02-16 20:25:12 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|