#!/usr/bin/env python3
import javalang
import os
import pandas as pd
import glob
import re

# God class if:
# |M(C)| > E(M) + 6*V(M)
# (number of methods greater than average across all classes plus 6 times the
# standard deviation)

DIR: str = os.path.dirname(os.path.realpath(__file__))
SOURCES: str = DIR + '/sources/src/com/google/javascript/jscomp/'
OUT_DIR: str = DIR + '/metrics'


def clean_output():
    filelist = glob.glob(OUT_DIR + '/*.csv')
    for f in filelist:
        os.remove(f)


def count(node: javalang.tree.Node, the_filter) -> int:
    i = 0
    for _, _ in node.filter(the_filter):
        i += 1
    return i


def metrics(clazz: javalang.tree.ClassDeclaration) -> dict[str, str]:
    m = {}

    m['MTH'] = len(clazz.methods) # MTH: #Methods 
    m['FLD'] = len(clazz.fields) # FLD: #Fields 

    # RFC: #Public methods + #Method invocations
    m['RFC'] = 0
    for method in clazz.methods:
        if 'public' in method.modifiers:
            m['RFC'] += 1
    m['RFC'] += count(clazz, javalang.tree.MethodInvocation)

    # INT: #Implemented interfaces
    m['INT'] = 0 if clazz.implements is None else len(clazz.implements)

    # Method metrics (max across methods)
    sz = 0; cpx = 0; ex = 0; ret = 0; sum_m_name_len = 0
    for method in clazz.methods:
        sum_m_name_len += len(method.name)

        # SZ: #Statements
        sz = max(sz, count(method, javalang.tree.Statement))

        # CPX: #CONDITIONAL + #LOOP statements
        cpx = max(cpx, 
            count(method, javalang.tree.IfStatement) + 
            count(method, javalang.tree.TernaryExpression) + 
            count(method, javalang.tree.WhileStatement) + 
            count(method, javalang.tree.ForStatement))

        # EX: #Exceptions in throws clause
        ex = max(ex, 0 if method.throws is None else len(method.throws))

        # RET: #Return points
        ret = max(ret, count(method, javalang.tree.ReturnStatement))
    m['SZ'] = sz; m['CPX'] = cpx; m['EX'] = ex; m['RET'] = ret

    # NLP metrics   
 
    # Average length of method names
    l = len(clazz.methods)
    m['NML'] = 0 if l == 0 else sum_m_name_len / l

    m['BCM'] = 0; m['WRD'] = 0
    for _, node in clazz.filter(javalang.tree.Documented):
        if node.documentation is not None:
            # BCM: #block comments
            m['BCM'] += 1 

            # #Words (longest alphanumeric substrings) in block comments
            m['WRD'] += len(re.findall('\w+', node.documentation))

    # #Words in comments / #Statements
    s = count(clazz, javalang.tree.Statement)
    m['DCM'] = 0 if s == 0 else m['WRD'] / s

    return m


def create_df(root) -> pd.DataFrame:
    df = pd.DataFrame(columns=['class_name', 'MTH', 'FLD', 'RFC', 'INT', 'SZ', 'CPX', 'EX', 'RET', 'BCM', 'NML', 'WRD', 'DCM'])

    i: int = 0
    for path, dirs, files in os.walk(root):
        for f in files:
            if f.endswith('.java'):
                # Get the AST of the file
                with open(path + '/' + f) as file:
                    data = file.read()
                tree = javalang.parse.parse(data)

                # Fetch package name from package declaration
                # if node is missing, assuming default package ('')
                package_name = ''
                for _, node in tree.filter(javalang.tree.PackageDeclaration):
                    package_name = node.name
                    break

                # Get all classes and number of methods for each one
                rows: list[tuple] = []
                for _, node in tree.filter(javalang.tree.ClassDeclaration):
                    fqdn = package_name + '.' + node.name
                    
                    df.loc[i, 'class_name'] = fqdn
                    m = metrics(node)
                    for metric in m:
                        df.loc[i, metric] = m[metric]
                    i += 1

    return df


def main():
    clean_output()
    df = create_df(SOURCES)
    df.to_csv(OUT_DIR + '/feature_vectors.csv', index=False)

if __name__ == '__main__':
    main()