project work part 1 done

2023-03-06 16:00:00 +01:00 · 2023-03-06 16:00:00 +01:00 · 6ecca28c13
commit 6ecca28c13
parent 5af7724e0d
5 changed files with 85 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-virtualenv/
+env/
+__pycache__/
--- a/find_god_classes.py
+++ b/find_god_classes.py
@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+import javalang
+import os
+import pandas as pd
+import glob
+
+# God class if:
+# |M(C)| > E(M) + 6*V(M)
+# (number of methods greater than average across all classes plus 6 times the
+# standard deviation)
+
+
+DIR: str = os.path.dirname(os.path.realpath(__file__))
+SOURCES: str = DIR + '/resources/xerces2-j-src'
+OUT_DIR: str = DIR + '/god_classes'
+
+
+def clean_output():
+    filelist = glob.glob(OUT_DIR + '/*.csv')
+    for f in filelist:
+        os.remove(f)
+
+
+def parse(path: str) -> list[tuple[str, str]]:
+    # Get the AST of the file
+    with open(path) as file:
+        data = file.read()
+    tree = javalang.parse.parse(data)
+
+    # Fetch package name from package declaration
+    # if node is missing, assuming default package ('')
+    package_name = ''
+    for _, node in tree.filter(javalang.tree.PackageDeclaration):
+        package_name = node.name
+        break
+
+    # Get all classes and number of methods for each one
+    rows: list[tuple[str, str]] = []
+    for _, node in tree.filter(javalang.tree.ClassDeclaration):
+        fqdn = package_name + '.' + node.name
+        rows.append((fqdn, len(node.methods),))
+
+    return rows
+
+
+def create_df(root) -> pd.DataFrame:
+    frame = pd.DataFrame(columns=['class_name', 'method_num'])
+
+    i: int = 0
+    for path, dirs, files in os.walk(root):
+        for f in files:
+            if f.endswith('.java'):
+                # for each java file, add all entries found to dataframe
+                for row in parse(path + '/' + f):
+                    frame.loc[i] = row
+                    i += 1
+
+    return frame
+
+
+def main():
+    clean_output()
+    df = create_df(SOURCES)
+
+    mean = df.loc[:, 'method_num'].mean()
+    std = df.loc[:, 'method_num'].std()
+    threshold = mean + 6 * std
+
+    god_classes_df = df[df['method_num'] > threshold]
+    god_classes_df.to_csv(OUT_DIR + '/god_classes.csv')
+
+
+if __name__ == '__main__':
+    main()
--- a/god_classes/.gitkeep
+++ b/god_classes/.gitkeep
--- a/god_classes/god_classes.csv
+++ b/god_classes/god_classes.csv
@ -0,0 +1,5 @@
+,class_name,method_num
+250,org.apache.xerces.impl.xs.traversers.XSDHandler,118
+300,org.apache.xerces.impl.dtd.DTDGrammar,101
+406,org.apache.xerces.xinclude.XIncludeHandler,116
+602,org.apache.xerces.dom.CoreDocumentImpl,125
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+javalang==0.13.0
+pandas==1.5.2
+scikit_learn==1.2.1
+shrek==0.0.2