part 1 almost done (no filtering on filenames / identifiers)

This commit is contained in:
Claudio Maggioni 2023-10-09 15:08:47 +02:00
parent 1122cdd8b0
commit 57cf6164f4
3 changed files with 116359 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
env/

116289
data.csv Normal file

File diff suppressed because it is too large Load diff

69
extract-data.py Normal file
View file

@ -0,0 +1,69 @@
import ast
import pandas as pd
import os
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
IN_DIR = os.path.join(SCRIPT_DIR, "tensorflow")
OUT_FILE = os.path.join(SCRIPT_DIR, "data.csv")
def find_py_files(dir):
for (cwd, dirs, files) in os.walk(dir):
for file in files:
if file.endswith(".py"):
yield os.path.join(cwd, file)
class FeatureVisitor(ast.NodeVisitor):
def __init__(self, filename):
self.filename = os.path.relpath(filename, SCRIPT_DIR)
self.rows = []
def visit_FunctionDef(self, node):
self.rows.append({
"name": node.name,
"file": self.filename,
"line": node.lineno,
"type": "function",
"comment": ast.get_docstring(node)
})
def visit_MethodDef(self, node):
self.rows.append({
"name": node.name,
"file": self.filename,
"line": node.lineno,
"type": "method",
"comment": ast.get_docstring(node)
})
def visit_ClassDef(self, node):
self.rows.append({
"name": node.name,
"file": self.filename,
"line": node.lineno,
"type": "class",
"comment": ast.get_docstring(node)
})
def main():
df = pd.DataFrame(columns=["name", "file", "line", "type", "comment"])
for file in find_py_files(IN_DIR):
with open(file, "r") as f:
py_source = f.read()
py_ast = ast.parse(py_source)
visitor = FeatureVisitor(file)
visitor.visit(py_ast)
df_visitor = pd.DataFrame.from_records(visitor.rows)
df = pd.concat([df, df_visitor])
df.reset_index(drop=True).to_csv(OUT_FILE)
if __name__ == "__main__":
main()