Final version of the bug-triaging project
Commit history has been discarded to remove large files from the repo.
1
.env.template
Normal file
|
@ -0,0 +1 @@
|
|||
GITHUB_TOKEN=
|
19
.gitattributes
vendored
Normal file
|
@ -0,0 +1,19 @@
|
|||
/issues.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/issues_new.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/src/model-dl/bbc-text.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/issues_000001_170000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/issues_150000_170000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/issues_170001_180000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/issues_test_170001_180000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/issues_train_000001_170000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/issues_train_recent_150000_170000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/out/csv/issues_test_170001_180000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/out/csv/issues_train_000001_170000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/out/csv/issues_train_recent_150000_170000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/out/csv/issues_test_2_170001_180000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/out/csv/issues_train_2_000001_170000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/out/csv/issues_train_recent_2_150000_170000.csv filter=lfs diff=lfs merge=lfs -text
|
||||
/out/json/issues.tar.gz filter=lfs diff=lfs merge=lfs -text
|
||||
/out/model/bug_triaging_all_10e_1e-06lr_relu.pt filter=lfs diff=lfs merge=lfs -text
|
||||
/out/model/bug_triaging_recent_10e_1e-06lr.pt filter=lfs diff=lfs merge=lfs -text
|
||||
/out/model/bug_triaging_recent_10e_1e-06lr_relu.pt filter=lfs diff=lfs merge=lfs -text
|
461
.gitignore
vendored
Normal file
|
@ -0,0 +1,461 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
**/latex/
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
**/.DS_Store
|
||||
out/model/*.pt
|
||||
|
||||
## Core latex/pdflatex auxiliary files:
|
||||
*.aux
|
||||
*.lof
|
||||
*.lot
|
||||
*.fls
|
||||
*.out
|
||||
*.toc
|
||||
*.fmt
|
||||
*.fot
|
||||
*.cb
|
||||
*.cb2
|
||||
.*.lb
|
||||
|
||||
## Intermediate documents:
|
||||
*.dvi
|
||||
*.xdv
|
||||
*-converted-to.*
|
||||
# these rules might exclude image files for figures etc.
|
||||
# *.ps
|
||||
# *.eps
|
||||
# *.pdf
|
||||
|
||||
## Generated if empty string is given at "Please type another file name for output:"
|
||||
**/*.pdf
|
||||
|
||||
## Bibliography auxiliary files (bibtex/biblatex/biber):
|
||||
*.bbl
|
||||
*.bcf
|
||||
*.blg
|
||||
*-blx.aux
|
||||
*-blx.bib
|
||||
*.run.xml
|
||||
|
||||
## Build tool auxiliary files:
|
||||
*.fdb_latexmk
|
||||
*.synctex
|
||||
*.synctex(busy)
|
||||
*.synctex.gz
|
||||
*.synctex.gz(busy)
|
||||
*.pdfsync
|
||||
|
||||
## Build tool directories for auxiliary files
|
||||
# latexrun
|
||||
latex.out/
|
||||
|
||||
## Auxiliary and intermediate files from other packages:
|
||||
# algorithms
|
||||
*.alg
|
||||
*.loa
|
||||
|
||||
# achemso
|
||||
acs-*.bib
|
||||
|
||||
# amsthm
|
||||
*.thm
|
||||
|
||||
# beamer
|
||||
*.nav
|
||||
*.pre
|
||||
*.snm
|
||||
*.vrb
|
||||
|
||||
# changes
|
||||
*.soc
|
||||
|
||||
# comment
|
||||
*.cut
|
||||
|
||||
# cprotect
|
||||
*.cpt
|
||||
|
||||
# elsarticle (documentclass of Elsevier journals)
|
||||
*.spl
|
||||
|
||||
# endnotes
|
||||
*.ent
|
||||
|
||||
*.lox
|
||||
|
||||
# feynmf/feynmp
|
||||
*.mf
|
||||
*.mp
|
||||
*.t[1-9]
|
||||
*.t[1-9][0-9]
|
||||
*.tfm
|
||||
|
||||
#(r)(e)ledmac/(r)(e)ledpar
|
||||
*.end
|
||||
*.?end
|
||||
*.[1-9]
|
||||
*.[1-9][0-9]
|
||||
*.[1-9][0-9][0-9]
|
||||
*.[1-9]R
|
||||
*.[1-9][0-9]R
|
||||
*.[1-9][0-9][0-9]R
|
||||
*.eledsec[1-9]
|
||||
*.eledsec[1-9]R
|
||||
*.eledsec[1-9][0-9]
|
||||
*.eledsec[1-9][0-9]R
|
||||
*.eledsec[1-9][0-9][0-9]
|
||||
*.eledsec[1-9][0-9][0-9]R
|
||||
|
||||
# glossaries
|
||||
*.acn
|
||||
*.acr
|
||||
*.glg
|
||||
*.glo
|
||||
*.gls
|
||||
*.glsdefs
|
||||
*.lzo
|
||||
*.lzs
|
||||
*.slg
|
||||
*.slo
|
||||
*.sls
|
||||
|
||||
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
|
||||
# *.ist
|
||||
|
||||
# gnuplot
|
||||
*.gnuplot
|
||||
*.table
|
||||
|
||||
# gnuplottex
|
||||
*-gnuplottex-*
|
||||
|
||||
# gregoriotex
|
||||
*.gaux
|
||||
*.glog
|
||||
*.gtex
|
||||
|
||||
# htlatex
|
||||
*.4ct
|
||||
*.4tc
|
||||
*.idv
|
||||
*.lg
|
||||
*.trc
|
||||
*.xref
|
||||
|
||||
# hyperref
|
||||
*.brf
|
||||
|
||||
# knitr
|
||||
*-concordance.tex
|
||||
# *.tikz
|
||||
*-tikzDictionary
|
||||
|
||||
# listings
|
||||
*.lol
|
||||
|
||||
# luatexja-ruby
|
||||
*.ltjruby
|
||||
|
||||
# makeidx
|
||||
*.idx
|
||||
*.ilg
|
||||
*.ind
|
||||
|
||||
# minitoc
|
||||
*.maf
|
||||
*.mlf
|
||||
*.mlt
|
||||
*.mtc[0-9]*
|
||||
*.slf[0-9]*
|
||||
*.slt[0-9]*
|
||||
*.stc[0-9]*
|
||||
|
||||
# minted
|
||||
_minted*
|
||||
*.pyg
|
||||
|
||||
# morewrites
|
||||
*.mw
|
||||
|
||||
# newpax
|
||||
*.newpax
|
||||
|
||||
# nomencl
|
||||
*.nlg
|
||||
*.nlo
|
||||
*.nls
|
||||
|
||||
# pax
|
||||
*.pax
|
||||
|
||||
# pdfpcnotes
|
||||
*.pdfpc
|
||||
|
||||
# sagetex
|
||||
*.sagetex.sage
|
||||
*.sagetex.py
|
||||
*.sagetex.scmd
|
||||
|
||||
# scrwfile
|
||||
*.wrt
|
||||
|
||||
# svg
|
||||
svg-inkscape/
|
||||
|
||||
# sympy
|
||||
*.sout
|
||||
*.sympy
|
||||
sympy-plots-for-*.tex/
|
||||
|
||||
# pdfcomment
|
||||
*.upa
|
||||
*.upb
|
||||
|
||||
# pythontex
|
||||
*.pytxcode
|
||||
pythontex-files-*/
|
||||
|
||||
# tcolorbox
|
||||
*.listing
|
||||
|
||||
# thmtools
|
||||
*.loe
|
||||
|
||||
# TikZ & PGF
|
||||
*.dpth
|
||||
*.md5
|
||||
*.auxlock
|
||||
|
||||
# titletoc
|
||||
*.ptc
|
||||
|
||||
# todonotes
|
||||
*.tdo
|
||||
|
||||
# vhistory
|
||||
*.hst
|
||||
*.ver
|
||||
|
||||
*.lod
|
||||
|
||||
# xcolor
|
||||
*.xcp
|
||||
|
||||
# xmpincl
|
||||
*.xmpi
|
||||
|
||||
# xindy
|
||||
*.xdy
|
||||
|
||||
# xypic precompiled matrices and outlines
|
||||
*.xyc
|
||||
*.xyd
|
||||
|
||||
# endfloat
|
||||
*.ttt
|
||||
*.fff
|
||||
|
||||
# Latexian
|
||||
TSWLatexianTemp*
|
||||
|
||||
## Editors:
|
||||
# WinEdt
|
||||
*.bak
|
||||
*.sav
|
||||
|
||||
# Texpad
|
||||
.texpadtmp
|
||||
|
||||
# LyX
|
||||
*.lyx~
|
||||
|
||||
# Kile
|
||||
*.backup
|
||||
|
||||
# gummi
|
||||
.*.swp
|
||||
|
||||
# KBibTeX
|
||||
*~[0-9]*
|
||||
|
||||
# TeXnicCenter
|
||||
*.tps
|
||||
|
||||
# auto folder when using emacs and auctex
|
||||
./auto/*
|
||||
*.el
|
||||
|
||||
# expex forward references with \gathertags
|
||||
*-tags.tex
|
||||
|
||||
# standalone packages
|
||||
*.sta
|
||||
|
||||
# Makeindex log files
|
||||
*.lpz
|
||||
|
||||
# xwatermark package
|
||||
*.xwm
|
||||
|
||||
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
|
||||
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
|
||||
# Uncomment the next line to have this generated file ignored.
|
||||
#*Notes.bib
|
55
.gitlab-ci.yml
Normal file
|
@ -0,0 +1,55 @@
|
|||
image: python:3.10.7
|
||||
|
||||
stages:
|
||||
- test
|
||||
- sonarqube
|
||||
- code_quality
|
||||
- deploy
|
||||
|
||||
tests:
|
||||
stage: test
|
||||
script:
|
||||
- python -m venv venv
|
||||
- source venv/bin/activate
|
||||
- pip install -r requirements.txt
|
||||
- python -m pytest --cov=. --junitxml=coverage/junit-report.xml tests
|
||||
- coverage xml
|
||||
artifacts:
|
||||
when: always
|
||||
paths:
|
||||
- coverage.xml
|
||||
- coverage/
|
||||
reports:
|
||||
#cobertura: coverage/cobertura-coverage.xml
|
||||
junit: coverage/junit-report.xml
|
||||
|
||||
sonarqube-check:
|
||||
only:
|
||||
- main #Code quality runs only on main
|
||||
stage: code_quality
|
||||
allow_failure: true
|
||||
image:
|
||||
name: ${CI_DEPENDENCY_PROXY_DIRECT_GROUP_IMAGE_PREFIX}/sonarsource/sonar-scanner-cli:latest
|
||||
entrypoint: ['']
|
||||
variables:
|
||||
SONAR_USER_HOME: '${CI_PROJECT_DIR}/.sonar'
|
||||
GIT_DEPTH: '0' # Tells git to fetch all the branches of the project, required by the analysis task
|
||||
cache:
|
||||
key: '${CI_JOB_NAME}'
|
||||
paths:
|
||||
- .sonar/cache
|
||||
script:
|
||||
- sonar-scanner
|
||||
|
||||
#docker-build:
|
||||
# image: docker:latest
|
||||
# stage: deploy
|
||||
# services:
|
||||
# - docker:dind
|
||||
# before_script:
|
||||
# - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD"
|
||||
# script:
|
||||
# - docker build -t "$CI_REGISTRY_USER/sa-triage" .
|
||||
# - docker push "$CI_REGISTRY_USER/sa-triage"
|
||||
|
||||
|
243
README.md
Normal file
|
@ -0,0 +1,243 @@
|
|||
# Assignment 1: Automated Bug Triaging
|
||||
|
||||
**Group 2: Baris Aksakal, Edoardo Riggio, Claudio Maggioni**
|
||||
|
||||
# Repository structure
|
||||
|
||||
- `/docs`: LaTeX report code;
|
||||
- `/out`
|
||||
- `/csv`: Cleaner output;
|
||||
- `/json`: Scraper output;
|
||||
- `/model`: Pickled models (model training output) and model evaluation output;
|
||||
- `/plots`: Plots for the dataset statistical analysis;
|
||||
- `/src`
|
||||
- `/analysis`: Notebook for the dataset statistical analysis;
|
||||
- `/model-dl`
|
||||
- `/bert_medium.ipynb`: Original implementation of the classifier model. Now broke down in python
|
||||
files;
|
||||
- `/model*.ipynb`: Alternative model implementation by Baris Aksakal. Not used in the final
|
||||
implementation;
|
||||
- `/{cleaner,modelimpl,scraper}`: Python modules used for scraper, cleaner, and model script implementation;
|
||||
- `/auc.py`: ROC curve generation script;
|
||||
- `/clean.py`: Cleaner script;
|
||||
- `/runmodel.py`: Model execution script;
|
||||
- `/scrape.py`: Scraper script;
|
||||
- `/trainmodel.py`: Model training script;
|
||||
- `/environment-dev.yaml`: Conda environment file for development environment;
|
||||
- `/environment-server.yml`: Conda environment file for model training and execution (to be used
|
||||
with `gym.si.usi.ch`).
|
||||
|
||||
# Setup
|
||||
|
||||
## Conda Environment
|
||||
|
||||
Training and running models is only supported on a CUDA 11.6 compatible environment like `gym.si.usi.ch`. The following
|
||||
instructions will create and activate a Conda environment with all required dependencies to scrape, clean,
|
||||
train and run the model:
|
||||
|
||||
```shell
|
||||
conda env remove -n bug-triaging-env || true # delete environment if already present
|
||||
conda env create --name bug-triaging-env --file=environment-server.yml
|
||||
conda activate bug-triaging-env
|
||||
```
|
||||
|
||||
### Development environment
|
||||
|
||||
*(may not work on all platforms/architectures)*
|
||||
|
||||
A pytorch-free version of the environment can be installed for development purposes. Only the scraper and cleaner script
|
||||
may be run using this environment. To install the development environment run:
|
||||
|
||||
```shell
|
||||
conda env remove -n bug-triaging-env-dev || true # delete environment if already present
|
||||
conda env create --name bug-triaging-env-dev --file=environment-dev.yml
|
||||
conda activate bug-triaging-env-dev
|
||||
```
|
||||
|
||||
## GitHub API token
|
||||
|
||||
In order to be able to run the scraper and the model executor, a GitHub API token is needed. The token must be placed in
|
||||
a `.env` file in this directory in a variable named `GITHUB_TOKEN`. The contents of the file should look like this:
|
||||
|
||||
```
|
||||
GITHUB_TOKEN=<insert-token-here>
|
||||
```
|
||||
|
||||
# Scraper
|
||||
|
||||
The scraper script is located in `src/scrape.py` and takes no arguments. It will download and save all issues in the
|
||||
`microsoft/vscode` repository in a gzip-compressed archive of JSON files, one per issue. The file will be saved in
|
||||
`out/json/issues.tar.gz`. The file **is deleted** if it already exists.
|
||||
|
||||
To run the scraper run:
|
||||
|
||||
```shell
|
||||
python3 src/scrape.py
|
||||
```
|
||||
|
||||
# Cleaner
|
||||
|
||||
The cleaner script is located in `src/clean.py` and takes no arguments. It will read the `out/json/issues.tar.gz`,
|
||||
perform the cleaning process, and perform the train-test split according to the instructions given in the assignment
|
||||
document. The output of the cleaning process is saved in 3 CSV files and one text file:
|
||||
|
||||
- `out/csv/issues_train_000001_170000.csv`, including all issues that belong to the complete training set;
|
||||
- `out/csv/issues_train_recent_150000_170000.csv`, including all issues that belong to the training set made up of "
|
||||
recent" issues;
|
||||
- `out/csv/issues_test_170001_180000.csv`, including all issues that belong to the test set.
|
||||
- `out/csv/issues_removed_count.txt`, including the count of issues (excluding PRs) that were discarded by the cleaning
|
||||
process in the entire dataset.
|
||||
|
||||
The script **will overwrite** these files if they exist. To run the cleaner script run:
|
||||
|
||||
```shell
|
||||
python3 src/clean.py
|
||||
```
|
||||
|
||||
# Training script
|
||||
|
||||
The script used to train the model is located in `src/trainmodel.py`. The script takes the following arguments:
|
||||
|
||||
```
|
||||
usage: trainmodel.py [-h] [-r LEARNING_RATE] [-c] [-f] {all,recent} epochs
|
||||
|
||||
Training and evaluation script. The script will train and save the obtained model and then perform test set evaluation.
|
||||
If the given parameters match with a model that was already saved, the script only runs the evaluation procedure.
|
||||
|
||||
positional arguments:
|
||||
{all,recent} The dataset to train with
|
||||
epochs Number of epochs of the training process
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-r LEARNING_RATE, --learning-rate LEARNING_RATE
|
||||
The learning rate fed in the Adam optimizer
|
||||
-c, --force-cpu disables CUDA support. Useful when debugging
|
||||
-f, --force-retraining forces training of a new model even if a matching model is already found within the saved
|
||||
models
|
||||
```
|
||||
|
||||
The script loads the generated CSV datasets in `out/csv` and will output three files in `out/model`:
|
||||
|
||||
- `out/model/bug_triaging_{all,recent}_{epochs}e_{LEARNING_RATE}lr_final.pt`, the pytorch "pickled" model;
|
||||
- `out/model/bug_triaging_{all,recent}_{epochs}e_{LEARNING_RATE}lr_final.label_range.txt`, a text file containing two
|
||||
lines which determine the numeric range of classification labels outputted by the model (this file is used when using
|
||||
the ROC and model execution scripts);
|
||||
- `out/model/bug_triaging_{all,recent}_{epochs}e_{LEARNING_RATE}lr_final.labels.csv`, a CSV file matching the assignee
|
||||
usernames with the numeric encoding used to train and execute the model with (this file is used when using
|
||||
the ROC and model execution scripts).
|
||||
|
||||
(`{all,recent}`, `{epochs}` and `{LEARNING_RATE}` are placeholders whose value will match the parameters given to the
|
||||
script)
|
||||
|
||||
To train the configurations that were chosen for the report execute:
|
||||
|
||||
```shell
|
||||
python3 src/trainmodel.py all 4 -r '5e-6'
|
||||
python3 src/trainmodel.py recent 4 -r '5e-6'
|
||||
```
|
||||
|
||||
**NOTE:** The pickled pytorch model files have not been committed to this repo due to file size restrictions. They are
|
||||
however saved in `gym.si.usi.ch:/home/SA23-G2/bug-triaging/out/model`.
|
||||
|
||||
# ROC curve generation script
|
||||
|
||||
The script used to train the model is located in `src/auc.py`. The script takes the following arguments:
|
||||
|
||||
```
|
||||
usage: auc.py [-h] [-c] modelfile
|
||||
|
||||
ROC curve and AUC computation script. The script evaluates the given model against the test set and generates a OvR ROC
|
||||
curve plot with one curve per class, a micro-averaged OvR ROC plot and the corresponding AUC value.
|
||||
|
||||
positional arguments:
|
||||
modelfile Path to the pickled pytorch model to classify the issue with
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-c, --force-cpu disables CUDA support. Useful when debugging
|
||||
```
|
||||
|
||||
`modelfile` must contain a path to one of the `.pt` files generated with the training script. The label range text file
|
||||
and the labels CSV file are assumed to be in the same directory of the picked model.
|
||||
|
||||
The script outputs two PNG plots and a text file:
|
||||
|
||||
- `out/model/{model}.ovr_curves.png` contains a plot of the One-vs-Rest ROC curves for each class (assignee) appearing
|
||||
both
|
||||
in the train and test set;
|
||||
- `out/model/{model}.ovr_avg.png` contains a plot of the micro-averaged One-vs-Rest ROC curve;
|
||||
- `out/model/{model}.auc.txt` contains the AUC for the micro-average ROC curve.
|
||||
|
||||
(`{model}` is a placeholder for the filename without extension - the output of the shell command
|
||||
`basename {modelfile} .pt` - for the pickled pytorch model given as argument)
|
||||
|
||||
To generate the curves for the two trained models run:
|
||||
|
||||
```shell
|
||||
python3 src/auc.py out/model/bug_triaging_all_4e_5e-06lr_final.pt
|
||||
python3 src/auc.py out/model/bug_triaging_recent_4e_5e-06lr_final.pt
|
||||
```
|
||||
|
||||
# Execution script
|
||||
|
||||
The script used to train the model is located in `src/runmodel.py`. The script takes the following arguments:
|
||||
|
||||
```
|
||||
usage: runmodel.py [-h] [-t TOP] [-c] modelfile issue_id
|
||||
|
||||
Model execution script. Downloads a given issue id from the microsoft/vscode repository, performs the cleaning process
|
||||
and recommends an assignee using the given model. The script may fail if the issue title and body do not contain any
|
||||
latin characters.
|
||||
|
||||
positional arguments:
|
||||
modelfile Path to the pickled pytorch model to classify the issue with
|
||||
issue_id The microsoft/vscode GitHub issue id to classify
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-t TOP, --top TOP Number of recommendations to output
|
||||
-c, --force-cpu disables CUDA support. Useful when debugging
|
||||
```
|
||||
|
||||
The script outputs the top-5 assignee recommendations for the given issue, and the actual assignee if the issue has
|
||||
already been assigned.
|
||||
|
||||
Alongside each assignee, the script outputs the corresponding numerical embedding. A numerical
|
||||
embedding equal to `-1` in the truth label denotes that the assignee does not appear in the training set
|
||||
(after the train/validation split).
|
||||
|
||||
The script also outputs the number of commits each assignee authored in the repository.
|
||||
|
||||
This is an example of the script output for issue `192213`:
|
||||
|
||||
```
|
||||
1: 'roblourens' (44) (confidence: 16.37%) (3932 commits authored)
|
||||
2: 'lramos15' (36) (confidence: 12.62%) (829 commits authored)
|
||||
3: 'bpasero' (16) (confidence: 7.29%) (11589 commits authored)
|
||||
4: 'jrieken' (32) (confidence: 4.53%) (9726 commits authored)
|
||||
5: 'hediet' (28) (confidence: 3.84%) (1231 commits authored)
|
||||
Truth: 'alexdima' (9) (6564 commits authored)
|
||||
```
|
||||
|
||||
To execute both the model trained on the `recent` dataset for issue 192213 run:
|
||||
|
||||
```shell
|
||||
python3 src/runmodel.py out/model/bug_triaging_all_4e_5e-06lr_final.pt 192213
|
||||
```
|
||||
|
||||
To execute both the model trained on the `all` dataset for issue 192213 run:
|
||||
|
||||
```shell
|
||||
python3 src/runmodel.py out/model/bug_triaging_recent_4e_5e-06lr_final.pt 192213
|
||||
```
|
||||
|
||||
# Report
|
||||
|
||||
To compile the report run:
|
||||
|
||||
```shell
|
||||
cd docs
|
||||
pdflatex -interaction=nonstopmode -output-directory=. main.tex
|
||||
pdflatex -interaction=nonstopmode -output-directory=. main.tex
|
||||
```
|
8
coveragerc
Normal file
|
@ -0,0 +1,8 @@
|
|||
[run]
|
||||
omit = tests/*
|
||||
|
||||
[paths]
|
||||
source = scripts/*
|
||||
|
||||
[xml]
|
||||
output = coverage/junit-report.xml
|
72
docs/main.tex
Normal file
|
@ -0,0 +1,72 @@
|
|||
\documentclass{scrartcl}
|
||||
\setlength\paperwidth{20.999cm}
|
||||
\setlength\paperheight{29.699cm}
|
||||
\setlength\voffset{-1in}
|
||||
\setlength\hoffset{-1in}
|
||||
\setlength\topmargin{1.499cm}
|
||||
\setlength\headheight{12pt}
|
||||
\setlength\headsep{.7cm}
|
||||
\setlength\footskip{1.131cm}
|
||||
\setlength\textheight{25cm}
|
||||
\setlength\oddsidemargin{2.499cm}
|
||||
\setlength\textwidth{15.999cm}
|
||||
\setlength\parindent{0cm}
|
||||
\setlength\parskip{0.3em}
|
||||
|
||||
\usepackage{amsmath}
|
||||
\usepackage{listings}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{fancyvrb}
|
||||
\usepackage{newverbs}
|
||||
\usepackage{fancyhdr}
|
||||
\usepackage{extramarks}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{mathtools}
|
||||
\usepackage{multicol}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{float}
|
||||
\usepackage{subcaption}
|
||||
|
||||
\pagestyle{fancy}
|
||||
\lhead{Aksakal, Maggioni, Riggio - Bug Triaging}
|
||||
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
|
||||
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
|
||||
|
||||
\newcommand\vartextvisiblespace[1][.6em]{%
|
||||
\makebox[\#1]{%
|
||||
\kern.07em
|
||||
\vrule height.4ex
|
||||
\hrulefill
|
||||
\vrule height.4ex
|
||||
\kern.07em
|
||||
}%
|
||||
}
|
||||
|
||||
\begin{document}
|
||||
\thispagestyle{plain}
|
||||
|
||||
\begin{center}
|
||||
\hrule
|
||||
|
||||
\vspace{.4cm}
|
||||
{\textbf {\Huge Bug Triaging}} \\
|
||||
\vspace{.2cm}
|
||||
{\textbf Software Analytics}
|
||||
\vspace{.2cm}
|
||||
\end{center}
|
||||
{\textbf Baris Aksakal } (baris.aksakal@usi.ch) \hspace{\fill} \\
|
||||
{\textbf Claudio Maggioni } (claudio.maggioni@usi.ch) \hspace{\fill} \\
|
||||
{\textbf Edoardo Riggio } (edoardo.riggio@usi.ch) \hspace{\fill} \today \\
|
||||
\hrule
|
||||
\vspace{.2cm}
|
||||
|
||||
\input{./sections/introduction}
|
||||
\input{./sections/scraping}
|
||||
\input{./sections/cleaning}
|
||||
\input{./sections/statistics}
|
||||
\input{./sections/prototype_model}
|
||||
\input{./sections/model}
|
||||
\input{./sections/references}
|
||||
|
||||
\end{document}
|
23
docs/sections/cleaning.tex
Normal file
|
@ -0,0 +1,23 @@
|
|||
\section*{Data Cleaning}
|
||||
Regarding data cleaning, we've employed a series of procedures to eliminate as much as possible noisy data that could potentially hinder the learning process of the DL model.
|
||||
|
||||
The first think we do is to transform the body from markdown to HTML\@.
|
||||
Thanks to this conversion, we are able to work directly on HTML tags and use Python's Beautiful Soup to remove some HTML blocks.
|
||||
In particular, we remove everything that is contained inside the \verb|<details> </details>| tag.
|
||||
This is done because everything that is contained in the tag is related to system details of the user and is not useful for the classification task our model needs to perform.
|
||||
Moreover, we remove the HTML comments, since they are part of some boilerplate code that gets generated when submitting an issue.
|
||||
|
||||
Now that all the unuseful sections have been removed, we convert the HTML back to plain text.
|
||||
From, here, we use a Python library to remove all emojis contained in the body and in the title (since they wouldn't help for the training).
|
||||
We also remove all URLs and newlines.
|
||||
|
||||
Finally, we check is the remaining body and title are written in a language that uses latin characters.
|
||||
If we encounter an issue written in other languages (such as Russian and Chinese), then the whole issue is discarded.
|
||||
This is done because our DL model has been pre-trained on English documents, thus it would not make any sense to train it on Chinese or Russian data.
|
||||
|
||||
We also tried to typical techniques used in this case: stemming and stopword removal.
|
||||
By applying stemming, we noticed that the body lost the fluidity of natural language.
|
||||
Since our model has been trained to recognize natural language, we felt that it would not make any sense to remove stopwords.
|
||||
In addition, we were planning to use stopword removal to decrease the number of tokens to feed to BERT (the limit for our base model is set to 512).
|
||||
But, after a statistical analysis on the data, we noticed that 101468 out of 102065 ($99.4\%$) of the issues are composed by less than 512 tokens (the next section will describe out statistical analysis in greater detail).
|
||||
And in this case we had similar results to the stemming, meaning that the text lost its fluidity.
|
8
docs/sections/introduction.tex
Normal file
|
@ -0,0 +1,8 @@
|
|||
\section*{Introduction}
|
||||
The goal of this assignment was to create a machine learning model able to assign a user to a GitHub issue.
|
||||
The very first step towards this goal was to scrape from the VSCode GitHub repository the past issue.
|
||||
These issues will be used to train the machine learning model (a deep neural network called BERT).
|
||||
The next logical step was to perform cleaning on the raw scraped data.
|
||||
We noticed that some of the parts of the issue body or title introduced noise that could negatively affect the training process.
|
||||
For this reason, the data was cleaned before being fed to BERT\@.
|
||||
Finally, a pre-trained (on english documents) base model of BERT was trained using our cleaned data, and returns a ranking of the top 5 most probable user to be assigned to the queried issue.
|
162
docs/sections/model.tex
Normal file
|
@ -0,0 +1,162 @@
|
|||
\section*{Model implementation}
|
||||
|
||||
The BERT model was implemented by loosely following a Medium article named
|
||||
``Text Classification with BERT in PyTorch - Towards Data Science'' by Ruben Winastwan%
|
||||
\footnote{\url{https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f}}.
|
||||
|
||||
Our implementation uses the \texttt{BertForSequenceClassification} model from the HuggingFace \texttt{transformers}
|
||||
library. The model architecture simply joins the pre-trained deep learning weights from BERT-medium with a feed-forward
|
||||
output layer consisting on one neuron per class to predict.
|
||||
|
||||
We train this model over two datasets, where one contains all issues in the range $[1, 170000]$ and another contains
|
||||
a more ``recent'' set of issues, namely in the range $[150000, 170000]$. In the training and evaluation scripts, these
|
||||
datasets are named \texttt{all} and \texttt{recent} respectively. The test set is made of issues in the range
|
||||
$[170001, 180000]$, and it is used to evaluate the model with both training datasets. Each of the \texttt{all} and
|
||||
\texttt{recent} datasets are split chronologically in a train set and validation set with
|
||||
90\% / 10\% proportions.
|
||||
|
||||
In order not to bias the model implementation with knowledge from ``future'' data, the classifier has as many output
|
||||
neurons as distinct assignees appearing in the training set. Additionally, instances in the validation set where the
|
||||
assignee does not match one of the assignees in the training set are excluded. However, in order not to bias the model
|
||||
evaluation, those instances are not excluded from the test set.
|
||||
|
||||
The training script encodes assignees with a numerical embedding between 0 and the number of assignees minus 1. The order
|
||||
of the values in this embedding reflects the chronological order of the first issue assigned to each assignee. The only
|
||||
predictor variables that are considered by the model are the cleaned issue title and body, which are concatenated without
|
||||
adding any additional tokens or markers, tokenized, and mapped in a 768-wide vector.
|
||||
|
||||
The size of the train, validation and test split for each dataset is illustrated in table~\ref{tab:set_size}.
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\begin{tabular}{lrr}
|
||||
\toprule
|
||||
Split & \texttt{recent} & \texttt{all} \\
|
||||
\midrule
|
||||
Training & 8303 & 91858 \\
|
||||
Validation & 921 & 10167 \\
|
||||
Test & 4787 & 4787 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Number of instances in the training, validation and test set for model training on the \texttt{recent}
|
||||
and \texttt{all} datasets.}
|
||||
\label{tab:set_size}
|
||||
\end{table}
|
||||
|
||||
Our training procedure runs over the data for 4 epochs for both datasets. In each epoch, the model is trained on a
|
||||
shuffled copy of the training set while average loss and accuracy are tracked. After backward propagation,
|
||||
the \textit{Adam} optimizer is applied upon the weights of the model with a learning
|
||||
rate of $5 \cdot 10^{-6}$ and \textit{beta} values equal to $(0.9, 0.9999)$.
|
||||
After each epoch, validation loss and accuracy are computed.
|
||||
|
||||
Due to lack of time,
|
||||
no automatic early stopping procedure has been implemented in the model training script. Therefore, the validation output
|
||||
has been manually used to do hyperparameter tuning. For example, the number of epochs has been chosen so that for both
|
||||
models the validation loss and accuracy decrease (allowing for some tolerance) between epochs and that the metrics do
|
||||
not diverge too much from the values observed during training.
|
||||
|
||||
Another instance where the validation set has been useful is in the choice of the embedding process for the issue title
|
||||
and body. We choose to use \texttt{distilbert-base-uncased}, a non-cased tokenizer after empirically determining that it
|
||||
provides better performance than a cased counterpart (namely \texttt{bert-base-cased}) over the validation set. However,
|
||||
we do not claim that our hyperparameter tuning procedure has been completely exhaustive. For instance, due to lack of
|
||||
time and computing power, both tokenizers have been tested only with a token length of 512 and truncation enabled.
|
||||
|
||||
In table~\ref{tab:metrics-recent} we report loss and accuracy for the train and validation set during training of the
|
||||
model over the \texttt{recent} dataset, while in table~\ref{tab:metrics-all} we report the same values for the model
|
||||
trained over the \texttt{all} dataset. By comparing the validation accuracy of both models we can say that the
|
||||
\texttt{recent} model performs better over the test set.
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\begin{tabular}{lrrrr}
|
||||
\toprule
|
||||
Epoch & Train loss & Validation loss & Train accuracy & Validation accuracy \\
|
||||
\midrule
|
||||
1 & 0.204 & 0.174 & 0.171 & 0.343 \\
|
||||
2 & 0.156 & 0.140 & 0.386 & 0.467 \\
|
||||
3 & 0.124 & 0.125 & 0.542 & 0.545 \\
|
||||
4 & 0.100 & 0.120 & 0.642 & 0.557 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Train set and validation set loss and accuracy during model training over the \texttt{recent} dataset.}
|
||||
\label{tab:metrics-recent}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\begin{tabular}{lrrrr}
|
||||
\toprule
|
||||
Epoch & Train loss & Validation loss & Train accuracy & Validation accuracy \\
|
||||
\midrule
|
||||
1 & 0.137 & 0.164 & 0.453 & 0.357 \\
|
||||
2 & 0.095 & 0.154 & 0.601 & 0.405 \\
|
||||
3 & 0.077 & 0.157 & 0.676 & 0.427 \\
|
||||
4 & 0.060 & 0.160 & 0.751 & 0.435 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Train set and validation set loss and accuracy during model training over the \texttt{all} dataset.}
|
||||
\label{tab:metrics-all}
|
||||
\end{table}
|
||||
|
||||
The performance of the models trained on the \texttt{all} and \texttt{recent} datasets is reported in
|
||||
table~\tab{tab:test-results}. We notice that both models are significantly better at outputting the correct assignee
|
||||
within the top 2 or top 3 results rather than picking the most confident output only. For all accuracies observed, the
|
||||
\texttt{recent} model still performs better than the \textt{all} model.
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\begin{tabular}{lrr}
|
||||
\toprule
|
||||
Truth label found & \texttt{recent} & \texttt{all} \\
|
||||
\midrule
|
||||
In top recommendation & 0.4980 & 0.4034 \\
|
||||
Within top 2 recommendations & 0.6179 & 0.5408 \\
|
||||
Within top 3 recommendations & 0.6651 & 0.5916 \\
|
||||
Within top 4 recommendations & 0.6940 & 0.6359 \\
|
||||
Within top 5 recommendations & 0.7174 & 0.6658 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Model accuracy on the test set for training with the \textt{all} and \texttt{recent} datasets. Accuracy
|
||||
is reported for the recommendations given by the model output ordered by confidence.}
|
||||
\label{tab:test-results}
|
||||
\end{table}
|
||||
|
||||
The receiving operating characteristics (ROC) curve is reported according to the One-vs-Rest method by computing
|
||||
one curve for each class (i.e.\ assignee) in the training set. The curve for the \texttt{recent} model is reported in
|
||||
figure~\ref{fig:roc-recent}, while the curve for the \texttt{all} model is reported in figure~\ref{fig:roc-all}. As the
|
||||
numeric label for each assignee is given in chronological order of first issue assignment, we can observe a difference
|
||||
between long-standing and more recent contributors. Long-standing contributors have lower AUC than recent contributors
|
||||
for both models. This may indicate that the models are more effective at predicting recent contributors as they are the
|
||||
most active on issues in the test set, which is by construction made of recent issues. This may be caused by
|
||||
long-standing authors eventually leaving the project.
|
||||
|
||||
\begin{figure}
|
||||
\includegraphics[width=\linewidth]{../out/model/bug_triaging_recent_4e_5e-06lr_final.ovr_curves}
|
||||
\caption{One-vs-Rest ROC curves for each class in the \texttt{recent} dataset for the model trained on the same dataset.}
|
||||
\label{fig:roc-recent}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}
|
||||
\includegraphics[width=\linewidth]{../out/model/bug_triaging_all_4e_5e-06lr_final.ovr_curves}
|
||||
\caption{One-vs-Rest ROC curves for each class in the \texttt{all} dataset for the model trained on the same dataset.}
|
||||
\label{fig:roc-all}
|
||||
\end{figure}
|
||||
|
||||
Additionally, we report a micro-averaged ROC curve to understand each model's overall performance, and we report the
|
||||
corresponding area under curve (AUC) value. These curves can be found in figure~\ref{fig:roc-avg}. The \texttt{recent} model
|
||||
is the one with a higher overall AUC .
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\begin{subfigure}[t]{\linewidth}
|
||||
\centering\includegraphics[width=.7\linewidth]{../out/model/bug_triaging_recent_4e_5e-06lr_final.ovr_avg}
|
||||
\caption{ROC curve for the model trained on the \texttt{recent} dataset. The AUC score is $0.9228$.}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}[t]{\linewidth}
|
||||
\centering\includegraphics[width=.7\linewidth]{../out/model/bug_triaging_all_4e_5e-06lr_final.ovr_avg}
|
||||
\caption{ROC curve for the model trained on the \texttt{all} dataset. The AUC score is $0.9121$.}
|
||||
\end{subfigure}
|
||||
\caption{Micro-averaged One-vs-Rest ROC curves for the trained models over the test set.}
|
||||
\label{fig:roc-avg}
|
||||
\end{figure}
|
||||
|
31
docs/sections/prototype_model.tex
Normal file
|
@ -0,0 +1,31 @@
|
|||
\section*{Initial Model Testing with Minimally Pre-Processed Data}
|
||||
|
||||
{\textsc{Note:} the code for the models discussed in this section can be found in the Jupyter notebook saved in
|
||||
the repository path \texttt{src/model-dl/model.ipynb}.}
|
||||
|
||||
\subsection*{Model Choice and Hyperparameters Used}
|
||||
In addition to running our models on the USI architecture, which is our final model to be explained later, we initially prototyped a more raw version of our dataset using some pre-trained models, following their established architectures. We have made two runs on the recent training samples with a minor difference the data is fed into the Transformer.
|
||||
\newline
|
||||
\newline
|
||||
The hyperparameters that were used in both training runs were as follows:
|
||||
\newline
|
||||
\newline
|
||||
BATCH{\_}SIZE = 32
|
||||
\newline
|
||||
LEARNING{\_}RATE = 1e-5
|
||||
\newline
|
||||
EPOCHS = 3
|
||||
\newline
|
||||
MAX{\_}LEN = 512
|
||||
\newline
|
||||
\newline
|
||||
Similarly to our final model, in both prototype runs, the data is a concatenated string of the title and the body of the item (with a minor change in the formation of the string), at the time of running this prototype our pre-processing was not as refined.
|
||||
\newline
|
||||
\newline
|
||||
Our model of choice was the “BERT-base-uncased” which is a pre-trained model with 110M parameters, trained on the English language using a masked language modeling (MLM) objective by Hugging Face [1]. We have chosen the fairly standard AdamW optimizer which is a stochastic optimization method that only differs from the "Adam Algorithm" by adjusting the weight decay term to appear in the gradient update [2].
|
||||
\newline
|
||||
\subsection*{String Creation for the Transformer}
|
||||
The previously mentioned minor difference between the runs was in the creation of the single string to be fed into the model. In the first run, we trained on the simple concatenation of the title and body string whereas in the second run, we added the strings "Title:" and "Body:" prefixes before the start of the respective parts of the string. We have reasoned that adding these strings, almost like meta-words, would help the model contextualize the string better at the cost of reducing the maximum length of the string by 50 characters (11 characters were used to create the two meta-words). Despite these adjustments, there was no significant difference observed in our evaluation metrics between the two runs, suggesting that the inclusion of meta-word prefixes did not substantially change the model's ability to understand and process the data. We have found no significant difference between any of our evaluation metrics between the two runs, namely, “Accuracy”, “Precision”, “Recall” and, “F1-Score”.
|
||||
\newline
|
||||
\subsection*{Preliminary Results and Subsequent Modifications}
|
||||
In our prototype run, we utilized Nvidia A100 GPUs with extra memory. We were able to achieve nearly 50\% accuracy in both runs after three epochs. We tried using the MobileBERT model on USI architecture, which is the version of BERT that is mainly aimed for mobile use, with the benefit of less demanding memory requirements; however, it yielded poor results. Further modifications to our encoding and tokenizing processes enabled us to use the “BERT-base-uncased” model effectively with the computing resources provided to us by USI.
|
9
docs/sections/references.tex
Normal file
|
@ -0,0 +1,9 @@
|
|||
\begin{thebibliography}{9}
|
||||
|
||||
\bibitem{bert}
|
||||
BERT-base-uncased – Hugging Face. (n.d.). \url{https://huggingface.co/bert-base-uncased}
|
||||
|
||||
\bibitem{loshchilov2017decoupled}
|
||||
I. Loshchilov and F. Hutter, ``Decoupled Weight Decay Regularization," University of Freiburg, Freiburg, Germany, 2017. [Online]. Available: \url{https://arxiv.org/pdf/1711.05101v3.pdf}
|
||||
|
||||
\end{thebibliography}
|
7
docs/sections/scraping.tex
Normal file
|
@ -0,0 +1,7 @@
|
|||
\section*{Issue Scraping}
|
||||
To scrape the data from GitHub, we used the API that GitHub exposes to its users.
|
||||
By using our GitHub token, we managed to make the appropriate requests to return the issues.
|
||||
The raw issues where saved as single json files (one per issue), and zipped into a \verb|.tar.gz| archive.
|
||||
Some downloaded issues, however, were blank JSON files.
|
||||
We suspect that these issues were available at the time of listing, but they have been since deleted and are not available anymore through the GitHub API, therefore we choose to ignore them.
|
||||
The internal issue IDs for these issues were: \verb|111293876|, \verb|116791101|, \verb|116805010|, \verb|116805553|, \verb|116805977|, \verb|116901067|, \verb|117010737|, \verb|117065474|, \verb|117067419|, \verb|117068152|, \verb|117069931|, \verb|116803071|, \verb|116923175|, \verb|1169895| \verb|17|, \verb|117063475|, and \verb|117067644|
|
34
docs/sections/statistics.tex
Normal file
|
@ -0,0 +1,34 @@
|
|||
\section*{Data Analysis}
|
||||
Given the CSV exported by the cleaning pipeline, we managed to extract some interesting statistics of the training set (from issue 1 to 170000).
|
||||
In particular, we analyzed the issue word count distribution, the author distribution, and the distribution of opened issues during the week.
|
||||
|
||||
For the word count distribution, we tried to understand how many issues had less than 512 words (as we said before, 512 is the maximum number of tokens that we can pass to BERT).
|
||||
From our analysis, we saw that of 102065 cleaned and valid issues, $99.4\%$ of them (101468 issues) have a length of less than 512 words.
|
||||
On the other hand, only $0.6\%$ of the issues (597 issues) have a length greater than 512 words.
|
||||
This result makes the use of stopword removal useless (for out goal of reducing the number of tokens).
|
||||
The image below represents the distribution of all issues with word count less than 512 words.
|
||||
|
||||
\begin{center}
|
||||
\includegraphics[width=10cm]{../out/plots/length_dist}
|
||||
\end{center}
|
||||
|
||||
From this distribution, we can see extrapolate that the most frequent length is of 42 words, which is the case for $1.1\%$ of the issues (1115 issues).
|
||||
|
||||
Regarding the author distribution -- meaning the number of issues per author -- we managed to find out that out of a total of 102 authors, $39.3\%$ of authors (40 authors) contributed to less than 10 issues.
|
||||
On the other hand, $60.7\%$ of authors (62 authors) contributed to more than 10 issues.
|
||||
The issues per author can be seen in the graph below.
|
||||
|
||||
\begin{center}
|
||||
\includegraphics[width=10cm]{../out/plots/author_dist}
|
||||
\end{center}
|
||||
|
||||
From this graph, we can extrapolate the top 5 authors based on issue assignment.
|
||||
The result is the following:
|
||||
|
||||
\begin{enumerate}
|
||||
\item mjbvz: $11.6\%$ (11882 issues)
|
||||
\item bpasero: $8.11\%$ (8280 issues)
|
||||
\item Tyriar: $7.91\%$ (8075 issues)
|
||||
\item joaomoreno: $7.61\%$ (7775 issues)
|
||||
\item isidorn: $6.77\%$ (6914 issues)
|
||||
\end{enumerate}
|
8
environment-dev.yml
Normal file
|
@ -0,0 +1,8 @@
|
|||
name: bug-triaging-env-dev
|
||||
channels:
|
||||
- defaults
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- pip=23.1.2=py311hecd8cb5_0
|
||||
- pip:
|
||||
- -r ./requirements.txt
|
263
environment-server.yml
Normal file
|
@ -0,0 +1,263 @@
|
|||
name: bug-triaging-env
|
||||
channels:
|
||||
- pytorch
|
||||
- nvidia
|
||||
- defaults
|
||||
dependencies:
|
||||
- _libgcc_mutex=0.1=main
|
||||
- _openmp_mutex=5.1=1_gnu
|
||||
- aiofiles=22.1.0=py311h06a4308_0
|
||||
- aiosqlite=0.18.0=py311h06a4308_0
|
||||
- anyio=3.5.0=py311h06a4308_0
|
||||
- argon2-cffi=21.3.0=pyhd3eb1b0_0
|
||||
- argon2-cffi-bindings=21.2.0=py311h5eee18b_0
|
||||
- asttokens=2.0.5=pyhd3eb1b0_0
|
||||
- babel=2.11.0=py311h06a4308_0
|
||||
- backcall=0.2.0=pyhd3eb1b0_0
|
||||
- beautifulsoup4=4.12.2=py311h06a4308_0
|
||||
- blas=1.0=mkl
|
||||
- bleach=4.1.0=pyhd3eb1b0_0
|
||||
- brotlipy=0.7.0=py311h5eee18b_1002
|
||||
- bzip2=1.0.8=h7b6447c_0
|
||||
- ca-certificates=2023.08.22=h06a4308_0
|
||||
- cffi=1.15.1=py311h5eee18b_3
|
||||
- comm=0.1.2=py311h06a4308_0
|
||||
- cryptography=41.0.3=py311hdda0065_0
|
||||
- cuda=11.6.1=0
|
||||
- cuda-cccl=11.6.55=hf6102b2_0
|
||||
- cuda-command-line-tools=11.6.2=0
|
||||
- cuda-compiler=11.6.2=0
|
||||
- cuda-cudart=11.8.89=0
|
||||
- cuda-cudart-dev=11.6.55=h42ad0f4_0
|
||||
- cuda-cuobjdump=11.6.124=h2eeebcb_0
|
||||
- cuda-cupti=11.8.87=0
|
||||
- cuda-cuxxfilt=11.6.124=hecbf4f6_0
|
||||
- cuda-driver-dev=11.6.55=0
|
||||
- cuda-gdb=12.2.140=0
|
||||
- cuda-libraries=11.8.0=0
|
||||
- cuda-libraries-dev=11.6.1=0
|
||||
- cuda-memcheck=11.8.86=0
|
||||
- cuda-nsight=12.2.144=0
|
||||
- cuda-nsight-compute=12.2.2=0
|
||||
- cuda-nvcc=11.6.124=hbba6d2d_0
|
||||
- cuda-nvdisasm=12.2.140=0
|
||||
- cuda-nvml-dev=11.6.55=haa9ef22_0
|
||||
- cuda-nvprof=12.2.142=0
|
||||
- cuda-nvprune=11.6.124=he22ec0a_0
|
||||
- cuda-nvrtc=11.8.89=0
|
||||
- cuda-nvrtc-dev=11.6.124=h249d397_0
|
||||
- cuda-nvtx=11.8.86=0
|
||||
- cuda-nvvp=12.2.142=0
|
||||
- cuda-runtime=11.8.0=0
|
||||
- cuda-samples=11.6.101=h8efea70_0
|
||||
- cuda-sanitizer-api=12.2.140=0
|
||||
- cuda-toolkit=11.6.1=0
|
||||
- cuda-tools=11.6.1=0
|
||||
- cuda-visual-tools=11.6.1=0
|
||||
- debugpy=1.6.7=py311h6a678d5_0
|
||||
- decorator=5.1.1=pyhd3eb1b0_0
|
||||
- defusedxml=0.7.1=pyhd3eb1b0_0
|
||||
- entrypoints=0.4=py311h06a4308_0
|
||||
- executing=0.8.3=pyhd3eb1b0_0
|
||||
- ffmpeg=4.3=hf484d3e_0
|
||||
- filelock=3.9.0=py311h06a4308_0
|
||||
- freetype=2.12.1=h4a9f257_0
|
||||
- future=0.18.3=py311h06a4308_0
|
||||
- gds-tools=1.7.2.10=0
|
||||
- giflib=5.2.1=h5eee18b_3
|
||||
- gmp=6.2.1=h295c915_3
|
||||
- gmpy2=2.1.2=py311hc9b5ff0_0
|
||||
- gnutls=3.6.15=he1e5248_0
|
||||
- icu=73.1=h6a678d5_0
|
||||
- idna=3.4=py311h06a4308_0
|
||||
- intel-openmp=2023.1.0=hdb19cb5_46305
|
||||
- ipykernel=6.25.0=py311h92b7b1e_0
|
||||
- ipython=8.15.0=py311h06a4308_0
|
||||
- ipython_genutils=0.2.0=pyhd3eb1b0_1
|
||||
- jedi=0.18.1=py311h06a4308_1
|
||||
- jinja2=3.1.2=py311h06a4308_0
|
||||
- jpeg=9e=h5eee18b_1
|
||||
- json5=0.9.6=pyhd3eb1b0_0
|
||||
- jsonschema=4.17.3=py311h06a4308_0
|
||||
- jupyter_client=7.4.9=py311h06a4308_0
|
||||
- jupyter_core=5.3.0=py311h06a4308_0
|
||||
- jupyter_events=0.6.3=py311h06a4308_0
|
||||
- jupyter_server=1.23.4=py311h06a4308_0
|
||||
- jupyter_server_fileid=0.9.0=py311h06a4308_0
|
||||
- jupyter_server_ydoc=0.8.0=py311h06a4308_1
|
||||
- jupyter_ydoc=0.2.4=py311h06a4308_0
|
||||
- jupyterlab=3.6.3=py311h06a4308_0
|
||||
- jupyterlab_pygments=0.1.2=py_0
|
||||
- jupyterlab_server=2.22.0=py311h06a4308_0
|
||||
- lame=3.100=h7b6447c_0
|
||||
- lcms2=2.12=h3be6417_0
|
||||
- ld_impl_linux-64=2.38=h1181459_1
|
||||
- lerc=3.0=h295c915_0
|
||||
- libcublas=11.11.3.6=0
|
||||
- libcublas-dev=11.9.2.110=h5c901ab_0
|
||||
- libcufft=10.9.0.58=0
|
||||
- libcufft-dev=10.7.1.112=ha5ce4c0_0
|
||||
- libcufile=1.7.2.10=0
|
||||
- libcufile-dev=1.7.2.10=0
|
||||
- libcurand=10.3.3.141=0
|
||||
- libcurand-dev=10.3.3.141=0
|
||||
- libcusolver=11.4.1.48=0
|
||||
- libcusparse=11.7.5.86=0
|
||||
- libcusparse-dev=11.7.2.124=hbbe9722_0
|
||||
- libdeflate=1.17=h5eee18b_1
|
||||
- libffi=3.4.4=h6a678d5_0
|
||||
- libgcc-ng=11.2.0=h1234567_1
|
||||
- libgomp=11.2.0=h1234567_1
|
||||
- libiconv=1.16=h7f8727e_2
|
||||
- libidn2=2.3.4=h5eee18b_0
|
||||
- libjpeg-turbo=2.0.0=h9bf148f_0
|
||||
- libnpp=11.8.0.86=0
|
||||
- libnpp-dev=11.6.3.124=h3c42840_0
|
||||
- libnvjpeg=11.9.0.86=0
|
||||
- libnvjpeg-dev=11.6.2.124=hb5906b9_0
|
||||
- libpng=1.6.39=h5eee18b_0
|
||||
- libprotobuf=3.20.3=he621ea3_0
|
||||
- libsodium=1.0.18=h7b6447c_0
|
||||
- libstdcxx-ng=11.2.0=h1234567_1
|
||||
- libtasn1=4.19.0=h5eee18b_0
|
||||
- libtiff=4.5.1=h6a678d5_0
|
||||
- libunistring=0.9.10=h27cfd23_0
|
||||
- libuuid=1.41.5=h5eee18b_0
|
||||
- libwebp=1.3.2=h11a3e52_0
|
||||
- libwebp-base=1.3.2=h5eee18b_0
|
||||
- libxml2=2.10.4=hf1b16e4_1
|
||||
- libxslt=1.1.37=h5eee18b_1
|
||||
- llvm-openmp=14.0.6=h9e868ea_0
|
||||
- lxml=4.9.3=py311hdbbb534_0
|
||||
- lz4-c=1.9.4=h6a678d5_0
|
||||
- markupsafe=2.1.1=py311h5eee18b_0
|
||||
- matplotlib-inline=0.1.6=py311h06a4308_0
|
||||
- mistune=0.8.4=py311h5eee18b_1000
|
||||
- mkl=2023.1.0=h213fc3f_46343
|
||||
- mkl-service=2.4.0=py311h5eee18b_1
|
||||
- mkl_fft=1.3.8=py311h5eee18b_0
|
||||
- mkl_random=1.2.4=py311hdb19cb5_0
|
||||
- mpc=1.1.0=h10f8cd9_1
|
||||
- mpfr=4.0.2=hb69a4c5_1
|
||||
- mpmath=1.3.0=py311h06a4308_0
|
||||
- nbclassic=0.5.5=py311h06a4308_0
|
||||
- nbclient=0.5.13=py311h06a4308_0
|
||||
- nbconvert=6.5.4=py311h06a4308_0
|
||||
- nbformat=5.9.2=py311h06a4308_0
|
||||
- ncurses=6.4=h6a678d5_0
|
||||
- nest-asyncio=1.5.6=py311h06a4308_0
|
||||
- nettle=3.7.3=hbbd107a_1
|
||||
- networkx=3.1=py311h06a4308_0
|
||||
- ninja=1.10.2=h06a4308_5
|
||||
- ninja-base=1.10.2=hd09550d_5
|
||||
- notebook=6.5.4=py311h06a4308_1
|
||||
- notebook-shim=0.2.2=py311h06a4308_0
|
||||
- nsight-compute=2023.2.2.3=0
|
||||
- numpy=1.26.0=py311h08b1b3b_0
|
||||
- numpy-base=1.26.0=py311hf175353_0
|
||||
- openh264=2.1.1=h4ff587b_0
|
||||
- openjpeg=2.4.0=h3ad879b_0
|
||||
- openssl=3.0.11=h7f8727e_2
|
||||
- pandocfilters=1.5.0=pyhd3eb1b0_0
|
||||
- parso=0.8.3=pyhd3eb1b0_0
|
||||
- pexpect=4.8.0=pyhd3eb1b0_3
|
||||
- pickleshare=0.7.5=pyhd3eb1b0_1003
|
||||
- pillow=10.0.1=py311ha6cbd5a_0
|
||||
- pip=23.2.1=py311h06a4308_0
|
||||
- platformdirs=3.10.0=py311h06a4308_0
|
||||
- prometheus_client=0.14.1=py311h06a4308_0
|
||||
- prompt-toolkit=3.0.36=py311h06a4308_0
|
||||
- psutil=5.9.0=py311h5eee18b_0
|
||||
- ptyprocess=0.7.0=pyhd3eb1b0_2
|
||||
- pure_eval=0.2.2=pyhd3eb1b0_0
|
||||
- pycparser=2.21=pyhd3eb1b0_0
|
||||
- pygments=2.15.1=py311h06a4308_1
|
||||
- pyopenssl=23.2.0=py311h06a4308_0
|
||||
- pyrsistent=0.18.0=py311h5eee18b_0
|
||||
- pysocks=1.7.1=py311h06a4308_0
|
||||
- python=3.11.5=h955ad1f_0
|
||||
- python-dateutil=2.8.2=pyhd3eb1b0_0
|
||||
- python-fastjsonschema=2.16.2=py311h06a4308_0
|
||||
- python-json-logger=2.0.7=py311h06a4308_0
|
||||
- pytorch=2.1.0=py3.11_cuda11.8_cudnn8.7.0_0
|
||||
- pytorch-cuda=11.8=h7e8668a_5
|
||||
- pytorch-mutex=1.0=cuda
|
||||
- pytz=2023.3.post1=py311h06a4308_0
|
||||
- pyyaml=6.0=py311h5eee18b_1
|
||||
- pyzmq=23.2.0=py311h6a678d5_0
|
||||
- readline=8.2=h5eee18b_0
|
||||
- rfc3339-validator=0.1.4=py311h06a4308_0
|
||||
- rfc3986-validator=0.1.1=py311h06a4308_0
|
||||
- send2trash=1.8.0=pyhd3eb1b0_1
|
||||
- setuptools=68.0.0=py311h06a4308_0
|
||||
- six=1.16.0=pyhd3eb1b0_1
|
||||
- sniffio=1.2.0=py311h06a4308_1
|
||||
- soupsieve=2.5=py311h06a4308_0
|
||||
- sqlite=3.41.2=h5eee18b_0
|
||||
- stack_data=0.2.0=pyhd3eb1b0_0
|
||||
- sympy=1.11.1=py311h06a4308_0
|
||||
- tbb=2021.8.0=hdb19cb5_0
|
||||
- terminado=0.17.1=py311h06a4308_0
|
||||
- tinycss2=1.2.1=py311h06a4308_0
|
||||
- tk=8.6.12=h1ccaba5_0
|
||||
- torchaudio=2.1.0=py311_cu118
|
||||
- torchtriton=2.1.0=py311
|
||||
- torchvision=0.16.0=py311_cu118
|
||||
- tornado=6.3.2=py311h5eee18b_0
|
||||
- traitlets=5.7.1=py311h06a4308_0
|
||||
- typing-extensions=4.7.1=py311h06a4308_0
|
||||
- typing_extensions=4.7.1=py311h06a4308_0
|
||||
- wcwidth=0.2.5=pyhd3eb1b0_0
|
||||
- webencodings=0.5.1=py311h06a4308_1
|
||||
- websocket-client=0.58.0=py311h06a4308_4
|
||||
- wheel=0.41.2=py311h06a4308_0
|
||||
- xz=5.4.2=h5eee18b_0
|
||||
- y-py=0.5.9=py311h52d8a92_0
|
||||
- yaml=0.2.5=h7b6447c_0
|
||||
- ypy-websocket=0.8.2=py311h06a4308_0
|
||||
- zeromq=4.3.4=h2531618_0
|
||||
- zlib=1.2.13=h5eee18b_0
|
||||
- zstd=1.5.5=hc292b87_0
|
||||
- pip:
|
||||
- accelerate==0.23.0
|
||||
- attrs==22.1.0
|
||||
- certifi==2022.9.24
|
||||
- charset-normalizer==2.1.1
|
||||
- click==8.1.3
|
||||
- contourpy==1.1.1
|
||||
- coverage==6.4.4
|
||||
- cycler==0.12.1
|
||||
- demoji==1.1.0
|
||||
- fonttools==4.43.1
|
||||
- fsspec==2023.9.2
|
||||
- huggingface-hub==0.17.3
|
||||
- iniconfig==1.1.1
|
||||
- joblib==1.2.0
|
||||
- kiwisolver==1.4.5
|
||||
- markdown==3.4.4
|
||||
- matplotlib==3.8.0
|
||||
- nltk==3.7
|
||||
- packaging==21.3
|
||||
- pandas==2.1.1
|
||||
- pluggy==1.0.0
|
||||
- py==1.11.0
|
||||
- pyarrow==13.0.0
|
||||
- pyparsing==3.0.9
|
||||
- pytest==7.1.3
|
||||
- pytest-cov==3.0.0
|
||||
- python-dotenv==1.0.0
|
||||
- regex==2022.9.13
|
||||
- requests==2.28.1
|
||||
- safetensors==0.4.0
|
||||
- scikit-learn==1.3.1
|
||||
- scipy==1.11.3
|
||||
- seaborn==0.13.0
|
||||
- sentence-transformers==2.2.2
|
||||
- sentencepiece==0.1.99
|
||||
- threadpoolctl==3.2.0
|
||||
- tokenizers==0.14.1
|
||||
- tomli==2.0.1
|
||||
- tqdm==4.64.1
|
||||
- transformers==4.34.0
|
||||
- tzdata==2023.3
|
||||
- urllib3==1.26.12
|
1
out/csv/issues_removed_count.txt
Normal file
|
@ -0,0 +1 @@
|
|||
43363
|
2
out/model/bug_triaging_all_4e_5e-06lr_final.auc.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
Micro-averaged One-vs-Rest ROC AUC score:
|
||||
0.9121
|
|
@ -0,0 +1,2 @@
|
|||
0
|
||||
93
|
104
out/model/bug_triaging_all_4e_5e-06lr_final.labels.csv
Normal file
|
@ -0,0 +1,104 @@
|
|||
token,label
|
||||
9at8,0
|
||||
DonJayamanne,1
|
||||
IanMatthewHuff,2
|
||||
ItalyPaleAle,3
|
||||
JacksonKearl,4
|
||||
Lixire,5
|
||||
RMacfarlane,6
|
||||
Steam-Rabbit,7
|
||||
TylerLeonhardt,8
|
||||
Tyriar,9
|
||||
aefernandes,10
|
||||
aeschli,11
|
||||
aiday-mar,12
|
||||
alexdima,13
|
||||
alexr00,14
|
||||
amunger,15
|
||||
andreamah,16
|
||||
auchenberg,17
|
||||
awvalenti,18
|
||||
bamurtaugh,19
|
||||
benibenj,20
|
||||
bgashler1,21
|
||||
bhavyaus,22
|
||||
bowdenk7,23
|
||||
bpasero,24
|
||||
btholt,25
|
||||
chrisdias,26
|
||||
chrmarti,27
|
||||
claudiaregio,28
|
||||
cleidigh,29
|
||||
connor4312,30
|
||||
danyeh,31
|
||||
daviddossett,32
|
||||
daviwil,33
|
||||
dbaeumer,34
|
||||
deepak1556,35
|
||||
delmyers,36
|
||||
digitarald,37
|
||||
dynamicwebpaige,38
|
||||
eamodio,39
|
||||
egamma,40
|
||||
esonnino,41
|
||||
fiveisprime,42
|
||||
foucdeg,43
|
||||
gregvanl,44
|
||||
gushuro,45
|
||||
hediet,46
|
||||
isidorn,47
|
||||
janbaltus,48
|
||||
joaomoreno,49
|
||||
johnliu369,50
|
||||
joyceerhl,51
|
||||
jrieken,52
|
||||
karthiknadig,53
|
||||
kieferrm,54
|
||||
kimadeline,55
|
||||
lramos15,56
|
||||
lszomoru,57
|
||||
lukaschal,58
|
||||
lychung7,59
|
||||
meganrogge,60
|
||||
michelkaporin,61
|
||||
miguelsolorio,62
|
||||
minsa110,63
|
||||
mjbvz,64
|
||||
mousetraps,65
|
||||
nexue2020,66
|
||||
octref,67
|
||||
ornelladotcom,68
|
||||
orta,69
|
||||
paulacamargo25,70
|
||||
pjmeyer,71
|
||||
ramya-rao-a,72
|
||||
rchiodo,73
|
||||
rebornix,74
|
||||
roblourens,75
|
||||
rzhao271,76
|
||||
sana-ajani,77
|
||||
sandy081,78
|
||||
sanket856,79
|
||||
sbatten,80
|
||||
seanmcbreen,81
|
||||
shawndon,82
|
||||
sofianhn,83
|
||||
stevencl,84
|
||||
tanhakabir,85
|
||||
tsalinger,86
|
||||
ulugbekna,87
|
||||
v-pavanp,88
|
||||
vsccarl,89
|
||||
waderyan,90
|
||||
weeteckt,91
|
||||
weinand,92
|
||||
DanielRosenwasser,93
|
||||
Yoyokrazy,94
|
||||
brettcannon,95
|
||||
devinvalenciano,96
|
||||
eleanorjboyd,97
|
||||
greazer,98
|
||||
justschen,99
|
||||
karrtikr,100
|
||||
sadasant,101
|
||||
hbons,102
|
|
BIN
out/model/bug_triaging_all_4e_5e-06lr_final.ovr_avg.png
Normal file
After Width: | Height: | Size: 30 KiB |
BIN
out/model/bug_triaging_all_4e_5e-06lr_final.ovr_curves.png
Normal file
After Width: | Height: | Size: 289 KiB |
2
out/model/bug_triaging_recent_4e_5e-06lr_final.auc.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
Micro-averaged One-vs-Rest ROC AUC score:
|
||||
0.9228
|
|
@ -0,0 +1,2 @@
|
|||
0
|
||||
51
|
57
out/model/bug_triaging_recent_4e_5e-06lr_final.labels.csv
Normal file
|
@ -0,0 +1,57 @@
|
|||
token,label
|
||||
DanielRosenwasser,0
|
||||
DonJayamanne,1
|
||||
IanMatthewHuff,2
|
||||
JacksonKearl,3
|
||||
TylerLeonhardt,4
|
||||
Tyriar,5
|
||||
Yoyokrazy,6
|
||||
aeschli,7
|
||||
aiday-mar,8
|
||||
alexdima,9
|
||||
alexr00,10
|
||||
amunger,11
|
||||
andreamah,12
|
||||
bamurtaugh,13
|
||||
benibenj,14
|
||||
bhavyaus,15
|
||||
bpasero,16
|
||||
chrisdias,17
|
||||
chrmarti,18
|
||||
connor4312,19
|
||||
daviddossett,20
|
||||
dbaeumer,21
|
||||
deepak1556,22
|
||||
devinvalenciano,23
|
||||
egamma,24
|
||||
eleanorjboyd,25
|
||||
greazer,26
|
||||
gregvanl,27
|
||||
hediet,28
|
||||
isidorn,29
|
||||
joaomoreno,30
|
||||
joyceerhl,31
|
||||
jrieken,32
|
||||
justschen,33
|
||||
karrtikr,34
|
||||
karthiknadig,35
|
||||
lramos15,36
|
||||
lszomoru,37
|
||||
meganrogge,38
|
||||
miguelsolorio,39
|
||||
minsa110,40
|
||||
mjbvz,41
|
||||
rchiodo,42
|
||||
rebornix,43
|
||||
roblourens,44
|
||||
rzhao271,45
|
||||
sadasant,46
|
||||
sandy081,47
|
||||
sbatten,48
|
||||
tanhakabir,49
|
||||
weinand,50
|
||||
brettcannon,51
|
||||
digitarald,52
|
||||
esonnino,53
|
||||
hbons,54
|
||||
ulugbekna,55
|
|
15
out/model/bug_triaging_recent_4e_5e-06lr_final.output.txt
Normal file
|
@ -0,0 +1,15 @@
|
|||
/home/SA23-G2/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.
|
||||
return bound(*args, **kwds)
|
||||
Training for dataset kind: recent
|
||||
Train set instance size: 8303
|
||||
Validation set instance size: 921
|
||||
Test set instance size: 4787
|
||||
Using device # 0
|
||||
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
|
||||
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
|
||||
Training model then saving in /home/SA23-G2/bug-triaging/src/modelimpl/../../out/model/bug_triaging_recent_4e_5e-06lr_final.pt
|
||||
Epochs: 1 | Train Loss: 0.204 | Train Accuracy: 0.171 | Val Loss: 0.174 | Val Accuracy: 0.343
|
||||
Epochs: 2 | Train Loss: 0.156 | Train Accuracy: 0.386 | Val Loss: 0.140 | Val Accuracy: 0.467
|
||||
Epochs: 3 | Train Loss: 0.124 | Train Accuracy: 0.542 | Val Loss: 0.125 | Val Accuracy: 0.545
|
||||
Epochs: 4 | Train Loss: 0.100 | Train Accuracy: 0.642 | Val Loss: 0.120 | Val Accuracy: 0.557
|
||||
Test Accuracy: 0.498
|
BIN
out/model/bug_triaging_recent_4e_5e-06lr_final.ovr_avg.png
Normal file
After Width: | Height: | Size: 30 KiB |
BIN
out/model/bug_triaging_recent_4e_5e-06lr_final.ovr_curves.png
Normal file
After Width: | Height: | Size: 279 KiB |
BIN
out/plots/author_dist.png
Normal file
After Width: | Height: | Size: 121 KiB |
BIN
out/plots/length_dist.png
Normal file
After Width: | Height: | Size: 26 KiB |
BIN
out/plots/weekday_dist.png
Normal file
After Width: | Height: | Size: 22 KiB |
31
requirements.txt
Normal file
|
@ -0,0 +1,31 @@
|
|||
Markdown==3.5
|
||||
attrs==22.1.0
|
||||
beautifulsoup4==4.12.2
|
||||
certifi==2022.9.24
|
||||
charset-normalizer==2.1.1
|
||||
click==8.1.3
|
||||
coverage==6.4.4
|
||||
demoji==1.1.0
|
||||
idna==3.4
|
||||
iniconfig==1.1.1
|
||||
joblib==1.2.0
|
||||
matplotlib==3.8.0
|
||||
nltk==3.7
|
||||
numpy==1.26.1
|
||||
packaging==21.3
|
||||
pandas==2.1.2
|
||||
pluggy==1.0.0
|
||||
py==1.11.0
|
||||
pyarrow==13.0.0
|
||||
pyparsing==3.0.9
|
||||
pytest-cov==3.0.0
|
||||
pytest==7.1.3
|
||||
python-dotenv==1.0.0
|
||||
regex==2022.9.13
|
||||
requests==2.28.1
|
||||
scikit_learn==1.3.2
|
||||
tomli==2.0.1
|
||||
torch==2.1.0
|
||||
tqdm==4.66.1
|
||||
transformers==4.34.1
|
||||
urllib3==1.26.12
|
17
scripts/consts.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Define regular expressions to remove unwanted content from issue text
|
||||
|
||||
extra_stopwords = [
|
||||
r'http(s)?://\S+', # Any http(s) url
|
||||
r'[^a-zA-Z0-9_ \n]', # Any non-word character
|
||||
]
|
||||
|
||||
# Define labels to remove unwanted issues
|
||||
|
||||
labels_blacklist = [
|
||||
r'^\*english-please$',
|
||||
r'^caused-by-extension$',
|
||||
r'^info-needed$',
|
||||
r'^invalid$',
|
||||
r'^\*off-topic$',
|
||||
r'^translation-required(-\w+)+$',
|
||||
]
|
6
scripts/utils.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
from datetime import datetime
|
||||
|
||||
|
||||
# Prints a line on the stdout prepended with the time
|
||||
def log(msg):
|
||||
print(f'[{datetime.now()}] {msg}')
|
24
sonar-project.properties
Normal file
|
@ -0,0 +1,24 @@
|
|||
# --- required for ci template ---
|
||||
|
||||
# must be unique in a given SonarQube instance
|
||||
sonar.projectKey=sa-2023-g2-${env.CI_PROJECT_ID}
|
||||
sonar.qualitygate.wait=true
|
||||
|
||||
# defaults to project key
|
||||
sonar.projectName=${env.CI_PROJECT_PATH}
|
||||
|
||||
# defaults to 'not provided'
|
||||
# sonar.projectVersion=${env.CI_COMMIT_TAG}
|
||||
|
||||
# --- additional properties ---
|
||||
sonar.sources=.
|
||||
sonar.tests=tests
|
||||
sonar.exclusions=tests/**/*
|
||||
sonar.python.version=3.10.7
|
||||
|
||||
# test coverage
|
||||
#sonar.python.coverage.reportPaths=coverage/cobertura-coverage.xml
|
||||
sonar.python.coverage.reportPaths=coverage.xml
|
||||
|
||||
# Encoding of the source code. Default is default system encoding
|
||||
sonar.sourceEncoding=UTF-8
|
375
src/analysis/statistics.ipynb
Normal file
25
src/auc.py
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
||||
|
||||
import torch
|
||||
|
||||
from modelimpl.auc import build_curve
|
||||
|
||||
if __name__ == '__main__':
|
||||
assert torch is not None # make sure pytorch is imported and loaded with correct CUDA env variable
|
||||
|
||||
parser = argparse.ArgumentParser(prog='auc.py',
|
||||
description='ROC curve and AUC computation script. The script evaluates the given '
|
||||
'model against the test set and generates a OvR ROC curve '
|
||||
'plot with one curve per class, a micro-averaged OvR ROC plot '
|
||||
'and the corresponding AUC value.')
|
||||
parser.add_argument('modelfile', type=str, help="Path to the pickled pytorch model to classify the issue with")
|
||||
parser.add_argument('-c', '--force-cpu', action='store_true',
|
||||
help="disables CUDA support. Useful when debugging")
|
||||
|
||||
args = parser.parse_args()
|
||||
build_curve(args.modelfile, args.force_cpu)
|
28
src/clean.py
Executable file
|
@ -0,0 +1,28 @@
|
|||
import os
|
||||
|
||||
from cleaner.clean import clean_all, save_set
|
||||
from cleaner.dataframe import build_df
|
||||
|
||||
ROOT = os.path.join(os.path.dirname(__file__), '', '..')
|
||||
IN_FILE = os.path.join(ROOT, 'out/json/issues.tar.gz')
|
||||
OUT_FILE_PREFIX = os.path.join(ROOT, 'out/csv/issues')
|
||||
|
||||
|
||||
def main():
|
||||
objs = []
|
||||
|
||||
counter = clean_all(objs, IN_FILE)
|
||||
|
||||
print(f'Removed Issues: {counter}')
|
||||
with open(OUT_FILE_PREFIX + '_removed_count.txt', 'w') as f:
|
||||
f.write(str(counter) + "\n")
|
||||
|
||||
df = build_df(objs)
|
||||
|
||||
save_set(df, 1, 170_000, '_train', OUT_FILE_PREFIX)
|
||||
save_set(df, 150_000, 170_000, '_train_recent', OUT_FILE_PREFIX)
|
||||
save_set(df, 170_001, 180_000, '_test', OUT_FILE_PREFIX)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
0
src/cleaner/__init__.py
Normal file
141
src/cleaner/clean.py
Normal file
|
@ -0,0 +1,141 @@
|
|||
import json
|
||||
import os
|
||||
import re
|
||||
import tarfile
|
||||
|
||||
import tqdm
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
from demoji import replace
|
||||
from markdown import markdown
|
||||
|
||||
counter = 0
|
||||
|
||||
|
||||
def clean_body(body):
|
||||
# Check if body is present
|
||||
if body is None:
|
||||
return None
|
||||
|
||||
html = markdown(body)
|
||||
soup = BeautifulSoup(html, features='html.parser')
|
||||
|
||||
# Remove everything in the <details> tag (noisy data)
|
||||
for s in soup.findAll('details'):
|
||||
s.extract()
|
||||
|
||||
# Remove HTML comments
|
||||
for s in soup(text=lambda comment: isinstance(comment, Comment)):
|
||||
s.extract()
|
||||
|
||||
# Remove emojis
|
||||
body = replace(soup.get_text('\n')).strip()
|
||||
|
||||
# Remove all newlines and replace with single space
|
||||
body = re.sub(r'\s+', ' ', body, flags=re.MULTILINE)
|
||||
|
||||
# Remove all links
|
||||
body = re.sub(r'http(s)?://\S+', ' ', body, flags=re.MULTILINE)
|
||||
|
||||
# Check if body only contains ASCII characters
|
||||
if not body.isascii():
|
||||
return None
|
||||
|
||||
return body
|
||||
|
||||
|
||||
def clean_title(title):
|
||||
if title is None:
|
||||
return None
|
||||
|
||||
# Remove emojis
|
||||
title = replace(title).strip()
|
||||
|
||||
# Check if title only contains ASCII characters
|
||||
if not title.isascii():
|
||||
return None
|
||||
|
||||
return title
|
||||
|
||||
|
||||
def read_issue_obj(obj, enable_filter=True):
|
||||
global counter
|
||||
|
||||
if enable_filter:
|
||||
if 'PR' in obj['node_id'] or ('pull_request' in obj and obj['pull_request'] is not None):
|
||||
return None # skip since it is a pull request
|
||||
|
||||
if 'assignees' not in obj or obj['assignees'] is None:
|
||||
counter += 1
|
||||
return None # skip since it has not been assigned
|
||||
|
||||
if len(obj['assignees']) != 1:
|
||||
counter += 1
|
||||
return None # skip since it has multiple assignees
|
||||
|
||||
body = clean_body(obj['body'])
|
||||
dirty_body = obj['body']
|
||||
title = clean_title(obj['title'])
|
||||
title_count = 0 if title is None else len(title.split())
|
||||
|
||||
issue = {
|
||||
'id': obj['number'],
|
||||
'title': title,
|
||||
'body': body,
|
||||
'dirty_body': dirty_body,
|
||||
'word_count': 0 if body is None else len(body.split()) + title_count,
|
||||
'word_count_dirty': 0 if body is None else len(dirty_body.split()) + title_count,
|
||||
'state': obj['state'],
|
||||
'assignee': None if len(obj['assignees']) == 0 else obj['assignees'][0]['login'],
|
||||
'created_at': obj['created_at']
|
||||
}
|
||||
|
||||
if issue['title'] is None and issue['body'] is None:
|
||||
counter += 1
|
||||
return None
|
||||
|
||||
return issue
|
||||
|
||||
|
||||
def read_issue(file):
|
||||
global counter
|
||||
|
||||
try:
|
||||
obj = json.load(file)
|
||||
issue = read_issue_obj(obj)
|
||||
except json.decoder.JSONDecodeError:
|
||||
# Some downloaded issues are a blank JSON file. We suspect that these issues were available at the time of
|
||||
# listing, but they have been since deleted and are not available anymore through the GitHub API, therefore
|
||||
# we choose to ignore them. The internal issue IDs for these issues were:
|
||||
# 111293876, 116791101, 116805010, 116805553, 116805977, 116901067, 117010737, 117065474, 117067419, 117068152,
|
||||
# 117069931, 116803071, 116923175, 116989517, 117063475, 117067644
|
||||
counter += 1
|
||||
return None
|
||||
|
||||
return issue
|
||||
|
||||
|
||||
def save_set(df, id_from, id_to, name, file_prefix: str):
|
||||
dir_name = os.path.dirname(file_prefix)
|
||||
if not os.path.isdir(dir_name):
|
||||
os.makedirs(dir_name)
|
||||
|
||||
df.loc[id_from:id_to].to_csv(file_prefix + f'{name}_{id_from:06d}_{id_to:06d}.csv')
|
||||
|
||||
|
||||
def clean_all(objs, in_file: str):
|
||||
global counter
|
||||
|
||||
counter = 0
|
||||
tar = tarfile.open(in_file, 'r:gz')
|
||||
|
||||
for member in tqdm.tqdm(tar.getmembers()):
|
||||
if member.name.endswith('.json'):
|
||||
f = tar.extractfile(member)
|
||||
|
||||
if f is not None:
|
||||
issue = read_issue(f)
|
||||
|
||||
if issue is not None:
|
||||
objs.append(issue)
|
||||
|
||||
return counter
|
8
src/cleaner/dataframe.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import pandas as pd
|
||||
|
||||
|
||||
def build_df(objs: list[dict[str, any]]) -> pd.DataFrame:
|
||||
df = pd.DataFrame.from_records(objs)
|
||||
df.set_index('id', drop=True, inplace=True)
|
||||
df.sort_index(inplace=True)
|
||||
return df
|
629
src/model-dl/bert_medium.ipynb
Normal file
|
@ -0,0 +1,629 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "6fed133d-61b7-4ce6-8a44-fe98acf0eed2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture\n",
|
||||
"!pip install transformers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2f3a8fd1-25a9-426d-a6be-c93b750cbcb8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/SA23-G2/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CUDA is available, Training on GPU ...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import torch\n",
|
||||
"import numpy as np\n",
|
||||
"from transformers import BertTokenizer, BertModel\n",
|
||||
"from torch import nn\n",
|
||||
"from torch.optim import Adam\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import os\n",
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"force_cpu = False\n",
|
||||
"\n",
|
||||
"if not force_cpu: \n",
|
||||
" # Use GPU #2\n",
|
||||
" os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"\n",
|
||||
" \n",
|
||||
"train_on_gpu = torch.cuda.is_available()\n",
|
||||
"\n",
|
||||
"if train_on_gpu:\n",
|
||||
" print('CUDA is available, Training on GPU ...')\n",
|
||||
"else:\n",
|
||||
" print('CUDA is not available! Training on CPU ...')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "47a53036-31ab-4374-bf15-a4dca17a7cbf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>title_body</th>\n",
|
||||
" <th>assignee</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>HTML: Not auto-closing quotes when typing attr...</td>\n",
|
||||
" <td>alexdima</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>Python: Block comment is three single quotes a...</td>\n",
|
||||
" <td>joaomoreno</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>PHP: extension console.logs when completing fu...</td>\n",
|
||||
" <td>jrieken</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>[ruby] the mode failed to tokenize the input\\n...</td>\n",
|
||||
" <td>aeschli</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>[vb] Block comment is not colored\\nWhile line ...</td>\n",
|
||||
" <td>bpasero</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" title_body assignee\n",
|
||||
"0 HTML: Not auto-closing quotes when typing attr... alexdima\n",
|
||||
"1 Python: Block comment is three single quotes a... joaomoreno\n",
|
||||
"2 PHP: extension console.logs when completing fu... jrieken\n",
|
||||
"3 [ruby] the mode failed to tokenize the input\\n... aeschli\n",
|
||||
"4 [vb] Block comment is not colored\\nWhile line ... bpasero"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"OUT_COLUMN = 'assignee'\n",
|
||||
"IN_COLUMN = 'title_body'\n",
|
||||
"\n",
|
||||
"def load_df(csv_path):\n",
|
||||
" df = pd.read_csv(csv_path)\n",
|
||||
" df['title_body'] = df[['title', 'body']].apply(lambda row: '\\n'.join(row.values.astype(str)), axis=1)\n",
|
||||
" return df.loc[:, ['title_body', 'assignee']]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df_train_all = load_df(f'../../out/csv/issues_train_000001_170000.csv')\n",
|
||||
"df_train_recent = load_df(f'../../out/csv/issues_train_recent_150000_170000.csv')\n",
|
||||
"df_test = load_df(f'../../out/csv/issues_test_170001_180000.csv')\n",
|
||||
"\n",
|
||||
"df_train_all.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "ab965eff-e1eb-416f-b80c-850554d8026c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"defaultdict(int,\n",
|
||||
" {'alexdima': 1,\n",
|
||||
" 'joaomoreno': 2,\n",
|
||||
" 'jrieken': 3,\n",
|
||||
" 'aeschli': 4,\n",
|
||||
" 'bpasero': 5,\n",
|
||||
" 'isidorn': 6,\n",
|
||||
" 'seanmcbreen': 7,\n",
|
||||
" 'weinand': 8,\n",
|
||||
" 'dbaeumer': 9,\n",
|
||||
" 'sofianhn': 10,\n",
|
||||
" 'chrmarti': 11,\n",
|
||||
" 'chrisdias': 12,\n",
|
||||
" 'Tyriar': 13,\n",
|
||||
" 'roblourens': 14,\n",
|
||||
" 'gregvanl': 15,\n",
|
||||
" 'kieferrm': 16,\n",
|
||||
" 'egamma': 17,\n",
|
||||
" 'bgashler1': 18,\n",
|
||||
" 'mjbvz': 19,\n",
|
||||
" 'alexr00': 20,\n",
|
||||
" 'stevencl': 21,\n",
|
||||
" 'sbatten': 22,\n",
|
||||
" 'rebornix': 23,\n",
|
||||
" 'ramya-rao-a': 24,\n",
|
||||
" 'waderyan': 25,\n",
|
||||
" 'RMacfarlane': 26,\n",
|
||||
" 'sandy081': 27,\n",
|
||||
" 'pjmeyer': 28,\n",
|
||||
" 'DonJayamanne': 29,\n",
|
||||
" 'miguelsolorio': 30,\n",
|
||||
" 'octref': 31,\n",
|
||||
" 'daviwil': 32,\n",
|
||||
" 'hediet': 33,\n",
|
||||
" 'mousetraps': 34,\n",
|
||||
" 'v-pavanp': 35,\n",
|
||||
" 'johnliu369': 36,\n",
|
||||
" 'vsccarl': 37,\n",
|
||||
" 'delmyers': 38,\n",
|
||||
" 'lukaschal': 39,\n",
|
||||
" 'lszomoru': 40,\n",
|
||||
" 'JacksonKearl': 41,\n",
|
||||
" 'ulugbekna': 42,\n",
|
||||
" 'esonnino': 43,\n",
|
||||
" 'connor4312': 44,\n",
|
||||
" 'michelkaporin': 45,\n",
|
||||
" 'aiday-mar': 46,\n",
|
||||
" 'Lixire': 47,\n",
|
||||
" 'lramos15': 48,\n",
|
||||
" 'andreamah': 49,\n",
|
||||
" 'meganrogge': 50,\n",
|
||||
" 'danyeh': 51,\n",
|
||||
" 'cleidigh': 52,\n",
|
||||
" 'deepak1556': 53,\n",
|
||||
" 'janbaltus': 54,\n",
|
||||
" 'gushuro': 55,\n",
|
||||
" 'aefernandes': 56,\n",
|
||||
" 'auchenberg': 57,\n",
|
||||
" 'TylerLeonhardt': 58,\n",
|
||||
" 'benibenj': 59,\n",
|
||||
" 'tsalinger': 60,\n",
|
||||
" 'rzhao271': 61,\n",
|
||||
" 'shawndon': 62,\n",
|
||||
" 'eamodio': 63,\n",
|
||||
" 'fiveisprime': 64,\n",
|
||||
" 'Steam-Rabbit': 65,\n",
|
||||
" 'foucdeg': 66,\n",
|
||||
" 'awvalenti': 67,\n",
|
||||
" 'weeteckt': 68,\n",
|
||||
" 'daviddossett': 69,\n",
|
||||
" 'bowdenk7': 70,\n",
|
||||
" 'sana-ajani': 71,\n",
|
||||
" '9at8': 72,\n",
|
||||
" 'btholt': 73,\n",
|
||||
" 'bamurtaugh': 74,\n",
|
||||
" 'ornelladotcom': 75,\n",
|
||||
" 'digitarald': 76,\n",
|
||||
" 'nexue2020': 77,\n",
|
||||
" 'bhavyaus': 78,\n",
|
||||
" 'joyceerhl': 79,\n",
|
||||
" 'amunger': 80,\n",
|
||||
" 'IanMatthewHuff': 81,\n",
|
||||
" 'claudiaregio': 82,\n",
|
||||
" 'rchiodo': 83,\n",
|
||||
" 'ItalyPaleAle': 84,\n",
|
||||
" 'kimadeline': 85,\n",
|
||||
" 'tanhakabir': 86,\n",
|
||||
" 'karthiknadig': 87,\n",
|
||||
" 'dynamicwebpaige': 88,\n",
|
||||
" 'minsa110': 89,\n",
|
||||
" 'sanket856': 90,\n",
|
||||
" 'orta': 91,\n",
|
||||
" 'paulacamargo25': 92,\n",
|
||||
" 'lychung7': 93,\n",
|
||||
" 'greazer': 94,\n",
|
||||
" 'justschen': 95,\n",
|
||||
" 'karrtikr': 96,\n",
|
||||
" 'eleanorjboyd': 97,\n",
|
||||
" 'sadasant': 98,\n",
|
||||
" 'Yoyokrazy': 99,\n",
|
||||
" 'devinvalenciano': 100,\n",
|
||||
" 'DanielRosenwasser': 101,\n",
|
||||
" 'brettcannon': 102,\n",
|
||||
" 'hbons': 103})"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"labels_dict = pd.concat([df_train_all[OUT_COLUMN], df_train_recent[OUT_COLUMN], df_test[OUT_COLUMN]]) \\\n",
|
||||
" .drop_duplicates(keep='first') \\\n",
|
||||
" .reset_index(drop=True) \\\n",
|
||||
" .to_dict()\n",
|
||||
"labels = defaultdict(int)\n",
|
||||
" \n",
|
||||
"for k,v in labels_dict.items():\n",
|
||||
" labels[v] = k + 1\n",
|
||||
"\n",
|
||||
"labels"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "5074c270-ed3e-4e1a-863d-71737c743cb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = BertTokenizer.from_pretrained('bert-base-cased')\n",
|
||||
"\n",
|
||||
"class Dataset(torch.utils.data.Dataset):\n",
|
||||
"\n",
|
||||
" def __init__(self, df):\n",
|
||||
"\n",
|
||||
" self.labels = [labels[label] for label in df[OUT_COLUMN]]\n",
|
||||
" self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True,\n",
|
||||
" return_tensors=\"pt\") for text in df[IN_COLUMN]]\n",
|
||||
"\n",
|
||||
" def classes(self):\n",
|
||||
" return self.labels\n",
|
||||
"\n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.labels)\n",
|
||||
"\n",
|
||||
" def get_batch_labels(self, idx):\n",
|
||||
" # Fetch a batch of labels\n",
|
||||
" return np.array(self.labels[idx])\n",
|
||||
"\n",
|
||||
" def get_batch_texts(self, idx):\n",
|
||||
" # Fetch a batch of inputs\n",
|
||||
" return self.texts[idx]\n",
|
||||
"\n",
|
||||
" def __getitem__(self, idx):\n",
|
||||
"\n",
|
||||
" batch_texts = self.get_batch_texts(idx)\n",
|
||||
" batch_y = self.get_batch_labels(idx)\n",
|
||||
"\n",
|
||||
" return batch_texts, batch_y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0c8a5d0f-80c3-42b3-9f06-ecfc3a21f395",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class BertClassifier(nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self, dropout=0.5):\n",
|
||||
"\n",
|
||||
" super(BertClassifier, self).__init__()\n",
|
||||
"\n",
|
||||
" self.bert = BertModel.from_pretrained('bert-base-cased')\n",
|
||||
" self.dropout = nn.Dropout(dropout)\n",
|
||||
" self.linear = nn.Linear(768, len(labels))\n",
|
||||
" self.relu = nn.ReLU()\n",
|
||||
"\n",
|
||||
" def forward(self, input_id, mask):\n",
|
||||
"\n",
|
||||
" _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)\n",
|
||||
" dropout_output = self.dropout(pooled_output)\n",
|
||||
" linear_output = self.linear(dropout_output)\n",
|
||||
" final_layer = self.relu(linear_output)\n",
|
||||
"\n",
|
||||
" return final_layer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "fa1f1cf7-65db-4966-9a55-ba26bd22ed6c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train(model, train_data, val_data, learning_rate, epochs):\n",
|
||||
"\n",
|
||||
" train, val = Dataset(train_data), Dataset(val_data)\n",
|
||||
"\n",
|
||||
" train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)\n",
|
||||
" val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)\n",
|
||||
"\n",
|
||||
" use_cuda = torch.cuda.is_available() and not force_cpu\n",
|
||||
" device = torch.device(\"cuda\" if use_cuda and not force_cpu else \"cpu\")\n",
|
||||
"\n",
|
||||
" criterion = nn.CrossEntropyLoss()\n",
|
||||
" optimizer = Adam(model.parameters(), lr= learning_rate)\n",
|
||||
"\n",
|
||||
" if use_cuda:\n",
|
||||
"\n",
|
||||
" model = model.cuda()\n",
|
||||
" criterion = criterion.cuda()\n",
|
||||
"\n",
|
||||
" for epoch_num in range(epochs):\n",
|
||||
"\n",
|
||||
" total_acc_train = 0\n",
|
||||
" total_loss_train = 0\n",
|
||||
"\n",
|
||||
" for train_input, train_label in tqdm(train_dataloader):\n",
|
||||
"\n",
|
||||
" train_label = train_label.to(device)\n",
|
||||
" mask = train_input['attention_mask'].to(device)\n",
|
||||
" input_id = train_input['input_ids'].squeeze(1).to(device)\n",
|
||||
"\n",
|
||||
" output = model(input_id, mask)\n",
|
||||
" \n",
|
||||
" batch_loss = criterion(output, train_label.long())\n",
|
||||
" total_loss_train += batch_loss.item()\n",
|
||||
" \n",
|
||||
" acc = (output.argmax(dim=1) == train_label).sum().item()\n",
|
||||
" total_acc_train += acc\n",
|
||||
"\n",
|
||||
" model.zero_grad()\n",
|
||||
" batch_loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" total_acc_val = 0\n",
|
||||
" total_loss_val = 0\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
"\n",
|
||||
" for val_input, val_label in val_dataloader:\n",
|
||||
"\n",
|
||||
" val_label = val_label.to(device)\n",
|
||||
" mask = val_input['attention_mask'].to(device)\n",
|
||||
" input_id = val_input['input_ids'].squeeze(1).to(device)\n",
|
||||
"\n",
|
||||
" output = model(input_id, mask)\n",
|
||||
"\n",
|
||||
" batch_loss = criterion(output, val_label.long())\n",
|
||||
" total_loss_val += batch_loss.item()\n",
|
||||
" \n",
|
||||
" acc = (output.argmax(dim=1) == val_label).sum().item()\n",
|
||||
" total_acc_val += acc\n",
|
||||
" \n",
|
||||
" print(\n",
|
||||
" f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "cd8a670d-c449-45fe-8f4c-9a5fb27855c1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def evaluate(model, test_data):\n",
|
||||
"\n",
|
||||
" test = Dataset(test_data)\n",
|
||||
"\n",
|
||||
" test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)\n",
|
||||
"\n",
|
||||
" use_cuda = torch.cuda.is_available() and not force_cpu\n",
|
||||
" device = torch.device(\"cuda\" if use_cuda and not force_cpu else \"cpu\")\n",
|
||||
" \n",
|
||||
" if use_cuda:\n",
|
||||
"\n",
|
||||
" model = model.cuda()\n",
|
||||
"\n",
|
||||
" total_acc_test = 0\n",
|
||||
" with torch.no_grad():\n",
|
||||
"\n",
|
||||
" for test_input, test_label in test_dataloader:\n",
|
||||
"\n",
|
||||
" test_label = test_label.to(device)\n",
|
||||
" mask = test_input['attention_mask'].to(device)\n",
|
||||
" input_id = test_input['input_ids'].squeeze(1).to(device)\n",
|
||||
"\n",
|
||||
" output = model(input_id, mask)\n",
|
||||
"\n",
|
||||
" acc = (output.argmax(dim=1) == test_label).sum().item()\n",
|
||||
" total_acc_test += acc\n",
|
||||
" \n",
|
||||
" print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "25d2231d-fef1-42cf-a73e-188cac932727",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"8303 923 4787\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/SA23-G2/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\n",
|
||||
" return bound(*args, **kwds)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"np.random.seed(112)\n",
|
||||
"\n",
|
||||
"df = df_train_recent\n",
|
||||
"num_samples = len(df.index)\n",
|
||||
"val_split_idx = \n",
|
||||
"\n",
|
||||
"df_train, df_val = np.split(df.sample(frac=1, random_state=42), [int(.9*len(df))])\n",
|
||||
"\n",
|
||||
"print(len(df_train),len(df_val), len(df_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "30242239-de70-4c03-8f56-9f5ade43518d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:47<00:00, 11.96it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epochs: 1 | Train Loss: 2.022 | Train Accuracy: 0.106 | Val Loss: 1.854 | Val Accuracy: 0.115\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:47<00:00, 11.93it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epochs: 2 | Train Loss: 1.764 | Train Accuracy: 0.161 | Val Loss: 1.664 | Val Accuracy: 0.232\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:48<00:00, 11.93it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epochs: 3 | Train Loss: 1.553 | Train Accuracy: 0.291 | Val Loss: 1.477 | Val Accuracy: 0.315\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:48<00:00, 11.93it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epochs: 4 | Train Loss: 1.371 | Train Accuracy: 0.377 | Val Loss: 1.358 | Val Accuracy: 0.358\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:48<00:00, 11.93it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epochs: 5 | Train Loss: 1.232 | Train Accuracy: 0.450 | Val Loss: 1.258 | Val Accuracy: 0.426\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"EPOCHS = 5\n",
|
||||
"model = BertClassifier()\n",
|
||||
"LR = 1e-6\n",
|
||||
" \n",
|
||||
"train(model, df_train, df_val, LR, EPOCHS)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "ccc00f0a-9a15-4942-9c9b-2f9789c8dd22",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Test Accuracy: 0.413\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluate(model, df_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "68be0b13-968c-437e-817a-6c12e0823091",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
408
src/model-dl/model-Copy1.ipynb
Normal file
365
src/model-dl/model.ipynb
Normal file
|
@ -0,0 +1,365 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "WGlZj4UC74VR",
|
||||
"outputId": "0522302e-cd6c-44ff-c7c0-3cf3829a8943"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/SA23-G2/bug-triaging/src/model-dl\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
|
||||
"The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. \n",
|
||||
"The class this function is called from is 'BertTokenizer'.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"cuda\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tokenizing Training Data: 100%|████████| 1000/1000 [00:00<00:00, 2641249.37it/s]\n",
|
||||
"Tokenizing Test Data: 100%|████████████| 1000/1000 [00:00<00:00, 5548021.16it/s]\n",
|
||||
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
|
||||
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "OutOfMemoryError",
|
||||
"evalue": "CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 10.76 GiB of which 17.69 MiB is free. Process 1977743 has 1.07 GiB memory in use. Process 1980219 has 1.07 GiB memory in use. Process 1922558 has 3.07 GiB memory in use. Including non-PyTorch memory, this process has 5.53 GiB memory in use. Of the allocated memory 4.67 GiB is allocated by PyTorch, and 55.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[2], line 118\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# Load model\u001b[39;00m\n\u001b[1;32m 117\u001b[0m model \u001b[38;5;241m=\u001b[39m BertForSequenceClassification\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbert-base-uncased\u001b[39m\u001b[38;5;124m'\u001b[39m, num_labels\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(assignee_mapping))\n\u001b[0;32m--> 118\u001b[0m model\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# Initialize optimizer\u001b[39;00m\n\u001b[1;32m 121\u001b[0m optimizer \u001b[38;5;241m=\u001b[39m AdamW(model\u001b[38;5;241m.\u001b[39mparameters(), lr\u001b[38;5;241m=\u001b[39mLEARNING_RATE)\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/transformers/modeling_utils.py:2179\u001b[0m, in \u001b[0;36mPreTrainedModel.to\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2174\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2175\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2176\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m model has already been set to the correct devices and casted to the correct `dtype`.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2177\u001b[0m )\n\u001b[1;32m 2178\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2179\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:1160\u001b[0m, in \u001b[0;36mModule.to\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1157\u001b[0m non_blocking, memory_format\u001b[38;5;241m=\u001b[39mconvert_to_format)\n\u001b[1;32m 1158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, non_blocking)\n\u001b[0;32m-> 1160\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_apply(convert)\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:810\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 808\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 809\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 810\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 812\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 814\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:810\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 808\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 809\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 810\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 812\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 814\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
|
||||
" \u001b[0;31m[... skipping similar frames: Module._apply at line 810 (4 times)]\u001b[0m\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:810\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 808\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 809\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 810\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 812\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 814\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:833\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 829\u001b[0m \u001b[38;5;66;03m# Tensors stored in modules are graph leaves, and we don't want to\u001b[39;00m\n\u001b[1;32m 830\u001b[0m \u001b[38;5;66;03m# track autograd history of `param_applied`, so we have to use\u001b[39;00m\n\u001b[1;32m 831\u001b[0m \u001b[38;5;66;03m# `with torch.no_grad():`\u001b[39;00m\n\u001b[1;32m 832\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m--> 833\u001b[0m param_applied \u001b[38;5;241m=\u001b[39m fn(param)\n\u001b[1;32m 834\u001b[0m should_use_set_data \u001b[38;5;241m=\u001b[39m compute_should_use_set_data(param, param_applied)\n\u001b[1;32m 835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m should_use_set_data:\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:1158\u001b[0m, in \u001b[0;36mModule.to.<locals>.convert\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 1155\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m convert_to_format \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m t\u001b[38;5;241m.\u001b[39mdim() \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;241m4\u001b[39m, \u001b[38;5;241m5\u001b[39m):\n\u001b[1;32m 1156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1157\u001b[0m non_blocking, memory_format\u001b[38;5;241m=\u001b[39mconvert_to_format)\n\u001b[0;32m-> 1158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, non_blocking)\n",
|
||||
"\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 10.76 GiB of which 17.69 MiB is free. Process 1977743 has 1.07 GiB memory in use. Process 1980219 has 1.07 GiB memory in use. Process 1922558 has 3.07 GiB memory in use. Including non-PyTorch memory, this process has 5.53 GiB memory in use. Of the allocated memory 4.67 GiB is allocated by PyTorch, and 55.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from torch.utils.data import Dataset, DataLoader\n",
|
||||
"from transformers import BertTokenizer, BertForSequenceClassification, AdamW\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# Hyperparameters\n",
|
||||
"BATCH_SIZE = 32\n",
|
||||
"LEARNING_RATE = 1e-5\n",
|
||||
"EPOCHS = 3\n",
|
||||
"MAX_LEN = 512\n",
|
||||
"\n",
|
||||
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"\n",
|
||||
"path = os.getcwd()\n",
|
||||
"\n",
|
||||
"print(path)\n",
|
||||
"\n",
|
||||
"# Load dataset\n",
|
||||
"train_df = pd.read_csv('/home/SA23-G2/bug-triaging/out/csv/issues_train_000001_170000.csv')\n",
|
||||
"test_df = pd.read_csv('/home/SA23-G2/bug-triaging/out/csv/issues_test_170001_180000.csv')\n",
|
||||
"\n",
|
||||
"train_df = train_df.head(1000)\n",
|
||||
"test_df = test_df.head(1000)\n",
|
||||
"\n",
|
||||
"# Combine train and test datasets\n",
|
||||
"combined_df = pd.concat([train_df, test_df], ignore_index=True)\n",
|
||||
"\n",
|
||||
"# Create a dictionary to map unique assignee names to integer labels\n",
|
||||
"assignee_mapping = {assignee: label for label, assignee in enumerate(combined_df['assignee'].unique())}\n",
|
||||
"\n",
|
||||
"# Update assignee labels in both train and test datasets\n",
|
||||
"train_df['assignee'] = train_df['assignee'].map(assignee_mapping)\n",
|
||||
"test_df['assignee'] = test_df['assignee'].map(assignee_mapping)\n",
|
||||
"train_df = train_df.dropna(subset=['assignee']).copy()\n",
|
||||
"test_df = test_df.dropna(subset=['assignee']).copy()\n",
|
||||
"train_df.reset_index(drop=True, inplace=True)\n",
|
||||
"test_df.reset_index(drop=True, inplace=True)\n",
|
||||
"train_df['assignee'] = train_df['assignee'].astype(int)\n",
|
||||
"test_df['assignee'] = test_df['assignee'].astype(int)\n",
|
||||
"train_df.drop(columns=['dirty_body'], inplace=True)\n",
|
||||
"test_df.drop(columns=['dirty_body'], inplace=True)\n",
|
||||
"\n",
|
||||
"# Initialize tokenizer\n",
|
||||
"tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')\n",
|
||||
"\n",
|
||||
"def calculate_max_seq_length(dataframe, tokenizer):\n",
|
||||
" max_len = 0\n",
|
||||
" for index, row in dataframe.iterrows():\n",
|
||||
" title = str(row['title'])\n",
|
||||
" body = str(row['body'])\n",
|
||||
" tokens = tokenizer.encode(title, body, add_special_tokens=True)\n",
|
||||
" max_len = max(max_len, len(tokens))\n",
|
||||
" return max_len\n",
|
||||
"\n",
|
||||
"# Calculate max sequence length for both training and test datasets\n",
|
||||
"#max_len_train = calculate_max_seq_length(train_df, tokenizer)\n",
|
||||
"#max_len_test = calculate_max_seq_length(test_df, tokenizer)\n",
|
||||
"#print(f\"Max sequence length in training dataset: {max_len_train}\")\n",
|
||||
"#print(f\"Max sequence length in test dataset: {max_len_test}\")\n",
|
||||
"\n",
|
||||
"# Custom dataset class\n",
|
||||
"class CustomDataset(Dataset):\n",
|
||||
" def __init__(self, dataframe, tokenizer, max_len):\n",
|
||||
" self.tokenizer = tokenizer\n",
|
||||
" self.data = dataframe\n",
|
||||
" self.title = dataframe.title\n",
|
||||
" self.body = dataframe.body\n",
|
||||
" self.targets = dataframe.assignee\n",
|
||||
" self.max_len = max_len\n",
|
||||
"\n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.title)\n",
|
||||
"\n",
|
||||
" def __getitem__(self, index):\n",
|
||||
" title = str(self.title[index])\n",
|
||||
" body = str(self.body[index])\n",
|
||||
" inputs = self.tokenizer.encode_plus(\n",
|
||||
" \"TITLE_START\" + title + \"BODY_START\" + body,\n",
|
||||
" add_special_tokens=True,\n",
|
||||
" max_length=self.max_len,\n",
|
||||
" padding='max_length',\n",
|
||||
" return_token_type_ids=True,\n",
|
||||
" truncation=True\n",
|
||||
" )\n",
|
||||
" ids = inputs['input_ids']\n",
|
||||
" mask = inputs['attention_mask']\n",
|
||||
"\n",
|
||||
" return {\n",
|
||||
" 'ids': torch.tensor(ids, dtype=torch.long),\n",
|
||||
" 'mask': torch.tensor(mask, dtype=torch.long),\n",
|
||||
" 'targets': torch.tensor(self.targets[index], dtype=torch.long)\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"# Check if CUDA is available\n",
|
||||
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
|
||||
"print(device)\n",
|
||||
"\n",
|
||||
"# Create datasets with tqdm progress bar\n",
|
||||
"with tqdm(total=len(train_df), desc=\"Tokenizing Training Data\") as pbar:\n",
|
||||
" train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)\n",
|
||||
" pbar.update(len(train_df))\n",
|
||||
"\n",
|
||||
"with tqdm(total=len(test_df), desc=\"Tokenizing Test Data\") as pbar:\n",
|
||||
" test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)\n",
|
||||
" pbar.update(len(test_df))\n",
|
||||
"\n",
|
||||
"# Create data loaders\n",
|
||||
"train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)\n",
|
||||
"test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)\n",
|
||||
"\n",
|
||||
"# Load model\n",
|
||||
"model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(assignee_mapping))\n",
|
||||
"model.to(device)\n",
|
||||
"\n",
|
||||
"# Initialize optimizer\n",
|
||||
"optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)\n",
|
||||
"\n",
|
||||
"# Training loop with tqdm progress bar for epochs\n",
|
||||
"for epoch in range(EPOCHS):\n",
|
||||
" model.train()\n",
|
||||
" progress_bar = tqdm(train_loader, desc=f\"Epoch {epoch + 1}/{EPOCHS}\")\n",
|
||||
"\n",
|
||||
" for batch in progress_bar:\n",
|
||||
" ids = batch['ids'].to(device)\n",
|
||||
" mask = batch['mask'].to(device)\n",
|
||||
" targets = batch['targets'].to(device)\n",
|
||||
"\n",
|
||||
" outputs = model(ids, attention_mask=mask, labels=targets)\n",
|
||||
" loss = outputs.loss\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" progress_bar.set_postfix({'Loss': f'{loss.item():.4f}'})\n",
|
||||
"\n",
|
||||
" # Evaluation\n",
|
||||
" model.eval()\n",
|
||||
" predictions = []\n",
|
||||
" true_labels = []\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" for batch in tqdm(test_loader, desc=\"Evaluating\"):\n",
|
||||
" ids = batch['ids'].to(device)\n",
|
||||
" mask = batch['mask'].to(device)\n",
|
||||
" targets = batch['targets'].to(device)\n",
|
||||
"\n",
|
||||
" outputs = model(ids, attention_mask=mask)\n",
|
||||
" predictions.extend(torch.argmax(outputs.logits, 1).cpu().numpy())\n",
|
||||
" true_labels.extend(targets.cpu().numpy())\n",
|
||||
"\n",
|
||||
" accuracy = accuracy_score(true_labels, predictions)\n",
|
||||
" print(f'\\nEpoch: {epoch + 1}, Accuracy: {accuracy:.4f}')\n",
|
||||
"\n",
|
||||
"# Save model\n",
|
||||
"# torch.save(model.state_dict(), 'model.pth')\n",
|
||||
"# print('Model saved to model.pth')\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "8Inp7tF27nXX",
|
||||
"outputId": "190ab7cd-3e21-44da-d97f-4b71abbc3cec"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Reversing the assignee_mapping dictionary\n",
|
||||
"reverse_assignee_mapping = {v: k for k, v in assignee_mapping.items()}\n",
|
||||
"\n",
|
||||
"# Get unique labels from true_labels and predictions\n",
|
||||
"unique_labels = np.unique(true_labels + predictions)\n",
|
||||
"\n",
|
||||
"# Convert numerical labels to names\n",
|
||||
"target_names = [reverse_assignee_mapping[label] for label in unique_labels]\n",
|
||||
"\n",
|
||||
"print(classification_report(true_labels, predictions, target_names=target_names, labels=unique_labels))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
"id": "vOBhgOm29II_",
|
||||
"outputId": "883d43a3-c248-4855-9644-4a10a3ca5234",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc\n",
|
||||
"from sklearn.preprocessing import label_binarize\n",
|
||||
"import seaborn as sns\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from itertools import cycle\n",
|
||||
"\n",
|
||||
"# Assuming predictions and true_labels lists are already filled\n",
|
||||
"unique_labels = np.unique(true_labels + predictions)\n",
|
||||
"\n",
|
||||
"# 1. Accuracy\n",
|
||||
"accuracy = accuracy_score(true_labels, predictions)\n",
|
||||
"print(f\"Accuracy: {accuracy:.4f}\")\n",
|
||||
"\n",
|
||||
"# 2. Precision\n",
|
||||
"precision = precision_score(true_labels, predictions, average='weighted')\n",
|
||||
"print(f\"Precision: {precision:.4f}\")\n",
|
||||
"\n",
|
||||
"# 3. Recall\n",
|
||||
"recall = recall_score(true_labels, predictions, average='weighted')\n",
|
||||
"print(f\"Recall: {recall:.4f}\")\n",
|
||||
"\n",
|
||||
"# 4. F1-Score\n",
|
||||
"f1 = f1_score(true_labels, predictions, average='weighted')\n",
|
||||
"print(f\"F1-Score: {f1:.4f}\")\n",
|
||||
"\n",
|
||||
"# 5. Confusion Matrix\n",
|
||||
"cm = confusion_matrix(true_labels, predictions)\n",
|
||||
"plt.figure(figsize=(10, 7))\n",
|
||||
"sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)\n",
|
||||
"plt.xlabel('Predicted')\n",
|
||||
"plt.ylabel('True')\n",
|
||||
"plt.title('Confusion Matrix')\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# 6. ROC Curve and AUC\n",
|
||||
"# Binarize the labels\n",
|
||||
"binarized_true_labels = label_binarize(true_labels, classes=unique_labels)\n",
|
||||
"binarized_predictions = label_binarize(predictions, classes=unique_labels)\n",
|
||||
"\n",
|
||||
"n_classes = binarized_true_labels.shape[1]\n",
|
||||
"\n",
|
||||
"# Compute ROC curve and ROC area for each class\n",
|
||||
"fpr = dict()\n",
|
||||
"tpr = dict()\n",
|
||||
"roc_auc = dict()\n",
|
||||
"for i in range(n_classes):\n",
|
||||
" fpr[i], tpr[i], _ = roc_curve(binarized_true_labels[:, i], binarized_predictions[:, i])\n",
|
||||
" roc_auc[i] = auc(fpr[i], tpr[i])\n",
|
||||
"\n",
|
||||
"# Plot all ROC curves\n",
|
||||
"plt.figure(figsize=(10, 7))\n",
|
||||
"for i, color in zip(range(n_classes), cycle(['aqua', 'darkorange', 'cornflowerblue'])):\n",
|
||||
" plt.plot(fpr[i], tpr[i], color=color, lw=2,\n",
|
||||
" label='ROC curve of class {0} (area = {1:0.2f})'\n",
|
||||
" ''.format(i, roc_auc[i]))\n",
|
||||
"\n",
|
||||
"plt.plot([0, 1], [0, 1], 'k--', lw=2)\n",
|
||||
"plt.xlim([0.0, 1.0])\n",
|
||||
"plt.ylim([0.0, 1.05])\n",
|
||||
"plt.xlabel('False Positive Rate')\n",
|
||||
"plt.ylabel('True Positive Rate')\n",
|
||||
"plt.title('Receiver Operating Characteristic to Multi-Class')\n",
|
||||
"plt.legend(loc=\"lower right\")\n",
|
||||
"plt.show()\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"gpuType": "A100",
|
||||
"machine_shape": "hm",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
0
src/modelimpl/__init__.py
Normal file
106
src/modelimpl/auc.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
import matplotlib as mpl
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tqdm
|
||||
from sklearn.metrics import RocCurveDisplay, roc_auc_score
|
||||
from sklearn.preprocessing import LabelBinarizer
|
||||
|
||||
from .classifier import Classifier
|
||||
from .dataset import Labelling, SplitData, load_df
|
||||
from .evaluate import predict
|
||||
from .load import load_model
|
||||
from .torch_dataset import Dataset
|
||||
|
||||
|
||||
def predict_proba(model: Classifier, test: Dataset, n_classes: int, force_cpu: bool) -> np.ndarray:
|
||||
test_output = predict(model, test, -1, force_cpu)
|
||||
out = np.ndarray(shape=(len(test.texts), n_classes))
|
||||
|
||||
for i, res in enumerate(tqdm.tqdm(test_output, desc="Predicting outputs for ROC/AUC")):
|
||||
for c, pred_class in enumerate(res.top_indices):
|
||||
if pred_class >= 0:
|
||||
pred_prob = res.top_values[c]
|
||||
out[i, pred_class] = pred_prob
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def compute_auc_roc(model: Classifier, test_data: SplitData, n_classes, labelling: Labelling, force_cpu: bool,
|
||||
out_prefix: str):
|
||||
# Encode classes in one-hot encoding for one-vs-rest AUC
|
||||
lb = LabelBinarizer()
|
||||
lb.fit(range(n_classes))
|
||||
y_onehot = lb.transform(test_data.labels)
|
||||
y_score = predict_proba(model, Dataset(test_data), n_classes, force_cpu)
|
||||
|
||||
# Color map for classes
|
||||
colormap = mpl.colormaps['Spectral'].resampled(n_classes)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(14, 14))
|
||||
|
||||
for assignee, class_n in tqdm.tqdm(labelling.labels.items(), desc="Computing ROC curves"):
|
||||
if 0 <= class_n < n_classes:
|
||||
lb_class = np.flatnonzero(lb.classes_ == class_n)[0]
|
||||
y_true = y_onehot[:, lb_class]
|
||||
|
||||
if len(np.flatnonzero(y_true)) > 0: # if this class is in the test set
|
||||
RocCurveDisplay.from_predictions(
|
||||
y_true,
|
||||
y_score[:, lb_class],
|
||||
ax=ax,
|
||||
name=f"{assignee} ({class_n})",
|
||||
color=colormap(class_n)
|
||||
)
|
||||
|
||||
plt.axis("square")
|
||||
plt.xlabel("False Positive Rate")
|
||||
plt.ylabel("True Positive Rate")
|
||||
plt.title("One-vs-Rest ROC curves")
|
||||
plt.legend()
|
||||
plt.savefig(out_prefix + ".ovr_curves.png")
|
||||
|
||||
fig, ax = plt.subplots(figsize=(7, 7))
|
||||
RocCurveDisplay.from_predictions(
|
||||
y_onehot.ravel(),
|
||||
y_score.ravel(),
|
||||
ax=ax,
|
||||
name="micro-average OvR",
|
||||
color="darkorange",
|
||||
)
|
||||
plt.axis("square")
|
||||
plt.xlabel("False Positive Rate")
|
||||
plt.ylabel("True Positive Rate")
|
||||
plt.title("Micro-averaged One-vs-Rest\nReceiver Operating Characteristic")
|
||||
ax.get_legend().remove()
|
||||
plt.savefig(out_prefix + ".ovr_avg.png")
|
||||
|
||||
micro_roc_auc_ovr = roc_auc_score(
|
||||
y_onehot,
|
||||
y_score,
|
||||
multi_class="ovr",
|
||||
average="micro",
|
||||
)
|
||||
|
||||
message = f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.4f}"
|
||||
|
||||
with open(out_prefix + ".auc.txt", "w") as f:
|
||||
f.write(message)
|
||||
|
||||
print(message)
|
||||
|
||||
|
||||
def build_curve(path: str, force_cpu: bool):
|
||||
if not path.endswith('.pt'):
|
||||
raise ValueError("path should point to a pytorch model file")
|
||||
|
||||
pretrained, model, classes = load_model(path, None, force_cpu, False)
|
||||
|
||||
if not pretrained:
|
||||
raise FileNotFoundError("Trained model is needed to run predict script")
|
||||
|
||||
out_prefix = path[:-3]
|
||||
csv_path = out_prefix + '.labels.csv'
|
||||
labelling = Labelling.load(csv_path)
|
||||
test_data = SplitData.from_df(load_df(f'issues_test_170001_180000.csv'), labelling, classes)
|
||||
|
||||
compute_auc_roc(model, test_data, classes, labelling, force_cpu, out_prefix)
|
8
src/modelimpl/classifier.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from transformers import BertForSequenceClassification
|
||||
|
||||
Classifier = BertForSequenceClassification
|
||||
|
||||
|
||||
def bert_classifier(n_classes: int) -> Classifier:
|
||||
return BertForSequenceClassification \
|
||||
.from_pretrained('bert-base-uncased', num_labels=n_classes)
|
148
src/modelimpl/dataset.py
Normal file
|
@ -0,0 +1,148 @@
|
|||
import os.path
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from transformers import DistilBertTokenizer
|
||||
from transformers.tokenization_utils_base import BatchEncoding
|
||||
|
||||
DIR: str = os.path.join(os.path.dirname(__file__), '..', '..', 'out', 'csv')
|
||||
OUT_COLUMN: str = 'assignee'
|
||||
IN_COLUMNS: list[str] = ['title', 'body']
|
||||
IN_JOINED_COLUMN: str = 'title_body'
|
||||
VALIDATION_PERC = 0.1
|
||||
|
||||
|
||||
def prepare_input(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[IN_JOINED_COLUMN] = df[IN_COLUMNS].apply(lambda row: '\n'.join(row.values.astype(str)), axis=1)
|
||||
return df.loc[:, [IN_JOINED_COLUMN, OUT_COLUMN]]
|
||||
|
||||
|
||||
def load_df(csv_filename: str) -> pd.DataFrame:
|
||||
df: pd.DataFrame = pd.read_csv(os.path.join(DIR, csv_filename))
|
||||
df = df.set_index('id', drop=True)
|
||||
return prepare_input(df)
|
||||
|
||||
|
||||
def compute_labels(frames: list[pd.DataFrame]) -> tuple[dict[str, int], list[int]]:
|
||||
n: int = 0
|
||||
labels_dict: dict[str, int] = {}
|
||||
num_bounds: list[int] = [0]
|
||||
|
||||
for frame in frames:
|
||||
labels: list[str] = frame[OUT_COLUMN] \
|
||||
.drop_duplicates(keep='first') \
|
||||
.sort_values() \
|
||||
.to_list()
|
||||
|
||||
for label in labels:
|
||||
if label not in labels_dict:
|
||||
labels_dict[label] = n
|
||||
n += 1
|
||||
|
||||
num_bounds.append(n)
|
||||
|
||||
return labels_dict, num_bounds
|
||||
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=False)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Labelling:
|
||||
labels: dict[str, int]
|
||||
labels_rev: dict[int, str]
|
||||
|
||||
def __init__(self, labels: dict[str, int]):
|
||||
self.labels = labels
|
||||
self.labels_rev = {v: k for k, v in self.labels.items()}
|
||||
self.labels_rev[-2] = 'Unassigned'
|
||||
|
||||
def save(self, filename: str):
|
||||
df = pd.DataFrame(list(self.labels.items()), columns=['token', 'label'])
|
||||
df.to_csv(filename, index=False)
|
||||
|
||||
@staticmethod
|
||||
def load(filename: str) -> 'Labelling':
|
||||
df = pd.read_csv(filename)
|
||||
|
||||
labels: dict[str, int] = {}
|
||||
for _, row in df.iterrows():
|
||||
labels[str(row['token'])] = int(row['label'])
|
||||
|
||||
return Labelling(labels)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SplitData:
|
||||
index: dict[int, int]
|
||||
labels: list[int]
|
||||
texts: list[BatchEncoding]
|
||||
|
||||
@staticmethod
|
||||
def from_df(df: pd.DataFrame, labels: Labelling, label_threshold: int) -> 'SplitData':
|
||||
index = {e: i for i, e in enumerate(df.index.tolist())}
|
||||
labels = [-2 if label is None else labels.labels[label] for label in df[OUT_COLUMN]]
|
||||
labels = [-1 if label >= label_threshold else label for label in labels]
|
||||
texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
|
||||
return_tensors='pt') for text in df[IN_JOINED_COLUMN]]
|
||||
return SplitData(index, labels, texts)
|
||||
|
||||
def __init__(self, index: dict[int, int], labels: list[int], texts: list[BatchEncoding]):
|
||||
self.index = index
|
||||
self.labels = labels
|
||||
self.texts = texts
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.texts)
|
||||
|
||||
def only_issue(self, issue_id: int) -> Optional['SplitData']:
|
||||
if issue_id not in self.index:
|
||||
return None
|
||||
|
||||
i = self.index[issue_id]
|
||||
label = self.labels[i]
|
||||
text = self.texts[i]
|
||||
|
||||
return SplitData({issue_id: 0}, [label], [text])
|
||||
|
||||
|
||||
def df_validation_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
df_train, df_val = np.split(df, [int((1 - VALIDATION_PERC) * len(df))])
|
||||
return df_train, df_val
|
||||
|
||||
|
||||
@dataclass
|
||||
class Datasets:
|
||||
train: SplitData
|
||||
validation: SplitData
|
||||
test: SplitData
|
||||
classifier_label_range: tuple[int, int]
|
||||
labelling: Labelling
|
||||
|
||||
def __init__(self, kind: str):
|
||||
if kind == 'all':
|
||||
df = load_df(f'issues_train_000001_170000.csv')
|
||||
elif kind == 'recent':
|
||||
df = load_df(f'issues_train_recent_150000_170000.csv')
|
||||
else:
|
||||
raise ValueError('kind must be one of \'all\' or \'recent\'')
|
||||
|
||||
df_test: pd.DataFrame = load_df(f'issues_test_170001_180000.csv')
|
||||
df_train, df_val = df_validation_split(df)
|
||||
|
||||
labels, splits = compute_labels([df_train, df_val, df_test])
|
||||
|
||||
self.labelling = Labelling(labels)
|
||||
self.classifier_label_range = (splits[0], splits[1])
|
||||
|
||||
self.train = SplitData.from_df(df_train, self.labelling, splits[1])
|
||||
|
||||
# Remove unknown labels from validation set
|
||||
df_val['label_num'] = df_val[OUT_COLUMN].apply(lambda ass: self.labelling.labels[ass])
|
||||
df_val = df_val[df_val.label_num < self.classifier_label_range[1]]
|
||||
|
||||
self.validation = SplitData.from_df(df_val, self.labelling, splits[1])
|
||||
|
||||
self.test = SplitData.from_df(df_test, self.labelling, splits[1])
|
90
src/modelimpl/evaluate.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import tqdm
|
||||
from torch.utils import data
|
||||
|
||||
from .classifier import Classifier
|
||||
from .dataset import SplitData
|
||||
from .torch_dataset import Dataset
|
||||
|
||||
|
||||
@dataclass
|
||||
class PredictionResult:
|
||||
top_values: list[float]
|
||||
top_indices: list[int]
|
||||
truth_idx: Optional[int]
|
||||
|
||||
def __init__(self, top_values: list[float], top_indices: list[int], truth_idx: Optional[int]):
|
||||
self.top_values = top_values
|
||||
self.top_indices = top_indices
|
||||
self.truth_idx = truth_idx
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.top_values)
|
||||
|
||||
|
||||
def predict(model: Classifier, test: Dataset, top_n: int, force_cpu: bool) -> list[PredictionResult]:
|
||||
batch_size = 16
|
||||
test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)
|
||||
|
||||
use_cuda = torch.cuda.is_available() and not force_cpu
|
||||
device = torch.device("cuda" if use_cuda and not force_cpu else "cpu")
|
||||
|
||||
if use_cuda:
|
||||
model = model.cuda()
|
||||
|
||||
res = []
|
||||
|
||||
with torch.no_grad():
|
||||
for test_x, test_y in tqdm.tqdm(test_dataloader, desc="Test"):
|
||||
test_y = test_y.to(device)
|
||||
mask = test_x['attention_mask'].to(device)
|
||||
input_id = test_x['input_ids'].squeeze(1).to(device)
|
||||
|
||||
output = model(input_id, mask)
|
||||
norm_output = torch.softmax(output.logits, dim=1)
|
||||
|
||||
dim_n = norm_output.size(dim=1)
|
||||
top_n = min(dim_n if top_n == -1 else top_n, dim_n)
|
||||
|
||||
top = torch.topk(norm_output, top_n, dim=1)
|
||||
|
||||
for i in range(top.values.size(dim=0)):
|
||||
res.append(PredictionResult(top.values[i, :].tolist(), top.indices[i, :].tolist(), test_y[i].item()))
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def evaluate(model, test: Dataset, force_cpu: bool, top_k: int = 5):
|
||||
test_output = predict(model, test, top_k, force_cpu)
|
||||
|
||||
if len(test_output) > 0:
|
||||
top_k = min(top_k, len(test_output[0].top_indices))
|
||||
|
||||
accuracies = [0] * top_k
|
||||
|
||||
for res in test_output:
|
||||
for i, index in enumerate(res.top_indices):
|
||||
if index == res.truth_idx:
|
||||
for j in range(i, len(accuracies)):
|
||||
accuracies[j] += 1
|
||||
|
||||
for i, acc in enumerate(accuracies):
|
||||
acc = acc / len(test.texts)
|
||||
print(f'Test Accuracy for {i + 1} recommendations: {acc: .4f}')
|
||||
|
||||
|
||||
def predict_top_k(model: Classifier, test_data: SplitData, issue_id: int, top_n: int,
|
||||
force_cpu: bool) -> PredictionResult:
|
||||
issue_data = test_data.only_issue(issue_id)
|
||||
|
||||
if issue_data is None:
|
||||
raise ValueError("Issue id {0} is not present as an issue in the test set".format(issue_id))
|
||||
|
||||
issue_dataset = Dataset(issue_data)
|
||||
result = predict(model, issue_dataset, top_n, force_cpu)
|
||||
|
||||
assert len(result) == 1
|
||||
return result[0]
|
58
src/modelimpl/load.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import os
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from .classifier import bert_classifier, Classifier
|
||||
|
||||
OUT_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'out', 'model')
|
||||
|
||||
|
||||
def get_model_path(dataset_kind: str, epochs: int, learning_rate: float, suffix_ext: str) -> str:
|
||||
filename = 'bug_triaging_{0}_{1}e_{2}lr_final.{3}'.format(
|
||||
dataset_kind,
|
||||
str(epochs),
|
||||
str(learning_rate).replace('.', '_'),
|
||||
suffix_ext
|
||||
)
|
||||
return os.path.join(OUT_DIR, filename)
|
||||
|
||||
|
||||
def load_model(path: str, label_range: Optional[tuple[int, int]], force_cpu: bool,
|
||||
force_retrain: bool) -> tuple[bool, Classifier, int]:
|
||||
if not path.endswith('.pt'):
|
||||
raise ValueError("path should point to a pytorch model file")
|
||||
|
||||
label_range_path = path[:-3] + '.label_range.txt'
|
||||
|
||||
np.random.seed(0)
|
||||
|
||||
use_gpu = torch.cuda.is_available() and not force_cpu
|
||||
|
||||
if use_gpu:
|
||||
print('Using device #', torch.cuda.current_device())
|
||||
else:
|
||||
print('CUDA is not available! Working on CPU...')
|
||||
|
||||
if label_range is None:
|
||||
with open(label_range_path, "r") as f:
|
||||
start_range = int(f.readline())
|
||||
end_range = int(f.readline())
|
||||
else:
|
||||
start_range = label_range[0]
|
||||
end_range = label_range[1]
|
||||
|
||||
classes = end_range - start_range
|
||||
model = bert_classifier(classes)
|
||||
|
||||
if os.path.isfile(path) and not force_retrain:
|
||||
print('Using already trained model')
|
||||
if use_gpu:
|
||||
model.load_state_dict(torch.load(path))
|
||||
else:
|
||||
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
|
||||
model.eval()
|
||||
return True, model, classes
|
||||
else:
|
||||
return False, model, classes
|
27
src/modelimpl/torch_dataset.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import numpy as np
|
||||
from torch.utils import data
|
||||
|
||||
from .dataset import SplitData
|
||||
|
||||
|
||||
class Dataset(data.Dataset):
|
||||
def __init__(self, split_data: SplitData):
|
||||
self.labels = split_data.labels
|
||||
self.texts = split_data.texts
|
||||
|
||||
def classes(self):
|
||||
return self.labels
|
||||
|
||||
def __len__(self):
|
||||
return len(self.labels)
|
||||
|
||||
def get_batch_labels(self, idx):
|
||||
return np.array(self.labels[idx])
|
||||
|
||||
def get_batch_texts(self, idx):
|
||||
return self.texts[idx]
|
||||
|
||||
def __getitem__(self, idx):
|
||||
batch_texts = self.get_batch_texts(idx)
|
||||
batch_y = self.get_batch_labels(idx)
|
||||
return batch_texts, batch_y
|
93
src/modelimpl/torch_train.py
Normal file
|
@ -0,0 +1,93 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.optim import Adam
|
||||
from torch.utils import data
|
||||
from tqdm import tqdm
|
||||
|
||||
from .torch_dataset import Dataset
|
||||
|
||||
|
||||
def print_message(epoch_num: int, train_loss: float, train_acc: float, train_ds: Dataset, val_loss: float,
|
||||
val_acc: float, val_ds: Dataset):
|
||||
messages = [
|
||||
f'Epochs: {epoch_num + 1}',
|
||||
f'Train Loss: {train_loss / len(train_ds.texts): .3f}',
|
||||
f'Train Accuracy: {train_acc / len(train_ds.texts): .3f}',
|
||||
f'Val Loss: {val_loss / len(val_ds.texts): .3f}',
|
||||
f'Val Accuracy: {val_acc / len(val_ds.texts): .3f}'
|
||||
]
|
||||
print(' | '.join(messages))
|
||||
|
||||
|
||||
def compute_loss_and_acc(label, input_data, device, model) -> tuple[float, float, any]:
|
||||
label = label.to(device)
|
||||
mask = input_data['attention_mask'].to(device)
|
||||
input_id = input_data['input_ids'].squeeze(1).to(device)
|
||||
|
||||
output = model(input_id, attention_mask=mask, labels=label)
|
||||
batch_loss = output.loss
|
||||
|
||||
acc = (torch.argmax(output.logits, 1) == label).sum().item()
|
||||
|
||||
return batch_loss.item(), acc, batch_loss
|
||||
|
||||
|
||||
def train(model, train_ds: Dataset, val_ds: Dataset, learning_rate: float, epochs: int, force_cpu: bool):
|
||||
batch_size = 16
|
||||
|
||||
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
|
||||
val_dataloader = torch.utils.data.DataLoader(val_ds, batch_size=batch_size)
|
||||
|
||||
use_cuda = torch.cuda.is_available() and not force_cpu
|
||||
device = torch.device("cuda" if use_cuda and not force_cpu else "cpu")
|
||||
|
||||
if use_cuda:
|
||||
model = model.cuda()
|
||||
|
||||
optimizer = Adam(model.parameters(), lr=learning_rate)
|
||||
|
||||
for epoch_num in range(epochs):
|
||||
total_acc_train = 0
|
||||
total_loss_train = 0
|
||||
|
||||
train_dl = tqdm(train_dataloader, desc=f"Train E{epoch_num + 1}")
|
||||
|
||||
i = 0
|
||||
for train_input, train_label in train_dl:
|
||||
|
||||
delta_loss, delta_acc, batch_loss = compute_loss_and_acc(train_label, train_input, device, model)
|
||||
|
||||
total_loss_train += delta_loss
|
||||
total_acc_train += delta_acc
|
||||
|
||||
batch_loss.backward()
|
||||
optimizer.step()
|
||||
model.zero_grad()
|
||||
|
||||
i += train_label.size(dim=0)
|
||||
loss_avg = total_loss_train / i
|
||||
acc_avg = total_acc_train / i
|
||||
|
||||
train_dl.set_description(f"Train E{epoch_num + 1} loss={loss_avg:.6f} acc={acc_avg:.4f}")
|
||||
|
||||
total_acc_val = 0
|
||||
total_loss_val = 0
|
||||
|
||||
val_dl = tqdm(val_dataloader, desc="Val E" + str(epoch_num + 1))
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
i = 0
|
||||
for val_input, val_label in val_dl:
|
||||
delta_loss, delta_acc, batch_loss = compute_loss_and_acc(val_label, val_input, device, model)
|
||||
|
||||
total_loss_val += delta_loss
|
||||
total_acc_val += delta_acc
|
||||
|
||||
i += val_label.size(dim=0)
|
||||
loss_avg = total_loss_val / i
|
||||
acc_avg = total_acc_val / i
|
||||
|
||||
val_dl.set_description(f"Val E{epoch_num + 1} loss={loss_avg:.6f} acc={acc_avg:.4f}")
|
||||
|
||||
print_message(epoch_num, total_loss_train, total_acc_train, train_ds, total_loss_val, total_acc_val, val_ds)
|
77
src/runmodel.py
Executable file
|
@ -0,0 +1,77 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
||||
|
||||
import torch
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from cleaner.clean import read_issue_obj
|
||||
from cleaner.dataframe import build_df
|
||||
from modelimpl.dataset import SplitData, prepare_input, Labelling
|
||||
from modelimpl.evaluate import predict_top_k
|
||||
from modelimpl.load import load_model
|
||||
from scraper.download import download_issue, download_commit_activity
|
||||
|
||||
load_dotenv()
|
||||
TOKEN = os.getenv('GITHUB_TOKEN')
|
||||
|
||||
|
||||
def main(path: str, issue_id: int, force_cpu: bool, top: int):
|
||||
stats = download_commit_activity(TOKEN)
|
||||
|
||||
if not path.endswith('.pt'):
|
||||
raise ValueError("path should point to a pytorch model file")
|
||||
|
||||
pretrained, model, classes = load_model(path, None, force_cpu, False)
|
||||
|
||||
csv_path = path[:-3] + '.labels.csv'
|
||||
|
||||
labelling = Labelling.load(csv_path)
|
||||
|
||||
if not pretrained:
|
||||
raise FileNotFoundError("Trained model is needed to run predict script")
|
||||
|
||||
issue_json = download_issue(issue_id, TOKEN)
|
||||
issue_clean = read_issue_obj(issue_json, enable_filter=False)
|
||||
|
||||
if issue_clean is None:
|
||||
raise ValueError("Issue does not contain latin characters in title or body, cannot classify")
|
||||
|
||||
issue_df = prepare_input(build_df([issue_clean]))
|
||||
issue_dataset = SplitData.from_df(issue_df, labelling, classes)
|
||||
|
||||
res = predict_top_k(model, issue_dataset, issue_id, top, force_cpu)
|
||||
for i in range(len(res)):
|
||||
value = res.top_values[i]
|
||||
idx = res.top_indices[i]
|
||||
assignee = labelling.labels_rev[idx]
|
||||
|
||||
print("{0}: '{1}' ({3}) (confidence: {2:.2f}%) ({4} commits authored)"
|
||||
.format(i + 1, assignee, value * 100, idx, stats[assignee]))
|
||||
|
||||
if res.truth_idx != -2:
|
||||
truth = labelling.labels_rev[res.truth_idx]
|
||||
print("Truth: '{0}' ({1}) ({2} commits authored)".format(truth, res.truth_idx, stats[truth]))
|
||||
else:
|
||||
print("Issue is unassigned on GitHub")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
assert torch is not None # make sure pytorch is imported and loaded with correct CUDA env variable
|
||||
|
||||
parser = argparse.ArgumentParser(prog='runmodel.py',
|
||||
description='Model execution script. Downloads a given issue id from the '
|
||||
'microsoft/vscode repository, performs the cleaning process and '
|
||||
'recommends an assignee using the given model. The script may fail if '
|
||||
'the issue title and body do not contain any latin characters.')
|
||||
parser.add_argument('modelfile', type=str, help="Path to the pickled pytorch model to classify the issue with")
|
||||
parser.add_argument('issue_id', type=int, help="The microsoft/vscode GitHub issue id to classify")
|
||||
parser.add_argument('-t', '--top', type=int, default=5, help="Number of recommendations to output")
|
||||
parser.add_argument('-c', '--force-cpu', action='store_true', help="disables CUDA support. Useful when debugging")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.modelfile, args.issue_id, args.force_cpu, args.top)
|
35
src/scrape.py
Executable file
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import tarfile
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from scraper.download import download_page
|
||||
|
||||
load_dotenv()
|
||||
TOKEN = os.getenv('GITHUB_TOKEN')
|
||||
RATE_LIMIT_HR = 5000
|
||||
OUT_DIR = os.path.join(os.path.dirname(__file__), '..', 'out', 'json')
|
||||
OUT_ARCHIVE = os.path.join(OUT_DIR, 'issues.tar.gz')
|
||||
|
||||
|
||||
def main():
|
||||
if not os.path.isdir(OUT_DIR):
|
||||
os.makedirs(OUT_DIR)
|
||||
elif os.path.isfile(OUT_ARCHIVE):
|
||||
os.remove(OUT_ARCHIVE)
|
||||
|
||||
with tarfile.open(OUT_ARCHIVE, "w:gz") as tar:
|
||||
page = 1
|
||||
more = True
|
||||
|
||||
while more:
|
||||
more = download_page(page, TOKEN, tar)
|
||||
page += 1
|
||||
time.sleep(3600 / RATE_LIMIT_HR)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
0
src/scraper/__init__.py
Normal file
86
src/scraper/download.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
import http.client
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from tarfile import TarFile, TarInfo
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import tqdm
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
GITHUB_JSON_MIME: str = 'application/vnd.github+json'
|
||||
TOKEN_PREFIX: str = 'Bearer '
|
||||
GITHUB_API: str = 'api.github.com'
|
||||
REPO_ENDPOINT: str = '/repos/microsoft/vscode'
|
||||
ISSUE_ENDPOINT: str = REPO_ENDPOINT + '/issues'
|
||||
JSON_OUT_DIR: str = os.path.join(os.path.dirname(__file__), '..', 'out')
|
||||
|
||||
|
||||
def get_res_body_or_fail(connection: http.client.HTTPSConnection) -> any:
|
||||
res = connection.getresponse()
|
||||
if res.status // 100 != 2:
|
||||
raise IOError("Response status from Github is " + str(res.status) + "\n" + res.read().decode())
|
||||
|
||||
res_text = res.read().decode()
|
||||
return json.loads(res_text)
|
||||
|
||||
|
||||
def download_page(page: int, token: str, tar_file: TarFile) -> bool:
|
||||
if page < 1:
|
||||
raise ValueError("page must be >= 1")
|
||||
|
||||
per_page = 100
|
||||
total_estimate = 200000 # rough estimate of the number of issues to download
|
||||
|
||||
connection = http.client.HTTPSConnection(GITHUB_API)
|
||||
headers = {'Accept': GITHUB_JSON_MIME,
|
||||
'User-Agent': 'usi-msde-2023-soft-analytics-bug-triaging-g2',
|
||||
'Authorization': TOKEN_PREFIX + token,
|
||||
'X-GitHub-Api-Version': '2022-11-28'}
|
||||
query = {'state': 'closed', 'page': str(page), 'per_page': str(per_page)}
|
||||
connection.request('GET', ISSUE_ENDPOINT + '?' + urlencode(query),
|
||||
headers=headers)
|
||||
res_json = get_res_body_or_fail(connection)
|
||||
|
||||
for issue in tqdm.tqdm(res_json, desc="Downloading page " + str(page), initial=(page - 1) * per_page,
|
||||
total=total_estimate):
|
||||
issue_id: int = issue['id']
|
||||
filename: str = os.path.join(JSON_OUT_DIR, str(issue_id) + '.json')
|
||||
contents: str = json.dumps(issue)
|
||||
|
||||
tar_info = TarInfo(name=filename)
|
||||
tar_info.size = len(contents)
|
||||
|
||||
file_object = io.BytesIO(contents.encode())
|
||||
tar_file.addfile(tar_info, fileobj=file_object)
|
||||
|
||||
return len(res_json) > 0
|
||||
|
||||
|
||||
def download_issue(issue_id: int, token: str) -> dict[str, any]:
|
||||
connection = http.client.HTTPSConnection(GITHUB_API)
|
||||
headers = {'Accept': GITHUB_JSON_MIME,
|
||||
'User-Agent': 'usi-msde-2023-soft-analytics-bug-triaging-g2',
|
||||
'Authorization': TOKEN_PREFIX + token,
|
||||
'X-GitHub-Api-Version': '2022-11-28'}
|
||||
connection.request('GET', ISSUE_ENDPOINT + '/' + str(issue_id), headers=headers)
|
||||
return get_res_body_or_fail(connection)
|
||||
|
||||
|
||||
def download_commit_activity(token: str) -> defaultdict[str, int]:
|
||||
connection = http.client.HTTPSConnection(GITHUB_API)
|
||||
headers = {'Accept': GITHUB_JSON_MIME,
|
||||
'User-Agent': 'usi-msde-2023-soft-analytics-bug-triaging-g2',
|
||||
'Authorization': TOKEN_PREFIX + token,
|
||||
'X-GitHub-Api-Version': '2022-11-28'}
|
||||
connection.request('GET', REPO_ENDPOINT + '/stats/contributors', headers=headers)
|
||||
res_obj = get_res_body_or_fail(connection)
|
||||
ret: defaultdict[str, int] = defaultdict(int)
|
||||
|
||||
for obj in res_obj:
|
||||
ret[str(obj["author"]["login"])] = int(obj["total"])
|
||||
|
||||
return ret
|
66
src/trainmodel.py
Executable file
|
@ -0,0 +1,66 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from modelimpl.dataset import Datasets
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
||||
|
||||
import torch
|
||||
|
||||
from modelimpl.evaluate import evaluate
|
||||
from modelimpl.load import load_model, get_model_path
|
||||
from modelimpl.torch_dataset import Dataset
|
||||
from modelimpl.torch_train import train
|
||||
|
||||
OUT_DIR = os.path.join(os.path.dirname(__file__), '', '..', 'out', 'model')
|
||||
|
||||
|
||||
def main(dataset_kind: str, epochs: int, learning_rate: float, force_cpu: bool, force_retrain: bool):
|
||||
datasets = Datasets(dataset_kind)
|
||||
print('Training for dataset kind:', dataset_kind)
|
||||
print('Train set instance size: ', len(datasets.train))
|
||||
print('Validation set instance size: ', len(datasets.validation))
|
||||
print('Test set instance size: ', len(datasets.test))
|
||||
|
||||
path = get_model_path(dataset_kind, epochs, learning_rate, 'pt')
|
||||
pretrained, model, _ = load_model(path, datasets.classifier_label_range, force_cpu, force_retrain)
|
||||
|
||||
if pretrained:
|
||||
print('Using already trained model')
|
||||
else:
|
||||
if not os.path.isdir(OUT_DIR):
|
||||
os.makedirs(OUT_DIR)
|
||||
|
||||
print('Training model then saving in ' + path)
|
||||
train(model, Dataset(datasets.train), Dataset(datasets.validation), learning_rate, epochs, force_cpu)
|
||||
|
||||
torch.save(model.state_dict(), path)
|
||||
datasets.labelling.save(get_model_path(dataset_kind, epochs, learning_rate, 'labels.csv'))
|
||||
with open(get_model_path(dataset_kind, epochs, learning_rate, 'label_range.txt'), "w") as f:
|
||||
f.writelines([str(x) + "\n" for x in datasets.classifier_label_range])
|
||||
|
||||
evaluate(model, Dataset(datasets.test), force_cpu)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
assert torch is not None # make sure pytorch is imported and loaded with correct CUDA env variable
|
||||
|
||||
parser = argparse.ArgumentParser(prog='trainmodel.py',
|
||||
description='Training and evaluation script. The script will train and save the '
|
||||
'obtained model and then perform test set evaluation. If the given '
|
||||
'parameters match with a model that was already saved, the script '
|
||||
'only runs the evaluation procedure.')
|
||||
parser.add_argument('dataset', choices=['all', 'recent'], type=str, help="The dataset to train with")
|
||||
parser.add_argument('epochs', type=int, help="Number of epochs of the training process")
|
||||
parser.add_argument('-r', '--learning-rate', type=float, default=1e-6,
|
||||
help="The learning rate fed in the Adam optimizer")
|
||||
parser.add_argument('-c', '--force-cpu', action='store_true',
|
||||
help="disables CUDA support. Useful when debugging")
|
||||
parser.add_argument('-f', '--force-retraining', action='store_true',
|
||||
help="forces training of a new model even if a matching model is already found within the "
|
||||
"saved models")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.dataset, args.epochs, args.learning_rate, args.force_cpu, args.force_retraining)
|
6
tests/.gitignore
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
*
|
||||
**/*
|
||||
|
||||
!.gitignore
|
||||
!__main__.py
|
||||
!test_*.py
|
4
tests/__main__.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
import pytest
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main()
|
83
tests/test_cleaner_clean.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
import io
|
||||
import json
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.cleaner.clean import clean_all, save_set
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_tar_file(tmp_path):
|
||||
# Create a sample tar file for testing
|
||||
tar_file_path = os.path.join(tmp_path, "sample_issues.tar.gz")
|
||||
with tarfile.open(tar_file_path, 'w:gz') as tar:
|
||||
# Add a sample JSON file to the tar archive
|
||||
sample_issue = {
|
||||
"id": 10001,
|
||||
"node_id": "giovanni",
|
||||
"number": 1,
|
||||
"title": "Sample Issue",
|
||||
"user": {
|
||||
"login": "test_user",
|
||||
"id": 2,
|
||||
},
|
||||
"labels": [],
|
||||
"state": "open",
|
||||
"assignee": {
|
||||
"login": "sample_user",
|
||||
"id": 3,
|
||||
},
|
||||
"assignees": [
|
||||
{
|
||||
"login": "sample_user",
|
||||
"id": 3,
|
||||
}
|
||||
],
|
||||
"created_at": "2022-01-01T00:00:00Z",
|
||||
"body": "This is a sample issue body.",
|
||||
}
|
||||
tarinfo = tarfile.TarInfo('sample_issue.json')
|
||||
contents: bytes = json.dumps(sample_issue).encode()
|
||||
tarinfo.size = len(contents)
|
||||
|
||||
file_object = io.BytesIO(contents)
|
||||
tar.addfile(tarinfo, fileobj=file_object)
|
||||
|
||||
return tar_file_path
|
||||
|
||||
|
||||
def test_clean_all(sample_tar_file):
|
||||
objs = []
|
||||
counter = clean_all(objs, sample_tar_file)
|
||||
assert counter == 0 # No issues should be skipped
|
||||
|
||||
# Assuming you have some assertions for the content of objs based on the sample data
|
||||
assert len(objs) == 1
|
||||
assert objs[0]['id'] == 1
|
||||
assert objs[0]['title'] == 'Sample Issue'
|
||||
assert objs[0]['body'] == 'This is a sample issue body.'
|
||||
assert objs[0]['state'] == 'open'
|
||||
assert objs[0]['assignee'] == 'sample_user'
|
||||
assert objs[0]['created_at'] == '2022-01-01T00:00:00Z'
|
||||
|
||||
|
||||
def test_save_set(tmp_path):
|
||||
# Assuming you have a DataFrame (df) with some sample data
|
||||
df = pd.DataFrame({
|
||||
'title': ['Issue 1', 'Issue 2', 'Issue 3'],
|
||||
'body': ['Body 1', 'Body 2', 'Body 3'],
|
||||
'state': ['open', 'closed', 'open'],
|
||||
'assignee': ['user1', 'user2', 'user3'],
|
||||
'created_at': ['2022-01-01T00:00:00Z', '2022-01-02T00:00:00Z', '2022-01-03T00:00:00Z']
|
||||
}, index=[1, 2, 3])
|
||||
|
||||
# Save the DataFrame to a CSV file using save_set
|
||||
save_set(df, 1, 3, 'test', os.path.join(tmp_path, 'test_file_'))
|
||||
|
||||
# Load the saved CSV file and assert its content
|
||||
loaded_df = pd.read_csv(os.path.join(tmp_path, 'test_file_test_000001_000003.csv'), index_col=0)
|
||||
|
||||
assert loaded_df.equals(df)
|
25
tests/test_cleaner_dataframe.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import pandas as pd
|
||||
import pytest
|
||||
from src.cleaner.dataframe import build_df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_objs():
|
||||
return [
|
||||
{'id': 1, 'name': 'Alice', 'age': 25},
|
||||
{'id': 2, 'name': 'Bob', 'age': 30},
|
||||
{'id': 3, 'name': 'Charlie', 'age': 22}
|
||||
]
|
||||
|
||||
|
||||
def test_build_df(sample_objs):
|
||||
result_df = build_df(sample_objs)
|
||||
|
||||
assert isinstance(result_df, pd.DataFrame)
|
||||
assert set(result_df.columns) == {'name', 'age'}
|
||||
|
||||
|
||||
def test_build_df_missing_id_column():
|
||||
objs_missing_id = [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}]
|
||||
with pytest.raises(KeyError, match="'id'"):
|
||||
build_df(objs_missing_id)
|
39
tests/test_modelimpl_auc.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from src.modelimpl.auc import build_curve, compute_auc_roc
|
||||
from test_modelimpl_torch_train import mocked_model, mocked_split_data, mocked_labelling
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_classifier():
|
||||
return MagicMock()
|
||||
|
||||
|
||||
def test_build_curve_invalid_path():
|
||||
with pytest.raises(ValueError, match="path should point to a pytorch model file"):
|
||||
build_curve("invalid_path", force_cpu=True)
|
||||
|
||||
|
||||
@patch('src.modelimpl.auc.load_model', return_value=(True, MagicMock(), 3))
|
||||
@patch('src.modelimpl.auc.Labelling.load', return_value=MagicMock())
|
||||
@patch('src.modelimpl.auc.SplitData.from_df', return_value=MagicMock())
|
||||
@patch('src.modelimpl.auc.compute_auc_roc')
|
||||
def test_build_curve_valid_path(mock_compute_auc_roc, mock_from_df, mock_labelling, mock_load_model):
|
||||
build_curve("valid_path.pt", force_cpu=True)
|
||||
mock_load_model.assert_called_once_with("valid_path.pt", None, True, False)
|
||||
mock_compute_auc_roc.assert_called_once()
|
||||
|
||||
|
||||
def test_compute_auc_roc(mocked_model, mocked_split_data, mocked_labelling, tmp_path):
|
||||
compute_auc_roc(mocked_model, mocked_split_data[0], 3, mocked_labelling, True,
|
||||
f"{tmp_path}/test_file")
|
||||
|
||||
assert (tmp_path / "test_file.ovr_curves.png").exists()
|
||||
assert (tmp_path / "test_file.ovr_avg.png").exists()
|
||||
assert (tmp_path / "test_file.auc.txt").exists()
|
||||
|
||||
(tmp_path / "test_file.ovr_curves.png").unlink()
|
||||
(tmp_path / "test_file.ovr_avg.png").unlink()
|
||||
(tmp_path / "test_file.auc.txt").unlink()
|
13
tests/test_modelimpl_classifier.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
from transformers import BertForSequenceClassification
|
||||
|
||||
from src.modelimpl.classifier import bert_classifier
|
||||
|
||||
|
||||
def test_bert_classifier():
|
||||
# Test that the function returns an instance of BertForSequenceClassification
|
||||
n_classes = 5
|
||||
model = bert_classifier(n_classes)
|
||||
assert isinstance(model, BertForSequenceClassification)
|
||||
|
||||
# Test that the model has the correct number of labels
|
||||
assert model.config.num_labels == n_classes
|
66
tests/test_modelimpl_dataset.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from src.modelimpl.dataset import prepare_input, load_df, compute_labels, Labelling, SplitData, df_validation_split, \
|
||||
Datasets
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dataframe():
|
||||
return pd.DataFrame({
|
||||
'id': [1, 2],
|
||||
'title': ['Title1', 'Title2'],
|
||||
'body': ['Body1', 'Body2'],
|
||||
'title_body': ['Title1\nBody1', 'Title2\nBody2'],
|
||||
'assignee': ['A', 'B']
|
||||
})
|
||||
|
||||
|
||||
def test_prepare_input(sample_dataframe):
|
||||
result_df = prepare_input(sample_dataframe)
|
||||
assert list(result_df.columns) == ['title_body', 'assignee']
|
||||
expected_title_body = ['Title1\nBody1', 'Title2\nBody2']
|
||||
assert result_df['title_body'].tolist() == expected_title_body
|
||||
assert result_df['assignee'].tolist() == ['A', 'B']
|
||||
|
||||
|
||||
def test_load_df(sample_dataframe, tmpdir):
|
||||
# Save sample DataFrame to a CSV file
|
||||
csv_filename = os.path.join(tmpdir, 'sample_issues.csv')
|
||||
sample_dataframe.to_csv(csv_filename, index=False)
|
||||
|
||||
result_df = load_df(csv_filename)
|
||||
assert list(result_df.columns) == ['title_body', 'assignee']
|
||||
assert len(result_df) == len(sample_dataframe)
|
||||
|
||||
|
||||
def test_compute_labels():
|
||||
sample_frames = [pd.DataFrame({'assignee': ['A', 'B', 'C']}), pd.DataFrame({'assignee': ['B', 'C', 'D']})]
|
||||
|
||||
labels_dict, num_bounds = compute_labels(sample_frames)
|
||||
assert labels_dict == {'A': 0, 'B': 1, 'C': 2, 'D': 3}
|
||||
assert num_bounds == [0, 3, 4]
|
||||
|
||||
|
||||
def test_labelling_methods(tmpdir):
|
||||
labels = {'A': 0, 'B': 1, 'C': 2}
|
||||
labelling = Labelling(labels)
|
||||
|
||||
filename = os.path.join(tmpdir, 'test_labels.csv')
|
||||
labelling.save(filename)
|
||||
loaded_labelling = Labelling.load(filename)
|
||||
assert labelling.labels == loaded_labelling.labels
|
||||
|
||||
|
||||
def test_split_data_methods(sample_dataframe):
|
||||
labels = Labelling({'A': 0, 'B': 1})
|
||||
split_data = SplitData.from_df(sample_dataframe, labels, 1)
|
||||
|
||||
assert len(split_data) == len(sample_dataframe)
|
||||
|
||||
|
||||
def test_df_validation_split(sample_dataframe):
|
||||
df_train, df_val = df_validation_split(sample_dataframe)
|
||||
assert len(df_train) > 0
|
||||
assert len(df_val) > 0
|
||||
assert len(df_train) + len(df_val) == len(sample_dataframe)
|
107
tests/test_modelimpl_evaluate.py
Normal file
|
@ -0,0 +1,107 @@
|
|||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.modelimpl.classifier import bert_classifier
|
||||
from src.modelimpl.dataset import tokenizer, SplitData, Labelling
|
||||
from src.modelimpl.evaluate import predict, evaluate, predict_top_k, PredictionResult
|
||||
from src.modelimpl.torch_dataset import Dataset
|
||||
|
||||
|
||||
class MockSplitData:
|
||||
def __init__(self, labels, texts):
|
||||
self.labels = labels
|
||||
self.texts = texts
|
||||
|
||||
|
||||
def test_predict():
|
||||
# Create a sample model and dataset
|
||||
model = bert_classifier(n_classes=2)
|
||||
labels = [0, 1, 1, 0]
|
||||
|
||||
texts = [
|
||||
"cats chase playful fuzzy mice",
|
||||
"big red ball bounces high",
|
||||
"happy sun warms cool breeze",
|
||||
"jumping kids laugh on playground",
|
||||
]
|
||||
texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
|
||||
return_tensors='pt') for text in texts]
|
||||
|
||||
split_data = MockSplitData(labels, texts)
|
||||
dataset = Dataset(split_data)
|
||||
|
||||
# Test predict function
|
||||
predictions = predict(model, dataset, top_n=2, force_cpu=True)
|
||||
|
||||
# Check the length of predictions
|
||||
assert len(predictions) == len(labels)
|
||||
|
||||
# Check the format of PredictionResult instances
|
||||
for result in predictions:
|
||||
assert isinstance(result, PredictionResult)
|
||||
assert len(result.top_values) == 2
|
||||
assert len(result.top_indices) == 2
|
||||
assert isinstance(result.truth_idx, int)
|
||||
|
||||
|
||||
# Test case for evaluate function
|
||||
def test_evaluate(capsys):
|
||||
# Create a sample model and dataset
|
||||
model = bert_classifier(n_classes=2)
|
||||
labels = [0, 1, 1, 0]
|
||||
|
||||
texts = [
|
||||
"cats chase playful fuzzy mice",
|
||||
"big red ball bounces high",
|
||||
"happy sun warms cool breeze",
|
||||
"jumping kids laugh on playground",
|
||||
]
|
||||
texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
|
||||
return_tensors='pt') for text in texts]
|
||||
|
||||
split_data = MockSplitData(labels, texts)
|
||||
dataset = Dataset(split_data)
|
||||
|
||||
# Test evaluate function
|
||||
evaluate(model, dataset, force_cpu=True)
|
||||
|
||||
# Capture the printed output and check the format
|
||||
captured = capsys.readouterr()
|
||||
assert "recommendations:" in captured.out
|
||||
|
||||
|
||||
# Test case for predict_top_k function
|
||||
def test_predict_top_k():
|
||||
# Create a sample model and dataset
|
||||
model = bert_classifier(n_classes=2)
|
||||
|
||||
df = pd.DataFrame({
|
||||
"assignee": ["author_0", "author_1", "author_1", "author_0"],
|
||||
"title_body": [
|
||||
"cats chase playful fuzzy mice",
|
||||
"big red ball bounces high",
|
||||
"happy sun warms cool breeze",
|
||||
"jumping kids laugh on playground",
|
||||
],
|
||||
}, index=[1, 2, 3, 4])
|
||||
|
||||
labels = Labelling({
|
||||
"author_0": 0,
|
||||
"author_1": 1
|
||||
})
|
||||
|
||||
split_data = SplitData.from_df(df, labels, 2)
|
||||
issue_id = 1
|
||||
|
||||
# Test predict_top_k function
|
||||
result = predict_top_k(model, split_data, issue_id, top_n=2, force_cpu=True)
|
||||
|
||||
# Check the format of PredictionResult instance
|
||||
assert isinstance(result, PredictionResult)
|
||||
assert len(result.top_values) == 2
|
||||
assert len(result.top_indices) == 2
|
||||
assert isinstance(result.truth_idx, int)
|
||||
|
||||
# Check the correctness of assert statement in the function
|
||||
with pytest.raises(ValueError):
|
||||
predict_top_k(model, split_data, issue_id=99, top_n=2, force_cpu=True)
|
50
tests/test_modelimpl_load.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from src.modelimpl.classifier import bert_classifier, Classifier
|
||||
from src.modelimpl.load import load_model
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_instance():
|
||||
return bert_classifier(n_classes=4)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_path(tmpdir):
|
||||
temp_model_path = os.path.join(tmpdir, "test_model.pt")
|
||||
return temp_model_path
|
||||
|
||||
|
||||
def test_load_model_with_valid_path(model_path):
|
||||
result, model, classes = load_model(model_path, label_range=(1, 5), force_cpu=False, force_retrain=False)
|
||||
assert result is False # The model should not be already trained
|
||||
assert isinstance(model, Classifier)
|
||||
assert classes == 4 # The range (1, 5) implies 4 classes
|
||||
|
||||
|
||||
def test_load_model_with_invalid_path():
|
||||
with pytest.raises(ValueError, match="path should point to a pytorch model file"):
|
||||
load_model("invalid_path.txt", label_range=(1, 5), force_cpu=False, force_retrain=False)
|
||||
|
||||
|
||||
def test_load_model_with_force_retrain(model_path):
|
||||
result, model, classes = load_model(model_path, label_range=(1, 5), force_cpu=False, force_retrain=True)
|
||||
assert result is False # The model should not be already trained, but force_retrain is True
|
||||
|
||||
|
||||
def test_load_model_with_force_cpu(model_path):
|
||||
result, model, classes = load_model(model_path, label_range=(1, 5), force_cpu=True, force_retrain=False)
|
||||
assert result is False # The model should not be already trained
|
||||
assert isinstance(model, Classifier)
|
||||
assert not torch.cuda.is_available() # CUDA should not be available
|
||||
|
||||
|
||||
def test_load_model_with_already_trained_model(model_path, model_instance):
|
||||
torch.save(model_instance.state_dict(), model_path)
|
||||
result, model, classes = load_model(model_path, label_range=(1, 5), force_cpu=False, force_retrain=False)
|
||||
assert result is True # The model should be already trained
|
||||
assert isinstance(model, Classifier)
|
||||
assert classes == 4 # The range (1, 5) implies 4 classes
|
43
tests/test_modelimpl_torch_dataset.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
import pytest
|
||||
import torch
|
||||
from src.modelimpl.torch_dataset import Dataset
|
||||
|
||||
|
||||
class MockSplitData:
|
||||
def __init__(self, labels, texts):
|
||||
self.labels = labels
|
||||
self.texts = texts
|
||||
|
||||
|
||||
def test_dataset():
|
||||
# Create a sample SplitData instance
|
||||
labels = [0, 1, 1, 0]
|
||||
texts = [torch.rand((5,)), torch.rand((5,)), torch.rand((5,)), torch.rand((5,))]
|
||||
split_data = MockSplitData(labels, texts)
|
||||
|
||||
# Initialize the Dataset instance
|
||||
dataset = Dataset(split_data)
|
||||
|
||||
assert len(dataset) == len(labels)
|
||||
assert dataset.classes() == labels
|
||||
|
||||
idx = 2
|
||||
assert torch.all(torch.eq(torch.from_numpy(dataset.get_batch_labels(idx)), torch.tensor(labels[idx])))
|
||||
|
||||
assert torch.equal(dataset.get_batch_texts(idx), texts[idx])
|
||||
|
||||
batch_texts, batch_y = dataset[idx]
|
||||
assert torch.equal(batch_texts, texts[idx])
|
||||
assert torch.tensor(batch_y) == torch.tensor(labels[idx])
|
||||
|
||||
|
||||
def test_dataset_empty_split_data():
|
||||
empty_split_data = MockSplitData([], [])
|
||||
|
||||
dataset = Dataset(empty_split_data)
|
||||
|
||||
assert len(dataset) == 0
|
||||
assert dataset.classes() == []
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
_ = dataset[0]
|
83
tests/test_modelimpl_torch_train.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
import pandas as pd
|
||||
import pytest
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from src.modelimpl.classifier import bert_classifier
|
||||
from src.modelimpl.dataset import Labelling, SplitData
|
||||
from src.modelimpl.torch_dataset import Dataset
|
||||
from src.modelimpl.torch_train import train, print_message, compute_loss_and_acc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mocked_labelling():
|
||||
return Labelling({"author_0": 0, "author_1": 1, "author_2": 2})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mocked_split_data(mocked_labelling) -> tuple[SplitData, SplitData]:
|
||||
df = pd.DataFrame({
|
||||
"assignee": ["author_0", "author_1", "author_2", "author_1", "author_0"],
|
||||
"title_body": [
|
||||
"cats chase playful fuzzy mice",
|
||||
"big red ball bounces high",
|
||||
"happy sun warms cool breeze",
|
||||
"jumping kids laugh on playground",
|
||||
"test sentence number 5",
|
||||
],
|
||||
}, index=[1, 2, 3, 4, 5])
|
||||
|
||||
return (SplitData.from_df(df.loc[[1, 2, 3]], mocked_labelling, 3),
|
||||
SplitData.from_df(df.loc[[4, 5]], mocked_labelling, 3))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mocked_data(mocked_split_data: tuple[SplitData, SplitData]):
|
||||
train_set, val_set = mocked_split_data
|
||||
return DataLoader(Dataset(train_set), batch_size=2), DataLoader(Dataset(val_set), batch_size=2)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mocked_model():
|
||||
return bert_classifier(n_classes=3)
|
||||
|
||||
|
||||
def test_train_without_errors(capfd, mocked_model, mocked_data):
|
||||
train(mocked_model, mocked_data[0].dataset, mocked_data[1].dataset, learning_rate=0.001, epochs=2, force_cpu=True)
|
||||
captured = capfd.readouterr()
|
||||
assert "Epochs: 1" in captured.out
|
||||
assert "Epochs: 2" in captured.out
|
||||
|
||||
|
||||
def test_print_message(capsys):
|
||||
class MockDataset:
|
||||
texts: list[any]
|
||||
|
||||
def __init__(self, length: int):
|
||||
self.texts = [None] * length
|
||||
|
||||
# noinspection PyTypeChecker
|
||||
print_message(epoch_num=1, train_loss=2.0, train_acc=0.7, train_ds=MockDataset(1), val_loss=1.0, val_acc=0.8,
|
||||
val_ds=MockDataset(1))
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "Epochs: 2" in captured.out
|
||||
assert "Train Loss: 2.000" in captured.out
|
||||
assert "Train Accuracy: 0.700" in captured.out
|
||||
assert "Val Loss: 1.000" in captured.out
|
||||
assert "Val Accuracy: 0.800" in captured.out
|
||||
|
||||
|
||||
def test_compute_loss_and_acc(mocked_model, mocked_data):
|
||||
train_data, val_data = mocked_data
|
||||
|
||||
device = torch.device("cpu")
|
||||
model = mocked_model
|
||||
model.return_value = torch.tensor([[0.2, 0.8], [0.5, 0.5]])
|
||||
|
||||
val_input, val_label = next(train_data.__iter__())
|
||||
loss, acc, batch_loss = compute_loss_and_acc(val_label, val_input, device, model)
|
||||
|
||||
assert isinstance(loss, float)
|
||||
assert isinstance(acc, int)
|
||||
assert isinstance(batch_loss, torch.Tensor)
|
73
tests/test_scraper_download.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
import tarfile
|
||||
from collections import defaultdict
|
||||
|
||||
import pytest
|
||||
from src.scraper.download import (
|
||||
download_page,
|
||||
download_issue,
|
||||
download_commit_activity,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def valid_github_token():
|
||||
return "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"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def invalid_github_token():
|
||||
return "ghp_invalid"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def valid_issue_id():
|
||||
return 192213 # Replace with a valid issue ID
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def invalid_issue_id():
|
||||
return -1
|
||||
|
||||
|
||||
def test_download_page_normal_execution(valid_github_token):
|
||||
page = 1
|
||||
tar_file_name = "test_archive.tar"
|
||||
with tarfile.open(tar_file_name, "w") as tar_file:
|
||||
result = download_page(page, valid_github_token, tar_file)
|
||||
assert result is True
|
||||
|
||||
|
||||
def test_download_page_invalid_page(valid_github_token):
|
||||
page = -1 # Invalid page number
|
||||
tar_file_name = "test_archive.tar"
|
||||
with pytest.raises(ValueError):
|
||||
with tarfile.open(tar_file_name, "w") as tar_file:
|
||||
download_page(page, valid_github_token, tar_file)
|
||||
|
||||
|
||||
def test_download_page_ok(valid_github_token):
|
||||
page = 1
|
||||
tar_file_name = "test_archive.tar"
|
||||
with tarfile.open(tar_file_name, "w") as tar_file:
|
||||
result = download_page(page, valid_github_token, tar_file)
|
||||
assert result is True
|
||||
|
||||
|
||||
def test_download_issue_valid_issue_id(valid_issue_id, valid_github_token):
|
||||
result = download_issue(valid_issue_id, valid_github_token)
|
||||
assert type(result) is dict
|
||||
|
||||
|
||||
def test_download_issue_invalid_issue_id(invalid_issue_id, valid_github_token):
|
||||
with pytest.raises(IOError):
|
||||
download_issue(invalid_issue_id, valid_github_token)
|
||||
|
||||
|
||||
def test_download_commit_activity_valid_token(valid_github_token):
|
||||
result = download_commit_activity(valid_github_token)
|
||||
assert type(result) is defaultdict
|
||||
|
||||
|
||||
def test_download_commit_activity_invalid_token(invalid_github_token):
|
||||
with pytest.raises(IOError):
|
||||
download_commit_activity(invalid_github_token)
|