Final version of the bug-triaging project

Commit history has been discarded to remove large files from the repo.
This commit is contained in:
Claudio Maggioni 2024-01-03 15:20:45 +01:00
commit 07232eddcc
69 changed files with 5032 additions and 0 deletions

1
.env.template Normal file
View file

@ -0,0 +1 @@
GITHUB_TOKEN=

19
.gitattributes vendored Normal file
View file

@ -0,0 +1,19 @@
/issues.csv filter=lfs diff=lfs merge=lfs -text
/issues_new.csv filter=lfs diff=lfs merge=lfs -text
/src/model-dl/bbc-text.csv filter=lfs diff=lfs merge=lfs -text
/issues_000001_170000.csv filter=lfs diff=lfs merge=lfs -text
/issues_150000_170000.csv filter=lfs diff=lfs merge=lfs -text
/issues_170001_180000.csv filter=lfs diff=lfs merge=lfs -text
/issues_test_170001_180000.csv filter=lfs diff=lfs merge=lfs -text
/issues_train_000001_170000.csv filter=lfs diff=lfs merge=lfs -text
/issues_train_recent_150000_170000.csv filter=lfs diff=lfs merge=lfs -text
/out/csv/issues_test_170001_180000.csv filter=lfs diff=lfs merge=lfs -text
/out/csv/issues_train_000001_170000.csv filter=lfs diff=lfs merge=lfs -text
/out/csv/issues_train_recent_150000_170000.csv filter=lfs diff=lfs merge=lfs -text
/out/csv/issues_test_2_170001_180000.csv filter=lfs diff=lfs merge=lfs -text
/out/csv/issues_train_2_000001_170000.csv filter=lfs diff=lfs merge=lfs -text
/out/csv/issues_train_recent_2_150000_170000.csv filter=lfs diff=lfs merge=lfs -text
/out/json/issues.tar.gz filter=lfs diff=lfs merge=lfs -text
/out/model/bug_triaging_all_10e_1e-06lr_relu.pt filter=lfs diff=lfs merge=lfs -text
/out/model/bug_triaging_recent_10e_1e-06lr.pt filter=lfs diff=lfs merge=lfs -text
/out/model/bug_triaging_recent_10e_1e-06lr_relu.pt filter=lfs diff=lfs merge=lfs -text

461
.gitignore vendored Normal file
View file

@ -0,0 +1,461 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
**/latex/
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
**/.DS_Store
out/model/*.pt
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf
## Generated if empty string is given at "Please type another file name for output:"
**/*.pdf
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
*.slg
*.slo
*.sls
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplot
*.gnuplot
*.table
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.glog
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# *.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# newpax
*.newpax
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# svg
svg-inkscape/
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# titletoc
*.ptc
# todonotes
*.tdo
# vhistory
*.hst
*.ver
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# xwatermark package
*.xwm
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib

55
.gitlab-ci.yml Normal file
View file

@ -0,0 +1,55 @@
image: python:3.10.7
stages:
- test
- sonarqube
- code_quality
- deploy
tests:
stage: test
script:
- python -m venv venv
- source venv/bin/activate
- pip install -r requirements.txt
- python -m pytest --cov=. --junitxml=coverage/junit-report.xml tests
- coverage xml
artifacts:
when: always
paths:
- coverage.xml
- coverage/
reports:
#cobertura: coverage/cobertura-coverage.xml
junit: coverage/junit-report.xml
sonarqube-check:
only:
- main #Code quality runs only on main
stage: code_quality
allow_failure: true
image:
name: ${CI_DEPENDENCY_PROXY_DIRECT_GROUP_IMAGE_PREFIX}/sonarsource/sonar-scanner-cli:latest
entrypoint: ['']
variables:
SONAR_USER_HOME: '${CI_PROJECT_DIR}/.sonar'
GIT_DEPTH: '0' # Tells git to fetch all the branches of the project, required by the analysis task
cache:
key: '${CI_JOB_NAME}'
paths:
- .sonar/cache
script:
- sonar-scanner
#docker-build:
# image: docker:latest
# stage: deploy
# services:
# - docker:dind
# before_script:
# - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD"
# script:
# - docker build -t "$CI_REGISTRY_USER/sa-triage" .
# - docker push "$CI_REGISTRY_USER/sa-triage"

243
README.md Normal file
View file

@ -0,0 +1,243 @@
# Assignment 1: Automated Bug Triaging
**Group 2: Baris Aksakal, Edoardo Riggio, Claudio Maggioni**
# Repository structure
- `/docs`: LaTeX report code;
- `/out`
- `/csv`: Cleaner output;
- `/json`: Scraper output;
- `/model`: Pickled models (model training output) and model evaluation output;
- `/plots`: Plots for the dataset statistical analysis;
- `/src`
- `/analysis`: Notebook for the dataset statistical analysis;
- `/model-dl`
- `/bert_medium.ipynb`: Original implementation of the classifier model. Now broke down in python
files;
- `/model*.ipynb`: Alternative model implementation by Baris Aksakal. Not used in the final
implementation;
- `/{cleaner,modelimpl,scraper}`: Python modules used for scraper, cleaner, and model script implementation;
- `/auc.py`: ROC curve generation script;
- `/clean.py`: Cleaner script;
- `/runmodel.py`: Model execution script;
- `/scrape.py`: Scraper script;
- `/trainmodel.py`: Model training script;
- `/environment-dev.yaml`: Conda environment file for development environment;
- `/environment-server.yml`: Conda environment file for model training and execution (to be used
with `gym.si.usi.ch`).
# Setup
## Conda Environment
Training and running models is only supported on a CUDA 11.6 compatible environment like `gym.si.usi.ch`. The following
instructions will create and activate a Conda environment with all required dependencies to scrape, clean,
train and run the model:
```shell
conda env remove -n bug-triaging-env || true # delete environment if already present
conda env create --name bug-triaging-env --file=environment-server.yml
conda activate bug-triaging-env
```
### Development environment
*(may not work on all platforms/architectures)*
A pytorch-free version of the environment can be installed for development purposes. Only the scraper and cleaner script
may be run using this environment. To install the development environment run:
```shell
conda env remove -n bug-triaging-env-dev || true # delete environment if already present
conda env create --name bug-triaging-env-dev --file=environment-dev.yml
conda activate bug-triaging-env-dev
```
## GitHub API token
In order to be able to run the scraper and the model executor, a GitHub API token is needed. The token must be placed in
a `.env` file in this directory in a variable named `GITHUB_TOKEN`. The contents of the file should look like this:
```
GITHUB_TOKEN=<insert-token-here>
```
# Scraper
The scraper script is located in `src/scrape.py` and takes no arguments. It will download and save all issues in the
`microsoft/vscode` repository in a gzip-compressed archive of JSON files, one per issue. The file will be saved in
`out/json/issues.tar.gz`. The file **is deleted** if it already exists.
To run the scraper run:
```shell
python3 src/scrape.py
```
# Cleaner
The cleaner script is located in `src/clean.py` and takes no arguments. It will read the `out/json/issues.tar.gz`,
perform the cleaning process, and perform the train-test split according to the instructions given in the assignment
document. The output of the cleaning process is saved in 3 CSV files and one text file:
- `out/csv/issues_train_000001_170000.csv`, including all issues that belong to the complete training set;
- `out/csv/issues_train_recent_150000_170000.csv`, including all issues that belong to the training set made up of "
recent" issues;
- `out/csv/issues_test_170001_180000.csv`, including all issues that belong to the test set.
- `out/csv/issues_removed_count.txt`, including the count of issues (excluding PRs) that were discarded by the cleaning
process in the entire dataset.
The script **will overwrite** these files if they exist. To run the cleaner script run:
```shell
python3 src/clean.py
```
# Training script
The script used to train the model is located in `src/trainmodel.py`. The script takes the following arguments:
```
usage: trainmodel.py [-h] [-r LEARNING_RATE] [-c] [-f] {all,recent} epochs
Training and evaluation script. The script will train and save the obtained model and then perform test set evaluation.
If the given parameters match with a model that was already saved, the script only runs the evaluation procedure.
positional arguments:
{all,recent} The dataset to train with
epochs Number of epochs of the training process
options:
-h, --help show this help message and exit
-r LEARNING_RATE, --learning-rate LEARNING_RATE
The learning rate fed in the Adam optimizer
-c, --force-cpu disables CUDA support. Useful when debugging
-f, --force-retraining forces training of a new model even if a matching model is already found within the saved
models
```
The script loads the generated CSV datasets in `out/csv` and will output three files in `out/model`:
- `out/model/bug_triaging_{all,recent}_{epochs}e_{LEARNING_RATE}lr_final.pt`, the pytorch "pickled" model;
- `out/model/bug_triaging_{all,recent}_{epochs}e_{LEARNING_RATE}lr_final.label_range.txt`, a text file containing two
lines which determine the numeric range of classification labels outputted by the model (this file is used when using
the ROC and model execution scripts);
- `out/model/bug_triaging_{all,recent}_{epochs}e_{LEARNING_RATE}lr_final.labels.csv`, a CSV file matching the assignee
usernames with the numeric encoding used to train and execute the model with (this file is used when using
the ROC and model execution scripts).
(`{all,recent}`, `{epochs}` and `{LEARNING_RATE}` are placeholders whose value will match the parameters given to the
script)
To train the configurations that were chosen for the report execute:
```shell
python3 src/trainmodel.py all 4 -r '5e-6'
python3 src/trainmodel.py recent 4 -r '5e-6'
```
**NOTE:** The pickled pytorch model files have not been committed to this repo due to file size restrictions. They are
however saved in `gym.si.usi.ch:/home/SA23-G2/bug-triaging/out/model`.
# ROC curve generation script
The script used to train the model is located in `src/auc.py`. The script takes the following arguments:
```
usage: auc.py [-h] [-c] modelfile
ROC curve and AUC computation script. The script evaluates the given model against the test set and generates a OvR ROC
curve plot with one curve per class, a micro-averaged OvR ROC plot and the corresponding AUC value.
positional arguments:
modelfile Path to the pickled pytorch model to classify the issue with
options:
-h, --help show this help message and exit
-c, --force-cpu disables CUDA support. Useful when debugging
```
`modelfile` must contain a path to one of the `.pt` files generated with the training script. The label range text file
and the labels CSV file are assumed to be in the same directory of the picked model.
The script outputs two PNG plots and a text file:
- `out/model/{model}.ovr_curves.png` contains a plot of the One-vs-Rest ROC curves for each class (assignee) appearing
both
in the train and test set;
- `out/model/{model}.ovr_avg.png` contains a plot of the micro-averaged One-vs-Rest ROC curve;
- `out/model/{model}.auc.txt` contains the AUC for the micro-average ROC curve.
(`{model}` is a placeholder for the filename without extension - the output of the shell command
`basename {modelfile} .pt` - for the pickled pytorch model given as argument)
To generate the curves for the two trained models run:
```shell
python3 src/auc.py out/model/bug_triaging_all_4e_5e-06lr_final.pt
python3 src/auc.py out/model/bug_triaging_recent_4e_5e-06lr_final.pt
```
# Execution script
The script used to train the model is located in `src/runmodel.py`. The script takes the following arguments:
```
usage: runmodel.py [-h] [-t TOP] [-c] modelfile issue_id
Model execution script. Downloads a given issue id from the microsoft/vscode repository, performs the cleaning process
and recommends an assignee using the given model. The script may fail if the issue title and body do not contain any
latin characters.
positional arguments:
modelfile Path to the pickled pytorch model to classify the issue with
issue_id The microsoft/vscode GitHub issue id to classify
options:
-h, --help show this help message and exit
-t TOP, --top TOP Number of recommendations to output
-c, --force-cpu disables CUDA support. Useful when debugging
```
The script outputs the top-5 assignee recommendations for the given issue, and the actual assignee if the issue has
already been assigned.
Alongside each assignee, the script outputs the corresponding numerical embedding. A numerical
embedding equal to `-1` in the truth label denotes that the assignee does not appear in the training set
(after the train/validation split).
The script also outputs the number of commits each assignee authored in the repository.
This is an example of the script output for issue `192213`:
```
1: 'roblourens' (44) (confidence: 16.37%) (3932 commits authored)
2: 'lramos15' (36) (confidence: 12.62%) (829 commits authored)
3: 'bpasero' (16) (confidence: 7.29%) (11589 commits authored)
4: 'jrieken' (32) (confidence: 4.53%) (9726 commits authored)
5: 'hediet' (28) (confidence: 3.84%) (1231 commits authored)
Truth: 'alexdima' (9) (6564 commits authored)
```
To execute both the model trained on the `recent` dataset for issue 192213 run:
```shell
python3 src/runmodel.py out/model/bug_triaging_all_4e_5e-06lr_final.pt 192213
```
To execute both the model trained on the `all` dataset for issue 192213 run:
```shell
python3 src/runmodel.py out/model/bug_triaging_recent_4e_5e-06lr_final.pt 192213
```
# Report
To compile the report run:
```shell
cd docs
pdflatex -interaction=nonstopmode -output-directory=. main.tex
pdflatex -interaction=nonstopmode -output-directory=. main.tex
```

8
coveragerc Normal file
View file

@ -0,0 +1,8 @@
[run]
omit = tests/*
[paths]
source = scripts/*
[xml]
output = coverage/junit-report.xml

72
docs/main.tex Normal file
View file

@ -0,0 +1,72 @@
\documentclass{scrartcl}
\setlength\paperwidth{20.999cm}
\setlength\paperheight{29.699cm}
\setlength\voffset{-1in}
\setlength\hoffset{-1in}
\setlength\topmargin{1.499cm}
\setlength\headheight{12pt}
\setlength\headsep{.7cm}
\setlength\footskip{1.131cm}
\setlength\textheight{25cm}
\setlength\oddsidemargin{2.499cm}
\setlength\textwidth{15.999cm}
\setlength\parindent{0cm}
\setlength\parskip{0.3em}
\usepackage{amsmath}
\usepackage{listings}
\usepackage{xcolor}
\usepackage{fancyvrb}
\usepackage{newverbs}
\usepackage{fancyhdr}
\usepackage{extramarks}
\usepackage{graphicx}
\usepackage{mathtools}
\usepackage{multicol}
\usepackage{hyperref}
\usepackage{booktabs}
\usepackage{float}
\usepackage{subcaption}
\pagestyle{fancy}
\lhead{Aksakal, Maggioni, Riggio - Bug Triaging}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
\newcommand\vartextvisiblespace[1][.6em]{%
\makebox[\#1]{%
\kern.07em
\vrule height.4ex
\hrulefill
\vrule height.4ex
\kern.07em
}%
}
\begin{document}
\thispagestyle{plain}
\begin{center}
\hrule
\vspace{.4cm}
{\textbf {\Huge Bug Triaging}} \\
\vspace{.2cm}
{\textbf Software Analytics}
\vspace{.2cm}
\end{center}
{\textbf Baris Aksakal } (baris.aksakal@usi.ch) \hspace{\fill} \\
{\textbf Claudio Maggioni } (claudio.maggioni@usi.ch) \hspace{\fill} \\
{\textbf Edoardo Riggio } (edoardo.riggio@usi.ch) \hspace{\fill} \today \\
\hrule
\vspace{.2cm}
\input{./sections/introduction}
\input{./sections/scraping}
\input{./sections/cleaning}
\input{./sections/statistics}
\input{./sections/prototype_model}
\input{./sections/model}
\input{./sections/references}
\end{document}

View file

@ -0,0 +1,23 @@
\section*{Data Cleaning}
Regarding data cleaning, we've employed a series of procedures to eliminate as much as possible noisy data that could potentially hinder the learning process of the DL model.
The first think we do is to transform the body from markdown to HTML\@.
Thanks to this conversion, we are able to work directly on HTML tags and use Python's Beautiful Soup to remove some HTML blocks.
In particular, we remove everything that is contained inside the \verb|<details> </details>| tag.
This is done because everything that is contained in the tag is related to system details of the user and is not useful for the classification task our model needs to perform.
Moreover, we remove the HTML comments, since they are part of some boilerplate code that gets generated when submitting an issue.
Now that all the unuseful sections have been removed, we convert the HTML back to plain text.
From, here, we use a Python library to remove all emojis contained in the body and in the title (since they wouldn't help for the training).
We also remove all URLs and newlines.
Finally, we check is the remaining body and title are written in a language that uses latin characters.
If we encounter an issue written in other languages (such as Russian and Chinese), then the whole issue is discarded.
This is done because our DL model has been pre-trained on English documents, thus it would not make any sense to train it on Chinese or Russian data.
We also tried to typical techniques used in this case: stemming and stopword removal.
By applying stemming, we noticed that the body lost the fluidity of natural language.
Since our model has been trained to recognize natural language, we felt that it would not make any sense to remove stopwords.
In addition, we were planning to use stopword removal to decrease the number of tokens to feed to BERT (the limit for our base model is set to 512).
But, after a statistical analysis on the data, we noticed that 101468 out of 102065 ($99.4\%$) of the issues are composed by less than 512 tokens (the next section will describe out statistical analysis in greater detail).
And in this case we had similar results to the stemming, meaning that the text lost its fluidity.

View file

@ -0,0 +1,8 @@
\section*{Introduction}
The goal of this assignment was to create a machine learning model able to assign a user to a GitHub issue.
The very first step towards this goal was to scrape from the VSCode GitHub repository the past issue.
These issues will be used to train the machine learning model (a deep neural network called BERT).
The next logical step was to perform cleaning on the raw scraped data.
We noticed that some of the parts of the issue body or title introduced noise that could negatively affect the training process.
For this reason, the data was cleaned before being fed to BERT\@.
Finally, a pre-trained (on english documents) base model of BERT was trained using our cleaned data, and returns a ranking of the top 5 most probable user to be assigned to the queried issue.

162
docs/sections/model.tex Normal file
View file

@ -0,0 +1,162 @@
\section*{Model implementation}
The BERT model was implemented by loosely following a Medium article named
``Text Classification with BERT in PyTorch - Towards Data Science'' by Ruben Winastwan%
\footnote{\url{https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f}}.
Our implementation uses the \texttt{BertForSequenceClassification} model from the HuggingFace \texttt{transformers}
library. The model architecture simply joins the pre-trained deep learning weights from BERT-medium with a feed-forward
output layer consisting on one neuron per class to predict.
We train this model over two datasets, where one contains all issues in the range $[1, 170000]$ and another contains
a more ``recent'' set of issues, namely in the range $[150000, 170000]$. In the training and evaluation scripts, these
datasets are named \texttt{all} and \texttt{recent} respectively. The test set is made of issues in the range
$[170001, 180000]$, and it is used to evaluate the model with both training datasets. Each of the \texttt{all} and
\texttt{recent} datasets are split chronologically in a train set and validation set with
90\% / 10\% proportions.
In order not to bias the model implementation with knowledge from ``future'' data, the classifier has as many output
neurons as distinct assignees appearing in the training set. Additionally, instances in the validation set where the
assignee does not match one of the assignees in the training set are excluded. However, in order not to bias the model
evaluation, those instances are not excluded from the test set.
The training script encodes assignees with a numerical embedding between 0 and the number of assignees minus 1. The order
of the values in this embedding reflects the chronological order of the first issue assigned to each assignee. The only
predictor variables that are considered by the model are the cleaned issue title and body, which are concatenated without
adding any additional tokens or markers, tokenized, and mapped in a 768-wide vector.
The size of the train, validation and test split for each dataset is illustrated in table~\ref{tab:set_size}.
\begin{table}[H]
\centering
\begin{tabular}{lrr}
\toprule
Split & \texttt{recent} & \texttt{all} \\
\midrule
Training & 8303 & 91858 \\
Validation & 921 & 10167 \\
Test & 4787 & 4787 \\
\bottomrule
\end{tabular}
\caption{Number of instances in the training, validation and test set for model training on the \texttt{recent}
and \texttt{all} datasets.}
\label{tab:set_size}
\end{table}
Our training procedure runs over the data for 4 epochs for both datasets. In each epoch, the model is trained on a
shuffled copy of the training set while average loss and accuracy are tracked. After backward propagation,
the \textit{Adam} optimizer is applied upon the weights of the model with a learning
rate of $5 \cdot 10^{-6}$ and \textit{beta} values equal to $(0.9, 0.9999)$.
After each epoch, validation loss and accuracy are computed.
Due to lack of time,
no automatic early stopping procedure has been implemented in the model training script. Therefore, the validation output
has been manually used to do hyperparameter tuning. For example, the number of epochs has been chosen so that for both
models the validation loss and accuracy decrease (allowing for some tolerance) between epochs and that the metrics do
not diverge too much from the values observed during training.
Another instance where the validation set has been useful is in the choice of the embedding process for the issue title
and body. We choose to use \texttt{distilbert-base-uncased}, a non-cased tokenizer after empirically determining that it
provides better performance than a cased counterpart (namely \texttt{bert-base-cased}) over the validation set. However,
we do not claim that our hyperparameter tuning procedure has been completely exhaustive. For instance, due to lack of
time and computing power, both tokenizers have been tested only with a token length of 512 and truncation enabled.
In table~\ref{tab:metrics-recent} we report loss and accuracy for the train and validation set during training of the
model over the \texttt{recent} dataset, while in table~\ref{tab:metrics-all} we report the same values for the model
trained over the \texttt{all} dataset. By comparing the validation accuracy of both models we can say that the
\texttt{recent} model performs better over the test set.
\begin{table}[H]
\centering
\begin{tabular}{lrrrr}
\toprule
Epoch & Train loss & Validation loss & Train accuracy & Validation accuracy \\
\midrule
1 & 0.204 & 0.174 & 0.171 & 0.343 \\
2 & 0.156 & 0.140 & 0.386 & 0.467 \\
3 & 0.124 & 0.125 & 0.542 & 0.545 \\
4 & 0.100 & 0.120 & 0.642 & 0.557 \\
\bottomrule
\end{tabular}
\caption{Train set and validation set loss and accuracy during model training over the \texttt{recent} dataset.}
\label{tab:metrics-recent}
\end{table}
\begin{table}[H]
\centering
\begin{tabular}{lrrrr}
\toprule
Epoch & Train loss & Validation loss & Train accuracy & Validation accuracy \\
\midrule
1 & 0.137 & 0.164 & 0.453 & 0.357 \\
2 & 0.095 & 0.154 & 0.601 & 0.405 \\
3 & 0.077 & 0.157 & 0.676 & 0.427 \\
4 & 0.060 & 0.160 & 0.751 & 0.435 \\
\bottomrule
\end{tabular}
\caption{Train set and validation set loss and accuracy during model training over the \texttt{all} dataset.}
\label{tab:metrics-all}
\end{table}
The performance of the models trained on the \texttt{all} and \texttt{recent} datasets is reported in
table~\tab{tab:test-results}. We notice that both models are significantly better at outputting the correct assignee
within the top 2 or top 3 results rather than picking the most confident output only. For all accuracies observed, the
\texttt{recent} model still performs better than the \textt{all} model.
\begin{table}[H]
\centering
\begin{tabular}{lrr}
\toprule
Truth label found & \texttt{recent} & \texttt{all} \\
\midrule
In top recommendation & 0.4980 & 0.4034 \\
Within top 2 recommendations & 0.6179 & 0.5408 \\
Within top 3 recommendations & 0.6651 & 0.5916 \\
Within top 4 recommendations & 0.6940 & 0.6359 \\
Within top 5 recommendations & 0.7174 & 0.6658 \\
\bottomrule
\end{tabular}
\caption{Model accuracy on the test set for training with the \textt{all} and \texttt{recent} datasets. Accuracy
is reported for the recommendations given by the model output ordered by confidence.}
\label{tab:test-results}
\end{table}
The receiving operating characteristics (ROC) curve is reported according to the One-vs-Rest method by computing
one curve for each class (i.e.\ assignee) in the training set. The curve for the \texttt{recent} model is reported in
figure~\ref{fig:roc-recent}, while the curve for the \texttt{all} model is reported in figure~\ref{fig:roc-all}. As the
numeric label for each assignee is given in chronological order of first issue assignment, we can observe a difference
between long-standing and more recent contributors. Long-standing contributors have lower AUC than recent contributors
for both models. This may indicate that the models are more effective at predicting recent contributors as they are the
most active on issues in the test set, which is by construction made of recent issues. This may be caused by
long-standing authors eventually leaving the project.
\begin{figure}
\includegraphics[width=\linewidth]{../out/model/bug_triaging_recent_4e_5e-06lr_final.ovr_curves}
\caption{One-vs-Rest ROC curves for each class in the \texttt{recent} dataset for the model trained on the same dataset.}
\label{fig:roc-recent}
\end{figure}
\begin{figure}
\includegraphics[width=\linewidth]{../out/model/bug_triaging_all_4e_5e-06lr_final.ovr_curves}
\caption{One-vs-Rest ROC curves for each class in the \texttt{all} dataset for the model trained on the same dataset.}
\label{fig:roc-all}
\end{figure}
Additionally, we report a micro-averaged ROC curve to understand each model's overall performance, and we report the
corresponding area under curve (AUC) value. These curves can be found in figure~\ref{fig:roc-avg}. The \texttt{recent} model
is the one with a higher overall AUC .
\begin{figure}
\centering
\begin{subfigure}[t]{\linewidth}
\centering\includegraphics[width=.7\linewidth]{../out/model/bug_triaging_recent_4e_5e-06lr_final.ovr_avg}
\caption{ROC curve for the model trained on the \texttt{recent} dataset. The AUC score is $0.9228$.}
\end{subfigure}
\begin{subfigure}[t]{\linewidth}
\centering\includegraphics[width=.7\linewidth]{../out/model/bug_triaging_all_4e_5e-06lr_final.ovr_avg}
\caption{ROC curve for the model trained on the \texttt{all} dataset. The AUC score is $0.9121$.}
\end{subfigure}
\caption{Micro-averaged One-vs-Rest ROC curves for the trained models over the test set.}
\label{fig:roc-avg}
\end{figure}

View file

@ -0,0 +1,31 @@
\section*{Initial Model Testing with Minimally Pre-Processed Data}
{\textsc{Note:} the code for the models discussed in this section can be found in the Jupyter notebook saved in
the repository path \texttt{src/model-dl/model.ipynb}.}
\subsection*{Model Choice and Hyperparameters Used}
In addition to running our models on the USI architecture, which is our final model to be explained later, we initially prototyped a more raw version of our dataset using some pre-trained models, following their established architectures. We have made two runs on the recent training samples with a minor difference the data is fed into the Transformer.
\newline
\newline
The hyperparameters that were used in both training runs were as follows:
\newline
\newline
BATCH{\_}SIZE = 32
\newline
LEARNING{\_}RATE = 1e-5
\newline
EPOCHS = 3
\newline
MAX{\_}LEN = 512
\newline
\newline
Similarly to our final model, in both prototype runs, the data is a concatenated string of the title and the body of the item (with a minor change in the formation of the string), at the time of running this prototype our pre-processing was not as refined.
\newline
\newline
Our model of choice was the “BERT-base-uncased” which is a pre-trained model with 110M parameters, trained on the English language using a masked language modeling (MLM) objective by Hugging Face [1]. We have chosen the fairly standard AdamW optimizer which is a stochastic optimization method that only differs from the "Adam Algorithm" by adjusting the weight decay term to appear in the gradient update [2].
\newline
\subsection*{String Creation for the Transformer}
The previously mentioned minor difference between the runs was in the creation of the single string to be fed into the model. In the first run, we trained on the simple concatenation of the title and body string whereas in the second run, we added the strings "Title:" and "Body:" prefixes before the start of the respective parts of the string. We have reasoned that adding these strings, almost like meta-words, would help the model contextualize the string better at the cost of reducing the maximum length of the string by 50 characters (11 characters were used to create the two meta-words). Despite these adjustments, there was no significant difference observed in our evaluation metrics between the two runs, suggesting that the inclusion of meta-word prefixes did not substantially change the model's ability to understand and process the data. We have found no significant difference between any of our evaluation metrics between the two runs, namely, “Accuracy”, “Precision”, “Recall” and, “F1-Score”.
\newline
\subsection*{Preliminary Results and Subsequent Modifications}
In our prototype run, we utilized Nvidia A100 GPUs with extra memory. We were able to achieve nearly 50\% accuracy in both runs after three epochs. We tried using the MobileBERT model on USI architecture, which is the version of BERT that is mainly aimed for mobile use, with the benefit of less demanding memory requirements; however, it yielded poor results. Further modifications to our encoding and tokenizing processes enabled us to use the “BERT-base-uncased” model effectively with the computing resources provided to us by USI.

View file

@ -0,0 +1,9 @@
\begin{thebibliography}{9}
\bibitem{bert}
BERT-base-uncased Hugging Face. (n.d.). \url{https://huggingface.co/bert-base-uncased}
\bibitem{loshchilov2017decoupled}
I. Loshchilov and F. Hutter, ``Decoupled Weight Decay Regularization," University of Freiburg, Freiburg, Germany, 2017. [Online]. Available: \url{https://arxiv.org/pdf/1711.05101v3.pdf}
\end{thebibliography}

View file

@ -0,0 +1,7 @@
\section*{Issue Scraping}
To scrape the data from GitHub, we used the API that GitHub exposes to its users.
By using our GitHub token, we managed to make the appropriate requests to return the issues.
The raw issues where saved as single json files (one per issue), and zipped into a \verb|.tar.gz| archive.
Some downloaded issues, however, were blank JSON files.
We suspect that these issues were available at the time of listing, but they have been since deleted and are not available anymore through the GitHub API, therefore we choose to ignore them.
The internal issue IDs for these issues were: \verb|111293876|, \verb|116791101|, \verb|116805010|, \verb|116805553|, \verb|116805977|, \verb|116901067|, \verb|117010737|, \verb|117065474|, \verb|117067419|, \verb|117068152|, \verb|117069931|, \verb|116803071|, \verb|116923175|, \verb|1169895| \verb|17|, \verb|117063475|, and \verb|117067644|

View file

@ -0,0 +1,34 @@
\section*{Data Analysis}
Given the CSV exported by the cleaning pipeline, we managed to extract some interesting statistics of the training set (from issue 1 to 170000).
In particular, we analyzed the issue word count distribution, the author distribution, and the distribution of opened issues during the week.
For the word count distribution, we tried to understand how many issues had less than 512 words (as we said before, 512 is the maximum number of tokens that we can pass to BERT).
From our analysis, we saw that of 102065 cleaned and valid issues, $99.4\%$ of them (101468 issues) have a length of less than 512 words.
On the other hand, only $0.6\%$ of the issues (597 issues) have a length greater than 512 words.
This result makes the use of stopword removal useless (for out goal of reducing the number of tokens).
The image below represents the distribution of all issues with word count less than 512 words.
\begin{center}
\includegraphics[width=10cm]{../out/plots/length_dist}
\end{center}
From this distribution, we can see extrapolate that the most frequent length is of 42 words, which is the case for $1.1\%$ of the issues (1115 issues).
Regarding the author distribution -- meaning the number of issues per author -- we managed to find out that out of a total of 102 authors, $39.3\%$ of authors (40 authors) contributed to less than 10 issues.
On the other hand, $60.7\%$ of authors (62 authors) contributed to more than 10 issues.
The issues per author can be seen in the graph below.
\begin{center}
\includegraphics[width=10cm]{../out/plots/author_dist}
\end{center}
From this graph, we can extrapolate the top 5 authors based on issue assignment.
The result is the following:
\begin{enumerate}
\item mjbvz: $11.6\%$ (11882 issues)
\item bpasero: $8.11\%$ (8280 issues)
\item Tyriar: $7.91\%$ (8075 issues)
\item joaomoreno: $7.61\%$ (7775 issues)
\item isidorn: $6.77\%$ (6914 issues)
\end{enumerate}

8
environment-dev.yml Normal file
View file

@ -0,0 +1,8 @@
name: bug-triaging-env-dev
channels:
- defaults
- conda-forge
dependencies:
- pip=23.1.2=py311hecd8cb5_0
- pip:
- -r ./requirements.txt

263
environment-server.yml Normal file
View file

@ -0,0 +1,263 @@
name: bug-triaging-env
channels:
- pytorch
- nvidia
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- aiofiles=22.1.0=py311h06a4308_0
- aiosqlite=0.18.0=py311h06a4308_0
- anyio=3.5.0=py311h06a4308_0
- argon2-cffi=21.3.0=pyhd3eb1b0_0
- argon2-cffi-bindings=21.2.0=py311h5eee18b_0
- asttokens=2.0.5=pyhd3eb1b0_0
- babel=2.11.0=py311h06a4308_0
- backcall=0.2.0=pyhd3eb1b0_0
- beautifulsoup4=4.12.2=py311h06a4308_0
- blas=1.0=mkl
- bleach=4.1.0=pyhd3eb1b0_0
- brotlipy=0.7.0=py311h5eee18b_1002
- bzip2=1.0.8=h7b6447c_0
- ca-certificates=2023.08.22=h06a4308_0
- cffi=1.15.1=py311h5eee18b_3
- comm=0.1.2=py311h06a4308_0
- cryptography=41.0.3=py311hdda0065_0
- cuda=11.6.1=0
- cuda-cccl=11.6.55=hf6102b2_0
- cuda-command-line-tools=11.6.2=0
- cuda-compiler=11.6.2=0
- cuda-cudart=11.8.89=0
- cuda-cudart-dev=11.6.55=h42ad0f4_0
- cuda-cuobjdump=11.6.124=h2eeebcb_0
- cuda-cupti=11.8.87=0
- cuda-cuxxfilt=11.6.124=hecbf4f6_0
- cuda-driver-dev=11.6.55=0
- cuda-gdb=12.2.140=0
- cuda-libraries=11.8.0=0
- cuda-libraries-dev=11.6.1=0
- cuda-memcheck=11.8.86=0
- cuda-nsight=12.2.144=0
- cuda-nsight-compute=12.2.2=0
- cuda-nvcc=11.6.124=hbba6d2d_0
- cuda-nvdisasm=12.2.140=0
- cuda-nvml-dev=11.6.55=haa9ef22_0
- cuda-nvprof=12.2.142=0
- cuda-nvprune=11.6.124=he22ec0a_0
- cuda-nvrtc=11.8.89=0
- cuda-nvrtc-dev=11.6.124=h249d397_0
- cuda-nvtx=11.8.86=0
- cuda-nvvp=12.2.142=0
- cuda-runtime=11.8.0=0
- cuda-samples=11.6.101=h8efea70_0
- cuda-sanitizer-api=12.2.140=0
- cuda-toolkit=11.6.1=0
- cuda-tools=11.6.1=0
- cuda-visual-tools=11.6.1=0
- debugpy=1.6.7=py311h6a678d5_0
- decorator=5.1.1=pyhd3eb1b0_0
- defusedxml=0.7.1=pyhd3eb1b0_0
- entrypoints=0.4=py311h06a4308_0
- executing=0.8.3=pyhd3eb1b0_0
- ffmpeg=4.3=hf484d3e_0
- filelock=3.9.0=py311h06a4308_0
- freetype=2.12.1=h4a9f257_0
- future=0.18.3=py311h06a4308_0
- gds-tools=1.7.2.10=0
- giflib=5.2.1=h5eee18b_3
- gmp=6.2.1=h295c915_3
- gmpy2=2.1.2=py311hc9b5ff0_0
- gnutls=3.6.15=he1e5248_0
- icu=73.1=h6a678d5_0
- idna=3.4=py311h06a4308_0
- intel-openmp=2023.1.0=hdb19cb5_46305
- ipykernel=6.25.0=py311h92b7b1e_0
- ipython=8.15.0=py311h06a4308_0
- ipython_genutils=0.2.0=pyhd3eb1b0_1
- jedi=0.18.1=py311h06a4308_1
- jinja2=3.1.2=py311h06a4308_0
- jpeg=9e=h5eee18b_1
- json5=0.9.6=pyhd3eb1b0_0
- jsonschema=4.17.3=py311h06a4308_0
- jupyter_client=7.4.9=py311h06a4308_0
- jupyter_core=5.3.0=py311h06a4308_0
- jupyter_events=0.6.3=py311h06a4308_0
- jupyter_server=1.23.4=py311h06a4308_0
- jupyter_server_fileid=0.9.0=py311h06a4308_0
- jupyter_server_ydoc=0.8.0=py311h06a4308_1
- jupyter_ydoc=0.2.4=py311h06a4308_0
- jupyterlab=3.6.3=py311h06a4308_0
- jupyterlab_pygments=0.1.2=py_0
- jupyterlab_server=2.22.0=py311h06a4308_0
- lame=3.100=h7b6447c_0
- lcms2=2.12=h3be6417_0
- ld_impl_linux-64=2.38=h1181459_1
- lerc=3.0=h295c915_0
- libcublas=11.11.3.6=0
- libcublas-dev=11.9.2.110=h5c901ab_0
- libcufft=10.9.0.58=0
- libcufft-dev=10.7.1.112=ha5ce4c0_0
- libcufile=1.7.2.10=0
- libcufile-dev=1.7.2.10=0
- libcurand=10.3.3.141=0
- libcurand-dev=10.3.3.141=0
- libcusolver=11.4.1.48=0
- libcusparse=11.7.5.86=0
- libcusparse-dev=11.7.2.124=hbbe9722_0
- libdeflate=1.17=h5eee18b_1
- libffi=3.4.4=h6a678d5_0
- libgcc-ng=11.2.0=h1234567_1
- libgomp=11.2.0=h1234567_1
- libiconv=1.16=h7f8727e_2
- libidn2=2.3.4=h5eee18b_0
- libjpeg-turbo=2.0.0=h9bf148f_0
- libnpp=11.8.0.86=0
- libnpp-dev=11.6.3.124=h3c42840_0
- libnvjpeg=11.9.0.86=0
- libnvjpeg-dev=11.6.2.124=hb5906b9_0
- libpng=1.6.39=h5eee18b_0
- libprotobuf=3.20.3=he621ea3_0
- libsodium=1.0.18=h7b6447c_0
- libstdcxx-ng=11.2.0=h1234567_1
- libtasn1=4.19.0=h5eee18b_0
- libtiff=4.5.1=h6a678d5_0
- libunistring=0.9.10=h27cfd23_0
- libuuid=1.41.5=h5eee18b_0
- libwebp=1.3.2=h11a3e52_0
- libwebp-base=1.3.2=h5eee18b_0
- libxml2=2.10.4=hf1b16e4_1
- libxslt=1.1.37=h5eee18b_1
- llvm-openmp=14.0.6=h9e868ea_0
- lxml=4.9.3=py311hdbbb534_0
- lz4-c=1.9.4=h6a678d5_0
- markupsafe=2.1.1=py311h5eee18b_0
- matplotlib-inline=0.1.6=py311h06a4308_0
- mistune=0.8.4=py311h5eee18b_1000
- mkl=2023.1.0=h213fc3f_46343
- mkl-service=2.4.0=py311h5eee18b_1
- mkl_fft=1.3.8=py311h5eee18b_0
- mkl_random=1.2.4=py311hdb19cb5_0
- mpc=1.1.0=h10f8cd9_1
- mpfr=4.0.2=hb69a4c5_1
- mpmath=1.3.0=py311h06a4308_0
- nbclassic=0.5.5=py311h06a4308_0
- nbclient=0.5.13=py311h06a4308_0
- nbconvert=6.5.4=py311h06a4308_0
- nbformat=5.9.2=py311h06a4308_0
- ncurses=6.4=h6a678d5_0
- nest-asyncio=1.5.6=py311h06a4308_0
- nettle=3.7.3=hbbd107a_1
- networkx=3.1=py311h06a4308_0
- ninja=1.10.2=h06a4308_5
- ninja-base=1.10.2=hd09550d_5
- notebook=6.5.4=py311h06a4308_1
- notebook-shim=0.2.2=py311h06a4308_0
- nsight-compute=2023.2.2.3=0
- numpy=1.26.0=py311h08b1b3b_0
- numpy-base=1.26.0=py311hf175353_0
- openh264=2.1.1=h4ff587b_0
- openjpeg=2.4.0=h3ad879b_0
- openssl=3.0.11=h7f8727e_2
- pandocfilters=1.5.0=pyhd3eb1b0_0
- parso=0.8.3=pyhd3eb1b0_0
- pexpect=4.8.0=pyhd3eb1b0_3
- pickleshare=0.7.5=pyhd3eb1b0_1003
- pillow=10.0.1=py311ha6cbd5a_0
- pip=23.2.1=py311h06a4308_0
- platformdirs=3.10.0=py311h06a4308_0
- prometheus_client=0.14.1=py311h06a4308_0
- prompt-toolkit=3.0.36=py311h06a4308_0
- psutil=5.9.0=py311h5eee18b_0
- ptyprocess=0.7.0=pyhd3eb1b0_2
- pure_eval=0.2.2=pyhd3eb1b0_0
- pycparser=2.21=pyhd3eb1b0_0
- pygments=2.15.1=py311h06a4308_1
- pyopenssl=23.2.0=py311h06a4308_0
- pyrsistent=0.18.0=py311h5eee18b_0
- pysocks=1.7.1=py311h06a4308_0
- python=3.11.5=h955ad1f_0
- python-dateutil=2.8.2=pyhd3eb1b0_0
- python-fastjsonschema=2.16.2=py311h06a4308_0
- python-json-logger=2.0.7=py311h06a4308_0
- pytorch=2.1.0=py3.11_cuda11.8_cudnn8.7.0_0
- pytorch-cuda=11.8=h7e8668a_5
- pytorch-mutex=1.0=cuda
- pytz=2023.3.post1=py311h06a4308_0
- pyyaml=6.0=py311h5eee18b_1
- pyzmq=23.2.0=py311h6a678d5_0
- readline=8.2=h5eee18b_0
- rfc3339-validator=0.1.4=py311h06a4308_0
- rfc3986-validator=0.1.1=py311h06a4308_0
- send2trash=1.8.0=pyhd3eb1b0_1
- setuptools=68.0.0=py311h06a4308_0
- six=1.16.0=pyhd3eb1b0_1
- sniffio=1.2.0=py311h06a4308_1
- soupsieve=2.5=py311h06a4308_0
- sqlite=3.41.2=h5eee18b_0
- stack_data=0.2.0=pyhd3eb1b0_0
- sympy=1.11.1=py311h06a4308_0
- tbb=2021.8.0=hdb19cb5_0
- terminado=0.17.1=py311h06a4308_0
- tinycss2=1.2.1=py311h06a4308_0
- tk=8.6.12=h1ccaba5_0
- torchaudio=2.1.0=py311_cu118
- torchtriton=2.1.0=py311
- torchvision=0.16.0=py311_cu118
- tornado=6.3.2=py311h5eee18b_0
- traitlets=5.7.1=py311h06a4308_0
- typing-extensions=4.7.1=py311h06a4308_0
- typing_extensions=4.7.1=py311h06a4308_0
- wcwidth=0.2.5=pyhd3eb1b0_0
- webencodings=0.5.1=py311h06a4308_1
- websocket-client=0.58.0=py311h06a4308_4
- wheel=0.41.2=py311h06a4308_0
- xz=5.4.2=h5eee18b_0
- y-py=0.5.9=py311h52d8a92_0
- yaml=0.2.5=h7b6447c_0
- ypy-websocket=0.8.2=py311h06a4308_0
- zeromq=4.3.4=h2531618_0
- zlib=1.2.13=h5eee18b_0
- zstd=1.5.5=hc292b87_0
- pip:
- accelerate==0.23.0
- attrs==22.1.0
- certifi==2022.9.24
- charset-normalizer==2.1.1
- click==8.1.3
- contourpy==1.1.1
- coverage==6.4.4
- cycler==0.12.1
- demoji==1.1.0
- fonttools==4.43.1
- fsspec==2023.9.2
- huggingface-hub==0.17.3
- iniconfig==1.1.1
- joblib==1.2.0
- kiwisolver==1.4.5
- markdown==3.4.4
- matplotlib==3.8.0
- nltk==3.7
- packaging==21.3
- pandas==2.1.1
- pluggy==1.0.0
- py==1.11.0
- pyarrow==13.0.0
- pyparsing==3.0.9
- pytest==7.1.3
- pytest-cov==3.0.0
- python-dotenv==1.0.0
- regex==2022.9.13
- requests==2.28.1
- safetensors==0.4.0
- scikit-learn==1.3.1
- scipy==1.11.3
- seaborn==0.13.0
- sentence-transformers==2.2.2
- sentencepiece==0.1.99
- threadpoolctl==3.2.0
- tokenizers==0.14.1
- tomli==2.0.1
- tqdm==4.64.1
- transformers==4.34.0
- tzdata==2023.3
- urllib3==1.26.12

View file

@ -0,0 +1 @@
43363

View file

@ -0,0 +1,2 @@
Micro-averaged One-vs-Rest ROC AUC score:
0.9121

View file

@ -0,0 +1,2 @@
0
93

View file

@ -0,0 +1,104 @@
token,label
9at8,0
DonJayamanne,1
IanMatthewHuff,2
ItalyPaleAle,3
JacksonKearl,4
Lixire,5
RMacfarlane,6
Steam-Rabbit,7
TylerLeonhardt,8
Tyriar,9
aefernandes,10
aeschli,11
aiday-mar,12
alexdima,13
alexr00,14
amunger,15
andreamah,16
auchenberg,17
awvalenti,18
bamurtaugh,19
benibenj,20
bgashler1,21
bhavyaus,22
bowdenk7,23
bpasero,24
btholt,25
chrisdias,26
chrmarti,27
claudiaregio,28
cleidigh,29
connor4312,30
danyeh,31
daviddossett,32
daviwil,33
dbaeumer,34
deepak1556,35
delmyers,36
digitarald,37
dynamicwebpaige,38
eamodio,39
egamma,40
esonnino,41
fiveisprime,42
foucdeg,43
gregvanl,44
gushuro,45
hediet,46
isidorn,47
janbaltus,48
joaomoreno,49
johnliu369,50
joyceerhl,51
jrieken,52
karthiknadig,53
kieferrm,54
kimadeline,55
lramos15,56
lszomoru,57
lukaschal,58
lychung7,59
meganrogge,60
michelkaporin,61
miguelsolorio,62
minsa110,63
mjbvz,64
mousetraps,65
nexue2020,66
octref,67
ornelladotcom,68
orta,69
paulacamargo25,70
pjmeyer,71
ramya-rao-a,72
rchiodo,73
rebornix,74
roblourens,75
rzhao271,76
sana-ajani,77
sandy081,78
sanket856,79
sbatten,80
seanmcbreen,81
shawndon,82
sofianhn,83
stevencl,84
tanhakabir,85
tsalinger,86
ulugbekna,87
v-pavanp,88
vsccarl,89
waderyan,90
weeteckt,91
weinand,92
DanielRosenwasser,93
Yoyokrazy,94
brettcannon,95
devinvalenciano,96
eleanorjboyd,97
greazer,98
justschen,99
karrtikr,100
sadasant,101
hbons,102
1 token label
2 9at8 0
3 DonJayamanne 1
4 IanMatthewHuff 2
5 ItalyPaleAle 3
6 JacksonKearl 4
7 Lixire 5
8 RMacfarlane 6
9 Steam-Rabbit 7
10 TylerLeonhardt 8
11 Tyriar 9
12 aefernandes 10
13 aeschli 11
14 aiday-mar 12
15 alexdima 13
16 alexr00 14
17 amunger 15
18 andreamah 16
19 auchenberg 17
20 awvalenti 18
21 bamurtaugh 19
22 benibenj 20
23 bgashler1 21
24 bhavyaus 22
25 bowdenk7 23
26 bpasero 24
27 btholt 25
28 chrisdias 26
29 chrmarti 27
30 claudiaregio 28
31 cleidigh 29
32 connor4312 30
33 danyeh 31
34 daviddossett 32
35 daviwil 33
36 dbaeumer 34
37 deepak1556 35
38 delmyers 36
39 digitarald 37
40 dynamicwebpaige 38
41 eamodio 39
42 egamma 40
43 esonnino 41
44 fiveisprime 42
45 foucdeg 43
46 gregvanl 44
47 gushuro 45
48 hediet 46
49 isidorn 47
50 janbaltus 48
51 joaomoreno 49
52 johnliu369 50
53 joyceerhl 51
54 jrieken 52
55 karthiknadig 53
56 kieferrm 54
57 kimadeline 55
58 lramos15 56
59 lszomoru 57
60 lukaschal 58
61 lychung7 59
62 meganrogge 60
63 michelkaporin 61
64 miguelsolorio 62
65 minsa110 63
66 mjbvz 64
67 mousetraps 65
68 nexue2020 66
69 octref 67
70 ornelladotcom 68
71 orta 69
72 paulacamargo25 70
73 pjmeyer 71
74 ramya-rao-a 72
75 rchiodo 73
76 rebornix 74
77 roblourens 75
78 rzhao271 76
79 sana-ajani 77
80 sandy081 78
81 sanket856 79
82 sbatten 80
83 seanmcbreen 81
84 shawndon 82
85 sofianhn 83
86 stevencl 84
87 tanhakabir 85
88 tsalinger 86
89 ulugbekna 87
90 v-pavanp 88
91 vsccarl 89
92 waderyan 90
93 weeteckt 91
94 weinand 92
95 DanielRosenwasser 93
96 Yoyokrazy 94
97 brettcannon 95
98 devinvalenciano 96
99 eleanorjboyd 97
100 greazer 98
101 justschen 99
102 karrtikr 100
103 sadasant 101
104 hbons 102

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 289 KiB

View file

@ -0,0 +1,2 @@
Micro-averaged One-vs-Rest ROC AUC score:
0.9228

View file

@ -0,0 +1,2 @@
0
51

View file

@ -0,0 +1,57 @@
token,label
DanielRosenwasser,0
DonJayamanne,1
IanMatthewHuff,2
JacksonKearl,3
TylerLeonhardt,4
Tyriar,5
Yoyokrazy,6
aeschli,7
aiday-mar,8
alexdima,9
alexr00,10
amunger,11
andreamah,12
bamurtaugh,13
benibenj,14
bhavyaus,15
bpasero,16
chrisdias,17
chrmarti,18
connor4312,19
daviddossett,20
dbaeumer,21
deepak1556,22
devinvalenciano,23
egamma,24
eleanorjboyd,25
greazer,26
gregvanl,27
hediet,28
isidorn,29
joaomoreno,30
joyceerhl,31
jrieken,32
justschen,33
karrtikr,34
karthiknadig,35
lramos15,36
lszomoru,37
meganrogge,38
miguelsolorio,39
minsa110,40
mjbvz,41
rchiodo,42
rebornix,43
roblourens,44
rzhao271,45
sadasant,46
sandy081,47
sbatten,48
tanhakabir,49
weinand,50
brettcannon,51
digitarald,52
esonnino,53
hbons,54
ulugbekna,55
1 token label
2 DanielRosenwasser 0
3 DonJayamanne 1
4 IanMatthewHuff 2
5 JacksonKearl 3
6 TylerLeonhardt 4
7 Tyriar 5
8 Yoyokrazy 6
9 aeschli 7
10 aiday-mar 8
11 alexdima 9
12 alexr00 10
13 amunger 11
14 andreamah 12
15 bamurtaugh 13
16 benibenj 14
17 bhavyaus 15
18 bpasero 16
19 chrisdias 17
20 chrmarti 18
21 connor4312 19
22 daviddossett 20
23 dbaeumer 21
24 deepak1556 22
25 devinvalenciano 23
26 egamma 24
27 eleanorjboyd 25
28 greazer 26
29 gregvanl 27
30 hediet 28
31 isidorn 29
32 joaomoreno 30
33 joyceerhl 31
34 jrieken 32
35 justschen 33
36 karrtikr 34
37 karthiknadig 35
38 lramos15 36
39 lszomoru 37
40 meganrogge 38
41 miguelsolorio 39
42 minsa110 40
43 mjbvz 41
44 rchiodo 42
45 rebornix 43
46 roblourens 44
47 rzhao271 45
48 sadasant 46
49 sandy081 47
50 sbatten 48
51 tanhakabir 49
52 weinand 50
53 brettcannon 51
54 digitarald 52
55 esonnino 53
56 hbons 54
57 ulugbekna 55

View file

@ -0,0 +1,15 @@
/home/SA23-G2/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.
return bound(*args, **kwds)
Training for dataset kind: recent
Train set instance size: 8303
Validation set instance size: 921
Test set instance size: 4787
Using device # 0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training model then saving in /home/SA23-G2/bug-triaging/src/modelimpl/../../out/model/bug_triaging_recent_4e_5e-06lr_final.pt
Epochs: 1 | Train Loss: 0.204 | Train Accuracy: 0.171 | Val Loss: 0.174 | Val Accuracy: 0.343
Epochs: 2 | Train Loss: 0.156 | Train Accuracy: 0.386 | Val Loss: 0.140 | Val Accuracy: 0.467
Epochs: 3 | Train Loss: 0.124 | Train Accuracy: 0.542 | Val Loss: 0.125 | Val Accuracy: 0.545
Epochs: 4 | Train Loss: 0.100 | Train Accuracy: 0.642 | Val Loss: 0.120 | Val Accuracy: 0.557
Test Accuracy: 0.498

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 279 KiB

BIN
out/plots/author_dist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

BIN
out/plots/length_dist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

BIN
out/plots/weekday_dist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

31
requirements.txt Normal file
View file

@ -0,0 +1,31 @@
Markdown==3.5
attrs==22.1.0
beautifulsoup4==4.12.2
certifi==2022.9.24
charset-normalizer==2.1.1
click==8.1.3
coverage==6.4.4
demoji==1.1.0
idna==3.4
iniconfig==1.1.1
joblib==1.2.0
matplotlib==3.8.0
nltk==3.7
numpy==1.26.1
packaging==21.3
pandas==2.1.2
pluggy==1.0.0
py==1.11.0
pyarrow==13.0.0
pyparsing==3.0.9
pytest-cov==3.0.0
pytest==7.1.3
python-dotenv==1.0.0
regex==2022.9.13
requests==2.28.1
scikit_learn==1.3.2
tomli==2.0.1
torch==2.1.0
tqdm==4.66.1
transformers==4.34.1
urllib3==1.26.12

17
scripts/consts.py Normal file
View file

@ -0,0 +1,17 @@
# Define regular expressions to remove unwanted content from issue text
extra_stopwords = [
r'http(s)?://\S+', # Any http(s) url
r'[^a-zA-Z0-9_ \n]', # Any non-word character
]
# Define labels to remove unwanted issues
labels_blacklist = [
r'^\*english-please$',
r'^caused-by-extension$',
r'^info-needed$',
r'^invalid$',
r'^\*off-topic$',
r'^translation-required(-\w+)+$',
]

6
scripts/utils.py Normal file
View file

@ -0,0 +1,6 @@
from datetime import datetime
# Prints a line on the stdout prepended with the time
def log(msg):
print(f'[{datetime.now()}] {msg}')

24
sonar-project.properties Normal file
View file

@ -0,0 +1,24 @@
# --- required for ci template ---
# must be unique in a given SonarQube instance
sonar.projectKey=sa-2023-g2-${env.CI_PROJECT_ID}
sonar.qualitygate.wait=true
# defaults to project key
sonar.projectName=${env.CI_PROJECT_PATH}
# defaults to 'not provided'
# sonar.projectVersion=${env.CI_COMMIT_TAG}
# --- additional properties ---
sonar.sources=.
sonar.tests=tests
sonar.exclusions=tests/**/*
sonar.python.version=3.10.7
# test coverage
#sonar.python.coverage.reportPaths=coverage/cobertura-coverage.xml
sonar.python.coverage.reportPaths=coverage.xml
# Encoding of the source code. Default is default system encoding
sonar.sourceEncoding=UTF-8

File diff suppressed because one or more lines are too long

25
src/auc.py Executable file
View file

@ -0,0 +1,25 @@
#!/usr/bin/env python
import argparse
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from modelimpl.auc import build_curve
if __name__ == '__main__':
assert torch is not None # make sure pytorch is imported and loaded with correct CUDA env variable
parser = argparse.ArgumentParser(prog='auc.py',
description='ROC curve and AUC computation script. The script evaluates the given '
'model against the test set and generates a OvR ROC curve '
'plot with one curve per class, a micro-averaged OvR ROC plot '
'and the corresponding AUC value.')
parser.add_argument('modelfile', type=str, help="Path to the pickled pytorch model to classify the issue with")
parser.add_argument('-c', '--force-cpu', action='store_true',
help="disables CUDA support. Useful when debugging")
args = parser.parse_args()
build_curve(args.modelfile, args.force_cpu)

28
src/clean.py Executable file
View file

@ -0,0 +1,28 @@
import os
from cleaner.clean import clean_all, save_set
from cleaner.dataframe import build_df
ROOT = os.path.join(os.path.dirname(__file__), '', '..')
IN_FILE = os.path.join(ROOT, 'out/json/issues.tar.gz')
OUT_FILE_PREFIX = os.path.join(ROOT, 'out/csv/issues')
def main():
objs = []
counter = clean_all(objs, IN_FILE)
print(f'Removed Issues: {counter}')
with open(OUT_FILE_PREFIX + '_removed_count.txt', 'w') as f:
f.write(str(counter) + "\n")
df = build_df(objs)
save_set(df, 1, 170_000, '_train', OUT_FILE_PREFIX)
save_set(df, 150_000, 170_000, '_train_recent', OUT_FILE_PREFIX)
save_set(df, 170_001, 180_000, '_test', OUT_FILE_PREFIX)
if __name__ == '__main__':
main()

0
src/cleaner/__init__.py Normal file
View file

141
src/cleaner/clean.py Normal file
View file

@ -0,0 +1,141 @@
import json
import os
import re
import tarfile
import tqdm
from bs4 import BeautifulSoup, Comment
from demoji import replace
from markdown import markdown
counter = 0
def clean_body(body):
# Check if body is present
if body is None:
return None
html = markdown(body)
soup = BeautifulSoup(html, features='html.parser')
# Remove everything in the <details> tag (noisy data)
for s in soup.findAll('details'):
s.extract()
# Remove HTML comments
for s in soup(text=lambda comment: isinstance(comment, Comment)):
s.extract()
# Remove emojis
body = replace(soup.get_text('\n')).strip()
# Remove all newlines and replace with single space
body = re.sub(r'\s+', ' ', body, flags=re.MULTILINE)
# Remove all links
body = re.sub(r'http(s)?://\S+', ' ', body, flags=re.MULTILINE)
# Check if body only contains ASCII characters
if not body.isascii():
return None
return body
def clean_title(title):
if title is None:
return None
# Remove emojis
title = replace(title).strip()
# Check if title only contains ASCII characters
if not title.isascii():
return None
return title
def read_issue_obj(obj, enable_filter=True):
global counter
if enable_filter:
if 'PR' in obj['node_id'] or ('pull_request' in obj and obj['pull_request'] is not None):
return None # skip since it is a pull request
if 'assignees' not in obj or obj['assignees'] is None:
counter += 1
return None # skip since it has not been assigned
if len(obj['assignees']) != 1:
counter += 1
return None # skip since it has multiple assignees
body = clean_body(obj['body'])
dirty_body = obj['body']
title = clean_title(obj['title'])
title_count = 0 if title is None else len(title.split())
issue = {
'id': obj['number'],
'title': title,
'body': body,
'dirty_body': dirty_body,
'word_count': 0 if body is None else len(body.split()) + title_count,
'word_count_dirty': 0 if body is None else len(dirty_body.split()) + title_count,
'state': obj['state'],
'assignee': None if len(obj['assignees']) == 0 else obj['assignees'][0]['login'],
'created_at': obj['created_at']
}
if issue['title'] is None and issue['body'] is None:
counter += 1
return None
return issue
def read_issue(file):
global counter
try:
obj = json.load(file)
issue = read_issue_obj(obj)
except json.decoder.JSONDecodeError:
# Some downloaded issues are a blank JSON file. We suspect that these issues were available at the time of
# listing, but they have been since deleted and are not available anymore through the GitHub API, therefore
# we choose to ignore them. The internal issue IDs for these issues were:
# 111293876, 116791101, 116805010, 116805553, 116805977, 116901067, 117010737, 117065474, 117067419, 117068152,
# 117069931, 116803071, 116923175, 116989517, 117063475, 117067644
counter += 1
return None
return issue
def save_set(df, id_from, id_to, name, file_prefix: str):
dir_name = os.path.dirname(file_prefix)
if not os.path.isdir(dir_name):
os.makedirs(dir_name)
df.loc[id_from:id_to].to_csv(file_prefix + f'{name}_{id_from:06d}_{id_to:06d}.csv')
def clean_all(objs, in_file: str):
global counter
counter = 0
tar = tarfile.open(in_file, 'r:gz')
for member in tqdm.tqdm(tar.getmembers()):
if member.name.endswith('.json'):
f = tar.extractfile(member)
if f is not None:
issue = read_issue(f)
if issue is not None:
objs.append(issue)
return counter

8
src/cleaner/dataframe.py Normal file
View file

@ -0,0 +1,8 @@
import pandas as pd
def build_df(objs: list[dict[str, any]]) -> pd.DataFrame:
df = pd.DataFrame.from_records(objs)
df.set_index('id', drop=True, inplace=True)
df.sort_index(inplace=True)
return df

View file

@ -0,0 +1,629 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "6fed133d-61b7-4ce6-8a44-fe98acf0eed2",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"!pip install transformers"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2f3a8fd1-25a9-426d-a6be-c93b750cbcb8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/SA23-G2/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CUDA is available, Training on GPU ...\n"
]
}
],
"source": [
"import pandas as pd\n",
"import torch\n",
"import numpy as np\n",
"from transformers import BertTokenizer, BertModel\n",
"from torch import nn\n",
"from torch.optim import Adam\n",
"from tqdm import tqdm\n",
"import os\n",
"from collections import defaultdict\n",
"\n",
"force_cpu = False\n",
"\n",
"if not force_cpu: \n",
" # Use GPU #2\n",
" os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"\n",
" \n",
"train_on_gpu = torch.cuda.is_available()\n",
"\n",
"if train_on_gpu:\n",
" print('CUDA is available, Training on GPU ...')\n",
"else:\n",
" print('CUDA is not available! Training on CPU ...')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "47a53036-31ab-4374-bf15-a4dca17a7cbf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title_body</th>\n",
" <th>assignee</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>HTML: Not auto-closing quotes when typing attr...</td>\n",
" <td>alexdima</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Python: Block comment is three single quotes a...</td>\n",
" <td>joaomoreno</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>PHP: extension console.logs when completing fu...</td>\n",
" <td>jrieken</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[ruby] the mode failed to tokenize the input\\n...</td>\n",
" <td>aeschli</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[vb] Block comment is not colored\\nWhile line ...</td>\n",
" <td>bpasero</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title_body assignee\n",
"0 HTML: Not auto-closing quotes when typing attr... alexdima\n",
"1 Python: Block comment is three single quotes a... joaomoreno\n",
"2 PHP: extension console.logs when completing fu... jrieken\n",
"3 [ruby] the mode failed to tokenize the input\\n... aeschli\n",
"4 [vb] Block comment is not colored\\nWhile line ... bpasero"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"OUT_COLUMN = 'assignee'\n",
"IN_COLUMN = 'title_body'\n",
"\n",
"def load_df(csv_path):\n",
" df = pd.read_csv(csv_path)\n",
" df['title_body'] = df[['title', 'body']].apply(lambda row: '\\n'.join(row.values.astype(str)), axis=1)\n",
" return df.loc[:, ['title_body', 'assignee']]\n",
"\n",
"\n",
"df_train_all = load_df(f'../../out/csv/issues_train_000001_170000.csv')\n",
"df_train_recent = load_df(f'../../out/csv/issues_train_recent_150000_170000.csv')\n",
"df_test = load_df(f'../../out/csv/issues_test_170001_180000.csv')\n",
"\n",
"df_train_all.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ab965eff-e1eb-416f-b80c-850554d8026c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(int,\n",
" {'alexdima': 1,\n",
" 'joaomoreno': 2,\n",
" 'jrieken': 3,\n",
" 'aeschli': 4,\n",
" 'bpasero': 5,\n",
" 'isidorn': 6,\n",
" 'seanmcbreen': 7,\n",
" 'weinand': 8,\n",
" 'dbaeumer': 9,\n",
" 'sofianhn': 10,\n",
" 'chrmarti': 11,\n",
" 'chrisdias': 12,\n",
" 'Tyriar': 13,\n",
" 'roblourens': 14,\n",
" 'gregvanl': 15,\n",
" 'kieferrm': 16,\n",
" 'egamma': 17,\n",
" 'bgashler1': 18,\n",
" 'mjbvz': 19,\n",
" 'alexr00': 20,\n",
" 'stevencl': 21,\n",
" 'sbatten': 22,\n",
" 'rebornix': 23,\n",
" 'ramya-rao-a': 24,\n",
" 'waderyan': 25,\n",
" 'RMacfarlane': 26,\n",
" 'sandy081': 27,\n",
" 'pjmeyer': 28,\n",
" 'DonJayamanne': 29,\n",
" 'miguelsolorio': 30,\n",
" 'octref': 31,\n",
" 'daviwil': 32,\n",
" 'hediet': 33,\n",
" 'mousetraps': 34,\n",
" 'v-pavanp': 35,\n",
" 'johnliu369': 36,\n",
" 'vsccarl': 37,\n",
" 'delmyers': 38,\n",
" 'lukaschal': 39,\n",
" 'lszomoru': 40,\n",
" 'JacksonKearl': 41,\n",
" 'ulugbekna': 42,\n",
" 'esonnino': 43,\n",
" 'connor4312': 44,\n",
" 'michelkaporin': 45,\n",
" 'aiday-mar': 46,\n",
" 'Lixire': 47,\n",
" 'lramos15': 48,\n",
" 'andreamah': 49,\n",
" 'meganrogge': 50,\n",
" 'danyeh': 51,\n",
" 'cleidigh': 52,\n",
" 'deepak1556': 53,\n",
" 'janbaltus': 54,\n",
" 'gushuro': 55,\n",
" 'aefernandes': 56,\n",
" 'auchenberg': 57,\n",
" 'TylerLeonhardt': 58,\n",
" 'benibenj': 59,\n",
" 'tsalinger': 60,\n",
" 'rzhao271': 61,\n",
" 'shawndon': 62,\n",
" 'eamodio': 63,\n",
" 'fiveisprime': 64,\n",
" 'Steam-Rabbit': 65,\n",
" 'foucdeg': 66,\n",
" 'awvalenti': 67,\n",
" 'weeteckt': 68,\n",
" 'daviddossett': 69,\n",
" 'bowdenk7': 70,\n",
" 'sana-ajani': 71,\n",
" '9at8': 72,\n",
" 'btholt': 73,\n",
" 'bamurtaugh': 74,\n",
" 'ornelladotcom': 75,\n",
" 'digitarald': 76,\n",
" 'nexue2020': 77,\n",
" 'bhavyaus': 78,\n",
" 'joyceerhl': 79,\n",
" 'amunger': 80,\n",
" 'IanMatthewHuff': 81,\n",
" 'claudiaregio': 82,\n",
" 'rchiodo': 83,\n",
" 'ItalyPaleAle': 84,\n",
" 'kimadeline': 85,\n",
" 'tanhakabir': 86,\n",
" 'karthiknadig': 87,\n",
" 'dynamicwebpaige': 88,\n",
" 'minsa110': 89,\n",
" 'sanket856': 90,\n",
" 'orta': 91,\n",
" 'paulacamargo25': 92,\n",
" 'lychung7': 93,\n",
" 'greazer': 94,\n",
" 'justschen': 95,\n",
" 'karrtikr': 96,\n",
" 'eleanorjboyd': 97,\n",
" 'sadasant': 98,\n",
" 'Yoyokrazy': 99,\n",
" 'devinvalenciano': 100,\n",
" 'DanielRosenwasser': 101,\n",
" 'brettcannon': 102,\n",
" 'hbons': 103})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels_dict = pd.concat([df_train_all[OUT_COLUMN], df_train_recent[OUT_COLUMN], df_test[OUT_COLUMN]]) \\\n",
" .drop_duplicates(keep='first') \\\n",
" .reset_index(drop=True) \\\n",
" .to_dict()\n",
"labels = defaultdict(int)\n",
" \n",
"for k,v in labels_dict.items():\n",
" labels[v] = k + 1\n",
"\n",
"labels"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5074c270-ed3e-4e1a-863d-71737c743cb8",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = BertTokenizer.from_pretrained('bert-base-cased')\n",
"\n",
"class Dataset(torch.utils.data.Dataset):\n",
"\n",
" def __init__(self, df):\n",
"\n",
" self.labels = [labels[label] for label in df[OUT_COLUMN]]\n",
" self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True,\n",
" return_tensors=\"pt\") for text in df[IN_COLUMN]]\n",
"\n",
" def classes(self):\n",
" return self.labels\n",
"\n",
" def __len__(self):\n",
" return len(self.labels)\n",
"\n",
" def get_batch_labels(self, idx):\n",
" # Fetch a batch of labels\n",
" return np.array(self.labels[idx])\n",
"\n",
" def get_batch_texts(self, idx):\n",
" # Fetch a batch of inputs\n",
" return self.texts[idx]\n",
"\n",
" def __getitem__(self, idx):\n",
"\n",
" batch_texts = self.get_batch_texts(idx)\n",
" batch_y = self.get_batch_labels(idx)\n",
"\n",
" return batch_texts, batch_y"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0c8a5d0f-80c3-42b3-9f06-ecfc3a21f395",
"metadata": {},
"outputs": [],
"source": [
"class BertClassifier(nn.Module):\n",
"\n",
" def __init__(self, dropout=0.5):\n",
"\n",
" super(BertClassifier, self).__init__()\n",
"\n",
" self.bert = BertModel.from_pretrained('bert-base-cased')\n",
" self.dropout = nn.Dropout(dropout)\n",
" self.linear = nn.Linear(768, len(labels))\n",
" self.relu = nn.ReLU()\n",
"\n",
" def forward(self, input_id, mask):\n",
"\n",
" _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)\n",
" dropout_output = self.dropout(pooled_output)\n",
" linear_output = self.linear(dropout_output)\n",
" final_layer = self.relu(linear_output)\n",
"\n",
" return final_layer"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fa1f1cf7-65db-4966-9a55-ba26bd22ed6c",
"metadata": {},
"outputs": [],
"source": [
"def train(model, train_data, val_data, learning_rate, epochs):\n",
"\n",
" train, val = Dataset(train_data), Dataset(val_data)\n",
"\n",
" train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)\n",
" val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)\n",
"\n",
" use_cuda = torch.cuda.is_available() and not force_cpu\n",
" device = torch.device(\"cuda\" if use_cuda and not force_cpu else \"cpu\")\n",
"\n",
" criterion = nn.CrossEntropyLoss()\n",
" optimizer = Adam(model.parameters(), lr= learning_rate)\n",
"\n",
" if use_cuda:\n",
"\n",
" model = model.cuda()\n",
" criterion = criterion.cuda()\n",
"\n",
" for epoch_num in range(epochs):\n",
"\n",
" total_acc_train = 0\n",
" total_loss_train = 0\n",
"\n",
" for train_input, train_label in tqdm(train_dataloader):\n",
"\n",
" train_label = train_label.to(device)\n",
" mask = train_input['attention_mask'].to(device)\n",
" input_id = train_input['input_ids'].squeeze(1).to(device)\n",
"\n",
" output = model(input_id, mask)\n",
" \n",
" batch_loss = criterion(output, train_label.long())\n",
" total_loss_train += batch_loss.item()\n",
" \n",
" acc = (output.argmax(dim=1) == train_label).sum().item()\n",
" total_acc_train += acc\n",
"\n",
" model.zero_grad()\n",
" batch_loss.backward()\n",
" optimizer.step()\n",
" \n",
" total_acc_val = 0\n",
" total_loss_val = 0\n",
"\n",
" with torch.no_grad():\n",
"\n",
" for val_input, val_label in val_dataloader:\n",
"\n",
" val_label = val_label.to(device)\n",
" mask = val_input['attention_mask'].to(device)\n",
" input_id = val_input['input_ids'].squeeze(1).to(device)\n",
"\n",
" output = model(input_id, mask)\n",
"\n",
" batch_loss = criterion(output, val_label.long())\n",
" total_loss_val += batch_loss.item()\n",
" \n",
" acc = (output.argmax(dim=1) == val_label).sum().item()\n",
" total_acc_val += acc\n",
" \n",
" print(\n",
" f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cd8a670d-c449-45fe-8f4c-9a5fb27855c1",
"metadata": {},
"outputs": [],
"source": [
"def evaluate(model, test_data):\n",
"\n",
" test = Dataset(test_data)\n",
"\n",
" test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)\n",
"\n",
" use_cuda = torch.cuda.is_available() and not force_cpu\n",
" device = torch.device(\"cuda\" if use_cuda and not force_cpu else \"cpu\")\n",
" \n",
" if use_cuda:\n",
"\n",
" model = model.cuda()\n",
"\n",
" total_acc_test = 0\n",
" with torch.no_grad():\n",
"\n",
" for test_input, test_label in test_dataloader:\n",
"\n",
" test_label = test_label.to(device)\n",
" mask = test_input['attention_mask'].to(device)\n",
" input_id = test_input['input_ids'].squeeze(1).to(device)\n",
"\n",
" output = model(input_id, mask)\n",
"\n",
" acc = (output.argmax(dim=1) == test_label).sum().item()\n",
" total_acc_test += acc\n",
" \n",
" print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "25d2231d-fef1-42cf-a73e-188cac932727",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8303 923 4787\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/SA23-G2/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\n",
" return bound(*args, **kwds)\n"
]
}
],
"source": [
"np.random.seed(112)\n",
"\n",
"df = df_train_recent\n",
"num_samples = len(df.index)\n",
"val_split_idx = \n",
"\n",
"df_train, df_val = np.split(df.sample(frac=1, random_state=42), [int(.9*len(df))])\n",
"\n",
"print(len(df_train),len(df_val), len(df_test))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "30242239-de70-4c03-8f56-9f5ade43518d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:47<00:00, 11.96it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epochs: 1 | Train Loss: 2.022 | Train Accuracy: 0.106 | Val Loss: 1.854 | Val Accuracy: 0.115\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:47<00:00, 11.93it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epochs: 2 | Train Loss: 1.764 | Train Accuracy: 0.161 | Val Loss: 1.664 | Val Accuracy: 0.232\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:48<00:00, 11.93it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epochs: 3 | Train Loss: 1.553 | Train Accuracy: 0.291 | Val Loss: 1.477 | Val Accuracy: 0.315\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:48<00:00, 11.93it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epochs: 4 | Train Loss: 1.371 | Train Accuracy: 0.377 | Val Loss: 1.358 | Val Accuracy: 0.358\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4152/4152 [05:48<00:00, 11.93it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epochs: 5 | Train Loss: 1.232 | Train Accuracy: 0.450 | Val Loss: 1.258 | Val Accuracy: 0.426\n"
]
}
],
"source": [
"EPOCHS = 5\n",
"model = BertClassifier()\n",
"LR = 1e-6\n",
" \n",
"train(model, df_train, df_val, LR, EPOCHS)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "ccc00f0a-9a15-4942-9c9b-2f9789c8dd22",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test Accuracy: 0.413\n"
]
}
],
"source": [
"evaluate(model, df_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "68be0b13-968c-437e-817a-6c12e0823091",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

365
src/model-dl/model.ipynb Normal file
View file

@ -0,0 +1,365 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WGlZj4UC74VR",
"outputId": "0522302e-cd6c-44ff-c7c0-3cf3829a8943"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/SA23-G2/bug-triaging/src/model-dl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
"The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. \n",
"The class this function is called from is 'BertTokenizer'.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"cuda\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Tokenizing Training Data: 100%|████████| 1000/1000 [00:00<00:00, 2641249.37it/s]\n",
"Tokenizing Test Data: 100%|████████████| 1000/1000 [00:00<00:00, 5548021.16it/s]\n",
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"ename": "OutOfMemoryError",
"evalue": "CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 10.76 GiB of which 17.69 MiB is free. Process 1977743 has 1.07 GiB memory in use. Process 1980219 has 1.07 GiB memory in use. Process 1922558 has 3.07 GiB memory in use. Including non-PyTorch memory, this process has 5.53 GiB memory in use. Of the allocated memory 4.67 GiB is allocated by PyTorch, and 55.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 118\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# Load model\u001b[39;00m\n\u001b[1;32m 117\u001b[0m model \u001b[38;5;241m=\u001b[39m BertForSequenceClassification\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbert-base-uncased\u001b[39m\u001b[38;5;124m'\u001b[39m, num_labels\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(assignee_mapping))\n\u001b[0;32m--> 118\u001b[0m model\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# Initialize optimizer\u001b[39;00m\n\u001b[1;32m 121\u001b[0m optimizer \u001b[38;5;241m=\u001b[39m AdamW(model\u001b[38;5;241m.\u001b[39mparameters(), lr\u001b[38;5;241m=\u001b[39mLEARNING_RATE)\n",
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/transformers/modeling_utils.py:2179\u001b[0m, in \u001b[0;36mPreTrainedModel.to\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2174\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2175\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2176\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m model has already been set to the correct devices and casted to the correct `dtype`.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2177\u001b[0m )\n\u001b[1;32m 2178\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2179\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:1160\u001b[0m, in \u001b[0;36mModule.to\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1157\u001b[0m non_blocking, memory_format\u001b[38;5;241m=\u001b[39mconvert_to_format)\n\u001b[1;32m 1158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, non_blocking)\n\u001b[0;32m-> 1160\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_apply(convert)\n",
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:810\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 808\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 809\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 810\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 812\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 814\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:810\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 808\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 809\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 810\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 812\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 814\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
" \u001b[0;31m[... skipping similar frames: Module._apply at line 810 (4 times)]\u001b[0m\n",
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:810\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 808\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 809\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 810\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 812\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 814\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 815\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:833\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 829\u001b[0m \u001b[38;5;66;03m# Tensors stored in modules are graph leaves, and we don't want to\u001b[39;00m\n\u001b[1;32m 830\u001b[0m \u001b[38;5;66;03m# track autograd history of `param_applied`, so we have to use\u001b[39;00m\n\u001b[1;32m 831\u001b[0m \u001b[38;5;66;03m# `with torch.no_grad():`\u001b[39;00m\n\u001b[1;32m 832\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m--> 833\u001b[0m param_applied \u001b[38;5;241m=\u001b[39m fn(param)\n\u001b[1;32m 834\u001b[0m should_use_set_data \u001b[38;5;241m=\u001b[39m compute_should_use_set_data(param, param_applied)\n\u001b[1;32m 835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m should_use_set_data:\n",
"File \u001b[0;32m~/anaconda3/envs/bug-triaging-env/lib/python3.11/site-packages/torch/nn/modules/module.py:1158\u001b[0m, in \u001b[0;36mModule.to.<locals>.convert\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 1155\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m convert_to_format \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m t\u001b[38;5;241m.\u001b[39mdim() \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;241m4\u001b[39m, \u001b[38;5;241m5\u001b[39m):\n\u001b[1;32m 1156\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1157\u001b[0m non_blocking, memory_format\u001b[38;5;241m=\u001b[39mconvert_to_format)\n\u001b[0;32m-> 1158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, non_blocking)\n",
"\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 10.76 GiB of which 17.69 MiB is free. Process 1977743 has 1.07 GiB memory in use. Process 1980219 has 1.07 GiB memory in use. Process 1922558 has 3.07 GiB memory in use. Including non-PyTorch memory, this process has 5.53 GiB memory in use. Of the allocated memory 4.67 GiB is allocated by PyTorch, and 55.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
]
}
],
"source": [
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import BertTokenizer, BertForSequenceClassification, AdamW\n",
"import pandas as pd\n",
"from sklearn.metrics import accuracy_score\n",
"from tqdm import tqdm\n",
"import pandas as pd\n",
"from sklearn.metrics import accuracy_score\n",
"from tqdm import tqdm\n",
"import os\n",
"\n",
"# Hyperparameters\n",
"BATCH_SIZE = 32\n",
"LEARNING_RATE = 1e-5\n",
"EPOCHS = 3\n",
"MAX_LEN = 512\n",
"\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"\n",
"path = os.getcwd()\n",
"\n",
"print(path)\n",
"\n",
"# Load dataset\n",
"train_df = pd.read_csv('/home/SA23-G2/bug-triaging/out/csv/issues_train_000001_170000.csv')\n",
"test_df = pd.read_csv('/home/SA23-G2/bug-triaging/out/csv/issues_test_170001_180000.csv')\n",
"\n",
"train_df = train_df.head(1000)\n",
"test_df = test_df.head(1000)\n",
"\n",
"# Combine train and test datasets\n",
"combined_df = pd.concat([train_df, test_df], ignore_index=True)\n",
"\n",
"# Create a dictionary to map unique assignee names to integer labels\n",
"assignee_mapping = {assignee: label for label, assignee in enumerate(combined_df['assignee'].unique())}\n",
"\n",
"# Update assignee labels in both train and test datasets\n",
"train_df['assignee'] = train_df['assignee'].map(assignee_mapping)\n",
"test_df['assignee'] = test_df['assignee'].map(assignee_mapping)\n",
"train_df = train_df.dropna(subset=['assignee']).copy()\n",
"test_df = test_df.dropna(subset=['assignee']).copy()\n",
"train_df.reset_index(drop=True, inplace=True)\n",
"test_df.reset_index(drop=True, inplace=True)\n",
"train_df['assignee'] = train_df['assignee'].astype(int)\n",
"test_df['assignee'] = test_df['assignee'].astype(int)\n",
"train_df.drop(columns=['dirty_body'], inplace=True)\n",
"test_df.drop(columns=['dirty_body'], inplace=True)\n",
"\n",
"# Initialize tokenizer\n",
"tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')\n",
"\n",
"def calculate_max_seq_length(dataframe, tokenizer):\n",
" max_len = 0\n",
" for index, row in dataframe.iterrows():\n",
" title = str(row['title'])\n",
" body = str(row['body'])\n",
" tokens = tokenizer.encode(title, body, add_special_tokens=True)\n",
" max_len = max(max_len, len(tokens))\n",
" return max_len\n",
"\n",
"# Calculate max sequence length for both training and test datasets\n",
"#max_len_train = calculate_max_seq_length(train_df, tokenizer)\n",
"#max_len_test = calculate_max_seq_length(test_df, tokenizer)\n",
"#print(f\"Max sequence length in training dataset: {max_len_train}\")\n",
"#print(f\"Max sequence length in test dataset: {max_len_test}\")\n",
"\n",
"# Custom dataset class\n",
"class CustomDataset(Dataset):\n",
" def __init__(self, dataframe, tokenizer, max_len):\n",
" self.tokenizer = tokenizer\n",
" self.data = dataframe\n",
" self.title = dataframe.title\n",
" self.body = dataframe.body\n",
" self.targets = dataframe.assignee\n",
" self.max_len = max_len\n",
"\n",
" def __len__(self):\n",
" return len(self.title)\n",
"\n",
" def __getitem__(self, index):\n",
" title = str(self.title[index])\n",
" body = str(self.body[index])\n",
" inputs = self.tokenizer.encode_plus(\n",
" \"TITLE_START\" + title + \"BODY_START\" + body,\n",
" add_special_tokens=True,\n",
" max_length=self.max_len,\n",
" padding='max_length',\n",
" return_token_type_ids=True,\n",
" truncation=True\n",
" )\n",
" ids = inputs['input_ids']\n",
" mask = inputs['attention_mask']\n",
"\n",
" return {\n",
" 'ids': torch.tensor(ids, dtype=torch.long),\n",
" 'mask': torch.tensor(mask, dtype=torch.long),\n",
" 'targets': torch.tensor(self.targets[index], dtype=torch.long)\n",
" }\n",
"\n",
"# Check if CUDA is available\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
"print(device)\n",
"\n",
"# Create datasets with tqdm progress bar\n",
"with tqdm(total=len(train_df), desc=\"Tokenizing Training Data\") as pbar:\n",
" train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)\n",
" pbar.update(len(train_df))\n",
"\n",
"with tqdm(total=len(test_df), desc=\"Tokenizing Test Data\") as pbar:\n",
" test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)\n",
" pbar.update(len(test_df))\n",
"\n",
"# Create data loaders\n",
"train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)\n",
"test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)\n",
"\n",
"# Load model\n",
"model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(assignee_mapping))\n",
"model.to(device)\n",
"\n",
"# Initialize optimizer\n",
"optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)\n",
"\n",
"# Training loop with tqdm progress bar for epochs\n",
"for epoch in range(EPOCHS):\n",
" model.train()\n",
" progress_bar = tqdm(train_loader, desc=f\"Epoch {epoch + 1}/{EPOCHS}\")\n",
"\n",
" for batch in progress_bar:\n",
" ids = batch['ids'].to(device)\n",
" mask = batch['mask'].to(device)\n",
" targets = batch['targets'].to(device)\n",
"\n",
" outputs = model(ids, attention_mask=mask, labels=targets)\n",
" loss = outputs.loss\n",
" loss.backward()\n",
" optimizer.step()\n",
" optimizer.zero_grad()\n",
"\n",
" progress_bar.set_postfix({'Loss': f'{loss.item():.4f}'})\n",
"\n",
" # Evaluation\n",
" model.eval()\n",
" predictions = []\n",
" true_labels = []\n",
"\n",
" with torch.no_grad():\n",
" for batch in tqdm(test_loader, desc=\"Evaluating\"):\n",
" ids = batch['ids'].to(device)\n",
" mask = batch['mask'].to(device)\n",
" targets = batch['targets'].to(device)\n",
"\n",
" outputs = model(ids, attention_mask=mask)\n",
" predictions.extend(torch.argmax(outputs.logits, 1).cpu().numpy())\n",
" true_labels.extend(targets.cpu().numpy())\n",
"\n",
" accuracy = accuracy_score(true_labels, predictions)\n",
" print(f'\\nEpoch: {epoch + 1}, Accuracy: {accuracy:.4f}')\n",
"\n",
"# Save model\n",
"# torch.save(model.state_dict(), 'model.pth')\n",
"# print('Model saved to model.pth')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8Inp7tF27nXX",
"outputId": "190ab7cd-3e21-44da-d97f-4b71abbc3cec"
},
"outputs": [],
"source": [
"# Reversing the assignee_mapping dictionary\n",
"reverse_assignee_mapping = {v: k for k, v in assignee_mapping.items()}\n",
"\n",
"# Get unique labels from true_labels and predictions\n",
"unique_labels = np.unique(true_labels + predictions)\n",
"\n",
"# Convert numerical labels to names\n",
"target_names = [reverse_assignee_mapping[label] for label in unique_labels]\n",
"\n",
"print(classification_report(true_labels, predictions, target_names=target_names, labels=unique_labels))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "vOBhgOm29II_",
"outputId": "883d43a3-c248-4855-9644-4a10a3ca5234",
"tags": []
},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc\n",
"from sklearn.preprocessing import label_binarize\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from itertools import cycle\n",
"\n",
"# Assuming predictions and true_labels lists are already filled\n",
"unique_labels = np.unique(true_labels + predictions)\n",
"\n",
"# 1. Accuracy\n",
"accuracy = accuracy_score(true_labels, predictions)\n",
"print(f\"Accuracy: {accuracy:.4f}\")\n",
"\n",
"# 2. Precision\n",
"precision = precision_score(true_labels, predictions, average='weighted')\n",
"print(f\"Precision: {precision:.4f}\")\n",
"\n",
"# 3. Recall\n",
"recall = recall_score(true_labels, predictions, average='weighted')\n",
"print(f\"Recall: {recall:.4f}\")\n",
"\n",
"# 4. F1-Score\n",
"f1 = f1_score(true_labels, predictions, average='weighted')\n",
"print(f\"F1-Score: {f1:.4f}\")\n",
"\n",
"# 5. Confusion Matrix\n",
"cm = confusion_matrix(true_labels, predictions)\n",
"plt.figure(figsize=(10, 7))\n",
"sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('True')\n",
"plt.title('Confusion Matrix')\n",
"plt.show()\n",
"\n",
"# 6. ROC Curve and AUC\n",
"# Binarize the labels\n",
"binarized_true_labels = label_binarize(true_labels, classes=unique_labels)\n",
"binarized_predictions = label_binarize(predictions, classes=unique_labels)\n",
"\n",
"n_classes = binarized_true_labels.shape[1]\n",
"\n",
"# Compute ROC curve and ROC area for each class\n",
"fpr = dict()\n",
"tpr = dict()\n",
"roc_auc = dict()\n",
"for i in range(n_classes):\n",
" fpr[i], tpr[i], _ = roc_curve(binarized_true_labels[:, i], binarized_predictions[:, i])\n",
" roc_auc[i] = auc(fpr[i], tpr[i])\n",
"\n",
"# Plot all ROC curves\n",
"plt.figure(figsize=(10, 7))\n",
"for i, color in zip(range(n_classes), cycle(['aqua', 'darkorange', 'cornflowerblue'])):\n",
" plt.plot(fpr[i], tpr[i], color=color, lw=2,\n",
" label='ROC curve of class {0} (area = {1:0.2f})'\n",
" ''.format(i, roc_auc[i]))\n",
"\n",
"plt.plot([0, 1], [0, 1], 'k--', lw=2)\n",
"plt.xlim([0.0, 1.0])\n",
"plt.ylim([0.0, 1.05])\n",
"plt.xlabel('False Positive Rate')\n",
"plt.ylabel('True Positive Rate')\n",
"plt.title('Receiver Operating Characteristic to Multi-Class')\n",
"plt.legend(loc=\"lower right\")\n",
"plt.show()\n"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "A100",
"machine_shape": "hm",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View file

106
src/modelimpl/auc.py Normal file
View file

@ -0,0 +1,106 @@
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import tqdm
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from .classifier import Classifier
from .dataset import Labelling, SplitData, load_df
from .evaluate import predict
from .load import load_model
from .torch_dataset import Dataset
def predict_proba(model: Classifier, test: Dataset, n_classes: int, force_cpu: bool) -> np.ndarray:
test_output = predict(model, test, -1, force_cpu)
out = np.ndarray(shape=(len(test.texts), n_classes))
for i, res in enumerate(tqdm.tqdm(test_output, desc="Predicting outputs for ROC/AUC")):
for c, pred_class in enumerate(res.top_indices):
if pred_class >= 0:
pred_prob = res.top_values[c]
out[i, pred_class] = pred_prob
return out
def compute_auc_roc(model: Classifier, test_data: SplitData, n_classes, labelling: Labelling, force_cpu: bool,
out_prefix: str):
# Encode classes in one-hot encoding for one-vs-rest AUC
lb = LabelBinarizer()
lb.fit(range(n_classes))
y_onehot = lb.transform(test_data.labels)
y_score = predict_proba(model, Dataset(test_data), n_classes, force_cpu)
# Color map for classes
colormap = mpl.colormaps['Spectral'].resampled(n_classes)
fig, ax = plt.subplots(figsize=(14, 14))
for assignee, class_n in tqdm.tqdm(labelling.labels.items(), desc="Computing ROC curves"):
if 0 <= class_n < n_classes:
lb_class = np.flatnonzero(lb.classes_ == class_n)[0]
y_true = y_onehot[:, lb_class]
if len(np.flatnonzero(y_true)) > 0: # if this class is in the test set
RocCurveDisplay.from_predictions(
y_true,
y_score[:, lb_class],
ax=ax,
name=f"{assignee} ({class_n})",
color=colormap(class_n)
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("One-vs-Rest ROC curves")
plt.legend()
plt.savefig(out_prefix + ".ovr_curves.png")
fig, ax = plt.subplots(figsize=(7, 7))
RocCurveDisplay.from_predictions(
y_onehot.ravel(),
y_score.ravel(),
ax=ax,
name="micro-average OvR",
color="darkorange",
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Micro-averaged One-vs-Rest\nReceiver Operating Characteristic")
ax.get_legend().remove()
plt.savefig(out_prefix + ".ovr_avg.png")
micro_roc_auc_ovr = roc_auc_score(
y_onehot,
y_score,
multi_class="ovr",
average="micro",
)
message = f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.4f}"
with open(out_prefix + ".auc.txt", "w") as f:
f.write(message)
print(message)
def build_curve(path: str, force_cpu: bool):
if not path.endswith('.pt'):
raise ValueError("path should point to a pytorch model file")
pretrained, model, classes = load_model(path, None, force_cpu, False)
if not pretrained:
raise FileNotFoundError("Trained model is needed to run predict script")
out_prefix = path[:-3]
csv_path = out_prefix + '.labels.csv'
labelling = Labelling.load(csv_path)
test_data = SplitData.from_df(load_df(f'issues_test_170001_180000.csv'), labelling, classes)
compute_auc_roc(model, test_data, classes, labelling, force_cpu, out_prefix)

View file

@ -0,0 +1,8 @@
from transformers import BertForSequenceClassification
Classifier = BertForSequenceClassification
def bert_classifier(n_classes: int) -> Classifier:
return BertForSequenceClassification \
.from_pretrained('bert-base-uncased', num_labels=n_classes)

148
src/modelimpl/dataset.py Normal file
View file

@ -0,0 +1,148 @@
import os.path
from dataclasses import dataclass
from typing import Optional
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer
from transformers.tokenization_utils_base import BatchEncoding
DIR: str = os.path.join(os.path.dirname(__file__), '..', '..', 'out', 'csv')
OUT_COLUMN: str = 'assignee'
IN_COLUMNS: list[str] = ['title', 'body']
IN_JOINED_COLUMN: str = 'title_body'
VALIDATION_PERC = 0.1
def prepare_input(df: pd.DataFrame) -> pd.DataFrame:
df[IN_JOINED_COLUMN] = df[IN_COLUMNS].apply(lambda row: '\n'.join(row.values.astype(str)), axis=1)
return df.loc[:, [IN_JOINED_COLUMN, OUT_COLUMN]]
def load_df(csv_filename: str) -> pd.DataFrame:
df: pd.DataFrame = pd.read_csv(os.path.join(DIR, csv_filename))
df = df.set_index('id', drop=True)
return prepare_input(df)
def compute_labels(frames: list[pd.DataFrame]) -> tuple[dict[str, int], list[int]]:
n: int = 0
labels_dict: dict[str, int] = {}
num_bounds: list[int] = [0]
for frame in frames:
labels: list[str] = frame[OUT_COLUMN] \
.drop_duplicates(keep='first') \
.sort_values() \
.to_list()
for label in labels:
if label not in labels_dict:
labels_dict[label] = n
n += 1
num_bounds.append(n)
return labels_dict, num_bounds
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=False)
@dataclass
class Labelling:
labels: dict[str, int]
labels_rev: dict[int, str]
def __init__(self, labels: dict[str, int]):
self.labels = labels
self.labels_rev = {v: k for k, v in self.labels.items()}
self.labels_rev[-2] = 'Unassigned'
def save(self, filename: str):
df = pd.DataFrame(list(self.labels.items()), columns=['token', 'label'])
df.to_csv(filename, index=False)
@staticmethod
def load(filename: str) -> 'Labelling':
df = pd.read_csv(filename)
labels: dict[str, int] = {}
for _, row in df.iterrows():
labels[str(row['token'])] = int(row['label'])
return Labelling(labels)
@dataclass
class SplitData:
index: dict[int, int]
labels: list[int]
texts: list[BatchEncoding]
@staticmethod
def from_df(df: pd.DataFrame, labels: Labelling, label_threshold: int) -> 'SplitData':
index = {e: i for i, e in enumerate(df.index.tolist())}
labels = [-2 if label is None else labels.labels[label] for label in df[OUT_COLUMN]]
labels = [-1 if label >= label_threshold else label for label in labels]
texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
return_tensors='pt') for text in df[IN_JOINED_COLUMN]]
return SplitData(index, labels, texts)
def __init__(self, index: dict[int, int], labels: list[int], texts: list[BatchEncoding]):
self.index = index
self.labels = labels
self.texts = texts
def __len__(self) -> int:
return len(self.texts)
def only_issue(self, issue_id: int) -> Optional['SplitData']:
if issue_id not in self.index:
return None
i = self.index[issue_id]
label = self.labels[i]
text = self.texts[i]
return SplitData({issue_id: 0}, [label], [text])
def df_validation_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
df_train, df_val = np.split(df, [int((1 - VALIDATION_PERC) * len(df))])
return df_train, df_val
@dataclass
class Datasets:
train: SplitData
validation: SplitData
test: SplitData
classifier_label_range: tuple[int, int]
labelling: Labelling
def __init__(self, kind: str):
if kind == 'all':
df = load_df(f'issues_train_000001_170000.csv')
elif kind == 'recent':
df = load_df(f'issues_train_recent_150000_170000.csv')
else:
raise ValueError('kind must be one of \'all\' or \'recent\'')
df_test: pd.DataFrame = load_df(f'issues_test_170001_180000.csv')
df_train, df_val = df_validation_split(df)
labels, splits = compute_labels([df_train, df_val, df_test])
self.labelling = Labelling(labels)
self.classifier_label_range = (splits[0], splits[1])
self.train = SplitData.from_df(df_train, self.labelling, splits[1])
# Remove unknown labels from validation set
df_val['label_num'] = df_val[OUT_COLUMN].apply(lambda ass: self.labelling.labels[ass])
df_val = df_val[df_val.label_num < self.classifier_label_range[1]]
self.validation = SplitData.from_df(df_val, self.labelling, splits[1])
self.test = SplitData.from_df(df_test, self.labelling, splits[1])

90
src/modelimpl/evaluate.py Normal file
View file

@ -0,0 +1,90 @@
from dataclasses import dataclass
from typing import Optional
import torch
import tqdm
from torch.utils import data
from .classifier import Classifier
from .dataset import SplitData
from .torch_dataset import Dataset
@dataclass
class PredictionResult:
top_values: list[float]
top_indices: list[int]
truth_idx: Optional[int]
def __init__(self, top_values: list[float], top_indices: list[int], truth_idx: Optional[int]):
self.top_values = top_values
self.top_indices = top_indices
self.truth_idx = truth_idx
def __len__(self) -> int:
return len(self.top_values)
def predict(model: Classifier, test: Dataset, top_n: int, force_cpu: bool) -> list[PredictionResult]:
batch_size = 16
test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)
use_cuda = torch.cuda.is_available() and not force_cpu
device = torch.device("cuda" if use_cuda and not force_cpu else "cpu")
if use_cuda:
model = model.cuda()
res = []
with torch.no_grad():
for test_x, test_y in tqdm.tqdm(test_dataloader, desc="Test"):
test_y = test_y.to(device)
mask = test_x['attention_mask'].to(device)
input_id = test_x['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
norm_output = torch.softmax(output.logits, dim=1)
dim_n = norm_output.size(dim=1)
top_n = min(dim_n if top_n == -1 else top_n, dim_n)
top = torch.topk(norm_output, top_n, dim=1)
for i in range(top.values.size(dim=0)):
res.append(PredictionResult(top.values[i, :].tolist(), top.indices[i, :].tolist(), test_y[i].item()))
return res
def evaluate(model, test: Dataset, force_cpu: bool, top_k: int = 5):
test_output = predict(model, test, top_k, force_cpu)
if len(test_output) > 0:
top_k = min(top_k, len(test_output[0].top_indices))
accuracies = [0] * top_k
for res in test_output:
for i, index in enumerate(res.top_indices):
if index == res.truth_idx:
for j in range(i, len(accuracies)):
accuracies[j] += 1
for i, acc in enumerate(accuracies):
acc = acc / len(test.texts)
print(f'Test Accuracy for {i + 1} recommendations: {acc: .4f}')
def predict_top_k(model: Classifier, test_data: SplitData, issue_id: int, top_n: int,
force_cpu: bool) -> PredictionResult:
issue_data = test_data.only_issue(issue_id)
if issue_data is None:
raise ValueError("Issue id {0} is not present as an issue in the test set".format(issue_id))
issue_dataset = Dataset(issue_data)
result = predict(model, issue_dataset, top_n, force_cpu)
assert len(result) == 1
return result[0]

58
src/modelimpl/load.py Normal file
View file

@ -0,0 +1,58 @@
import os
from typing import Optional
import numpy as np
import torch
from .classifier import bert_classifier, Classifier
OUT_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'out', 'model')
def get_model_path(dataset_kind: str, epochs: int, learning_rate: float, suffix_ext: str) -> str:
filename = 'bug_triaging_{0}_{1}e_{2}lr_final.{3}'.format(
dataset_kind,
str(epochs),
str(learning_rate).replace('.', '_'),
suffix_ext
)
return os.path.join(OUT_DIR, filename)
def load_model(path: str, label_range: Optional[tuple[int, int]], force_cpu: bool,
force_retrain: bool) -> tuple[bool, Classifier, int]:
if not path.endswith('.pt'):
raise ValueError("path should point to a pytorch model file")
label_range_path = path[:-3] + '.label_range.txt'
np.random.seed(0)
use_gpu = torch.cuda.is_available() and not force_cpu
if use_gpu:
print('Using device #', torch.cuda.current_device())
else:
print('CUDA is not available! Working on CPU...')
if label_range is None:
with open(label_range_path, "r") as f:
start_range = int(f.readline())
end_range = int(f.readline())
else:
start_range = label_range[0]
end_range = label_range[1]
classes = end_range - start_range
model = bert_classifier(classes)
if os.path.isfile(path) and not force_retrain:
print('Using already trained model')
if use_gpu:
model.load_state_dict(torch.load(path))
else:
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
model.eval()
return True, model, classes
else:
return False, model, classes

View file

@ -0,0 +1,27 @@
import numpy as np
from torch.utils import data
from .dataset import SplitData
class Dataset(data.Dataset):
def __init__(self, split_data: SplitData):
self.labels = split_data.labels
self.texts = split_data.texts
def classes(self):
return self.labels
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
return np.array(self.labels[idx])
def get_batch_texts(self, idx):
return self.texts[idx]
def __getitem__(self, idx):
batch_texts = self.get_batch_texts(idx)
batch_y = self.get_batch_labels(idx)
return batch_texts, batch_y

View file

@ -0,0 +1,93 @@
import torch
from torch import nn
from torch.optim import Adam
from torch.utils import data
from tqdm import tqdm
from .torch_dataset import Dataset
def print_message(epoch_num: int, train_loss: float, train_acc: float, train_ds: Dataset, val_loss: float,
val_acc: float, val_ds: Dataset):
messages = [
f'Epochs: {epoch_num + 1}',
f'Train Loss: {train_loss / len(train_ds.texts): .3f}',
f'Train Accuracy: {train_acc / len(train_ds.texts): .3f}',
f'Val Loss: {val_loss / len(val_ds.texts): .3f}',
f'Val Accuracy: {val_acc / len(val_ds.texts): .3f}'
]
print(' | '.join(messages))
def compute_loss_and_acc(label, input_data, device, model) -> tuple[float, float, any]:
label = label.to(device)
mask = input_data['attention_mask'].to(device)
input_id = input_data['input_ids'].squeeze(1).to(device)
output = model(input_id, attention_mask=mask, labels=label)
batch_loss = output.loss
acc = (torch.argmax(output.logits, 1) == label).sum().item()
return batch_loss.item(), acc, batch_loss
def train(model, train_ds: Dataset, val_ds: Dataset, learning_rate: float, epochs: int, force_cpu: bool):
batch_size = 16
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_ds, batch_size=batch_size)
use_cuda = torch.cuda.is_available() and not force_cpu
device = torch.device("cuda" if use_cuda and not force_cpu else "cpu")
if use_cuda:
model = model.cuda()
optimizer = Adam(model.parameters(), lr=learning_rate)
for epoch_num in range(epochs):
total_acc_train = 0
total_loss_train = 0
train_dl = tqdm(train_dataloader, desc=f"Train E{epoch_num + 1}")
i = 0
for train_input, train_label in train_dl:
delta_loss, delta_acc, batch_loss = compute_loss_and_acc(train_label, train_input, device, model)
total_loss_train += delta_loss
total_acc_train += delta_acc
batch_loss.backward()
optimizer.step()
model.zero_grad()
i += train_label.size(dim=0)
loss_avg = total_loss_train / i
acc_avg = total_acc_train / i
train_dl.set_description(f"Train E{epoch_num + 1} loss={loss_avg:.6f} acc={acc_avg:.4f}")
total_acc_val = 0
total_loss_val = 0
val_dl = tqdm(val_dataloader, desc="Val E" + str(epoch_num + 1))
with torch.no_grad():
i = 0
for val_input, val_label in val_dl:
delta_loss, delta_acc, batch_loss = compute_loss_and_acc(val_label, val_input, device, model)
total_loss_val += delta_loss
total_acc_val += delta_acc
i += val_label.size(dim=0)
loss_avg = total_loss_val / i
acc_avg = total_acc_val / i
val_dl.set_description(f"Val E{epoch_num + 1} loss={loss_avg:.6f} acc={acc_avg:.4f}")
print_message(epoch_num, total_loss_train, total_acc_train, train_ds, total_loss_val, total_acc_val, val_ds)

77
src/runmodel.py Executable file
View file

@ -0,0 +1,77 @@
#!/usr/bin/env python
import argparse
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from dotenv import load_dotenv
from cleaner.clean import read_issue_obj
from cleaner.dataframe import build_df
from modelimpl.dataset import SplitData, prepare_input, Labelling
from modelimpl.evaluate import predict_top_k
from modelimpl.load import load_model
from scraper.download import download_issue, download_commit_activity
load_dotenv()
TOKEN = os.getenv('GITHUB_TOKEN')
def main(path: str, issue_id: int, force_cpu: bool, top: int):
stats = download_commit_activity(TOKEN)
if not path.endswith('.pt'):
raise ValueError("path should point to a pytorch model file")
pretrained, model, classes = load_model(path, None, force_cpu, False)
csv_path = path[:-3] + '.labels.csv'
labelling = Labelling.load(csv_path)
if not pretrained:
raise FileNotFoundError("Trained model is needed to run predict script")
issue_json = download_issue(issue_id, TOKEN)
issue_clean = read_issue_obj(issue_json, enable_filter=False)
if issue_clean is None:
raise ValueError("Issue does not contain latin characters in title or body, cannot classify")
issue_df = prepare_input(build_df([issue_clean]))
issue_dataset = SplitData.from_df(issue_df, labelling, classes)
res = predict_top_k(model, issue_dataset, issue_id, top, force_cpu)
for i in range(len(res)):
value = res.top_values[i]
idx = res.top_indices[i]
assignee = labelling.labels_rev[idx]
print("{0}: '{1}' ({3}) (confidence: {2:.2f}%) ({4} commits authored)"
.format(i + 1, assignee, value * 100, idx, stats[assignee]))
if res.truth_idx != -2:
truth = labelling.labels_rev[res.truth_idx]
print("Truth: '{0}' ({1}) ({2} commits authored)".format(truth, res.truth_idx, stats[truth]))
else:
print("Issue is unassigned on GitHub")
if __name__ == '__main__':
assert torch is not None # make sure pytorch is imported and loaded with correct CUDA env variable
parser = argparse.ArgumentParser(prog='runmodel.py',
description='Model execution script. Downloads a given issue id from the '
'microsoft/vscode repository, performs the cleaning process and '
'recommends an assignee using the given model. The script may fail if '
'the issue title and body do not contain any latin characters.')
parser.add_argument('modelfile', type=str, help="Path to the pickled pytorch model to classify the issue with")
parser.add_argument('issue_id', type=int, help="The microsoft/vscode GitHub issue id to classify")
parser.add_argument('-t', '--top', type=int, default=5, help="Number of recommendations to output")
parser.add_argument('-c', '--force-cpu', action='store_true', help="disables CUDA support. Useful when debugging")
args = parser.parse_args()
main(args.modelfile, args.issue_id, args.force_cpu, args.top)

35
src/scrape.py Executable file
View file

@ -0,0 +1,35 @@
#!/usr/bin/env python3
import os
import tarfile
import time
from dotenv import load_dotenv
from scraper.download import download_page
load_dotenv()
TOKEN = os.getenv('GITHUB_TOKEN')
RATE_LIMIT_HR = 5000
OUT_DIR = os.path.join(os.path.dirname(__file__), '..', 'out', 'json')
OUT_ARCHIVE = os.path.join(OUT_DIR, 'issues.tar.gz')
def main():
if not os.path.isdir(OUT_DIR):
os.makedirs(OUT_DIR)
elif os.path.isfile(OUT_ARCHIVE):
os.remove(OUT_ARCHIVE)
with tarfile.open(OUT_ARCHIVE, "w:gz") as tar:
page = 1
more = True
while more:
more = download_page(page, TOKEN, tar)
page += 1
time.sleep(3600 / RATE_LIMIT_HR)
if __name__ == '__main__':
main()

0
src/scraper/__init__.py Normal file
View file

86
src/scraper/download.py Normal file
View file

@ -0,0 +1,86 @@
import http.client
import io
import json
import os
from collections import defaultdict
from tarfile import TarFile, TarInfo
from urllib.parse import urlencode
import tqdm
from dotenv import load_dotenv
load_dotenv()
GITHUB_JSON_MIME: str = 'application/vnd.github+json'
TOKEN_PREFIX: str = 'Bearer '
GITHUB_API: str = 'api.github.com'
REPO_ENDPOINT: str = '/repos/microsoft/vscode'
ISSUE_ENDPOINT: str = REPO_ENDPOINT + '/issues'
JSON_OUT_DIR: str = os.path.join(os.path.dirname(__file__), '..', 'out')
def get_res_body_or_fail(connection: http.client.HTTPSConnection) -> any:
res = connection.getresponse()
if res.status // 100 != 2:
raise IOError("Response status from Github is " + str(res.status) + "\n" + res.read().decode())
res_text = res.read().decode()
return json.loads(res_text)
def download_page(page: int, token: str, tar_file: TarFile) -> bool:
if page < 1:
raise ValueError("page must be >= 1")
per_page = 100
total_estimate = 200000 # rough estimate of the number of issues to download
connection = http.client.HTTPSConnection(GITHUB_API)
headers = {'Accept': GITHUB_JSON_MIME,
'User-Agent': 'usi-msde-2023-soft-analytics-bug-triaging-g2',
'Authorization': TOKEN_PREFIX + token,
'X-GitHub-Api-Version': '2022-11-28'}
query = {'state': 'closed', 'page': str(page), 'per_page': str(per_page)}
connection.request('GET', ISSUE_ENDPOINT + '?' + urlencode(query),
headers=headers)
res_json = get_res_body_or_fail(connection)
for issue in tqdm.tqdm(res_json, desc="Downloading page " + str(page), initial=(page - 1) * per_page,
total=total_estimate):
issue_id: int = issue['id']
filename: str = os.path.join(JSON_OUT_DIR, str(issue_id) + '.json')
contents: str = json.dumps(issue)
tar_info = TarInfo(name=filename)
tar_info.size = len(contents)
file_object = io.BytesIO(contents.encode())
tar_file.addfile(tar_info, fileobj=file_object)
return len(res_json) > 0
def download_issue(issue_id: int, token: str) -> dict[str, any]:
connection = http.client.HTTPSConnection(GITHUB_API)
headers = {'Accept': GITHUB_JSON_MIME,
'User-Agent': 'usi-msde-2023-soft-analytics-bug-triaging-g2',
'Authorization': TOKEN_PREFIX + token,
'X-GitHub-Api-Version': '2022-11-28'}
connection.request('GET', ISSUE_ENDPOINT + '/' + str(issue_id), headers=headers)
return get_res_body_or_fail(connection)
def download_commit_activity(token: str) -> defaultdict[str, int]:
connection = http.client.HTTPSConnection(GITHUB_API)
headers = {'Accept': GITHUB_JSON_MIME,
'User-Agent': 'usi-msde-2023-soft-analytics-bug-triaging-g2',
'Authorization': TOKEN_PREFIX + token,
'X-GitHub-Api-Version': '2022-11-28'}
connection.request('GET', REPO_ENDPOINT + '/stats/contributors', headers=headers)
res_obj = get_res_body_or_fail(connection)
ret: defaultdict[str, int] = defaultdict(int)
for obj in res_obj:
ret[str(obj["author"]["login"])] = int(obj["total"])
return ret

66
src/trainmodel.py Executable file
View file

@ -0,0 +1,66 @@
#!/usr/bin/env python
import argparse
import os
from modelimpl.dataset import Datasets
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from modelimpl.evaluate import evaluate
from modelimpl.load import load_model, get_model_path
from modelimpl.torch_dataset import Dataset
from modelimpl.torch_train import train
OUT_DIR = os.path.join(os.path.dirname(__file__), '', '..', 'out', 'model')
def main(dataset_kind: str, epochs: int, learning_rate: float, force_cpu: bool, force_retrain: bool):
datasets = Datasets(dataset_kind)
print('Training for dataset kind:', dataset_kind)
print('Train set instance size: ', len(datasets.train))
print('Validation set instance size: ', len(datasets.validation))
print('Test set instance size: ', len(datasets.test))
path = get_model_path(dataset_kind, epochs, learning_rate, 'pt')
pretrained, model, _ = load_model(path, datasets.classifier_label_range, force_cpu, force_retrain)
if pretrained:
print('Using already trained model')
else:
if not os.path.isdir(OUT_DIR):
os.makedirs(OUT_DIR)
print('Training model then saving in ' + path)
train(model, Dataset(datasets.train), Dataset(datasets.validation), learning_rate, epochs, force_cpu)
torch.save(model.state_dict(), path)
datasets.labelling.save(get_model_path(dataset_kind, epochs, learning_rate, 'labels.csv'))
with open(get_model_path(dataset_kind, epochs, learning_rate, 'label_range.txt'), "w") as f:
f.writelines([str(x) + "\n" for x in datasets.classifier_label_range])
evaluate(model, Dataset(datasets.test), force_cpu)
if __name__ == '__main__':
assert torch is not None # make sure pytorch is imported and loaded with correct CUDA env variable
parser = argparse.ArgumentParser(prog='trainmodel.py',
description='Training and evaluation script. The script will train and save the '
'obtained model and then perform test set evaluation. If the given '
'parameters match with a model that was already saved, the script '
'only runs the evaluation procedure.')
parser.add_argument('dataset', choices=['all', 'recent'], type=str, help="The dataset to train with")
parser.add_argument('epochs', type=int, help="Number of epochs of the training process")
parser.add_argument('-r', '--learning-rate', type=float, default=1e-6,
help="The learning rate fed in the Adam optimizer")
parser.add_argument('-c', '--force-cpu', action='store_true',
help="disables CUDA support. Useful when debugging")
parser.add_argument('-f', '--force-retraining', action='store_true',
help="forces training of a new model even if a matching model is already found within the "
"saved models")
args = parser.parse_args()
main(args.dataset, args.epochs, args.learning_rate, args.force_cpu, args.force_retraining)

6
tests/.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
*
**/*
!.gitignore
!__main__.py
!test_*.py

4
tests/__main__.py Normal file
View file

@ -0,0 +1,4 @@
import pytest
if __name__ == "__main__":
pytest.main()

View file

@ -0,0 +1,83 @@
import io
import json
import os
import tarfile
import pandas as pd
import pytest
from src.cleaner.clean import clean_all, save_set
@pytest.fixture
def sample_tar_file(tmp_path):
# Create a sample tar file for testing
tar_file_path = os.path.join(tmp_path, "sample_issues.tar.gz")
with tarfile.open(tar_file_path, 'w:gz') as tar:
# Add a sample JSON file to the tar archive
sample_issue = {
"id": 10001,
"node_id": "giovanni",
"number": 1,
"title": "Sample Issue",
"user": {
"login": "test_user",
"id": 2,
},
"labels": [],
"state": "open",
"assignee": {
"login": "sample_user",
"id": 3,
},
"assignees": [
{
"login": "sample_user",
"id": 3,
}
],
"created_at": "2022-01-01T00:00:00Z",
"body": "This is a sample issue body.",
}
tarinfo = tarfile.TarInfo('sample_issue.json')
contents: bytes = json.dumps(sample_issue).encode()
tarinfo.size = len(contents)
file_object = io.BytesIO(contents)
tar.addfile(tarinfo, fileobj=file_object)
return tar_file_path
def test_clean_all(sample_tar_file):
objs = []
counter = clean_all(objs, sample_tar_file)
assert counter == 0 # No issues should be skipped
# Assuming you have some assertions for the content of objs based on the sample data
assert len(objs) == 1
assert objs[0]['id'] == 1
assert objs[0]['title'] == 'Sample Issue'
assert objs[0]['body'] == 'This is a sample issue body.'
assert objs[0]['state'] == 'open'
assert objs[0]['assignee'] == 'sample_user'
assert objs[0]['created_at'] == '2022-01-01T00:00:00Z'
def test_save_set(tmp_path):
# Assuming you have a DataFrame (df) with some sample data
df = pd.DataFrame({
'title': ['Issue 1', 'Issue 2', 'Issue 3'],
'body': ['Body 1', 'Body 2', 'Body 3'],
'state': ['open', 'closed', 'open'],
'assignee': ['user1', 'user2', 'user3'],
'created_at': ['2022-01-01T00:00:00Z', '2022-01-02T00:00:00Z', '2022-01-03T00:00:00Z']
}, index=[1, 2, 3])
# Save the DataFrame to a CSV file using save_set
save_set(df, 1, 3, 'test', os.path.join(tmp_path, 'test_file_'))
# Load the saved CSV file and assert its content
loaded_df = pd.read_csv(os.path.join(tmp_path, 'test_file_test_000001_000003.csv'), index_col=0)
assert loaded_df.equals(df)

View file

@ -0,0 +1,25 @@
import pandas as pd
import pytest
from src.cleaner.dataframe import build_df
@pytest.fixture
def sample_objs():
return [
{'id': 1, 'name': 'Alice', 'age': 25},
{'id': 2, 'name': 'Bob', 'age': 30},
{'id': 3, 'name': 'Charlie', 'age': 22}
]
def test_build_df(sample_objs):
result_df = build_df(sample_objs)
assert isinstance(result_df, pd.DataFrame)
assert set(result_df.columns) == {'name', 'age'}
def test_build_df_missing_id_column():
objs_missing_id = [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}]
with pytest.raises(KeyError, match="'id'"):
build_df(objs_missing_id)

View file

@ -0,0 +1,39 @@
from unittest.mock import MagicMock, patch
import pytest
from src.modelimpl.auc import build_curve, compute_auc_roc
from test_modelimpl_torch_train import mocked_model, mocked_split_data, mocked_labelling
@pytest.fixture
def mock_classifier():
return MagicMock()
def test_build_curve_invalid_path():
with pytest.raises(ValueError, match="path should point to a pytorch model file"):
build_curve("invalid_path", force_cpu=True)
@patch('src.modelimpl.auc.load_model', return_value=(True, MagicMock(), 3))
@patch('src.modelimpl.auc.Labelling.load', return_value=MagicMock())
@patch('src.modelimpl.auc.SplitData.from_df', return_value=MagicMock())
@patch('src.modelimpl.auc.compute_auc_roc')
def test_build_curve_valid_path(mock_compute_auc_roc, mock_from_df, mock_labelling, mock_load_model):
build_curve("valid_path.pt", force_cpu=True)
mock_load_model.assert_called_once_with("valid_path.pt", None, True, False)
mock_compute_auc_roc.assert_called_once()
def test_compute_auc_roc(mocked_model, mocked_split_data, mocked_labelling, tmp_path):
compute_auc_roc(mocked_model, mocked_split_data[0], 3, mocked_labelling, True,
f"{tmp_path}/test_file")
assert (tmp_path / "test_file.ovr_curves.png").exists()
assert (tmp_path / "test_file.ovr_avg.png").exists()
assert (tmp_path / "test_file.auc.txt").exists()
(tmp_path / "test_file.ovr_curves.png").unlink()
(tmp_path / "test_file.ovr_avg.png").unlink()
(tmp_path / "test_file.auc.txt").unlink()

View file

@ -0,0 +1,13 @@
from transformers import BertForSequenceClassification
from src.modelimpl.classifier import bert_classifier
def test_bert_classifier():
# Test that the function returns an instance of BertForSequenceClassification
n_classes = 5
model = bert_classifier(n_classes)
assert isinstance(model, BertForSequenceClassification)
# Test that the model has the correct number of labels
assert model.config.num_labels == n_classes

View file

@ -0,0 +1,66 @@
import os
import pandas as pd
import pytest
from src.modelimpl.dataset import prepare_input, load_df, compute_labels, Labelling, SplitData, df_validation_split, \
Datasets
@pytest.fixture
def sample_dataframe():
return pd.DataFrame({
'id': [1, 2],
'title': ['Title1', 'Title2'],
'body': ['Body1', 'Body2'],
'title_body': ['Title1\nBody1', 'Title2\nBody2'],
'assignee': ['A', 'B']
})
def test_prepare_input(sample_dataframe):
result_df = prepare_input(sample_dataframe)
assert list(result_df.columns) == ['title_body', 'assignee']
expected_title_body = ['Title1\nBody1', 'Title2\nBody2']
assert result_df['title_body'].tolist() == expected_title_body
assert result_df['assignee'].tolist() == ['A', 'B']
def test_load_df(sample_dataframe, tmpdir):
# Save sample DataFrame to a CSV file
csv_filename = os.path.join(tmpdir, 'sample_issues.csv')
sample_dataframe.to_csv(csv_filename, index=False)
result_df = load_df(csv_filename)
assert list(result_df.columns) == ['title_body', 'assignee']
assert len(result_df) == len(sample_dataframe)
def test_compute_labels():
sample_frames = [pd.DataFrame({'assignee': ['A', 'B', 'C']}), pd.DataFrame({'assignee': ['B', 'C', 'D']})]
labels_dict, num_bounds = compute_labels(sample_frames)
assert labels_dict == {'A': 0, 'B': 1, 'C': 2, 'D': 3}
assert num_bounds == [0, 3, 4]
def test_labelling_methods(tmpdir):
labels = {'A': 0, 'B': 1, 'C': 2}
labelling = Labelling(labels)
filename = os.path.join(tmpdir, 'test_labels.csv')
labelling.save(filename)
loaded_labelling = Labelling.load(filename)
assert labelling.labels == loaded_labelling.labels
def test_split_data_methods(sample_dataframe):
labels = Labelling({'A': 0, 'B': 1})
split_data = SplitData.from_df(sample_dataframe, labels, 1)
assert len(split_data) == len(sample_dataframe)
def test_df_validation_split(sample_dataframe):
df_train, df_val = df_validation_split(sample_dataframe)
assert len(df_train) > 0
assert len(df_val) > 0
assert len(df_train) + len(df_val) == len(sample_dataframe)

View file

@ -0,0 +1,107 @@
import pandas as pd
import pytest
from src.modelimpl.classifier import bert_classifier
from src.modelimpl.dataset import tokenizer, SplitData, Labelling
from src.modelimpl.evaluate import predict, evaluate, predict_top_k, PredictionResult
from src.modelimpl.torch_dataset import Dataset
class MockSplitData:
def __init__(self, labels, texts):
self.labels = labels
self.texts = texts
def test_predict():
# Create a sample model and dataset
model = bert_classifier(n_classes=2)
labels = [0, 1, 1, 0]
texts = [
"cats chase playful fuzzy mice",
"big red ball bounces high",
"happy sun warms cool breeze",
"jumping kids laugh on playground",
]
texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
return_tensors='pt') for text in texts]
split_data = MockSplitData(labels, texts)
dataset = Dataset(split_data)
# Test predict function
predictions = predict(model, dataset, top_n=2, force_cpu=True)
# Check the length of predictions
assert len(predictions) == len(labels)
# Check the format of PredictionResult instances
for result in predictions:
assert isinstance(result, PredictionResult)
assert len(result.top_values) == 2
assert len(result.top_indices) == 2
assert isinstance(result.truth_idx, int)
# Test case for evaluate function
def test_evaluate(capsys):
# Create a sample model and dataset
model = bert_classifier(n_classes=2)
labels = [0, 1, 1, 0]
texts = [
"cats chase playful fuzzy mice",
"big red ball bounces high",
"happy sun warms cool breeze",
"jumping kids laugh on playground",
]
texts = [tokenizer(text, padding='max_length', max_length=512, truncation=True,
return_tensors='pt') for text in texts]
split_data = MockSplitData(labels, texts)
dataset = Dataset(split_data)
# Test evaluate function
evaluate(model, dataset, force_cpu=True)
# Capture the printed output and check the format
captured = capsys.readouterr()
assert "recommendations:" in captured.out
# Test case for predict_top_k function
def test_predict_top_k():
# Create a sample model and dataset
model = bert_classifier(n_classes=2)
df = pd.DataFrame({
"assignee": ["author_0", "author_1", "author_1", "author_0"],
"title_body": [
"cats chase playful fuzzy mice",
"big red ball bounces high",
"happy sun warms cool breeze",
"jumping kids laugh on playground",
],
}, index=[1, 2, 3, 4])
labels = Labelling({
"author_0": 0,
"author_1": 1
})
split_data = SplitData.from_df(df, labels, 2)
issue_id = 1
# Test predict_top_k function
result = predict_top_k(model, split_data, issue_id, top_n=2, force_cpu=True)
# Check the format of PredictionResult instance
assert isinstance(result, PredictionResult)
assert len(result.top_values) == 2
assert len(result.top_indices) == 2
assert isinstance(result.truth_idx, int)
# Check the correctness of assert statement in the function
with pytest.raises(ValueError):
predict_top_k(model, split_data, issue_id=99, top_n=2, force_cpu=True)

View file

@ -0,0 +1,50 @@
import os
import pytest
import torch
from src.modelimpl.classifier import bert_classifier, Classifier
from src.modelimpl.load import load_model
@pytest.fixture
def model_instance():
return bert_classifier(n_classes=4)
@pytest.fixture
def model_path(tmpdir):
temp_model_path = os.path.join(tmpdir, "test_model.pt")
return temp_model_path
def test_load_model_with_valid_path(model_path):
result, model, classes = load_model(model_path, label_range=(1, 5), force_cpu=False, force_retrain=False)
assert result is False # The model should not be already trained
assert isinstance(model, Classifier)
assert classes == 4 # The range (1, 5) implies 4 classes
def test_load_model_with_invalid_path():
with pytest.raises(ValueError, match="path should point to a pytorch model file"):
load_model("invalid_path.txt", label_range=(1, 5), force_cpu=False, force_retrain=False)
def test_load_model_with_force_retrain(model_path):
result, model, classes = load_model(model_path, label_range=(1, 5), force_cpu=False, force_retrain=True)
assert result is False # The model should not be already trained, but force_retrain is True
def test_load_model_with_force_cpu(model_path):
result, model, classes = load_model(model_path, label_range=(1, 5), force_cpu=True, force_retrain=False)
assert result is False # The model should not be already trained
assert isinstance(model, Classifier)
assert not torch.cuda.is_available() # CUDA should not be available
def test_load_model_with_already_trained_model(model_path, model_instance):
torch.save(model_instance.state_dict(), model_path)
result, model, classes = load_model(model_path, label_range=(1, 5), force_cpu=False, force_retrain=False)
assert result is True # The model should be already trained
assert isinstance(model, Classifier)
assert classes == 4 # The range (1, 5) implies 4 classes

View file

@ -0,0 +1,43 @@
import pytest
import torch
from src.modelimpl.torch_dataset import Dataset
class MockSplitData:
def __init__(self, labels, texts):
self.labels = labels
self.texts = texts
def test_dataset():
# Create a sample SplitData instance
labels = [0, 1, 1, 0]
texts = [torch.rand((5,)), torch.rand((5,)), torch.rand((5,)), torch.rand((5,))]
split_data = MockSplitData(labels, texts)
# Initialize the Dataset instance
dataset = Dataset(split_data)
assert len(dataset) == len(labels)
assert dataset.classes() == labels
idx = 2
assert torch.all(torch.eq(torch.from_numpy(dataset.get_batch_labels(idx)), torch.tensor(labels[idx])))
assert torch.equal(dataset.get_batch_texts(idx), texts[idx])
batch_texts, batch_y = dataset[idx]
assert torch.equal(batch_texts, texts[idx])
assert torch.tensor(batch_y) == torch.tensor(labels[idx])
def test_dataset_empty_split_data():
empty_split_data = MockSplitData([], [])
dataset = Dataset(empty_split_data)
assert len(dataset) == 0
assert dataset.classes() == []
with pytest.raises(IndexError):
_ = dataset[0]

View file

@ -0,0 +1,83 @@
import pandas as pd
import pytest
import torch
from torch.utils.data import DataLoader
from src.modelimpl.classifier import bert_classifier
from src.modelimpl.dataset import Labelling, SplitData
from src.modelimpl.torch_dataset import Dataset
from src.modelimpl.torch_train import train, print_message, compute_loss_and_acc
@pytest.fixture
def mocked_labelling():
return Labelling({"author_0": 0, "author_1": 1, "author_2": 2})
@pytest.fixture
def mocked_split_data(mocked_labelling) -> tuple[SplitData, SplitData]:
df = pd.DataFrame({
"assignee": ["author_0", "author_1", "author_2", "author_1", "author_0"],
"title_body": [
"cats chase playful fuzzy mice",
"big red ball bounces high",
"happy sun warms cool breeze",
"jumping kids laugh on playground",
"test sentence number 5",
],
}, index=[1, 2, 3, 4, 5])
return (SplitData.from_df(df.loc[[1, 2, 3]], mocked_labelling, 3),
SplitData.from_df(df.loc[[4, 5]], mocked_labelling, 3))
@pytest.fixture
def mocked_data(mocked_split_data: tuple[SplitData, SplitData]):
train_set, val_set = mocked_split_data
return DataLoader(Dataset(train_set), batch_size=2), DataLoader(Dataset(val_set), batch_size=2)
@pytest.fixture
def mocked_model():
return bert_classifier(n_classes=3)
def test_train_without_errors(capfd, mocked_model, mocked_data):
train(mocked_model, mocked_data[0].dataset, mocked_data[1].dataset, learning_rate=0.001, epochs=2, force_cpu=True)
captured = capfd.readouterr()
assert "Epochs: 1" in captured.out
assert "Epochs: 2" in captured.out
def test_print_message(capsys):
class MockDataset:
texts: list[any]
def __init__(self, length: int):
self.texts = [None] * length
# noinspection PyTypeChecker
print_message(epoch_num=1, train_loss=2.0, train_acc=0.7, train_ds=MockDataset(1), val_loss=1.0, val_acc=0.8,
val_ds=MockDataset(1))
captured = capsys.readouterr()
assert "Epochs: 2" in captured.out
assert "Train Loss: 2.000" in captured.out
assert "Train Accuracy: 0.700" in captured.out
assert "Val Loss: 1.000" in captured.out
assert "Val Accuracy: 0.800" in captured.out
def test_compute_loss_and_acc(mocked_model, mocked_data):
train_data, val_data = mocked_data
device = torch.device("cpu")
model = mocked_model
model.return_value = torch.tensor([[0.2, 0.8], [0.5, 0.5]])
val_input, val_label = next(train_data.__iter__())
loss, acc, batch_loss = compute_loss_and_acc(val_label, val_input, device, model)
assert isinstance(loss, float)
assert isinstance(acc, int)
assert isinstance(batch_loss, torch.Tensor)

View file

@ -0,0 +1,73 @@
import tarfile
from collections import defaultdict
import pytest
from src.scraper.download import (
download_page,
download_issue,
download_commit_activity,
)
@pytest.fixture
def valid_github_token():
return "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"
@pytest.fixture
def invalid_github_token():
return "ghp_invalid"
@pytest.fixture
def valid_issue_id():
return 192213 # Replace with a valid issue ID
@pytest.fixture
def invalid_issue_id():
return -1
def test_download_page_normal_execution(valid_github_token):
page = 1
tar_file_name = "test_archive.tar"
with tarfile.open(tar_file_name, "w") as tar_file:
result = download_page(page, valid_github_token, tar_file)
assert result is True
def test_download_page_invalid_page(valid_github_token):
page = -1 # Invalid page number
tar_file_name = "test_archive.tar"
with pytest.raises(ValueError):
with tarfile.open(tar_file_name, "w") as tar_file:
download_page(page, valid_github_token, tar_file)
def test_download_page_ok(valid_github_token):
page = 1
tar_file_name = "test_archive.tar"
with tarfile.open(tar_file_name, "w") as tar_file:
result = download_page(page, valid_github_token, tar_file)
assert result is True
def test_download_issue_valid_issue_id(valid_issue_id, valid_github_token):
result = download_issue(valid_issue_id, valid_github_token)
assert type(result) is dict
def test_download_issue_invalid_issue_id(invalid_issue_id, valid_github_token):
with pytest.raises(IOError):
download_issue(invalid_issue_id, valid_github_token)
def test_download_commit_activity_valid_token(valid_github_token):
result = download_commit_activity(valid_github_token)
assert type(result) is defaultdict
def test_download_commit_activity_invalid_token(invalid_github_token):
with pytest.raises(IOError):
download_commit_activity(invalid_github_token)