Final version of the project
History has been rewritten to delete large files in repo
This commit is contained in:
commit
a4ceee8716
93 changed files with 215857 additions and 0 deletions
464
.gitignore
vendored
Normal file
464
.gitignore
vendored
Normal file
|
@ -0,0 +1,464 @@
|
|||
/dataset/download/*.zip
|
||||
/dataset/functions/*.pq
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
**/latex/
|
||||
/models/test
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
**/.DS_Store
|
||||
out/model/*.pt
|
||||
|
||||
## Core latex/pdflatex auxiliary files:
|
||||
*.aux
|
||||
*.lof
|
||||
*.lot
|
||||
*.fls
|
||||
*.out
|
||||
*.toc
|
||||
*.fmt
|
||||
*.fot
|
||||
*.cb
|
||||
*.cb2
|
||||
.*.lb
|
||||
|
||||
## Intermediate documents:
|
||||
*.dvi
|
||||
*.xdv
|
||||
*-converted-to.*
|
||||
# these rules might exclude image files for figures etc.
|
||||
# *.ps
|
||||
# *.eps
|
||||
# *.pdf
|
||||
|
||||
## Generated if empty string is given at "Please type another file name for output:"
|
||||
**/*.pdf
|
||||
|
||||
## Bibliography auxiliary files (bibtex/biblatex/biber):
|
||||
*.bbl
|
||||
*.bcf
|
||||
*.blg
|
||||
*-blx.aux
|
||||
*-blx.bib
|
||||
*.run.xml
|
||||
|
||||
## Build tool auxiliary files:
|
||||
*.fdb_latexmk
|
||||
*.synctex
|
||||
*.synctex(busy)
|
||||
*.synctex.gz
|
||||
*.synctex.gz(busy)
|
||||
*.pdfsync
|
||||
|
||||
## Build tool directories for auxiliary files
|
||||
# latexrun
|
||||
latex.out/
|
||||
|
||||
## Auxiliary and intermediate files from other packages:
|
||||
# algorithms
|
||||
*.alg
|
||||
*.loa
|
||||
|
||||
# achemso
|
||||
acs-*.bib
|
||||
|
||||
# amsthm
|
||||
*.thm
|
||||
|
||||
# beamer
|
||||
*.nav
|
||||
*.pre
|
||||
*.snm
|
||||
*.vrb
|
||||
|
||||
# changes
|
||||
*.soc
|
||||
|
||||
# comment
|
||||
*.cut
|
||||
|
||||
# cprotect
|
||||
*.cpt
|
||||
|
||||
# elsarticle (documentclass of Elsevier journals)
|
||||
*.spl
|
||||
|
||||
# endnotes
|
||||
*.ent
|
||||
|
||||
*.lox
|
||||
|
||||
# feynmf/feynmp
|
||||
*.mf
|
||||
*.mp
|
||||
*.t[1-9]
|
||||
*.t[1-9][0-9]
|
||||
*.tfm
|
||||
|
||||
#(r)(e)ledmac/(r)(e)ledpar
|
||||
*.end
|
||||
*.?end
|
||||
*.[1-9]
|
||||
*.[1-9][0-9]
|
||||
*.[1-9][0-9][0-9]
|
||||
*.[1-9]R
|
||||
*.[1-9][0-9]R
|
||||
*.[1-9][0-9][0-9]R
|
||||
*.eledsec[1-9]
|
||||
*.eledsec[1-9]R
|
||||
*.eledsec[1-9][0-9]
|
||||
*.eledsec[1-9][0-9]R
|
||||
*.eledsec[1-9][0-9][0-9]
|
||||
*.eledsec[1-9][0-9][0-9]R
|
||||
|
||||
# glossaries
|
||||
*.acn
|
||||
*.acr
|
||||
*.glg
|
||||
*.glo
|
||||
*.gls
|
||||
*.glsdefs
|
||||
*.lzo
|
||||
*.lzs
|
||||
*.slg
|
||||
*.slo
|
||||
*.sls
|
||||
|
||||
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
|
||||
# *.ist
|
||||
|
||||
# gnuplot
|
||||
*.gnuplot
|
||||
*.table
|
||||
|
||||
# gnuplottex
|
||||
*-gnuplottex-*
|
||||
|
||||
# gregoriotex
|
||||
*.gaux
|
||||
*.glog
|
||||
*.gtex
|
||||
|
||||
# htlatex
|
||||
*.4ct
|
||||
*.4tc
|
||||
*.idv
|
||||
*.lg
|
||||
*.trc
|
||||
*.xref
|
||||
|
||||
# hyperref
|
||||
*.brf
|
||||
|
||||
# knitr
|
||||
*-concordance.tex
|
||||
# *.tikz
|
||||
*-tikzDictionary
|
||||
|
||||
# listings
|
||||
*.lol
|
||||
|
||||
# luatexja-ruby
|
||||
*.ltjruby
|
||||
|
||||
# makeidx
|
||||
*.idx
|
||||
*.ilg
|
||||
*.ind
|
||||
|
||||
# minitoc
|
||||
*.maf
|
||||
*.mlf
|
||||
*.mlt
|
||||
*.mtc[0-9]*
|
||||
*.slf[0-9]*
|
||||
*.slt[0-9]*
|
||||
*.stc[0-9]*
|
||||
|
||||
# minted
|
||||
_minted*
|
||||
*.pyg
|
||||
|
||||
# morewrites
|
||||
*.mw
|
||||
|
||||
# newpax
|
||||
*.newpax
|
||||
|
||||
# nomencl
|
||||
*.nlg
|
||||
*.nlo
|
||||
*.nls
|
||||
|
||||
# pax
|
||||
*.pax
|
||||
|
||||
# pdfpcnotes
|
||||
*.pdfpc
|
||||
|
||||
# sagetex
|
||||
*.sagetex.sage
|
||||
*.sagetex.py
|
||||
*.sagetex.scmd
|
||||
|
||||
# scrwfile
|
||||
*.wrt
|
||||
|
||||
# svg
|
||||
svg-inkscape/
|
||||
|
||||
# sympy
|
||||
*.sout
|
||||
*.sympy
|
||||
sympy-plots-for-*.tex/
|
||||
|
||||
# pdfcomment
|
||||
*.upa
|
||||
*.upb
|
||||
|
||||
# pythontex
|
||||
*.pytxcode
|
||||
pythontex-files-*/
|
||||
|
||||
# tcolorbox
|
||||
*.listing
|
||||
|
||||
# thmtools
|
||||
*.loe
|
||||
|
||||
# TikZ & PGF
|
||||
*.dpth
|
||||
*.md5
|
||||
*.auxlock
|
||||
|
||||
# titletoc
|
||||
*.ptc
|
||||
|
||||
# todonotes
|
||||
*.tdo
|
||||
|
||||
# vhistory
|
||||
*.hst
|
||||
*.ver
|
||||
|
||||
*.lod
|
||||
|
||||
# xcolor
|
||||
*.xcp
|
||||
|
||||
# xmpincl
|
||||
*.xmpi
|
||||
|
||||
# xindy
|
||||
*.xdy
|
||||
|
||||
# xypic precompiled matrices and outlines
|
||||
*.xyc
|
||||
*.xyd
|
||||
|
||||
# endfloat
|
||||
*.ttt
|
||||
*.fff
|
||||
|
||||
# Latexian
|
||||
TSWLatexianTemp*
|
||||
|
||||
## Editors:
|
||||
# WinEdt
|
||||
*.bak
|
||||
*.sav
|
||||
|
||||
# Texpad
|
||||
.texpadtmp
|
||||
|
||||
# LyX
|
||||
*.lyx~
|
||||
|
||||
# Kile
|
||||
*.backup
|
||||
|
||||
# gummi
|
||||
.*.swp
|
||||
|
||||
# KBibTeX
|
||||
*~[0-9]*
|
||||
|
||||
# TeXnicCenter
|
||||
*.tps
|
||||
|
||||
# auto folder when using emacs and auctex
|
||||
./auto/*
|
||||
*.el
|
||||
|
||||
# expex forward references with \gathertags
|
||||
*-tags.tex
|
||||
|
||||
# standalone packages
|
||||
*.sta
|
||||
|
||||
# Makeindex log files
|
||||
*.lpz
|
||||
|
||||
# xwatermark package
|
||||
*.xwm
|
||||
|
||||
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
|
||||
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
|
||||
# Uncomment the next line to have this generated file ignored.
|
||||
#*Notes.bib
|
88
README.md
Normal file
88
README.md
Normal file
|
@ -0,0 +1,88 @@
|
|||
# Assignment 2: If statements
|
||||
|
||||
**Group 2: Baris Aksakal, Edoardo Riggio, Claudio Maggioni**
|
||||
|
||||
## Repository Structure
|
||||
|
||||
- `/dataset`: code and data related to scraping repository from GitHub;
|
||||
- `/models`
|
||||
- `/baris`: code and persisted model of the original architecture built by
|
||||
Baris. `model_0.1.ipynb` and `test_model.ipynb` are respectively an
|
||||
earlier and later iteration of the code used to train this model;
|
||||
- `/final`: persisted model for the final architecture with training and
|
||||
test evaluation statistics;
|
||||
- `/test_outputs.csv`: CSV deliverable for the test set evaluation on
|
||||
the test set we extracted;
|
||||
- `/test_usi_outputs.csv`: CSV deliverable for the test set evaluation
|
||||
on the provided test set.
|
||||
- `/test`: unit tests for the model training scripts;
|
||||
- `/train`: dependencies of the main model training script;
|
||||
- `/train_model.py`: main model training script;
|
||||
- `/plot_acc.py`: accuracy statistics plotting script.
|
||||
|
||||
## Environment Setup
|
||||
|
||||
In order to execute both the scraping and training scripts, Python 3.10 or
|
||||
greater is required. Dependencies can be installed through a virtual env by
|
||||
running:
|
||||
|
||||
```shell
|
||||
python3 -m venv .env
|
||||
source .env/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Dataset Extraction
|
||||
|
||||
Please refer to [the README.md file in `/dataset`](dataset/README.md) for
|
||||
documentation on the dataset extraction process.
|
||||
|
||||
## Model Training
|
||||
|
||||
Model training can be performed by running the script:
|
||||
|
||||
```shell
|
||||
python3 train_model.py
|
||||
```
|
||||
|
||||
The script is able to resume fine-tuning if the pretraining phase was completed
|
||||
by a previous execution, and it is able to directly skip to model evaluation on
|
||||
the two test sets if fine-tuning was already completed.
|
||||
|
||||
The persisted pretrained model is located in `/models/final/pretrain`. Each
|
||||
epoch of the fine-tuning train process is persisted at path
|
||||
`/models/final/<N>`, where `<N>` is the epoch number starting from 0. The epoch
|
||||
number for the epoch selected by the early stopping process is stored in
|
||||
`/models/final/best.txt`.
|
||||
|
||||
`/models/final/stats.csv` stores the training and validation loss and accuracy
|
||||
statistics during the training process. `/models/final/test_outputs.csv` is the
|
||||
CSV deliverable for the test set evaluation on the test set we extracted, while
|
||||
`/models/final/test_usi_outputs.csv` is the CSV deliverable for the test set
|
||||
evaluation on the provided test set.
|
||||
|
||||
The stdout for the training process script can be found in the file
|
||||
`/models/final/train_log.txt`.
|
||||
|
||||
### Plots
|
||||
|
||||
The train and validation loss and accuracy plots can be generated from
|
||||
`/models/final/stats.csv` with the following command:
|
||||
|
||||
```shell
|
||||
python3 plot_acc.py
|
||||
```
|
||||
|
||||
The output is stored in `/models/final/training_metrics.png`.
|
||||
|
||||
# Report
|
||||
|
||||
To compile the report run:
|
||||
|
||||
```shell
|
||||
cd report
|
||||
pdflatex -interaction=nonstopmode -output-directory=. main.tex
|
||||
pdflatex -interaction=nonstopmode -output-directory=. main.tex
|
||||
```
|
||||
|
||||
The report is then located in `report/main.pdf`.
|
78
dataset/README.md
Normal file
78
dataset/README.md
Normal file
|
@ -0,0 +1,78 @@
|
|||
# Dataset Download Instructions
|
||||
|
||||
## Project .zip Export
|
||||
|
||||
We scraped GitHub repositories using the download tool https://seart-ghs.si.usi.ch/ to generate the `results.csv` file
|
||||
under this directory. Other than the default constraints applied by the `seart-ghs` crawler, we used the following
|
||||
criteria:
|
||||
|
||||
- lines of code: >=10000
|
||||
- language: `Python`
|
||||
|
||||
We found 21269 results. We then downloaded a `.zip` archive of the main branch of each repository using the following
|
||||
command. We started the download process on 2023-11-13 at 12:00.
|
||||
|
||||
```shell
|
||||
mkdir download || true
|
||||
cat results.csv | \
|
||||
awk -F, 'NR>1 { print "wget -O " $2 ".zip https://github.com/" $2 "/archive/refs/heads/" $6 ".zip" }' | \
|
||||
sed 's#\/#-#;s#\"##g' > download/to_download.sh
|
||||
cd download
|
||||
bash to_download.sh
|
||||
```
|
||||
|
||||
### Manually Excluded Repos
|
||||
|
||||
We manually excluded the following repositories from our scraped dataset ("404" means that the repository was
|
||||
inaccessible and could not be downloaded):
|
||||
|
||||
- `thorn-lab/coronavirus_structural_task_force` (too large, more than 6GiB)
|
||||
- `feeicn/security-ppt` (too large, more than 9GiB)
|
||||
- `salesforce/ai-economist` (404)
|
||||
- `agiliumtrade/ai-metaapi-python-sdk` (404)
|
||||
- `pokemonchw/dieloli` (harmful content)
|
||||
- `thesnowguru/pytrader-python-mt4-mt5-trading-api-connector-drag-n-drop` (DMCA takedown)
|
||||
- `objectiv/objectiv-analytics` (404)
|
||||
- `aws/solutions-aws-security-hub-automated-response-and-remediation` (404)
|
||||
- `openunited/product-factory-backend` (404)
|
||||
- `ibm-epbl/ibm-project-43602-1660718377` (404)
|
||||
- `ibm-epbl/ibm-project-1392-1658386621` (404)
|
||||
- `potatolondon/django-gcloud-connectors` (404)
|
||||
- `fortwoone/oracle-project` (404)
|
||||
- `iperov/deepxtools` (404)
|
||||
- `frequenz/floss-frequenz-sdk-python` (404)
|
||||
|
||||
### Check Archive Health
|
||||
|
||||
The following script was used to check the integrity of each downloaded `.zip` file.
|
||||
|
||||
```shell
|
||||
cd download
|
||||
find . -name '*.zip' \
|
||||
-exec bash -c 'echo $0 $(unzip -l "$0" 2>/dev/null 1>/dev/null && echo "1" || echo "0")' \{\} \; \
|
||||
> archive_health.txt
|
||||
```
|
||||
|
||||
## Function Extraction
|
||||
|
||||
The following command builds a dataset from the archives saved in the `/download` subdirectory:
|
||||
|
||||
```shell
|
||||
python3 ./extract.py
|
||||
```
|
||||
|
||||
Functions are extracted with the Python `ast` module, which discards comments (but not docstrings). The script generates
|
||||
one parquet archive per project in the directory `/functions` containing functions.
|
||||
|
||||
As the dataset was large, this script was terminated early. At termination, 70 million functions were extracted. Due to
|
||||
computing power limitations for model training, we further extracted only 500000 functions out of the ones downloaded
|
||||
to build the training set. The extraction process reads the archives in `/functions` and then stores the extracted
|
||||
functions in the Parquet file `extracted/functions.pq`. The extraction script can be invoked with the command:
|
||||
|
||||
```shell
|
||||
python3 extract.py
|
||||
```
|
||||
|
||||
The extraction process guarantees that the extracted functions have valid syntax for Python 3.10+ and that the code of
|
||||
each function contains only ASCII characters.
|
||||
|
90
dataset/extract.py
Normal file
90
dataset/extract.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
import ast
|
||||
import os.path
|
||||
import typing
|
||||
import zipfile
|
||||
from typing import Optional
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from fastparquet import write
|
||||
import multiprocessing
|
||||
|
||||
PWD = os.path.dirname(__file__)
|
||||
IN_DIR = os.path.join(PWD, "download")
|
||||
OUT_DIR = os.path.join(PWD, "functions")
|
||||
|
||||
|
||||
def read_functions(content, filename: str, zip_name: str) -> Optional[pd.DataFrame]:
|
||||
records = []
|
||||
|
||||
try:
|
||||
tree = ast.parse(content.decode('utf-8'), filename=filename)
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.FunctionDef):
|
||||
f_source: str = ast.unparse(typing.cast(any, node))
|
||||
records.append({
|
||||
"zip_filename": zip_name,
|
||||
"py_filename": filename,
|
||||
"source": f_source,
|
||||
"success": True,
|
||||
"error": None,
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"project '{zip_name}': error parsing '{filename}': {e}")
|
||||
records.append({
|
||||
"zip_filename": zip_name,
|
||||
"py_filename": filename,
|
||||
"source": "",
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
return pd.DataFrame.from_records(records)
|
||||
|
||||
|
||||
def read_zip_file(zip_file: str):
|
||||
out_path = os.path.join(OUT_DIR, os.path.basename(zip_file) + ".pq")
|
||||
df = pd.DataFrame(columns=["zip_filename", "py_filename", "source"])
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
||||
info_list = [info for info in zip_ref.infolist() if info.filename.endswith('.py')]
|
||||
|
||||
for info in tqdm(info_list, desc=os.path.basename(zip_file), ncols=0, position=None, leave=True):
|
||||
content = zip_ref.read(info.filename)
|
||||
|
||||
df_file = read_functions(content, info.filename, zip_file)
|
||||
if df_file is not None:
|
||||
df = pd.concat([df, df_file], ignore_index=True)
|
||||
write(out_path, df, compression='GZIP')
|
||||
|
||||
return zip_file
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
def read_clones(zip_dir: str):
|
||||
zip_files = []
|
||||
for a_file in tqdm(os.listdir(zip_dir), desc="Scan dir"):
|
||||
path = os.path.join(zip_dir, a_file)
|
||||
out_path = os.path.join(OUT_DIR, os.path.basename(path) + ".pq")
|
||||
if zipfile.is_zipfile(path) and not os.path.isfile(out_path):
|
||||
zip_files.append(path)
|
||||
|
||||
num_processes = 192
|
||||
with multiprocessing.Manager():
|
||||
with multiprocessing.Pool(processes=num_processes) as pool:
|
||||
for _ in tqdm(pool.imap_unordered(read_zip_file, zip_files), desc="Read ZIPs",
|
||||
unit="item", total=len(zip_files), position=None, leave=True):
|
||||
pass # dummy iteration to consume multiprocessing iterator, needed to launch processes
|
||||
|
||||
|
||||
def main():
|
||||
if not os.path.isdir(OUT_DIR):
|
||||
os.makedirs(OUT_DIR)
|
||||
|
||||
read_clones(IN_DIR)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
3
dataset/extracted/functions.pq
Normal file
3
dataset/extracted/functions.pq
Normal file
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:923ad850a4eed1d026b26fedbd5ecd64cf2e4e0f4648108d4732ac0e8fe70eb8
|
||||
size 72966215
|
110455
dataset/extracted/test_set_usi.csv
Normal file
110455
dataset/extracted/test_set_usi.csv
Normal file
File diff suppressed because it is too large
Load diff
21270
dataset/results.csv
Normal file
21270
dataset/results.csv
Normal file
File diff suppressed because one or more lines are too long
68
dataset/sample.py
Normal file
68
dataset/sample.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
import os
|
||||
import random
|
||||
import pandas as pd
|
||||
from fastparquet import write
|
||||
from tqdm import tqdm
|
||||
|
||||
PWD = os.path.dirname(__file__)
|
||||
IN_DIR = os.path.join(PWD, "functions")
|
||||
|
||||
OUT_FILE = os.path.join(PWD, "extracted", "functions.pq")
|
||||
OUT_SIZE = 500_000
|
||||
|
||||
|
||||
def main():
|
||||
out_dir = os.path.dirname(OUT_FILE)
|
||||
if not os.path.isdir(out_dir):
|
||||
os.makedirs(out_dir)
|
||||
|
||||
files = [f for f in os.listdir(IN_DIR) if f.endswith('.pq')]
|
||||
chosen: set[tuple[str, int]] = set()
|
||||
|
||||
df = None
|
||||
|
||||
with tqdm(desc="Sampling", total=OUT_SIZE) as p:
|
||||
while df is None or len(df.index) < OUT_SIZE:
|
||||
filename = random.choice(files)
|
||||
path = os.path.join(IN_DIR, filename)
|
||||
|
||||
df_file = pd.read_parquet(path, engine='fastparquet')
|
||||
df_len = len(df_file.index)
|
||||
|
||||
if df_len == 0:
|
||||
continue
|
||||
|
||||
for _ in range(min(1000, df_len)):
|
||||
index = random.randrange(0, df_len)
|
||||
error_message = df_file.iloc[index]["error"]
|
||||
|
||||
if error_message is not None and len(error_message) > 0:
|
||||
continue
|
||||
|
||||
source = df_file.iloc[index]["source"]
|
||||
|
||||
if not source.isascii():
|
||||
continue
|
||||
|
||||
choice = (filename, index)
|
||||
if choice not in chosen:
|
||||
chosen.add(choice)
|
||||
|
||||
if df is None:
|
||||
df = df_file
|
||||
else:
|
||||
df = pd.concat([df, df_file.iloc[[index], :]], axis=0, ignore_index=True)
|
||||
|
||||
if len(df.index) % 1000 == 0:
|
||||
write(OUT_FILE, df, compression='GZIP')
|
||||
|
||||
p.update(1)
|
||||
|
||||
# Fix success column, which is computed wrong in a previous iteration of extract.py
|
||||
df["success"] = df["error"].apply(lambda e: e is None or len(e) == 0)
|
||||
|
||||
write(OUT_FILE, df, compression='GZIP')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
111
environment.yml
Normal file
111
environment.yml
Normal file
|
@ -0,0 +1,111 @@
|
|||
name: SA
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- abseil-cpp=20230802.0
|
||||
- aiohttp=3.8.5
|
||||
- aiosignal=1.2.0
|
||||
- arrow-cpp=11.0.0
|
||||
- async-timeout=4.0.2
|
||||
- attrs=23.1.0
|
||||
- aws-c-common=0.6.8
|
||||
- aws-c-event-stream=0.1.6
|
||||
- aws-checksums=0.1.11
|
||||
- aws-sdk-cpp=1.8.185
|
||||
- blas=1.0
|
||||
- boost-cpp=1.82.0
|
||||
- bottleneck=1.3.5
|
||||
- brotli=1.0.9
|
||||
- brotli-bin=1.0.9
|
||||
- brotli-python=1.0.9
|
||||
- bzip2=1.0.8
|
||||
- c-ares=1.19.1
|
||||
- ca-certificates=2023.08.22
|
||||
- certifi=2023.11.17
|
||||
- cffi=1.16.0
|
||||
- charset-normalizer=2.0.4
|
||||
- cramjam=2.6.2
|
||||
- cryptography=41.0.3
|
||||
- datasets=2.12.0
|
||||
- dill=0.3.6
|
||||
- fastparquet=2023.8.0
|
||||
- filelock=3.13.1
|
||||
- frozenlist=1.4.0
|
||||
- fsspec=2023.9.2
|
||||
- gflags=2.2.2
|
||||
- glog=0.5.0
|
||||
- grpc-cpp=1.48.2
|
||||
- gtest=1.14.0
|
||||
- huggingface_hub=0.17.3
|
||||
- icu=73.1
|
||||
- idna=3.4
|
||||
- importlib-metadata=6.0.0
|
||||
- krb5=1.20.1
|
||||
- libboost=1.82.0
|
||||
- libbrotlicommon=1.0.9
|
||||
- libbrotlidec=1.0.9
|
||||
- libbrotlienc=1.0.9
|
||||
- libcurl=8.4.0
|
||||
- libcxx=14.0.6
|
||||
- libedit=3.1.20221030
|
||||
- libev=4.33
|
||||
- libevent=2.1.12
|
||||
- libffi=3.4.4
|
||||
- libgfortran=5.0.0
|
||||
- libgfortran5=11.3.0
|
||||
- libiconv=1.16
|
||||
- libnghttp2=1.57.0
|
||||
- libopenblas=0.3.21
|
||||
- libprotobuf=3.20.3
|
||||
- libssh2=1.10.0
|
||||
- libthrift=0.15.0
|
||||
- llvm-openmp=14.0.6
|
||||
- lz4-c=1.9.4
|
||||
- multidict=6.0.2
|
||||
- multiprocess=0.70.14
|
||||
- ncurses=6.4
|
||||
- numexpr=2.8.7
|
||||
- numpy=1.26.0
|
||||
- numpy-base=1.26.0
|
||||
- openssl=3.0.12
|
||||
- orc=1.7.4
|
||||
- packaging=23.1
|
||||
- pandas=2.1.1
|
||||
- pip=23.3.1
|
||||
- pyarrow=11.0.0
|
||||
- pycparser=2.21
|
||||
- pyopenssl=23.2.0
|
||||
- pysocks=1.7.1
|
||||
- python=3.11.5
|
||||
- python-dateutil=2.8.2
|
||||
- python-tzdata=2023.3
|
||||
- python-xxhash=2.0.2
|
||||
- pytz=2023.3.post1
|
||||
- pyyaml=6.0.1
|
||||
- re2=2022.04.01
|
||||
- readline=8.2
|
||||
- regex=2023.10.3
|
||||
- requests=2.31.0
|
||||
- responses=0.13.3
|
||||
- safetensors=0.4.0
|
||||
- setuptools=68.0.0
|
||||
- six=1.16.0
|
||||
- snappy=1.1.9
|
||||
- sqlite=3.41.2
|
||||
- tk=8.6.12
|
||||
- tokenizers=0.13.2
|
||||
- tqdm=4.65.0
|
||||
- transformers=4.32.1
|
||||
- typing-extensions=4.7.1
|
||||
- typing_extensions=4.7.1
|
||||
- tzdata=2023c
|
||||
- urllib3=1.26.18
|
||||
- utf8proc=2.6.1
|
||||
- wheel=0.41.2
|
||||
- xxhash=0.8.0
|
||||
- xz=5.4.2
|
||||
- yaml=0.2.5
|
||||
- yarl=1.8.1
|
||||
- zipp=3.11.0
|
||||
- zlib=1.2.13
|
||||
- zstd=1.5.5
|
0
models/.gitkeep
Normal file
0
models/.gitkeep
Normal file
69
models/baris/0/config.json
Normal file
69
models/baris/0/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.34.0",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/baris/0/generation_config.json
Normal file
8
models/baris/0/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.34.0"
|
||||
}
|
69
models/baris/1/config.json
Normal file
69
models/baris/1/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.34.0",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/baris/1/generation_config.json
Normal file
8
models/baris/1/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.34.0"
|
||||
}
|
69
models/baris/2/config.json
Normal file
69
models/baris/2/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.34.0",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/baris/2/generation_config.json
Normal file
8
models/baris/2/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.34.0"
|
||||
}
|
69
models/baris/3/config.json
Normal file
69
models/baris/3/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.34.0",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/baris/3/generation_config.json
Normal file
8
models/baris/3/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.34.0"
|
||||
}
|
69
models/baris/4/config.json
Normal file
69
models/baris/4/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.34.0",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/baris/4/generation_config.json
Normal file
8
models/baris/4/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.34.0"
|
||||
}
|
3250
models/baris/model_0.1.ipynb
Normal file
3250
models/baris/model_0.1.ipynb
Normal file
File diff suppressed because it is too large
Load diff
449
models/baris/test_model.ipynb
Normal file
449
models/baris/test_model.ipynb
Normal file
|
@ -0,0 +1,449 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "5f7ad96a-6b01-4b63-93b6-4008597a0e9e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using device: cuda\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Evaluating: 100%|███████████████████████████| 1092/1092 [04:02<00:00, 4.50it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Test Accuracy: 0.3642\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import re\n",
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"from torch.utils.data import Dataset, DataLoader\n",
|
||||
"from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForLanguageModeling\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import os\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Dataset class for pre-training\n",
|
||||
"class PythonCodeDataset(Dataset):\n",
|
||||
" def __init__(self, tokenizer, dataframe, max_len=512):\n",
|
||||
" self.tokenizer = tokenizer\n",
|
||||
" self.data = dataframe\n",
|
||||
" self.max_len = max_len\n",
|
||||
"\n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.data)\n",
|
||||
"\n",
|
||||
" def __getitem__(self, index):\n",
|
||||
" code = self.data.iloc[index]['source']\n",
|
||||
" inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)\n",
|
||||
" return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}\n",
|
||||
"\n",
|
||||
"# Function to mask if conditions\n",
|
||||
"def mask_if_condition(code_snippet):\n",
|
||||
" if_conditions = re.findall(r'(if\\s+.*?:)', code_snippet)\n",
|
||||
" masked_snippet = code_snippet.replace(if_conditions[0], '<mask>', 1) if if_conditions else code_snippet\n",
|
||||
" return masked_snippet, if_conditions[0] if if_conditions else None\n",
|
||||
"\n",
|
||||
"# Fine-tuning and evaluation dataset classes\n",
|
||||
"class MaskedIfDataset(PythonCodeDataset):\n",
|
||||
" def __getitem__(self, index):\n",
|
||||
" masked_code = self.data.iloc[index]['masked_code']\n",
|
||||
" ground_truth = self.data.iloc[index]['ground_truth']\n",
|
||||
" inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\")\n",
|
||||
" labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\").input_ids\n",
|
||||
" labels[labels == self.tokenizer.pad_token_id] = -100\n",
|
||||
" return {'input_ids': inputs.input_ids.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': labels.squeeze()}\n",
|
||||
"\n",
|
||||
"# Define the pre-training loop\n",
|
||||
"def pretrain(model, dataloader, epochs, print_every=10):\n",
|
||||
" model.train()\n",
|
||||
" optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n",
|
||||
" global_step = 0 # Initialize a counter for the global training step\n",
|
||||
"\n",
|
||||
" for epoch in range(epochs):\n",
|
||||
" for batch in dataloader:\n",
|
||||
" batch = {k: v.to(device) for k, v in batch.items()}\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}\n",
|
||||
" outputs = model(**inputs, labels=batch['input_ids'])\n",
|
||||
" loss = outputs.loss\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
" if global_step % print_every == 0: # Print every steps\n",
|
||||
" print(f\"Step {global_step}, Loss: {loss.item()}\")\n",
|
||||
"\n",
|
||||
" global_step += 1 # Increment the step counter\n",
|
||||
"\n",
|
||||
" print(f\"Epoch {epoch+1}/{epochs} completed.\")\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def fine_tune_with_eval(model, train_loader, eval_loader, epochs, save_path, print_every=10, early_stopping_patience=3):\n",
|
||||
" optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n",
|
||||
" best_epoch = 0\n",
|
||||
" best_eval_accuracy = 0\n",
|
||||
" patience_counter = 0\n",
|
||||
" train_losses, eval_accuracies = [], []\n",
|
||||
"\n",
|
||||
" for epoch in range(epochs):\n",
|
||||
" model.train()\n",
|
||||
" total_loss = 0\n",
|
||||
"\n",
|
||||
" # Training loop with tqdm for progress tracking\n",
|
||||
" for batch in tqdm(train_loader, desc=f\"Training Epoch {epoch+1}/{epochs}\"):\n",
|
||||
" batch = {k: v.to(device) for k, v in batch.items()}\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['labels']}\n",
|
||||
" outputs = model(**inputs)\n",
|
||||
" loss = outputs.loss\n",
|
||||
" total_loss += loss.item()\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
" average_loss = total_loss / len(train_loader)\n",
|
||||
" train_losses.append(average_loss)\n",
|
||||
"\n",
|
||||
" # Evaluate on the evaluation set\n",
|
||||
" eval_accuracy = evaluate_accuracy(model, eval_loader, tokenizer, device)\n",
|
||||
" eval_accuracies.append(eval_accuracy)\n",
|
||||
" print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {average_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}\")\n",
|
||||
"\n",
|
||||
" # Early stopping and checkpointing\n",
|
||||
" if eval_accuracy > best_eval_accuracy:\n",
|
||||
" best_eval_accuracy = eval_accuracy\n",
|
||||
" best_epoch = epoch\n",
|
||||
" patience_counter = 0\n",
|
||||
" else:\n",
|
||||
" patience_counter += 1\n",
|
||||
" if patience_counter >= early_stopping_patience:\n",
|
||||
" print(\"Early stopping triggered.\")\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" save_directory = f\"{save_path}/{epoch}\"\n",
|
||||
" model.save_pretrained(save_directory)\n",
|
||||
" \n",
|
||||
" # Plotting the training loss and evaluation accuracy\n",
|
||||
" plt.figure(figsize=(12, 5))\n",
|
||||
" plt.subplot(1, 2, 1)\n",
|
||||
" plt.plot(train_losses, label='Training Loss')\n",
|
||||
" plt.title('Training Loss')\n",
|
||||
" plt.xlabel('Epoch')\n",
|
||||
" plt.ylabel('Loss')\n",
|
||||
" plt.legend()\n",
|
||||
"\n",
|
||||
" plt.subplot(1, 2, 2)\n",
|
||||
" plt.plot(eval_accuracies, label='Evaluation Accuracy')\n",
|
||||
" plt.title('Evaluation Accuracy')\n",
|
||||
" plt.xlabel('Epoch')\n",
|
||||
" plt.ylabel('Accuracy')\n",
|
||||
" plt.legend()\n",
|
||||
"\n",
|
||||
" plt.savefig(f\"{save_path}/training_metrics.png\")\n",
|
||||
" \n",
|
||||
" return best_epoch\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def evaluate_accuracy(model, dataloader, tokenizer, device):\n",
|
||||
" model.eval()\n",
|
||||
" correct_predictions, total_predictions = 0, 0\n",
|
||||
"\n",
|
||||
" for batch in tqdm(dataloader, desc=\"Evaluating\"):\n",
|
||||
" batch = {k: v.to(device) for k, v in batch.items()}\n",
|
||||
" with torch.no_grad():\n",
|
||||
" outputs = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)\n",
|
||||
" decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n",
|
||||
"\n",
|
||||
" # Decode labels with added check for None values\n",
|
||||
" decoded_labels = []\n",
|
||||
" for label in batch['labels']:\n",
|
||||
" label_trimmed = [l for l in label.tolist() if l != tokenizer.pad_token_id and l != -100]\n",
|
||||
" if label_trimmed:\n",
|
||||
" decoded_label = tokenizer.decode(label_trimmed, skip_special_tokens=True)\n",
|
||||
" decoded_labels.append(decoded_label)\n",
|
||||
" else:\n",
|
||||
" decoded_labels.append(None) # Append None for invalid/empty labels\n",
|
||||
"\n",
|
||||
" # Calculate accuracy\n",
|
||||
" for output, label in zip(decoded_outputs, decoded_labels):\n",
|
||||
" if label is not None and output.strip() == label.strip():\n",
|
||||
" correct_predictions += 1\n",
|
||||
" if label is not None:\n",
|
||||
" total_predictions += 1\n",
|
||||
"\n",
|
||||
" return correct_predictions / total_predictions if total_predictions > 0 else 0\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"# Read the dataset\n",
|
||||
"df = pd.read_parquet('../if-statements/dataset/extracted/functions.pq')\n",
|
||||
"#df = df.head(50)\n",
|
||||
"\n",
|
||||
"# Split the dataset into pre-training, fine-tuning, evaluation, and test sets\n",
|
||||
"pretrain_df, fine_tune_df = train_test_split(df, test_size=0.5, random_state=42)\n",
|
||||
"eval_df = fine_tune_df.sample(frac=0.1, random_state=42)\n",
|
||||
"test_df = fine_tune_df.drop(eval_df.index).sample(frac=0.1111, random_state=42)\n",
|
||||
"fine_tune_df = fine_tune_df.drop(eval_df.index).drop(test_df.index)\n",
|
||||
"\n",
|
||||
"assert len(set(eval_df.index).intersection(set(test_df.index))) == 0\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Initialize tokenizer and model\n",
|
||||
"tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')\n",
|
||||
"model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')\n",
|
||||
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
||||
"print(f'Using device: {device}')\n",
|
||||
"model.to(device)\n",
|
||||
" \n",
|
||||
"# Instantiate the dataset for pre-training\n",
|
||||
"pretrain_dataset = PythonCodeDataset(tokenizer, pretrain_df)\n",
|
||||
"\n",
|
||||
"# Set up the data collator for MLM\n",
|
||||
"data_collator = DataCollatorForLanguageModeling(\n",
|
||||
" tokenizer=tokenizer,\n",
|
||||
" mlm=True,\n",
|
||||
" mlm_probability=0.15\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Create a DataLoader for pre-training\n",
|
||||
"pretrain_loader = DataLoader(pretrain_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)\n",
|
||||
"\n",
|
||||
"# Pre-train the model\n",
|
||||
"#pretrain(model, pretrain_loader, epochs=1)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Prepare data for fine-tuning and evaluation\n",
|
||||
"fine_tune_df['masked_code'], fine_tune_df['ground_truth'] = zip(*fine_tune_df['source'].apply(mask_if_condition))\n",
|
||||
"eval_df['masked_code'], eval_df['ground_truth'] = zip(*eval_df['source'].apply(mask_if_condition))\n",
|
||||
"fine_tune_df.dropna(subset=['ground_truth'], inplace=True)\n",
|
||||
"eval_df.dropna(subset=['ground_truth'], inplace=True)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
|
||||
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Dataloaders for fine-tuning and evaluation\n",
|
||||
"fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=8, shuffle=True)\n",
|
||||
"eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Instantiate the datasets for fine-tuning and evaluation\n",
|
||||
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
|
||||
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"best_epoch = 4\n",
|
||||
"\n",
|
||||
"# Example of calling the modified function\n",
|
||||
"save_path = '../if-statements/dataset/extracted/final'\n",
|
||||
"#best_epoch = fine_tune_with_eval(model, fine_tune_loader, eval_loader, epochs=5, save_path=save_path)\n",
|
||||
"\n",
|
||||
"# Define the directory of the best model\n",
|
||||
"best_model_directory = os.path.join(save_path, str(best_epoch))\n",
|
||||
"\n",
|
||||
"# Load the best model and its config\n",
|
||||
"best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)\n",
|
||||
"\n",
|
||||
"# Optionally, load the model's config\n",
|
||||
"model_config = best_model.config # This will load the config file associated with the model\n",
|
||||
"\n",
|
||||
"best_model.to(device)\n",
|
||||
"\n",
|
||||
"# Prepare and evaluate on the test set\n",
|
||||
"test_df['masked_code'], test_df['ground_truth'] = zip(*test_df['source'].apply(mask_if_condition))\n",
|
||||
"test_df.dropna(subset=['ground_truth'], inplace=True)\n",
|
||||
"test_dataset = MaskedIfDataset(tokenizer, test_df)\n",
|
||||
"test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)\n",
|
||||
"\n",
|
||||
"# Evaluate the model on the test set\n",
|
||||
"test_accuracy = evaluate_accuracy(best_model, test_loader, tokenizer, device)\n",
|
||||
"print(f\"Test Accuracy: {test_accuracy:.4f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "3eb56941-cd5b-405b-ae37-f15d97a2b22e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Unnamed: 0 original_method \\\n",
|
||||
"0 5126 def stream_edit(request, stream_id, response_f... \n",
|
||||
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
|
||||
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
|
||||
"3 17853 def search_host(self, search_string):\\n res... \n",
|
||||
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
|
||||
"\n",
|
||||
" target_block \n",
|
||||
"0 if \"cancel\" not in request . POST : \n",
|
||||
"1 if isinstance ( node , ast . Include ) : \n",
|
||||
"2 if len ( line . strip ( ) ) == 0 : \n",
|
||||
"3 if isinstance ( value , int ) : \n",
|
||||
"4 if self . _get_flag ( \"struct\" ) : \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load the new dataset\n",
|
||||
"new_df = pd.read_csv('../if-statements/dataset/extracted/test_set_usi.csv')\n",
|
||||
"\n",
|
||||
"new_df.drop(\"input_method\", axis=1, inplace=True)\n",
|
||||
"new_df.drop(\"tokens_in_method\", axis=1, inplace=True)\n",
|
||||
"\n",
|
||||
"print(new_df.head())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "08a9c76f-32da-4871-b0af-d5afafa50ae0",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Unnamed: 0 original_method \\\n",
|
||||
"0 5126 def stream_edit(request, stream_id, response_f... \n",
|
||||
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
|
||||
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
|
||||
"3 17853 def search_host(self, search_string):\\n res... \n",
|
||||
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
|
||||
"\n",
|
||||
" target_block \\\n",
|
||||
"0 if \"cancel\" not in request . POST : \n",
|
||||
"1 if isinstance ( node , ast . Include ) : \n",
|
||||
"2 if len ( line . strip ( ) ) == 0 : \n",
|
||||
"3 if isinstance ( value , int ) : \n",
|
||||
"4 if self . _get_flag ( \"struct\" ) : \n",
|
||||
"\n",
|
||||
" masked_code \\\n",
|
||||
"0 def stream_edit(request, stream_id, response_f... \n",
|
||||
"1 def _read_and_parse_includes(self):\\n # Map... \n",
|
||||
"2 def _get_list_key(self, spaces, lines):\\n k... \n",
|
||||
"3 def search_host(self, search_string):\\n res... \n",
|
||||
"4 def pop(self, key: Union[str, Enum], default: ... \n",
|
||||
"\n",
|
||||
" ground_truth \n",
|
||||
"0 if not request.user.profile.has_permission(str... \n",
|
||||
"1 if isinstance(node, ast.Include): \n",
|
||||
"2 if len(line.strip()) == 0: \n",
|
||||
"3 if host_entry.get(\"type\") != \"entry\": \n",
|
||||
"4 if self._get_flag(\"readonly\"): \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Function to preprocess the new dataframe\n",
|
||||
"def preprocess_new_df(df):\n",
|
||||
" # Apply the masking function\n",
|
||||
" df['masked_code'], df['ground_truth'] = zip(*df['original_method'].apply(mask_if_condition))\n",
|
||||
" # Drop rows where ground truth (if statement) is None\n",
|
||||
" df.dropna(subset=['ground_truth'], inplace=True)\n",
|
||||
"\n",
|
||||
"# Preprocess the new dataframe\n",
|
||||
"preprocess_new_df(new_df)\n",
|
||||
"\n",
|
||||
"# Check the first few rows\n",
|
||||
"print(new_df.head())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "c36c9144-64b2-46dd-b597-5528ff57b10a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Evaluating: 100%|█████████████████████████████| 624/624 [02:29<00:00, 4.17it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"New Dataset Accuracy: 0.2841\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Create dataset for the new dataframe\n",
|
||||
"new_dataset = MaskedIfDataset(tokenizer, new_df)\n",
|
||||
"\n",
|
||||
"# Create DataLoader for the new dataset\n",
|
||||
"new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False)\n",
|
||||
"\n",
|
||||
"# Evaluate the model on the new dataset\n",
|
||||
"new_accuracy = evaluate_accuracy(best_model, new_loader, tokenizer, device)\n",
|
||||
"print(f\"New Dataset Accuracy: {new_accuracy:.4f}\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
BIN
models/baris/training_metrics.png
Normal file
BIN
models/baris/training_metrics.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 45 KiB |
69
models/final/0/config.json
Normal file
69
models/final/0/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/0/generation_config.json
Normal file
8
models/final/0/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/1/config.json
Normal file
69
models/final/1/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/1/generation_config.json
Normal file
8
models/final/1/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/10/config.json
Normal file
69
models/final/10/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/10/generation_config.json
Normal file
8
models/final/10/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/11/config.json
Normal file
69
models/final/11/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/11/generation_config.json
Normal file
8
models/final/11/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/12/config.json
Normal file
69
models/final/12/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/12/generation_config.json
Normal file
8
models/final/12/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/13/config.json
Normal file
69
models/final/13/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/13/generation_config.json
Normal file
8
models/final/13/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/14/config.json
Normal file
69
models/final/14/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/14/generation_config.json
Normal file
8
models/final/14/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/15/config.json
Normal file
69
models/final/15/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/15/generation_config.json
Normal file
8
models/final/15/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/16/config.json
Normal file
69
models/final/16/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/16/generation_config.json
Normal file
8
models/final/16/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/17/config.json
Normal file
69
models/final/17/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/17/generation_config.json
Normal file
8
models/final/17/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/18/config.json
Normal file
69
models/final/18/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/18/generation_config.json
Normal file
8
models/final/18/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/19/config.json
Normal file
69
models/final/19/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/19/generation_config.json
Normal file
8
models/final/19/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/2/config.json
Normal file
69
models/final/2/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/2/generation_config.json
Normal file
8
models/final/2/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/3/config.json
Normal file
69
models/final/3/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/3/generation_config.json
Normal file
8
models/final/3/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/4/config.json
Normal file
69
models/final/4/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/4/generation_config.json
Normal file
8
models/final/4/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/5/config.json
Normal file
69
models/final/5/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/5/generation_config.json
Normal file
8
models/final/5/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/6/config.json
Normal file
69
models/final/6/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/6/generation_config.json
Normal file
8
models/final/6/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/7/config.json
Normal file
69
models/final/7/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/7/generation_config.json
Normal file
8
models/final/7/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/8/config.json
Normal file
69
models/final/8/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/8/generation_config.json
Normal file
8
models/final/8/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
69
models/final/9/config.json
Normal file
69
models/final/9/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/9/generation_config.json
Normal file
8
models/final/9/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
1
models/final/best.txt
Normal file
1
models/final/best.txt
Normal file
|
@ -0,0 +1 @@
|
|||
19
|
69
models/final/pretrain/config.json
Normal file
69
models/final/pretrain/config.json
Normal file
|
@ -0,0 +1,69 @@
|
|||
{
|
||||
"_name_or_path": "Salesforce/codet5-small",
|
||||
"architectures": [
|
||||
"T5ForConditionalGeneration"
|
||||
],
|
||||
"bos_token_id": 1,
|
||||
"classifier_dropout": 0.0,
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 512,
|
||||
"decoder_start_token_id": 0,
|
||||
"dense_act_fn": "relu",
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 2,
|
||||
"feed_forward_proj": "relu",
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0"
|
||||
},
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0
|
||||
},
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "t5",
|
||||
"n_positions": 512,
|
||||
"num_decoder_layers": 6,
|
||||
"num_heads": 8,
|
||||
"num_layers": 6,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_max_distance": 128,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"task_specific_params": {
|
||||
"summarization": {
|
||||
"early_stopping": true,
|
||||
"length_penalty": 2.0,
|
||||
"max_length": 200,
|
||||
"min_length": 30,
|
||||
"no_repeat_ngram_size": 3,
|
||||
"num_beams": 4,
|
||||
"prefix": "summarize: "
|
||||
},
|
||||
"translation_en_to_de": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to German: "
|
||||
},
|
||||
"translation_en_to_fr": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to French: "
|
||||
},
|
||||
"translation_en_to_ro": {
|
||||
"early_stopping": true,
|
||||
"max_length": 300,
|
||||
"num_beams": 4,
|
||||
"prefix": "translate English to Romanian: "
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.35.2",
|
||||
"use_cache": true,
|
||||
"vocab_size": 32100
|
||||
}
|
8
models/final/pretrain/generation_config.json
Normal file
8
models/final/pretrain/generation_config.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"decoder_start_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.35.2"
|
||||
}
|
21
models/final/stats.csv
Normal file
21
models/final/stats.csv
Normal file
|
@ -0,0 +1,21 @@
|
|||
,train_loss,train_acc,val_loss,val_acc
|
||||
1,1.1066815287971177,0.2564848070235448,0.8621971841733244,0.35313884221780967
|
||||
2,0.8582622519923347,0.334283488208578,0.8207894280070714,0.36795478845272644
|
||||
3,0.7284335572143417,0.38036599965794426,0.7892089254894864,0.38903314495188634
|
||||
4,0.633081175458829,0.4195693898105391,0.7816493976815753,0.4145410111501451
|
||||
5,0.5550723814587575,0.45366094673431767,0.787525433691057,0.41805407056667176
|
||||
6,0.489626774831408,0.485301104079965,0.7974283164605881,0.4288987322437758
|
||||
7,0.43271683281407614,0.5162001406229215,0.8192677173626487,0.4383687184970215
|
||||
8,0.3834469902450057,0.540695133306729,0.8201495354764291,0.44661677104017106
|
||||
9,0.34214707001886285,0.5680975999087852,0.8280998212575312,0.45242095616312816
|
||||
10,0.30546249586785273,0.5925735894950876,0.8484094327830816,0.4547120818695586
|
||||
11,0.2720891370430394,0.6147502042832982,0.860619057973981,0.46250190927142204
|
||||
12,0.24163438230665166,0.6392261938696008,0.8756259442051197,0.46234916755766
|
||||
13,0.2165358039272913,0.6588753966896604,0.8958285228441377,0.46754238582556895
|
||||
14,0.19597039912527445,0.679246717214906,0.9252617640826756,0.46983351153199937
|
||||
15,0.17652969488658307,0.6953613438990556,0.9368522311691582,0.468000610966855
|
||||
16,0.16078871088729313,0.7160937232768941,0.9492269373215708,0.46906980296318923
|
||||
17,0.14277899259151752,0.7370541398247914,0.9709357907776681,0.4727356040934779
|
||||
18,0.13330485094920555,0.7507553731258195,0.9966374322321716,0.4719718955246678
|
||||
19,0.12318261171442361,0.7625372935788534,0.9880664938842191,0.4768596303650527
|
||||
20,0.11463805472369394,0.7749843224445585,1.0141825987084658,0.4780815640751489
|
|
71299
models/final/test_outputs.csv
Normal file
71299
models/final/test_outputs.csv
Normal file
File diff suppressed because it is too large
Load diff
5001
models/final/test_usi_outputs.csv
Normal file
5001
models/final/test_usi_outputs.csv
Normal file
File diff suppressed because it is too large
Load diff
39
models/final/train_log.txt
Normal file
39
models/final/train_log.txt
Normal file
|
@ -0,0 +1,39 @@
|
|||
Excluding token lengths < 33.0 (25.00%ile): 123184 instances
|
||||
Excluding token lengths > 129.0 (75.00%ile): 124447 instances
|
||||
pretrain dataset: 126184 functions loaded
|
||||
finetune train dataset: 100948 functions loaded
|
||||
finetune val dataset: 12618 functions loaded
|
||||
finetune test dataset: 12619 functions loaded
|
||||
Using device: cuda
|
||||
Pretraining for 1 epochs
|
||||
train dataset: 100948 functions found
|
||||
train dataset: 52623 conditions found
|
||||
val dataset: 12618 functions found
|
||||
val dataset: 6547 conditions found
|
||||
Tuning for 20 epochs
|
||||
E1/20, Train Loss: 1.1067, Train Accuracy: 0.2565, Val Loss: 0.8622, Val Accuracy: 0.3531
|
||||
E2/20, Train Loss: 0.8583, Train Accuracy: 0.3343, Val Loss: 0.8208, Val Accuracy: 0.3680
|
||||
E3/20, Train Loss: 0.7284, Train Accuracy: 0.3804, Val Loss: 0.7892, Val Accuracy: 0.3890
|
||||
E4/20, Train Loss: 0.6331, Train Accuracy: 0.4196, Val Loss: 0.7816, Val Accuracy: 0.4145
|
||||
E5/20, Train Loss: 0.5551, Train Accuracy: 0.4537, Val Loss: 0.7875, Val Accuracy: 0.4181
|
||||
E6/20, Train Loss: 0.4896, Train Accuracy: 0.4853, Val Loss: 0.7974, Val Accuracy: 0.4289
|
||||
E7/20, Train Loss: 0.4327, Train Accuracy: 0.5162, Val Loss: 0.8193, Val Accuracy: 0.4384
|
||||
E8/20, Train Loss: 0.3834, Train Accuracy: 0.5407, Val Loss: 0.8201, Val Accuracy: 0.4466
|
||||
E9/20, Train Loss: 0.3421, Train Accuracy: 0.5681, Val Loss: 0.8281, Val Accuracy: 0.4524
|
||||
E10/20, Train Loss: 0.3055, Train Accuracy: 0.5926, Val Loss: 0.8484, Val Accuracy: 0.4547
|
||||
E11/20, Train Loss: 0.2721, Train Accuracy: 0.6148, Val Loss: 0.8606, Val Accuracy: 0.4625
|
||||
E12/20, Train Loss: 0.2416, Train Accuracy: 0.6392, Val Loss: 0.8756, Val Accuracy: 0.4623
|
||||
E13/20, Train Loss: 0.2165, Train Accuracy: 0.6589, Val Loss: 0.8958, Val Accuracy: 0.4675
|
||||
E14/20, Train Loss: 0.1960, Train Accuracy: 0.6792, Val Loss: 0.9253, Val Accuracy: 0.4698
|
||||
E15/20, Train Loss: 0.1765, Train Accuracy: 0.6954, Val Loss: 0.9369, Val Accuracy: 0.4680
|
||||
E16/20, Train Loss: 0.1608, Train Accuracy: 0.7161, Val Loss: 0.9492, Val Accuracy: 0.4691
|
||||
E17/20, Train Loss: 0.1428, Train Accuracy: 0.7371, Val Loss: 0.9709, Val Accuracy: 0.4727
|
||||
E18/20, Train Loss: 0.1333, Train Accuracy: 0.7508, Val Loss: 0.9966, Val Accuracy: 0.4720
|
||||
E19/20, Train Loss: 0.1232, Train Accuracy: 0.7625, Val Loss: 0.9881, Val Accuracy: 0.4769
|
||||
E20/20, Train Loss: 0.1146, Train Accuracy: 0.7750, Val Loss: 1.0142, Val Accuracy: 0.4781
|
||||
test dataset: 12619 functions found
|
||||
test dataset: 6637 conditions found
|
||||
test_usi dataset: 5000 functions found
|
||||
test_usi dataset: 5000 conditions found
|
||||
Test Accuracy: 48.26%
|
||||
USI Test Accuracy: 19.54%
|
BIN
models/final/training_metrics.png
Normal file
BIN
models/final/training_metrics.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 72 KiB |
16
plot_acc.py
Normal file
16
plot_acc.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
import os.path
|
||||
|
||||
import pandas as pd
|
||||
from train.finetune import plot_loss_acc
|
||||
|
||||
ROOT = os.path.dirname(__file__)
|
||||
|
||||
|
||||
def main():
|
||||
df = pd.read_csv(os.path.join(ROOT, 'models', 'final', 'stats.csv'))
|
||||
plot_loss_acc(df['train_loss'].tolist(), df['val_loss'].tolist(), df['train_acc'].tolist(), df['val_acc'].tolist(),
|
||||
os.path.join(ROOT, 'models', 'final'))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
73
report/main.tex
Normal file
73
report/main.tex
Normal file
|
@ -0,0 +1,73 @@
|
|||
\documentclass{scrartcl}
|
||||
\setlength\paperwidth{20.999cm}
|
||||
\setlength\paperheight{29.699cm}
|
||||
\setlength\voffset{-1in}
|
||||
\setlength\hoffset{-1in}
|
||||
\setlength\topmargin{1.499cm}
|
||||
\setlength\headheight{12pt}
|
||||
\setlength\headsep{.7cm}
|
||||
\setlength\footskip{1.131cm}
|
||||
\setlength\textheight{25cm}
|
||||
\setlength\oddsidemargin{2.499cm}
|
||||
\setlength\textwidth{15.999cm}
|
||||
\setlength\parindent{0cm}
|
||||
\setlength\parskip{0.3em}
|
||||
|
||||
\usepackage{amsmath}
|
||||
\usepackage{listings}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{fancyvrb}
|
||||
\usepackage{newverbs}
|
||||
\usepackage{fancyhdr}
|
||||
\usepackage{extramarks}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{mathtools}
|
||||
\usepackage{multicol}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{float}
|
||||
\usepackage{subcaption}
|
||||
|
||||
\pagestyle{fancy}
|
||||
\lhead{Aksakal, Maggioni, Riggio - Bug Triaging}
|
||||
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
|
||||
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
|
||||
|
||||
\newcommand\vartextvisiblespace[1][.6em]{%
|
||||
\makebox[\#1]{%
|
||||
\kern.07em
|
||||
\vrule height.4ex
|
||||
\hrulefill
|
||||
\vrule height.4ex
|
||||
\kern.07em
|
||||
}%
|
||||
}
|
||||
|
||||
\begin{document}
|
||||
\thispagestyle{plain}
|
||||
|
||||
\begin{center}
|
||||
\hrule
|
||||
|
||||
\vspace{.4cm}
|
||||
{\textbf {\Huge If-Conditions}} \\
|
||||
\vspace{.2cm}
|
||||
{\textbf Software Analytics}
|
||||
\vspace{.2cm}
|
||||
\end{center}
|
||||
{\textbf {Baris Aksakal} } (baris.aksakal@usi.ch) \hspace{\fill} \\
|
||||
{\textbf {Claudio Maggioni} } (claudio.maggioni@usi.ch) \hspace{\fill} \\
|
||||
{\textbf {Edoardo Riggio} } (edoardo.riggio@usi.ch) \hspace{\fill} \today \\
|
||||
\hrule
|
||||
\vspace{.2cm}
|
||||
|
||||
\input{sections/introduction}
|
||||
\input{sections/scraping}
|
||||
\input{sections/datasets}
|
||||
\input{sections/baris}
|
||||
\input{sections/pretraining-finetuning}
|
||||
\input{sections/model}
|
||||
\input{sections/evaluation}
|
||||
|
||||
\end{document}
|
||||
|
32
report/sections/baris.tex
Normal file
32
report/sections/baris.tex
Normal file
|
@ -0,0 +1,32 @@
|
|||
\section*{Model Architecture Design (Baris)}
|
||||
|
||||
Our model consists of three main segments. The first is the already pre-trained
|
||||
transformer CodeT5 (base-sized model) from Hugging Face that also came with its
|
||||
own tokenizer. In essence, the model is a unified framework that supports both
|
||||
code understanding and generation tasks and allows for multi-task learning. We
|
||||
have quickly observed that the model works as expected, in the sense that it
|
||||
continued to generate code from the bottom-most line. While expected, this base
|
||||
model requires some other modifications to be applicable for an MLM task.
|
||||
Therefore, our first step was to teach the CodeT5 the task of MLM with
|
||||
considerably less data than what it was initially trained on. As we were
|
||||
already using a pre-trained model, we can consider this step almost as a
|
||||
"pre-fine-tuning". Meaning that we are actually "fine-tuning" a pre-trained
|
||||
model as our own pre-training. At this point in the training, our dataset
|
||||
consisted of all sorts of Python functions where 15\% of the tokens in a
|
||||
function were randomly masked for the model to predict. After our
|
||||
"pre-fine-tuning" step was complete, we observed the model's new ability to
|
||||
adapt and complete MLM tasks. We then proceeded with the actual fine-tuning. We
|
||||
have masked if conditions as was instructed, putting apart from the fine-tuning
|
||||
dataset 10\% of the instances as an evaluation set and 10\% as a test set. For
|
||||
this, we implemented a straightforward scheme where we randomly sampled
|
||||
functions with if conditions and randomly selected and masked a single if
|
||||
condition per function. Therefore, a function was used only once to train the
|
||||
model no matter how many "if conditions" it contains. We opted for this scheme
|
||||
as we already had more than enough samples for our computational resources and
|
||||
had no extra necessity for getting multiple samples out of a single function.
|
||||
This last step of fine-tuning created our final model which, with some success,
|
||||
is able to automatically recommend appropriate conditions for if statements in
|
||||
Python functions. Lastly, we have tested our final CodeT5 (already pre-trained)
|
||||
model after our own pre-training and fine-tuning steps, on the small test set
|
||||
on which we can compare the performance of the models trained by the two
|
||||
groups.
|
34
report/sections/datasets.tex
Normal file
34
report/sections/datasets.tex
Normal file
|
@ -0,0 +1,34 @@
|
|||
\section*{Datasets}
|
||||
After scraping all the data, we had to split up the dataset into several different datasets.
|
||||
Firstly, we divided it into pretrain and finetune datasets, and then in training, test, and validation.
|
||||
The number of functions found in our datasets pretrain and finetune datasets are summarized in Table~\ref{tab:table-pre}, while the functions and conditions found in the rest of the databases are summarized in Table~\ref{tab:table}. \\ \\
|
||||
Before performing either the pretraining or the finetuning, we transformed the functions in an array tokens by using the \verb|Salesforce/codet5-small| tokenizer.
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{| l | c |}
|
||||
\hline
|
||||
Dataset & \# of Functions \\
|
||||
\hline \hline
|
||||
Pretrain & 126184 \\ \hline
|
||||
Finetune Train & 100948 \\ \hline
|
||||
Finetune Validation & 12618 \\ \hline
|
||||
Finetune Test & 12619 \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\caption{Number of fucntions for each dataset}
|
||||
\label{tab:table-pre}
|
||||
\end{table}
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{| l | c | c |}
|
||||
\hline
|
||||
Dataset & \# of Functions & \# of Conditions \\
|
||||
\hline \hline
|
||||
Training & 100948 & 21269 \\ \hline
|
||||
Validation & 12618 & 6547 \\ \hline
|
||||
Test & 12619 & 6637 \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\caption{Number of fucntions and conditions for each dataset}
|
||||
\label{tab:table}
|
||||
\end{table}
|
7
report/sections/evaluation.tex
Normal file
7
report/sections/evaluation.tex
Normal file
|
@ -0,0 +1,7 @@
|
|||
\section*{Model Evaluation on the Test Sets}
|
||||
The model was statistically evaluated on two test sets, with data respectively from our scraping effort and external data provided with the assignment document.\\ \\
|
||||
Our test dataset contains 12619 functions and a total of 6637 conditions on which to perform the evaluation onto.
|
||||
The model accuracy on this test set is 48.26\%, which is very close to the final validation set accuracy (47.69\%) thus indicating that the model is likely not overfitted. \\ \\
|
||||
The test set provided externally has 5000 conditions in it.
|
||||
The model accuracy on this dataset was 19.54\%, which is significantly lower than the value we have for our test set.
|
||||
We suspect this is due to our dataset filtering efforts based on token length, and that this test set covers cases with token lengths we explicitly excluded.
|
3
report/sections/introduction.tex
Normal file
3
report/sections/introduction.tex
Normal file
|
@ -0,0 +1,3 @@
|
|||
\section*{Introduction}
|
||||
The goal of this assignment was to train a model that recommends the appropriate condition given an if-statement in Python.
|
||||
This assignment was divided into several steps, which included the scraping of python files, pre-training and fine-tuning of the model, and finally training the final model to make recommendations.
|
50
report/sections/model.tex
Normal file
50
report/sections/model.tex
Normal file
|
@ -0,0 +1,50 @@
|
|||
\section*{Model Architecture}
|
||||
Our implementation uses the \texttt{T5ForConditionalGeneration} model from the
|
||||
HuggingFace \textit{transformers} library.
|
||||
The model architecture combines the
|
||||
standard T5 model architecture with a language modeling output layer head to
|
||||
allow performing generative tasks.
|
||||
We used the \textsc{CodeT5} pretrained
|
||||
instance of the model from
|
||||
Salesforce\footnote{\url{https://github.com/salesforce/CodeT5}} to perform
|
||||
pretraining on python-specific code and to further fine tune the model to
|
||||
generate conditions for if statements. \\ \\
|
||||
The pretrain phase runs for one epoch and uses instances in the pretrained set to
|
||||
train the model to recognize the structure of Python code.
|
||||
This is achieved by
|
||||
tokenizing and then masking 15\% of the tokens within functions by random
|
||||
sampling.
|
||||
The output labels that the model should learn to predict are then
|
||||
these masked tokens. \\ \\
|
||||
The fine tune phase runs for at most 20 epochs and uses the fine-tune train
|
||||
dataset to train the model to predict if conditions.
|
||||
Each function in the
|
||||
training set is analyzed with the Python \texttt{ast} module to search for
|
||||
conditions, and for each condition found a training instance is created where
|
||||
that specific condition is masked.
|
||||
This means that one function may be considered as multiple differently masked instances, or not considered at all if
|
||||
it does not contain if conditions. \\ \\
|
||||
We implemented an early stopping procedure to avoid overfitting the model during
|
||||
training.
|
||||
The procedure analyzes model accuracy on the validation set and has
|
||||
patience of 3 epochs. \\ \\
|
||||
Both the pretrain and fine tune training loops are custom, and use the
|
||||
\textsc{AdamW} optimizer with a learning rate of $5 \cdot 10^{-5}$. \\ \\
|
||||
Figure~\ref{fig:metrics} shows the loss and accuracy metrics for the fine tune
|
||||
training and validation set.
|
||||
It is noteworthy to see that even if validation
|
||||
accuracy increases, validation loss increases as well after the first few
|
||||
epochs.
|
||||
According to our early stopping policy, this is not overfitting, but
|
||||
this might indicate that a different early stopping policy may have chosen an
|
||||
earlier epoch.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{../models/final/training_metrics}
|
||||
\caption{Training and Validation Metrics Over Epochs. The plot illustrates the
|
||||
trends in training and validation loss as well as training and validation
|
||||
accuracy across different epochs during the training
|
||||
process.}\label{fig:metrics}
|
||||
\end{figure}
|
||||
|
||||
|
18
report/sections/pretraining-finetuning.tex
Normal file
18
report/sections/pretraining-finetuning.tex
Normal file
|
@ -0,0 +1,18 @@
|
|||
\section*{Pre-Training}
|
||||
After the tokenization of the functions, we removed the outliers.
|
||||
These are the functions with a total number of tokens in the 25\textsuperscript{th} (less than 33 tokens) and 75\textsuperscript{th} (more than 129 tokens) percentiles.
|
||||
In total, 247.631 functions were removed (123184 in the 25\textsuperscript{th} percentile and 124447 in the 75\textsuperscript{th} percentile). \\ \\
|
||||
After this step was performed, we proceeded with the pretraining of the model.
|
||||
In the pretraining step, we had two first mask 15\% of all the tokens.
|
||||
This was done by taking the vectors of tokens and substituting 15\% of them with the token that represented the \verb|<mask>| special tag. \\ \\
|
||||
Finally, this dataset was used to pretrain our model.
|
||||
This part of the training helped the model to better understand the structure of the Python programming language.
|
||||
Since the model was already pretrained on code, it already had a general understanding of the structure of some programming languages.
|
||||
But, thanks to our pretraining, it now is more specialized in recognizing the structure of a Python piece of code.
|
||||
|
||||
\section*{Fine-tuning}
|
||||
In this part of the training of the model, we had to teach the model to perform a specific task, in our case recommending suitable conditions for Python if-statements. \\ \\
|
||||
To do so, we masked some of the if-conditions of the functions in our finetune dataset, and performed a training operation on this new dataset.
|
||||
The masking was performed by taking the function and converting it into an AST\@.
|
||||
After doing that, we iterated over the AST nodes and identified those that were if statements.
|
||||
When an if-statement was found, it was replaced by the special \verb|<mask>| token.
|
10
report/sections/scraping.tex
Normal file
10
report/sections/scraping.tex
Normal file
|
@ -0,0 +1,10 @@
|
|||
\section*{Scraping}
|
||||
To scrape the Python files, we used the SEART-GHS crawler.
|
||||
The tool exported a csv file containing a list of all the repositories that matched our constraints, namely \verb|lines of code: >=| \verb|10000| and \verb|language: Python|.
|
||||
This CSV file was then used to download the main branch of each repository and save it in a ZIP archive. \\ \\
|
||||
From this ZIP archive, we wanted to extract functions from the Python files.
|
||||
To do this, we used the Python AST library to extract functions while discarding comments (docstrings were kept). \\ \\
|
||||
As the dataset extracted was extremely large, the extractor script was terminated earlier.
|
||||
When the script terminated, it had generated 70 million functions.
|
||||
Due to limited computer power for model training, we decided to cut down the number of functions to 500.000 to build the training set.
|
||||
After extracting the functions, we saved them in a Parquet file in the \verb|dataset/extracted| directory.
|
10
requirements.txt
Normal file
10
requirements.txt
Normal file
|
@ -0,0 +1,10 @@
|
|||
pandas==2.1.3
|
||||
fastparquet==2023.10.1
|
||||
tqdm==4.66.1
|
||||
transformers==4.35.2
|
||||
torch==2.1.1
|
||||
matplotlib==3.8.2
|
||||
deap~=1.4.1
|
||||
frozendict~=2.3.8
|
||||
nltk~=3.8.1
|
||||
swifter~=1.4.0
|
94
test/test_dataset.py
Normal file
94
test/test_dataset.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
import pandas as pd
|
||||
import pytest
|
||||
import swifter
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from train.dataset import (TOKENIZER, MAX_TOKEN_LENGTH, PythonCodeDataset, MaskedIfDataset, decode_tokenized,
|
||||
PRETRAIN_MLM_PROB, BATCH_SIZE, build_pretrain_dataloader, build_fine_tune_dataloader)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_pretrain_data():
|
||||
data = {'source': ['if a > 2: pass', 'if b <= 4: pass'],
|
||||
'other_column': [1, 2]}
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_fine_tune_dataloader():
|
||||
data = {'source': ['if a > 2: pass', 'if b <= 4: pass'],
|
||||
'other_column': [1, 2]}
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_fine_tune_data():
|
||||
data = {'masked_code': ['if a > 2: pass', 'if b <= 4: pass'],
|
||||
'ground_truth': ['if a > 2: pass', 'if b <= 4: pass']}
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_tokenized_output():
|
||||
return [1234, 5678]
|
||||
|
||||
|
||||
def test_decode_tokenized(mock_tokenized_output):
|
||||
decoded_output = decode_tokenized(mock_tokenized_output)
|
||||
expected_output = " msg comments"
|
||||
assert decoded_output == expected_output
|
||||
|
||||
mock_tokenized_output_with_padding = [-100]
|
||||
decoded_output_with_padding = decode_tokenized(mock_tokenized_output_with_padding)
|
||||
expected_output_with_padding = None
|
||||
assert decoded_output_with_padding == expected_output_with_padding
|
||||
|
||||
|
||||
def test_build_pretrain_dataloader(mock_pretrain_data):
|
||||
dataloader = build_pretrain_dataloader(mock_pretrain_data)
|
||||
|
||||
assert isinstance(dataloader, DataLoader)
|
||||
assert dataloader.batch_size == BATCH_SIZE
|
||||
assert isinstance(dataloader.dataset, PythonCodeDataset)
|
||||
assert dataloader.dataset.tokenizer == TOKENIZER
|
||||
assert dataloader.dataset.data.equals(mock_pretrain_data)
|
||||
assert dataloader.collate_fn.tokenizer == TOKENIZER
|
||||
assert dataloader.collate_fn.mlm_probability == PRETRAIN_MLM_PROB
|
||||
assert dataloader.collate_fn.mlm == True
|
||||
|
||||
|
||||
def test_build_fine_tune_dataloader(mock_fine_tune_dataloader):
|
||||
train_dataloader = build_fine_tune_dataloader(mock_fine_tune_dataloader, 'train')
|
||||
|
||||
assert isinstance(train_dataloader, DataLoader)
|
||||
assert train_dataloader.batch_size == BATCH_SIZE
|
||||
assert isinstance(train_dataloader.dataset, PythonCodeDataset)
|
||||
assert train_dataloader.dataset.tokenizer == TOKENIZER
|
||||
|
||||
|
||||
def test_python_code_dataset(mock_pretrain_data):
|
||||
dataset = PythonCodeDataset(TOKENIZER, mock_pretrain_data, MAX_TOKEN_LENGTH)
|
||||
sample = dataset[0]
|
||||
|
||||
assert len(dataset) == len(mock_pretrain_data)
|
||||
assert 'input_ids' in sample
|
||||
assert 'attention_mask' in sample
|
||||
assert sample['input_ids'].shape == torch.Size([MAX_TOKEN_LENGTH])
|
||||
assert sample['attention_mask'].shape == torch.Size([MAX_TOKEN_LENGTH])
|
||||
|
||||
|
||||
def test_masked_if_dataset(mock_fine_tune_data):
|
||||
dataset = MaskedIfDataset(TOKENIZER, mock_fine_tune_data, MAX_TOKEN_LENGTH)
|
||||
sample = dataset[0]
|
||||
|
||||
assert len(dataset) == len(mock_fine_tune_data)
|
||||
assert 'input_ids' in sample
|
||||
assert 'attention_mask' in sample
|
||||
assert 'labels' in sample
|
||||
assert sample['input_ids'].shape == torch.Size([MAX_TOKEN_LENGTH])
|
||||
assert sample['attention_mask'].shape == torch.Size([MAX_TOKEN_LENGTH])
|
||||
assert sample['labels'].shape == torch.Size([MAX_TOKEN_LENGTH])
|
34
test/test_evaluate.py
Normal file
34
test/test_evaluate.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from train.dataset import TOKENIZER
|
||||
from train.evaluate import compute_accuracy
|
||||
|
||||
|
||||
def test_compute_accuracy():
|
||||
batch = {'labels': [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")], 'input_ids': [[1,2],[3,4]]}
|
||||
outputs = [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")]
|
||||
|
||||
result = compute_accuracy(outputs, batch)
|
||||
correct_predictions, total_predictions, _ = result
|
||||
|
||||
print(result)
|
||||
|
||||
assert isinstance(result, tuple)
|
||||
assert isinstance(correct_predictions, int)
|
||||
assert isinstance(total_predictions, int)
|
||||
assert correct_predictions == 2
|
||||
assert total_predictions == 2
|
||||
|
||||
|
||||
def test_compute_accuracy_none():
|
||||
batch = {'labels': [[-100], TOKENIZER.encode("label 2")], 'input_ids': [[5,6], [7,8]]}
|
||||
outputs = [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")]
|
||||
|
||||
result = compute_accuracy(outputs, batch)
|
||||
correct_predictions, total_predictions, _ = result
|
||||
|
||||
print(result)
|
||||
|
||||
assert isinstance(result, tuple)
|
||||
assert isinstance(correct_predictions, int)
|
||||
assert isinstance(total_predictions, int)
|
||||
assert correct_predictions == 1
|
||||
assert total_predictions == 1
|
8
test/test_pretrain.py
Normal file
8
test/test_pretrain.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from train.pretrain import label
|
||||
|
||||
|
||||
def test_label():
|
||||
label_test = label(20, 0.01)
|
||||
|
||||
assert label_test != ''
|
||||
assert label_test == 'Epoch=20 Loss=0.01'
|
31
test/test_train_load.py
Normal file
31
test/test_train_load.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from train.load import DataSet, filter_outliers
|
||||
|
||||
IN_PATH: str = os.path.join(os.path.dirname(__file__), '..', 'dataset', 'extracted', 'functions.pq')
|
||||
IN_PATH_USI: str = os.path.join(os.path.dirname(__file__), '..', 'dataset', 'extracted', 'test_set_usi.csv')
|
||||
RANDOM_STATE: int = 42
|
||||
|
||||
|
||||
def test_dataset_load():
|
||||
ds = DataSet.load(IN_PATH, IN_PATH_USI, RANDOM_STATE)
|
||||
assert len(set(ds.fine_tune_val_df.index).intersection(set(ds.fine_tune_test_df.index))) == 0
|
||||
|
||||
|
||||
def test_filter_outliers():
|
||||
df = pd.DataFrame({'source': ['abc def', 'ab cd ef', 'a b c d e f g h i j k l']})
|
||||
filtered_df = filter_outliers(df)
|
||||
|
||||
assert 'token_length' in filtered_df.columns
|
||||
assert len(filtered_df) < len(df)
|
||||
assert len(filtered_df) > 0
|
||||
|
||||
|
||||
def test_filter_outliers_non_utf_8():
|
||||
df = pd.DataFrame({'source': [b'\xff']})
|
||||
filtered_df = filter_outliers(df)
|
||||
|
||||
assert 'token_length' in filtered_df.columns
|
||||
assert filtered_df.iloc[0]['token_length'] == 0
|
122
test/test_train_mask.py
Normal file
122
test/test_train_mask.py
Normal file
|
@ -0,0 +1,122 @@
|
|||
import pytest
|
||||
import swifter
|
||||
import pandas as pd
|
||||
|
||||
from train.mask import FineTrainInstance, strip_parentheses, mask_conditions
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_function():
|
||||
return """
|
||||
def stream_edit(request, stream_id, response_format="html"):
|
||||
"Stream edit page"
|
||||
user = request.user.profile
|
||||
stream = get_object_or_404(MessageStream, pk=stream_id)
|
||||
if not request.user.profile.has_permission(stream, mode="w"):
|
||||
return user_denied(
|
||||
request,
|
||||
message="You don't have access to this Stream",
|
||||
response_format=response_format,
|
||||
)
|
||||
if request.POST:
|
||||
if "cancel" not in request.POST:
|
||||
form = MessageStreamForm(user, request.POST, instance=stream)
|
||||
if form.is_valid():
|
||||
stream = form.save()
|
||||
return HttpResponseRedirect(
|
||||
reverse("messaging_stream_view", args=[stream.id])
|
||||
)
|
||||
else:
|
||||
return HttpResponseRedirect(
|
||||
reverse("messaging_stream_view", args=[stream.id])
|
||||
)
|
||||
else:
|
||||
form = MessageStreamForm(user, instance=stream)
|
||||
context = _get_default_context(request)
|
||||
context.update({"form": form, "stream": stream})
|
||||
return render_to_response(
|
||||
"messaging/stream_edit",
|
||||
context,
|
||||
context_instance=RequestContext(request),
|
||||
response_format=response_format,
|
||||
)
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_function_with_error():
|
||||
return """
|
||||
def ciao_mamma():
|
||||
if 1 > 2:
|
||||
print("ciao")
|
||||
else if 1 < 2:
|
||||
print("ok")
|
||||
else:
|
||||
return
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dataframe():
|
||||
data = {'source': ['if x > 0: pass', 'if (a and b) or c: pass']}
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dataframe_usi():
|
||||
data = {'input_method': ['<fill-in> pass', '<fill-in> pass'], 'target_block': ['if x > 0 :', 'if (a and b) or c :']}
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def test_mask_does_not_crash(sample_function):
|
||||
instances = FineTrainInstance.from_function(sample_function)
|
||||
assert len(instances) == 4
|
||||
|
||||
|
||||
def test_mask_with_syntax_error(sample_function_with_error):
|
||||
instances = FineTrainInstance.from_function(sample_function_with_error)
|
||||
assert instances == []
|
||||
|
||||
|
||||
def test_strip_parentheses_balanced():
|
||||
balanced = '("ok")'
|
||||
stripped = strip_parentheses(balanced)
|
||||
|
||||
assert "(" not in stripped and ")" not in stripped
|
||||
assert stripped == '"ok"'
|
||||
|
||||
|
||||
def test_strip_parentheses_unbalanced():
|
||||
balanced = '("ok"))'
|
||||
stripped = strip_parentheses(balanced)
|
||||
|
||||
assert balanced == stripped
|
||||
|
||||
|
||||
def test_mask_conditions(sample_dataframe):
|
||||
result_df = mask_conditions(sample_dataframe, kind='test')
|
||||
|
||||
assert len(result_df) == 2
|
||||
assert 'masked_code' in result_df.columns
|
||||
assert 'ground_truth' in result_df.columns
|
||||
assert '<mask>' in result_df['masked_code'].iloc[0]
|
||||
assert '<mask>' in result_df['masked_code'].iloc[1]
|
||||
assert result_df['ground_truth'].iloc[0] == 'x > 0'
|
||||
assert result_df['ground_truth'].iloc[1] == 'a and b or c'
|
||||
|
||||
|
||||
def test_mask_conditions_usi(sample_dataframe_usi):
|
||||
result_df = mask_conditions(sample_dataframe_usi, kind='test_usi')
|
||||
|
||||
print(result_df)
|
||||
assert len(result_df) == 2
|
||||
assert 'masked_code' in result_df.columns
|
||||
assert 'ground_truth' in result_df.columns
|
||||
assert '<mask>' in result_df['masked_code'].iloc[0]
|
||||
assert '<mask>' in result_df['masked_code'].iloc[1]
|
||||
assert result_df['ground_truth'].iloc[0] == 'x > 0'
|
||||
assert result_df['ground_truth'].iloc[1] == 'a and b or c'
|
78
train/dataset.py
Normal file
78
train/dataset.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
import ast
|
||||
from typing import Literal, Optional
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from transformers import RobertaTokenizer, PreTrainedTokenizer, DataCollatorForLanguageModeling
|
||||
|
||||
from train.mask import mask_conditions
|
||||
|
||||
TOKENIZER: PreTrainedTokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
|
||||
|
||||
BATCH_SIZE: int = 8
|
||||
MAX_TOKEN_LENGTH: int = 512
|
||||
|
||||
# Probability of masking a token during pretraining
|
||||
PRETRAIN_MLM_PROB: float = 0.15
|
||||
|
||||
|
||||
class PythonCodeDataset(Dataset):
|
||||
"""Dataset class for pre-training"""
|
||||
|
||||
def __init__(self, tokenizer, dataframe, max_len):
|
||||
self.tokenizer = tokenizer
|
||||
self.data = dataframe
|
||||
self.max_len = max_len
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
code = self.data.iloc[index]['source']
|
||||
inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len,
|
||||
padding='max_length', truncation=True)
|
||||
return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
|
||||
'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}
|
||||
|
||||
|
||||
def build_pretrain_dataloader(pretrain_df: pd.DataFrame) -> DataLoader:
|
||||
pretrain_dataset = PythonCodeDataset(TOKENIZER, pretrain_df, MAX_TOKEN_LENGTH)
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer=TOKENIZER, mlm=True, mlm_probability=PRETRAIN_MLM_PROB)
|
||||
return DataLoader(pretrain_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator)
|
||||
|
||||
|
||||
class MaskedIfDataset(PythonCodeDataset):
|
||||
"""Fine-tuning and evaluation dataset classes"""
|
||||
|
||||
def __getitem__(self, index):
|
||||
masked_code = self.data.iloc[index]['masked_code']
|
||||
ground_truth = self.data.iloc[index]['ground_truth']
|
||||
inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True,
|
||||
return_tensors="pt")
|
||||
labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True,
|
||||
return_tensors="pt").input_ids
|
||||
labels[labels == self.tokenizer.pad_token_id] = -100
|
||||
return {'input_ids': inputs.input_ids.squeeze(),
|
||||
'attention_mask': inputs.attention_mask.squeeze(),
|
||||
'labels': labels.squeeze()}
|
||||
|
||||
|
||||
Kind = Literal['train'] | Literal['val'] | Literal['test'] | Literal['test_usi']
|
||||
|
||||
|
||||
def build_fine_tune_dataloader(df: pd.DataFrame, kind: Kind) -> DataLoader:
|
||||
print(f"{kind} dataset: {len(df.index)} functions found")
|
||||
df = mask_conditions(df, kind)
|
||||
print(f"{kind} dataset: {len(df.index)} conditions found")
|
||||
|
||||
dataset = MaskedIfDataset(TOKENIZER, df, MAX_TOKEN_LENGTH)
|
||||
return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=kind == 'train')
|
||||
|
||||
|
||||
def decode_tokenized(output) -> Optional[str]:
|
||||
label_trimmed = [token for token in output if token != TOKENIZER.pad_token_id and token != -100]
|
||||
if label_trimmed:
|
||||
return TOKENIZER.decode(label_trimmed, skip_special_tokens=True)
|
||||
else:
|
||||
return None
|
92
train/evaluate.py
Normal file
92
train/evaluate.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
import pandas as pd
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from train.dataset import decode_tokenized, TOKENIZER
|
||||
from train.mask import normalize_condition
|
||||
|
||||
|
||||
def compute_accuracy(outputs, batch, track_predictions=False, confidences=None) -> tuple[int, int, list[dict]]:
|
||||
correct_predictions, total_predictions = 0, 0
|
||||
|
||||
decoded_outputs = [TOKENIZER.decode(output, skip_special_tokens=True) for output in outputs]
|
||||
|
||||
if track_predictions:
|
||||
confidences = confidences.tolist()
|
||||
else:
|
||||
confidences = [None] * len(decoded_outputs)
|
||||
|
||||
tracking = []
|
||||
|
||||
# Decode labels with added check for None values
|
||||
decoded_labels = []
|
||||
for label in batch['labels']:
|
||||
decoded_labels.append(decode_tokenized(label))
|
||||
|
||||
# Calculate accuracy
|
||||
for output, label, confidence, in_ids in zip(decoded_outputs, decoded_labels, confidences, batch['input_ids']):
|
||||
is_correct = None
|
||||
if label is not None:
|
||||
total_predictions += 1
|
||||
is_correct = False
|
||||
|
||||
output = normalize_condition(output)
|
||||
label = normalize_condition(label)
|
||||
|
||||
if output == label:
|
||||
correct_predictions += 1
|
||||
is_correct = True
|
||||
|
||||
if track_predictions:
|
||||
tracking.append({
|
||||
'input': TOKENIZER.decode(in_ids) \
|
||||
.replace("<pad>", "") \
|
||||
.replace("<s>", "") \
|
||||
.replace("</s>", ""),
|
||||
'is_correct': is_correct,
|
||||
'expected_cond': label,
|
||||
'predicted_cond': output,
|
||||
'score': confidence
|
||||
})
|
||||
|
||||
return correct_predictions, total_predictions, tracking
|
||||
|
||||
|
||||
def evaluate_accuracy(model, dataloader: DataLoader, device, track_predictions=False) -> tuple[
|
||||
float, float, list[dict]]:
|
||||
"""Returns the accuracy and loss on the given validation set"""
|
||||
|
||||
model.eval()
|
||||
total_loss = 0
|
||||
|
||||
correct_predictions, total_predictions = 0, 0
|
||||
|
||||
tracking = []
|
||||
|
||||
for batch in tqdm(dataloader, desc="Evaluating"):
|
||||
batch = {k: v.to(device) for k, v in batch.items()}
|
||||
with torch.no_grad():
|
||||
inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'],
|
||||
'labels': batch['labels']}
|
||||
outputs = model(**inputs)
|
||||
generation = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)
|
||||
|
||||
# Compute prediction score as inverse entropy of the T5 model logits. If their softmax has values close to 0 or
|
||||
# 1, confidence is high
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
logits_flattened = outputs.logits.flatten(1, 2)
|
||||
probabilities = torch.nn.functional.softmax(logits_flattened, dim=1)
|
||||
entropy = -torch.sum(probabilities * torch.log(probabilities), dim=1)
|
||||
confidence = 1.0 - entropy / torch.log(torch.tensor(probabilities.size(dim=1)))
|
||||
|
||||
total_loss += loss.item()
|
||||
|
||||
c_pred, total_pred, t = compute_accuracy(generation, batch, track_predictions, confidence)
|
||||
correct_predictions += c_pred
|
||||
total_predictions += total_pred
|
||||
tracking.extend(t)
|
||||
|
||||
return correct_predictions / total_predictions if total_predictions > 0 else 0, total_loss / len(
|
||||
dataloader), tracking
|
142
train/finetune.py
Normal file
142
train/finetune.py
Normal file
|
@ -0,0 +1,142 @@
|
|||
import os
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from train.evaluate import evaluate_accuracy, compute_accuracy
|
||||
|
||||
ADAM_LR: float = 5e-5
|
||||
EARLY_STOPPING_PATIENCE: int = 3
|
||||
PLOT_FILENAME: str = "training_metrics.png"
|
||||
|
||||
|
||||
def fine_tune_with_eval(model, device, train_loader: DataLoader, val_loader: DataLoader, epochs: int,
|
||||
save_dir: str) -> int:
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=ADAM_LR)
|
||||
best_epoch = 0
|
||||
best_eval_accuracy = 0
|
||||
patience_counter = 0
|
||||
train_accuracies, train_losses, val_accuracies, val_losses = [], [], [], []
|
||||
|
||||
print(f"Tuning for {epochs} epochs")
|
||||
|
||||
max_epoch = 0
|
||||
|
||||
for epoch in range(epochs):
|
||||
max_epoch = epoch
|
||||
|
||||
model.train()
|
||||
total_loss = 0
|
||||
|
||||
correct_predictions, total_predictions = 0, 0
|
||||
train_accuracy = 0
|
||||
|
||||
train_dl = tqdm(train_loader, desc=f"Train E{epoch + 1}/{epochs}")
|
||||
i = 0
|
||||
|
||||
# Training loop with tqdm for progress tracking
|
||||
for batch in train_dl:
|
||||
i += 1
|
||||
batch = {k: v.to(device) for k, v in batch.items()}
|
||||
optimizer.zero_grad()
|
||||
inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'],
|
||||
'labels': batch['labels']}
|
||||
outputs = model(**inputs)
|
||||
generation = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)
|
||||
|
||||
loss = outputs.loss
|
||||
total_loss += loss.item()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# Update train accuracy
|
||||
c_pred, total_pred, _ = compute_accuracy(generation, batch)
|
||||
correct_predictions += c_pred
|
||||
total_predictions += total_pred
|
||||
train_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
||||
|
||||
train_dl.set_description(f"Train E{epoch + 1}/{epochs} loss={total_loss / i:.6f} acc={train_accuracy:.4f}")
|
||||
|
||||
average_loss = total_loss / len(train_loader)
|
||||
train_losses.append(average_loss)
|
||||
train_accuracies.append(train_accuracy)
|
||||
|
||||
# Evaluate on the validation set
|
||||
val_accuracy, val_loss, _ = evaluate_accuracy(model, val_loader, device)
|
||||
val_accuracies.append(val_accuracy)
|
||||
val_losses.append(val_loss)
|
||||
print(f"E{epoch + 1}/{epochs}, Train Loss: {average_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
|
||||
f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
|
||||
|
||||
# Early stopping and checkpointing
|
||||
if val_accuracy > best_eval_accuracy:
|
||||
best_eval_accuracy = val_accuracy
|
||||
best_epoch = epoch
|
||||
patience_counter = 0
|
||||
else:
|
||||
patience_counter += 1
|
||||
if patience_counter >= EARLY_STOPPING_PATIENCE:
|
||||
print("Early stopping triggered.")
|
||||
break
|
||||
|
||||
save_directory = os.path.join(save_dir, str(epoch))
|
||||
model.save_pretrained(save_directory)
|
||||
|
||||
df = pd.DataFrame({'train_loss': train_losses,
|
||||
'train_acc': train_accuracies,
|
||||
'val_loss': val_losses,
|
||||
'val_acc': val_accuracies},
|
||||
index=list(range(1, max_epoch + 2)))
|
||||
|
||||
df.to_csv(os.path.join(save_dir, "stats.csv"))
|
||||
|
||||
return best_epoch
|
||||
|
||||
|
||||
def plot_loss_acc(train_losses: list[float], val_losses: list[float], train_accuracies: list[float],
|
||||
val_accuracies: list[float], save_path: str):
|
||||
plt.figure(figsize=(12, 10))
|
||||
|
||||
plt.subplot(2, 2, 1)
|
||||
plt.plot(train_losses, label='Training Loss')
|
||||
plt.title('Training Loss')
|
||||
plt.xlabel('Epoch')
|
||||
plt.ylabel('Loss')
|
||||
plt.xticks(np.arange(0, 20, step=1))
|
||||
plt.yticks(np.arange(0, 1.4, step=0.2))
|
||||
plt.legend()
|
||||
|
||||
plt.subplot(2, 2, 2)
|
||||
plt.plot(val_losses, label='Validation Loss', color='orange')
|
||||
plt.title('Validation Loss')
|
||||
plt.xlabel('Epoch')
|
||||
plt.ylabel('Loss')
|
||||
plt.xticks(np.arange(0, 20, step=1))
|
||||
plt.yticks(np.arange(0, 1.4, step=0.2))
|
||||
plt.legend()
|
||||
|
||||
plt.subplot(2, 2, 3)
|
||||
plt.plot(train_accuracies, label='Training Accuracy', color='green')
|
||||
plt.title('Training Accuracy')
|
||||
plt.xlabel('Epoch')
|
||||
plt.ylabel('Accuracy')
|
||||
plt.xticks(np.arange(0, 20, step=1))
|
||||
plt.yticks(np.arange(0, 1.1, step=0.1))
|
||||
plt.legend()
|
||||
|
||||
plt.subplot(2, 2, 4)
|
||||
plt.plot(val_accuracies, label='Validation Accuracy', color='red')
|
||||
plt.title('Validation Accuracy')
|
||||
plt.xlabel('Epoch')
|
||||
plt.ylabel('Accuracy')
|
||||
plt.xticks(np.arange(0, 20, step=1))
|
||||
plt.yticks(np.arange(0, 1.1, step=0.1))
|
||||
plt.legend()
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
plt.savefig(os.path.join(save_path, PLOT_FILENAME))
|
82
train/load.py
Normal file
82
train/load.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from tokenize import tokenize
|
||||
|
||||
import pandas as pd
|
||||
import swifter
|
||||
from sklearn.model_selection import train_test_split
|
||||
from train.mask import MASK_TOKEN
|
||||
from tqdm import tqdm
|
||||
|
||||
# PreTrain/Train = 1-FINE_TUNE_FRAC/FINE_TUNE_FRAC split
|
||||
FINE_TUNE_FRAC: float = 0.5
|
||||
|
||||
# Splits for the fine tune dataset
|
||||
# TRAIN_FRAC = 1-VAL_FRAC-TEST_FRAC
|
||||
TEST_FRAC: float = 0.1
|
||||
VAL_FRAC: float = 0.1
|
||||
|
||||
LOW_PERCENTILE: float = 0.25
|
||||
HIGH_PERCENTILE: float = 0.75
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataSet:
|
||||
pretrain_df: pd.DataFrame
|
||||
fine_tune_train_df: pd.DataFrame
|
||||
fine_tune_val_df: pd.DataFrame
|
||||
fine_tune_test_df: pd.DataFrame
|
||||
usi_test_df: pd.DataFrame
|
||||
|
||||
def __init__(self, pretrain_df: pd.DataFrame, fine_tune_train_df: pd.DataFrame, fine_tune_val_df: pd.DataFrame,
|
||||
fine_tune_test_df: pd.DataFrame, usi_test_df: pd.DataFrame):
|
||||
self.pretrain_df = pretrain_df
|
||||
self.fine_tune_train_df = fine_tune_train_df
|
||||
self.fine_tune_val_df = fine_tune_val_df
|
||||
self.fine_tune_test_df = fine_tune_test_df
|
||||
self.usi_test_df = usi_test_df
|
||||
|
||||
@staticmethod
|
||||
def load(dataset_path: str, usi_test_dataset_path: str, random_state: int):
|
||||
df = pd.read_parquet(dataset_path)
|
||||
# df = df.iloc[0:100, :] # debug
|
||||
df = filter_outliers(df)
|
||||
|
||||
pretrain_df, fine_tune_df = train_test_split(df, test_size=FINE_TUNE_FRAC, random_state=random_state)
|
||||
val_df = fine_tune_df.sample(frac=VAL_FRAC, random_state=random_state)
|
||||
test_df = fine_tune_df.drop(val_df.index).sample(frac=TEST_FRAC / (1 - VAL_FRAC), random_state=random_state)
|
||||
fine_tune_df = fine_tune_df.drop(val_df.index).drop(test_df.index)
|
||||
usi_test_df = pd.read_csv(usi_test_dataset_path, index_col=0)
|
||||
# usi_test_df = usi_test_df.iloc[0:10, :] # debug
|
||||
|
||||
print(f"pretrain dataset: {len(pretrain_df.index)} functions loaded")
|
||||
print(f"finetune train dataset: {len(fine_tune_df.index)} functions loaded")
|
||||
print(f"finetune val dataset: {len(val_df.index)} functions loaded")
|
||||
print(f"finetune test dataset: {len(test_df.index)} functions loaded")
|
||||
|
||||
return DataSet(pretrain_df, fine_tune_df, val_df, test_df, usi_test_df)
|
||||
|
||||
|
||||
def filter_outliers(df: pd.DataFrame) -> pd.DataFrame:
|
||||
assert swifter is not None
|
||||
|
||||
def count_tokens(s: str) -> int:
|
||||
try:
|
||||
count = 0
|
||||
for _ in tokenize(BytesIO(s.encode('utf-8')).readline):
|
||||
count += 1
|
||||
return count
|
||||
except:
|
||||
return 0
|
||||
|
||||
df["token_length"] = df["source"].swifter.apply(count_tokens)
|
||||
|
||||
low_qty = df["token_length"].quantile(LOW_PERCENTILE)
|
||||
mask_low = df["token_length"] < low_qty
|
||||
print(f"Excluding token lengths < {low_qty} ({LOW_PERCENTILE * 100:02.02f}%ile): {sum(mask_low)} instances")
|
||||
|
||||
high_qty = df["token_length"].quantile(HIGH_PERCENTILE)
|
||||
mask_high = df["token_length"] > high_qty
|
||||
print(f"Excluding token lengths > {high_qty} ({HIGH_PERCENTILE * 100:02.02f}%ile): {sum(mask_high)} instances")
|
||||
|
||||
return df[~mask_high & ~mask_low]
|
125
train/mask.py
Normal file
125
train/mask.py
Normal file
|
@ -0,0 +1,125 @@
|
|||
import ast
|
||||
import sys
|
||||
from _ast import Load
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Series
|
||||
from tqdm import tqdm
|
||||
|
||||
MASK_TOKEN: str = "<mask>"
|
||||
|
||||
|
||||
def strip_parentheses(input_string):
|
||||
input_string = input_string.strip()
|
||||
if input_string.startswith('(') and input_string.endswith(')'):
|
||||
# Dyck Language algorithm
|
||||
|
||||
paren_count: int = 0
|
||||
for char in input_string:
|
||||
if char == '(':
|
||||
paren_count += 1
|
||||
elif char == ')':
|
||||
if paren_count == 0:
|
||||
return input_string # closing a non-matching open paren
|
||||
else:
|
||||
paren_count -= 1
|
||||
|
||||
if paren_count == 0:
|
||||
return input_string[1:-1] # strip if parens are balanced
|
||||
|
||||
return input_string
|
||||
|
||||
|
||||
@dataclass
|
||||
class FineTrainInstance:
|
||||
masked_function: str
|
||||
condition: str
|
||||
|
||||
def __init__(self, masked_function: str, condition: str):
|
||||
self.masked_function = masked_function
|
||||
self.condition = condition
|
||||
|
||||
@staticmethod
|
||||
def from_function(function: str) -> list['FineTrainInstance']:
|
||||
try:
|
||||
tree = ast.parse(function)
|
||||
except SyntaxError:
|
||||
return []
|
||||
|
||||
instances: list['FineTrainInstance'] = []
|
||||
|
||||
for t in ast.walk(tree):
|
||||
if isinstance(t, ast.If):
|
||||
# swap in place the condition with the mask node
|
||||
cond = t.test
|
||||
|
||||
# Replace the condition with a mask node and build the masked function source
|
||||
# treat "<mask>" as a variable rvalue, which may not ever happen in real source code as it is not a
|
||||
# valid identifier. However, this makes ast.unparse happily print "<mask>" as a string
|
||||
t.test = ast.Name(identifier_id=MASK_TOKEN, expr_context_ctx=Load())
|
||||
t.test.id = MASK_TOKEN
|
||||
masked_fun = ast.unparse(tree)
|
||||
|
||||
instances.append(FineTrainInstance(masked_fun, strip_parentheses(ast.unparse(cond))))
|
||||
|
||||
# restore the condition
|
||||
t.test = cond
|
||||
|
||||
return instances
|
||||
|
||||
|
||||
def normalize_condition(c: str) -> str:
|
||||
c = c.strip()
|
||||
|
||||
try:
|
||||
# reformat if syntax is parsable, otherwise return as-is
|
||||
return strip_parentheses(ast.unparse(ast.parse(c)))
|
||||
except SyntaxError:
|
||||
return c
|
||||
|
||||
|
||||
def mask_conditions(df_source: pd.DataFrame, kind: str) -> pd.DataFrame:
|
||||
if kind != 'test_usi':
|
||||
df = pd.DataFrame(columns=['masked_code', 'ground_truth'])
|
||||
instances = df_source["source"].swifter.apply(lambda s: FineTrainInstance.from_function(s))
|
||||
|
||||
i = 0
|
||||
for row in tqdm(instances, desc=f"Building {kind}", total=len(df_source.index)):
|
||||
for instance in row:
|
||||
df.loc[i, 'masked_code'] = instance.masked_function
|
||||
df.loc[i, 'ground_truth'] = instance.condition
|
||||
i += 1
|
||||
else:
|
||||
df = pd.DataFrame(columns=['masked_code', 'ground_truth'], index=df_source.index)
|
||||
|
||||
def canonicalize(c: str) -> pd.Series:
|
||||
prefixes = ["if ", "elif "]
|
||||
found_prefix = ""
|
||||
postfix = ":"
|
||||
|
||||
c = c.strip()
|
||||
|
||||
for prefix in prefixes:
|
||||
if c.startswith(prefix):
|
||||
c = c[len(prefix):]
|
||||
found_prefix = prefix
|
||||
break
|
||||
|
||||
if c.endswith(postfix):
|
||||
c = c[:len(c) - len(postfix)]
|
||||
|
||||
c = normalize_condition(c)
|
||||
|
||||
return pd.Series([found_prefix, c], index=['found_prefix', 'c'])
|
||||
|
||||
# Canonicalize condition string
|
||||
df[['prefix', 'ground_truth']] = df_source['target_block'].swifter.apply(canonicalize)
|
||||
df['masked_code'] = df_source['input_method'].copy()
|
||||
|
||||
# Our model is only able to predict the if condition itself, so we re-inject the "if"/"elif" and ":" token
|
||||
# back in the input
|
||||
df['masked_code'] = df[['prefix', 'masked_code']] \
|
||||
.apply(lambda s: s['masked_code'].replace("<fill-in>", s['prefix'] + " " + MASK_TOKEN + " :"), axis=1)
|
||||
|
||||
return df
|
31
train/pretrain.py
Normal file
31
train/pretrain.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
ADAM_LR: float = 5e-5
|
||||
|
||||
|
||||
def label(epoch: int, loss: float) -> str:
|
||||
return f"Epoch={epoch} Loss={loss}"
|
||||
|
||||
|
||||
def pretrain(model, dataloader: DataLoader, device, epochs: int, save_dir: str):
|
||||
model.train()
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=ADAM_LR)
|
||||
|
||||
print(f"Pretraining for {epochs} epochs")
|
||||
|
||||
for epoch in range(epochs):
|
||||
with tqdm(dataloader, desc=f"Epoch {epoch + 1}") as pbar:
|
||||
for step, batch in enumerate(pbar):
|
||||
batch = {k: v.to(device) for k, v in batch.items()}
|
||||
optimizer.zero_grad()
|
||||
inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
|
||||
outputs = model(**inputs, labels=batch['input_ids'])
|
||||
loss = outputs.loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
pbar.set_description(label(epoch + 1, loss.item()))
|
||||
|
||||
model.save_pretrained(save_dir)
|
||||
|
76
train_model.py
Normal file
76
train_model.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
||||
|
||||
import torch
|
||||
from transformers import T5ForConditionalGeneration
|
||||
|
||||
from train.evaluate import evaluate_accuracy
|
||||
from train.finetune import fine_tune_with_eval
|
||||
from train.dataset import build_pretrain_dataloader, build_fine_tune_dataloader
|
||||
from train.pretrain import pretrain
|
||||
from train.load import DataSet
|
||||
|
||||
IN_PATH: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'functions.pq')
|
||||
IN_PATH_USI: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'test_set_usi.csv')
|
||||
OUT_PATH: str = os.path.join(os.path.dirname(__file__), 'models', 'final')
|
||||
|
||||
RANDOM_STATE: int = 42
|
||||
|
||||
|
||||
def train():
|
||||
dataset = DataSet.load(IN_PATH, IN_PATH_USI, RANDOM_STATE)
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
print(f'Using device: {device}')
|
||||
|
||||
pretrain_dir = os.path.join(OUT_PATH, "pretrain")
|
||||
|
||||
if os.path.isfile(os.path.join(pretrain_dir, "config.json")):
|
||||
# load the pretrained model if it exists
|
||||
model = T5ForConditionalGeneration.from_pretrained(pretrain_dir)
|
||||
model.to(device)
|
||||
else:
|
||||
# Pre-train the model
|
||||
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')
|
||||
model.to(device)
|
||||
pretrain_loader = build_pretrain_dataloader(dataset.pretrain_df)
|
||||
pretrain(model, pretrain_loader, device, 1, pretrain_dir)
|
||||
|
||||
# Dataloaders for fine-tuning and validation
|
||||
best_epoch_file = os.path.join(OUT_PATH, "best.txt")
|
||||
|
||||
if not os.path.isfile(best_epoch_file):
|
||||
fine_tune_loader = build_fine_tune_dataloader(dataset.fine_tune_train_df, 'train')
|
||||
eval_loader = build_fine_tune_dataloader(dataset.fine_tune_val_df, 'val')
|
||||
|
||||
best_epoch = fine_tune_with_eval(model, device, fine_tune_loader, eval_loader, 20, OUT_PATH)
|
||||
|
||||
with open(best_epoch_file, "w") as f:
|
||||
f.write(str(best_epoch) + "\n")
|
||||
|
||||
# Load model for best epoch
|
||||
with open(best_epoch_file, "r") as f:
|
||||
best_epoch = int(f.read().strip())
|
||||
best_model_directory = os.path.join(OUT_PATH, str(best_epoch))
|
||||
best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)
|
||||
best_model.to(device)
|
||||
|
||||
test_loader = build_fine_tune_dataloader(dataset.fine_tune_test_df, 'test')
|
||||
test_usi_loader = build_fine_tune_dataloader(dataset.usi_test_df, 'test_usi')
|
||||
|
||||
# Evaluate the model on the test set
|
||||
test_accuracy, _, test_outs = evaluate_accuracy(best_model, test_loader, device, track_predictions=True)
|
||||
pd.DataFrame.from_records(test_outs).to_csv(os.path.join(OUT_PATH, 'test_outputs.csv'))
|
||||
print(f"Test Accuracy: {test_accuracy * 100:02.02f}%")
|
||||
|
||||
# Evaluate the model on the usi test set
|
||||
test_accuracy, _, test_usi_outs = evaluate_accuracy(best_model, test_usi_loader, device, track_predictions=True)
|
||||
pd.DataFrame.from_records(test_usi_outs).to_csv(os.path.join(OUT_PATH, 'test_usi_outputs.csv'))
|
||||
print(f"USI Test Accuracy: {test_accuracy * 100:02.02f}%")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train()
|
Reference in a new issue