Final version of the project

History has been rewritten to delete large files in repo
This commit is contained in:
Claudio Maggioni 2024-01-03 15:25:41 +01:00
commit a4ceee8716
93 changed files with 215857 additions and 0 deletions

464
.gitignore vendored Normal file
View file

@ -0,0 +1,464 @@
/dataset/download/*.zip
/dataset/functions/*.pq
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
**/latex/
/models/test
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
**/.DS_Store
out/model/*.pt
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf
## Generated if empty string is given at "Please type another file name for output:"
**/*.pdf
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
*.slg
*.slo
*.sls
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplot
*.gnuplot
*.table
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.glog
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# *.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# newpax
*.newpax
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# svg
svg-inkscape/
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# titletoc
*.ptc
# todonotes
*.tdo
# vhistory
*.hst
*.ver
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# xwatermark package
*.xwm
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib

88
README.md Normal file
View file

@ -0,0 +1,88 @@
# Assignment 2: If statements
**Group 2: Baris Aksakal, Edoardo Riggio, Claudio Maggioni**
## Repository Structure
- `/dataset`: code and data related to scraping repository from GitHub;
- `/models`
- `/baris`: code and persisted model of the original architecture built by
Baris. `model_0.1.ipynb` and `test_model.ipynb` are respectively an
earlier and later iteration of the code used to train this model;
- `/final`: persisted model for the final architecture with training and
test evaluation statistics;
- `/test_outputs.csv`: CSV deliverable for the test set evaluation on
the test set we extracted;
- `/test_usi_outputs.csv`: CSV deliverable for the test set evaluation
on the provided test set.
- `/test`: unit tests for the model training scripts;
- `/train`: dependencies of the main model training script;
- `/train_model.py`: main model training script;
- `/plot_acc.py`: accuracy statistics plotting script.
## Environment Setup
In order to execute both the scraping and training scripts, Python 3.10 or
greater is required. Dependencies can be installed through a virtual env by
running:
```shell
python3 -m venv .env
source .env/bin/activate
pip install -r requirements.txt
```
## Dataset Extraction
Please refer to [the README.md file in `/dataset`](dataset/README.md) for
documentation on the dataset extraction process.
## Model Training
Model training can be performed by running the script:
```shell
python3 train_model.py
```
The script is able to resume fine-tuning if the pretraining phase was completed
by a previous execution, and it is able to directly skip to model evaluation on
the two test sets if fine-tuning was already completed.
The persisted pretrained model is located in `/models/final/pretrain`. Each
epoch of the fine-tuning train process is persisted at path
`/models/final/<N>`, where `<N>` is the epoch number starting from 0. The epoch
number for the epoch selected by the early stopping process is stored in
`/models/final/best.txt`.
`/models/final/stats.csv` stores the training and validation loss and accuracy
statistics during the training process. `/models/final/test_outputs.csv` is the
CSV deliverable for the test set evaluation on the test set we extracted, while
`/models/final/test_usi_outputs.csv` is the CSV deliverable for the test set
evaluation on the provided test set.
The stdout for the training process script can be found in the file
`/models/final/train_log.txt`.
### Plots
The train and validation loss and accuracy plots can be generated from
`/models/final/stats.csv` with the following command:
```shell
python3 plot_acc.py
```
The output is stored in `/models/final/training_metrics.png`.
# Report
To compile the report run:
```shell
cd report
pdflatex -interaction=nonstopmode -output-directory=. main.tex
pdflatex -interaction=nonstopmode -output-directory=. main.tex
```
The report is then located in `report/main.pdf`.

78
dataset/README.md Normal file
View file

@ -0,0 +1,78 @@
# Dataset Download Instructions
## Project .zip Export
We scraped GitHub repositories using the download tool https://seart-ghs.si.usi.ch/ to generate the `results.csv` file
under this directory. Other than the default constraints applied by the `seart-ghs` crawler, we used the following
criteria:
- lines of code: >=10000
- language: `Python`
We found 21269 results. We then downloaded a `.zip` archive of the main branch of each repository using the following
command. We started the download process on 2023-11-13 at 12:00.
```shell
mkdir download || true
cat results.csv | \
awk -F, 'NR>1 { print "wget -O " $2 ".zip https://github.com/" $2 "/archive/refs/heads/" $6 ".zip" }' | \
sed 's#\/#-#;s#\"##g' > download/to_download.sh
cd download
bash to_download.sh
```
### Manually Excluded Repos
We manually excluded the following repositories from our scraped dataset ("404" means that the repository was
inaccessible and could not be downloaded):
- `thorn-lab/coronavirus_structural_task_force` (too large, more than 6GiB)
- `feeicn/security-ppt` (too large, more than 9GiB)
- `salesforce/ai-economist` (404)
- `agiliumtrade/ai-metaapi-python-sdk` (404)
- `pokemonchw/dieloli` (harmful content)
- `thesnowguru/pytrader-python-mt4-mt5-trading-api-connector-drag-n-drop` (DMCA takedown)
- `objectiv/objectiv-analytics` (404)
- `aws/solutions-aws-security-hub-automated-response-and-remediation` (404)
- `openunited/product-factory-backend` (404)
- `ibm-epbl/ibm-project-43602-1660718377` (404)
- `ibm-epbl/ibm-project-1392-1658386621` (404)
- `potatolondon/django-gcloud-connectors` (404)
- `fortwoone/oracle-project` (404)
- `iperov/deepxtools` (404)
- `frequenz/floss-frequenz-sdk-python` (404)
### Check Archive Health
The following script was used to check the integrity of each downloaded `.zip` file.
```shell
cd download
find . -name '*.zip' \
-exec bash -c 'echo $0 $(unzip -l "$0" 2>/dev/null 1>/dev/null && echo "1" || echo "0")' \{\} \; \
> archive_health.txt
```
## Function Extraction
The following command builds a dataset from the archives saved in the `/download` subdirectory:
```shell
python3 ./extract.py
```
Functions are extracted with the Python `ast` module, which discards comments (but not docstrings). The script generates
one parquet archive per project in the directory `/functions` containing functions.
As the dataset was large, this script was terminated early. At termination, 70 million functions were extracted. Due to
computing power limitations for model training, we further extracted only 500000 functions out of the ones downloaded
to build the training set. The extraction process reads the archives in `/functions` and then stores the extracted
functions in the Parquet file `extracted/functions.pq`. The extraction script can be invoked with the command:
```shell
python3 extract.py
```
The extraction process guarantees that the extracted functions have valid syntax for Python 3.10+ and that the code of
each function contains only ASCII characters.

90
dataset/extract.py Normal file
View file

@ -0,0 +1,90 @@
import ast
import os.path
import typing
import zipfile
from typing import Optional
import pandas as pd
from tqdm import tqdm
from fastparquet import write
import multiprocessing
PWD = os.path.dirname(__file__)
IN_DIR = os.path.join(PWD, "download")
OUT_DIR = os.path.join(PWD, "functions")
def read_functions(content, filename: str, zip_name: str) -> Optional[pd.DataFrame]:
records = []
try:
tree = ast.parse(content.decode('utf-8'), filename=filename)
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
f_source: str = ast.unparse(typing.cast(any, node))
records.append({
"zip_filename": zip_name,
"py_filename": filename,
"source": f_source,
"success": True,
"error": None,
})
except Exception as e:
print(f"project '{zip_name}': error parsing '{filename}': {e}")
records.append({
"zip_filename": zip_name,
"py_filename": filename,
"source": "",
"success": False,
"error": str(e)
})
return pd.DataFrame.from_records(records)
def read_zip_file(zip_file: str):
out_path = os.path.join(OUT_DIR, os.path.basename(zip_file) + ".pq")
df = pd.DataFrame(columns=["zip_filename", "py_filename", "source"])
try:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
info_list = [info for info in zip_ref.infolist() if info.filename.endswith('.py')]
for info in tqdm(info_list, desc=os.path.basename(zip_file), ncols=0, position=None, leave=True):
content = zip_ref.read(info.filename)
df_file = read_functions(content, info.filename, zip_file)
if df_file is not None:
df = pd.concat([df, df_file], ignore_index=True)
write(out_path, df, compression='GZIP')
return zip_file
except Exception as e:
print(e)
def read_clones(zip_dir: str):
zip_files = []
for a_file in tqdm(os.listdir(zip_dir), desc="Scan dir"):
path = os.path.join(zip_dir, a_file)
out_path = os.path.join(OUT_DIR, os.path.basename(path) + ".pq")
if zipfile.is_zipfile(path) and not os.path.isfile(out_path):
zip_files.append(path)
num_processes = 192
with multiprocessing.Manager():
with multiprocessing.Pool(processes=num_processes) as pool:
for _ in tqdm(pool.imap_unordered(read_zip_file, zip_files), desc="Read ZIPs",
unit="item", total=len(zip_files), position=None, leave=True):
pass # dummy iteration to consume multiprocessing iterator, needed to launch processes
def main():
if not os.path.isdir(OUT_DIR):
os.makedirs(OUT_DIR)
read_clones(IN_DIR)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:923ad850a4eed1d026b26fedbd5ecd64cf2e4e0f4648108d4732ac0e8fe70eb8
size 72966215

File diff suppressed because it is too large Load diff

21270
dataset/results.csv Normal file

File diff suppressed because one or more lines are too long

68
dataset/sample.py Normal file
View file

@ -0,0 +1,68 @@
import os
import random
import pandas as pd
from fastparquet import write
from tqdm import tqdm
PWD = os.path.dirname(__file__)
IN_DIR = os.path.join(PWD, "functions")
OUT_FILE = os.path.join(PWD, "extracted", "functions.pq")
OUT_SIZE = 500_000
def main():
out_dir = os.path.dirname(OUT_FILE)
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
files = [f for f in os.listdir(IN_DIR) if f.endswith('.pq')]
chosen: set[tuple[str, int]] = set()
df = None
with tqdm(desc="Sampling", total=OUT_SIZE) as p:
while df is None or len(df.index) < OUT_SIZE:
filename = random.choice(files)
path = os.path.join(IN_DIR, filename)
df_file = pd.read_parquet(path, engine='fastparquet')
df_len = len(df_file.index)
if df_len == 0:
continue
for _ in range(min(1000, df_len)):
index = random.randrange(0, df_len)
error_message = df_file.iloc[index]["error"]
if error_message is not None and len(error_message) > 0:
continue
source = df_file.iloc[index]["source"]
if not source.isascii():
continue
choice = (filename, index)
if choice not in chosen:
chosen.add(choice)
if df is None:
df = df_file
else:
df = pd.concat([df, df_file.iloc[[index], :]], axis=0, ignore_index=True)
if len(df.index) % 1000 == 0:
write(OUT_FILE, df, compression='GZIP')
p.update(1)
# Fix success column, which is computed wrong in a previous iteration of extract.py
df["success"] = df["error"].apply(lambda e: e is None or len(e) == 0)
write(OUT_FILE, df, compression='GZIP')
if __name__ == "__main__":
main()

111
environment.yml Normal file
View file

@ -0,0 +1,111 @@
name: SA
channels:
- defaults
dependencies:
- abseil-cpp=20230802.0
- aiohttp=3.8.5
- aiosignal=1.2.0
- arrow-cpp=11.0.0
- async-timeout=4.0.2
- attrs=23.1.0
- aws-c-common=0.6.8
- aws-c-event-stream=0.1.6
- aws-checksums=0.1.11
- aws-sdk-cpp=1.8.185
- blas=1.0
- boost-cpp=1.82.0
- bottleneck=1.3.5
- brotli=1.0.9
- brotli-bin=1.0.9
- brotli-python=1.0.9
- bzip2=1.0.8
- c-ares=1.19.1
- ca-certificates=2023.08.22
- certifi=2023.11.17
- cffi=1.16.0
- charset-normalizer=2.0.4
- cramjam=2.6.2
- cryptography=41.0.3
- datasets=2.12.0
- dill=0.3.6
- fastparquet=2023.8.0
- filelock=3.13.1
- frozenlist=1.4.0
- fsspec=2023.9.2
- gflags=2.2.2
- glog=0.5.0
- grpc-cpp=1.48.2
- gtest=1.14.0
- huggingface_hub=0.17.3
- icu=73.1
- idna=3.4
- importlib-metadata=6.0.0
- krb5=1.20.1
- libboost=1.82.0
- libbrotlicommon=1.0.9
- libbrotlidec=1.0.9
- libbrotlienc=1.0.9
- libcurl=8.4.0
- libcxx=14.0.6
- libedit=3.1.20221030
- libev=4.33
- libevent=2.1.12
- libffi=3.4.4
- libgfortran=5.0.0
- libgfortran5=11.3.0
- libiconv=1.16
- libnghttp2=1.57.0
- libopenblas=0.3.21
- libprotobuf=3.20.3
- libssh2=1.10.0
- libthrift=0.15.0
- llvm-openmp=14.0.6
- lz4-c=1.9.4
- multidict=6.0.2
- multiprocess=0.70.14
- ncurses=6.4
- numexpr=2.8.7
- numpy=1.26.0
- numpy-base=1.26.0
- openssl=3.0.12
- orc=1.7.4
- packaging=23.1
- pandas=2.1.1
- pip=23.3.1
- pyarrow=11.0.0
- pycparser=2.21
- pyopenssl=23.2.0
- pysocks=1.7.1
- python=3.11.5
- python-dateutil=2.8.2
- python-tzdata=2023.3
- python-xxhash=2.0.2
- pytz=2023.3.post1
- pyyaml=6.0.1
- re2=2022.04.01
- readline=8.2
- regex=2023.10.3
- requests=2.31.0
- responses=0.13.3
- safetensors=0.4.0
- setuptools=68.0.0
- six=1.16.0
- snappy=1.1.9
- sqlite=3.41.2
- tk=8.6.12
- tokenizers=0.13.2
- tqdm=4.65.0
- transformers=4.32.1
- typing-extensions=4.7.1
- typing_extensions=4.7.1
- tzdata=2023c
- urllib3=1.26.18
- utf8proc=2.6.1
- wheel=0.41.2
- xxhash=0.8.0
- xz=5.4.2
- yaml=0.2.5
- yarl=1.8.1
- zipp=3.11.0
- zlib=1.2.13
- zstd=1.5.5

0
models/.gitkeep Normal file
View file

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

3250
models/baris/model_0.1.ipynb Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,449 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "5f7ad96a-6b01-4b63-93b6-4008597a0e9e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating: 100%|███████████████████████████| 1092/1092 [04:02<00:00, 4.50it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test Accuracy: 0.3642\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"import random\n",
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForLanguageModeling\n",
"from sklearn.model_selection import train_test_split\n",
"from tqdm import tqdm\n",
"import os\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"# Dataset class for pre-training\n",
"class PythonCodeDataset(Dataset):\n",
" def __init__(self, tokenizer, dataframe, max_len=512):\n",
" self.tokenizer = tokenizer\n",
" self.data = dataframe\n",
" self.max_len = max_len\n",
"\n",
" def __len__(self):\n",
" return len(self.data)\n",
"\n",
" def __getitem__(self, index):\n",
" code = self.data.iloc[index]['source']\n",
" inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)\n",
" return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}\n",
"\n",
"# Function to mask if conditions\n",
"def mask_if_condition(code_snippet):\n",
" if_conditions = re.findall(r'(if\\s+.*?:)', code_snippet)\n",
" masked_snippet = code_snippet.replace(if_conditions[0], '<mask>', 1) if if_conditions else code_snippet\n",
" return masked_snippet, if_conditions[0] if if_conditions else None\n",
"\n",
"# Fine-tuning and evaluation dataset classes\n",
"class MaskedIfDataset(PythonCodeDataset):\n",
" def __getitem__(self, index):\n",
" masked_code = self.data.iloc[index]['masked_code']\n",
" ground_truth = self.data.iloc[index]['ground_truth']\n",
" inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\")\n",
" labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\").input_ids\n",
" labels[labels == self.tokenizer.pad_token_id] = -100\n",
" return {'input_ids': inputs.input_ids.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': labels.squeeze()}\n",
"\n",
"# Define the pre-training loop\n",
"def pretrain(model, dataloader, epochs, print_every=10):\n",
" model.train()\n",
" optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n",
" global_step = 0 # Initialize a counter for the global training step\n",
"\n",
" for epoch in range(epochs):\n",
" for batch in dataloader:\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" optimizer.zero_grad()\n",
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}\n",
" outputs = model(**inputs, labels=batch['input_ids'])\n",
" loss = outputs.loss\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" if global_step % print_every == 0: # Print every steps\n",
" print(f\"Step {global_step}, Loss: {loss.item()}\")\n",
"\n",
" global_step += 1 # Increment the step counter\n",
"\n",
" print(f\"Epoch {epoch+1}/{epochs} completed.\")\n",
" \n",
"\n",
"def fine_tune_with_eval(model, train_loader, eval_loader, epochs, save_path, print_every=10, early_stopping_patience=3):\n",
" optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n",
" best_epoch = 0\n",
" best_eval_accuracy = 0\n",
" patience_counter = 0\n",
" train_losses, eval_accuracies = [], []\n",
"\n",
" for epoch in range(epochs):\n",
" model.train()\n",
" total_loss = 0\n",
"\n",
" # Training loop with tqdm for progress tracking\n",
" for batch in tqdm(train_loader, desc=f\"Training Epoch {epoch+1}/{epochs}\"):\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" optimizer.zero_grad()\n",
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['labels']}\n",
" outputs = model(**inputs)\n",
" loss = outputs.loss\n",
" total_loss += loss.item()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" average_loss = total_loss / len(train_loader)\n",
" train_losses.append(average_loss)\n",
"\n",
" # Evaluate on the evaluation set\n",
" eval_accuracy = evaluate_accuracy(model, eval_loader, tokenizer, device)\n",
" eval_accuracies.append(eval_accuracy)\n",
" print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {average_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}\")\n",
"\n",
" # Early stopping and checkpointing\n",
" if eval_accuracy > best_eval_accuracy:\n",
" best_eval_accuracy = eval_accuracy\n",
" best_epoch = epoch\n",
" patience_counter = 0\n",
" else:\n",
" patience_counter += 1\n",
" if patience_counter >= early_stopping_patience:\n",
" print(\"Early stopping triggered.\")\n",
" break\n",
" \n",
" save_directory = f\"{save_path}/{epoch}\"\n",
" model.save_pretrained(save_directory)\n",
" \n",
" # Plotting the training loss and evaluation accuracy\n",
" plt.figure(figsize=(12, 5))\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(train_losses, label='Training Loss')\n",
" plt.title('Training Loss')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Loss')\n",
" plt.legend()\n",
"\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(eval_accuracies, label='Evaluation Accuracy')\n",
" plt.title('Evaluation Accuracy')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Accuracy')\n",
" plt.legend()\n",
"\n",
" plt.savefig(f\"{save_path}/training_metrics.png\")\n",
" \n",
" return best_epoch\n",
"\n",
"\n",
"def evaluate_accuracy(model, dataloader, tokenizer, device):\n",
" model.eval()\n",
" correct_predictions, total_predictions = 0, 0\n",
"\n",
" for batch in tqdm(dataloader, desc=\"Evaluating\"):\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" with torch.no_grad():\n",
" outputs = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)\n",
" decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n",
"\n",
" # Decode labels with added check for None values\n",
" decoded_labels = []\n",
" for label in batch['labels']:\n",
" label_trimmed = [l for l in label.tolist() if l != tokenizer.pad_token_id and l != -100]\n",
" if label_trimmed:\n",
" decoded_label = tokenizer.decode(label_trimmed, skip_special_tokens=True)\n",
" decoded_labels.append(decoded_label)\n",
" else:\n",
" decoded_labels.append(None) # Append None for invalid/empty labels\n",
"\n",
" # Calculate accuracy\n",
" for output, label in zip(decoded_outputs, decoded_labels):\n",
" if label is not None and output.strip() == label.strip():\n",
" correct_predictions += 1\n",
" if label is not None:\n",
" total_predictions += 1\n",
"\n",
" return correct_predictions / total_predictions if total_predictions > 0 else 0\n",
" \n",
" \n",
"# Read the dataset\n",
"df = pd.read_parquet('../if-statements/dataset/extracted/functions.pq')\n",
"#df = df.head(50)\n",
"\n",
"# Split the dataset into pre-training, fine-tuning, evaluation, and test sets\n",
"pretrain_df, fine_tune_df = train_test_split(df, test_size=0.5, random_state=42)\n",
"eval_df = fine_tune_df.sample(frac=0.1, random_state=42)\n",
"test_df = fine_tune_df.drop(eval_df.index).sample(frac=0.1111, random_state=42)\n",
"fine_tune_df = fine_tune_df.drop(eval_df.index).drop(test_df.index)\n",
"\n",
"assert len(set(eval_df.index).intersection(set(test_df.index))) == 0\n",
"\n",
"\n",
"# Initialize tokenizer and model\n",
"tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')\n",
"model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"print(f'Using device: {device}')\n",
"model.to(device)\n",
" \n",
"# Instantiate the dataset for pre-training\n",
"pretrain_dataset = PythonCodeDataset(tokenizer, pretrain_df)\n",
"\n",
"# Set up the data collator for MLM\n",
"data_collator = DataCollatorForLanguageModeling(\n",
" tokenizer=tokenizer,\n",
" mlm=True,\n",
" mlm_probability=0.15\n",
")\n",
"\n",
"# Create a DataLoader for pre-training\n",
"pretrain_loader = DataLoader(pretrain_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)\n",
"\n",
"# Pre-train the model\n",
"#pretrain(model, pretrain_loader, epochs=1)\n",
"\n",
"\n",
"# Prepare data for fine-tuning and evaluation\n",
"fine_tune_df['masked_code'], fine_tune_df['ground_truth'] = zip(*fine_tune_df['source'].apply(mask_if_condition))\n",
"eval_df['masked_code'], eval_df['ground_truth'] = zip(*eval_df['source'].apply(mask_if_condition))\n",
"fine_tune_df.dropna(subset=['ground_truth'], inplace=True)\n",
"eval_df.dropna(subset=['ground_truth'], inplace=True)\n",
"\n",
"\n",
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
"\n",
"\n",
"# Dataloaders for fine-tuning and evaluation\n",
"fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=8, shuffle=True)\n",
"eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)\n",
"\n",
"\n",
"# Instantiate the datasets for fine-tuning and evaluation\n",
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
"\n",
"\n",
"best_epoch = 4\n",
"\n",
"# Example of calling the modified function\n",
"save_path = '../if-statements/dataset/extracted/final'\n",
"#best_epoch = fine_tune_with_eval(model, fine_tune_loader, eval_loader, epochs=5, save_path=save_path)\n",
"\n",
"# Define the directory of the best model\n",
"best_model_directory = os.path.join(save_path, str(best_epoch))\n",
"\n",
"# Load the best model and its config\n",
"best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)\n",
"\n",
"# Optionally, load the model's config\n",
"model_config = best_model.config # This will load the config file associated with the model\n",
"\n",
"best_model.to(device)\n",
"\n",
"# Prepare and evaluate on the test set\n",
"test_df['masked_code'], test_df['ground_truth'] = zip(*test_df['source'].apply(mask_if_condition))\n",
"test_df.dropna(subset=['ground_truth'], inplace=True)\n",
"test_dataset = MaskedIfDataset(tokenizer, test_df)\n",
"test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)\n",
"\n",
"# Evaluate the model on the test set\n",
"test_accuracy = evaluate_accuracy(best_model, test_loader, tokenizer, device)\n",
"print(f\"Test Accuracy: {test_accuracy:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3eb56941-cd5b-405b-ae37-f15d97a2b22e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 original_method \\\n",
"0 5126 def stream_edit(request, stream_id, response_f... \n",
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
"3 17853 def search_host(self, search_string):\\n res... \n",
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
"\n",
" target_block \n",
"0 if \"cancel\" not in request . POST : \n",
"1 if isinstance ( node , ast . Include ) : \n",
"2 if len ( line . strip ( ) ) == 0 : \n",
"3 if isinstance ( value , int ) : \n",
"4 if self . _get_flag ( \"struct\" ) : \n"
]
}
],
"source": [
"# Load the new dataset\n",
"new_df = pd.read_csv('../if-statements/dataset/extracted/test_set_usi.csv')\n",
"\n",
"new_df.drop(\"input_method\", axis=1, inplace=True)\n",
"new_df.drop(\"tokens_in_method\", axis=1, inplace=True)\n",
"\n",
"print(new_df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "08a9c76f-32da-4871-b0af-d5afafa50ae0",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 original_method \\\n",
"0 5126 def stream_edit(request, stream_id, response_f... \n",
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
"3 17853 def search_host(self, search_string):\\n res... \n",
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
"\n",
" target_block \\\n",
"0 if \"cancel\" not in request . POST : \n",
"1 if isinstance ( node , ast . Include ) : \n",
"2 if len ( line . strip ( ) ) == 0 : \n",
"3 if isinstance ( value , int ) : \n",
"4 if self . _get_flag ( \"struct\" ) : \n",
"\n",
" masked_code \\\n",
"0 def stream_edit(request, stream_id, response_f... \n",
"1 def _read_and_parse_includes(self):\\n # Map... \n",
"2 def _get_list_key(self, spaces, lines):\\n k... \n",
"3 def search_host(self, search_string):\\n res... \n",
"4 def pop(self, key: Union[str, Enum], default: ... \n",
"\n",
" ground_truth \n",
"0 if not request.user.profile.has_permission(str... \n",
"1 if isinstance(node, ast.Include): \n",
"2 if len(line.strip()) == 0: \n",
"3 if host_entry.get(\"type\") != \"entry\": \n",
"4 if self._get_flag(\"readonly\"): \n"
]
}
],
"source": [
"# Function to preprocess the new dataframe\n",
"def preprocess_new_df(df):\n",
" # Apply the masking function\n",
" df['masked_code'], df['ground_truth'] = zip(*df['original_method'].apply(mask_if_condition))\n",
" # Drop rows where ground truth (if statement) is None\n",
" df.dropna(subset=['ground_truth'], inplace=True)\n",
"\n",
"# Preprocess the new dataframe\n",
"preprocess_new_df(new_df)\n",
"\n",
"# Check the first few rows\n",
"print(new_df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "c36c9144-64b2-46dd-b597-5528ff57b10a",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating: 100%|█████████████████████████████| 624/624 [02:29<00:00, 4.17it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"New Dataset Accuracy: 0.2841\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Create dataset for the new dataframe\n",
"new_dataset = MaskedIfDataset(tokenizer, new_df)\n",
"\n",
"# Create DataLoader for the new dataset\n",
"new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False)\n",
"\n",
"# Evaluate the model on the new dataset\n",
"new_accuracy = evaluate_accuracy(best_model, new_loader, tokenizer, device)\n",
"print(f\"New Dataset Accuracy: {new_accuracy:.4f}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

1
models/final/best.txt Normal file
View file

@ -0,0 +1 @@
19

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.35.2"
}

21
models/final/stats.csv Normal file
View file

@ -0,0 +1,21 @@
,train_loss,train_acc,val_loss,val_acc
1,1.1066815287971177,0.2564848070235448,0.8621971841733244,0.35313884221780967
2,0.8582622519923347,0.334283488208578,0.8207894280070714,0.36795478845272644
3,0.7284335572143417,0.38036599965794426,0.7892089254894864,0.38903314495188634
4,0.633081175458829,0.4195693898105391,0.7816493976815753,0.4145410111501451
5,0.5550723814587575,0.45366094673431767,0.787525433691057,0.41805407056667176
6,0.489626774831408,0.485301104079965,0.7974283164605881,0.4288987322437758
7,0.43271683281407614,0.5162001406229215,0.8192677173626487,0.4383687184970215
8,0.3834469902450057,0.540695133306729,0.8201495354764291,0.44661677104017106
9,0.34214707001886285,0.5680975999087852,0.8280998212575312,0.45242095616312816
10,0.30546249586785273,0.5925735894950876,0.8484094327830816,0.4547120818695586
11,0.2720891370430394,0.6147502042832982,0.860619057973981,0.46250190927142204
12,0.24163438230665166,0.6392261938696008,0.8756259442051197,0.46234916755766
13,0.2165358039272913,0.6588753966896604,0.8958285228441377,0.46754238582556895
14,0.19597039912527445,0.679246717214906,0.9252617640826756,0.46983351153199937
15,0.17652969488658307,0.6953613438990556,0.9368522311691582,0.468000610966855
16,0.16078871088729313,0.7160937232768941,0.9492269373215708,0.46906980296318923
17,0.14277899259151752,0.7370541398247914,0.9709357907776681,0.4727356040934779
18,0.13330485094920555,0.7507553731258195,0.9966374322321716,0.4719718955246678
19,0.12318261171442361,0.7625372935788534,0.9880664938842191,0.4768596303650527
20,0.11463805472369394,0.7749843224445585,1.0141825987084658,0.4780815640751489
1 train_loss train_acc val_loss val_acc
2 1 1.1066815287971177 0.2564848070235448 0.8621971841733244 0.35313884221780967
3 2 0.8582622519923347 0.334283488208578 0.8207894280070714 0.36795478845272644
4 3 0.7284335572143417 0.38036599965794426 0.7892089254894864 0.38903314495188634
5 4 0.633081175458829 0.4195693898105391 0.7816493976815753 0.4145410111501451
6 5 0.5550723814587575 0.45366094673431767 0.787525433691057 0.41805407056667176
7 6 0.489626774831408 0.485301104079965 0.7974283164605881 0.4288987322437758
8 7 0.43271683281407614 0.5162001406229215 0.8192677173626487 0.4383687184970215
9 8 0.3834469902450057 0.540695133306729 0.8201495354764291 0.44661677104017106
10 9 0.34214707001886285 0.5680975999087852 0.8280998212575312 0.45242095616312816
11 10 0.30546249586785273 0.5925735894950876 0.8484094327830816 0.4547120818695586
12 11 0.2720891370430394 0.6147502042832982 0.860619057973981 0.46250190927142204
13 12 0.24163438230665166 0.6392261938696008 0.8756259442051197 0.46234916755766
14 13 0.2165358039272913 0.6588753966896604 0.8958285228441377 0.46754238582556895
15 14 0.19597039912527445 0.679246717214906 0.9252617640826756 0.46983351153199937
16 15 0.17652969488658307 0.6953613438990556 0.9368522311691582 0.468000610966855
17 16 0.16078871088729313 0.7160937232768941 0.9492269373215708 0.46906980296318923
18 17 0.14277899259151752 0.7370541398247914 0.9709357907776681 0.4727356040934779
19 18 0.13330485094920555 0.7507553731258195 0.9966374322321716 0.4719718955246678
20 19 0.12318261171442361 0.7625372935788534 0.9880664938842191 0.4768596303650527
21 20 0.11463805472369394 0.7749843224445585 1.0141825987084658 0.4780815640751489

71299
models/final/test_outputs.csv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,39 @@
Excluding token lengths < 33.0 (25.00%ile): 123184 instances
Excluding token lengths > 129.0 (75.00%ile): 124447 instances
pretrain dataset: 126184 functions loaded
finetune train dataset: 100948 functions loaded
finetune val dataset: 12618 functions loaded
finetune test dataset: 12619 functions loaded
Using device: cuda
Pretraining for 1 epochs
train dataset: 100948 functions found
train dataset: 52623 conditions found
val dataset: 12618 functions found
val dataset: 6547 conditions found
Tuning for 20 epochs
E1/20, Train Loss: 1.1067, Train Accuracy: 0.2565, Val Loss: 0.8622, Val Accuracy: 0.3531
E2/20, Train Loss: 0.8583, Train Accuracy: 0.3343, Val Loss: 0.8208, Val Accuracy: 0.3680
E3/20, Train Loss: 0.7284, Train Accuracy: 0.3804, Val Loss: 0.7892, Val Accuracy: 0.3890
E4/20, Train Loss: 0.6331, Train Accuracy: 0.4196, Val Loss: 0.7816, Val Accuracy: 0.4145
E5/20, Train Loss: 0.5551, Train Accuracy: 0.4537, Val Loss: 0.7875, Val Accuracy: 0.4181
E6/20, Train Loss: 0.4896, Train Accuracy: 0.4853, Val Loss: 0.7974, Val Accuracy: 0.4289
E7/20, Train Loss: 0.4327, Train Accuracy: 0.5162, Val Loss: 0.8193, Val Accuracy: 0.4384
E8/20, Train Loss: 0.3834, Train Accuracy: 0.5407, Val Loss: 0.8201, Val Accuracy: 0.4466
E9/20, Train Loss: 0.3421, Train Accuracy: 0.5681, Val Loss: 0.8281, Val Accuracy: 0.4524
E10/20, Train Loss: 0.3055, Train Accuracy: 0.5926, Val Loss: 0.8484, Val Accuracy: 0.4547
E11/20, Train Loss: 0.2721, Train Accuracy: 0.6148, Val Loss: 0.8606, Val Accuracy: 0.4625
E12/20, Train Loss: 0.2416, Train Accuracy: 0.6392, Val Loss: 0.8756, Val Accuracy: 0.4623
E13/20, Train Loss: 0.2165, Train Accuracy: 0.6589, Val Loss: 0.8958, Val Accuracy: 0.4675
E14/20, Train Loss: 0.1960, Train Accuracy: 0.6792, Val Loss: 0.9253, Val Accuracy: 0.4698
E15/20, Train Loss: 0.1765, Train Accuracy: 0.6954, Val Loss: 0.9369, Val Accuracy: 0.4680
E16/20, Train Loss: 0.1608, Train Accuracy: 0.7161, Val Loss: 0.9492, Val Accuracy: 0.4691
E17/20, Train Loss: 0.1428, Train Accuracy: 0.7371, Val Loss: 0.9709, Val Accuracy: 0.4727
E18/20, Train Loss: 0.1333, Train Accuracy: 0.7508, Val Loss: 0.9966, Val Accuracy: 0.4720
E19/20, Train Loss: 0.1232, Train Accuracy: 0.7625, Val Loss: 0.9881, Val Accuracy: 0.4769
E20/20, Train Loss: 0.1146, Train Accuracy: 0.7750, Val Loss: 1.0142, Val Accuracy: 0.4781
test dataset: 12619 functions found
test dataset: 6637 conditions found
test_usi dataset: 5000 functions found
test_usi dataset: 5000 conditions found
Test Accuracy: 48.26%
USI Test Accuracy: 19.54%

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

16
plot_acc.py Normal file
View file

@ -0,0 +1,16 @@
import os.path
import pandas as pd
from train.finetune import plot_loss_acc
ROOT = os.path.dirname(__file__)
def main():
df = pd.read_csv(os.path.join(ROOT, 'models', 'final', 'stats.csv'))
plot_loss_acc(df['train_loss'].tolist(), df['val_loss'].tolist(), df['train_acc'].tolist(), df['val_acc'].tolist(),
os.path.join(ROOT, 'models', 'final'))
if __name__ == "__main__":
main()

73
report/main.tex Normal file
View file

@ -0,0 +1,73 @@
\documentclass{scrartcl}
\setlength\paperwidth{20.999cm}
\setlength\paperheight{29.699cm}
\setlength\voffset{-1in}
\setlength\hoffset{-1in}
\setlength\topmargin{1.499cm}
\setlength\headheight{12pt}
\setlength\headsep{.7cm}
\setlength\footskip{1.131cm}
\setlength\textheight{25cm}
\setlength\oddsidemargin{2.499cm}
\setlength\textwidth{15.999cm}
\setlength\parindent{0cm}
\setlength\parskip{0.3em}
\usepackage{amsmath}
\usepackage{listings}
\usepackage{xcolor}
\usepackage{fancyvrb}
\usepackage{newverbs}
\usepackage{fancyhdr}
\usepackage{extramarks}
\usepackage{graphicx}
\usepackage{mathtools}
\usepackage{multicol}
\usepackage{hyperref}
\usepackage{booktabs}
\usepackage{float}
\usepackage{subcaption}
\pagestyle{fancy}
\lhead{Aksakal, Maggioni, Riggio - Bug Triaging}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
\newcommand\vartextvisiblespace[1][.6em]{%
\makebox[\#1]{%
\kern.07em
\vrule height.4ex
\hrulefill
\vrule height.4ex
\kern.07em
}%
}
\begin{document}
\thispagestyle{plain}
\begin{center}
\hrule
\vspace{.4cm}
{\textbf {\Huge If-Conditions}} \\
\vspace{.2cm}
{\textbf Software Analytics}
\vspace{.2cm}
\end{center}
{\textbf {Baris Aksakal} } (baris.aksakal@usi.ch) \hspace{\fill} \\
{\textbf {Claudio Maggioni} } (claudio.maggioni@usi.ch) \hspace{\fill} \\
{\textbf {Edoardo Riggio} } (edoardo.riggio@usi.ch) \hspace{\fill} \today \\
\hrule
\vspace{.2cm}
\input{sections/introduction}
\input{sections/scraping}
\input{sections/datasets}
\input{sections/baris}
\input{sections/pretraining-finetuning}
\input{sections/model}
\input{sections/evaluation}
\end{document}

32
report/sections/baris.tex Normal file
View file

@ -0,0 +1,32 @@
\section*{Model Architecture Design (Baris)}
Our model consists of three main segments. The first is the already pre-trained
transformer CodeT5 (base-sized model) from Hugging Face that also came with its
own tokenizer. In essence, the model is a unified framework that supports both
code understanding and generation tasks and allows for multi-task learning. We
have quickly observed that the model works as expected, in the sense that it
continued to generate code from the bottom-most line. While expected, this base
model requires some other modifications to be applicable for an MLM task.
Therefore, our first step was to teach the CodeT5 the task of MLM with
considerably less data than what it was initially trained on. As we were
already using a pre-trained model, we can consider this step almost as a
"pre-fine-tuning". Meaning that we are actually "fine-tuning" a pre-trained
model as our own pre-training. At this point in the training, our dataset
consisted of all sorts of Python functions where 15\% of the tokens in a
function were randomly masked for the model to predict. After our
"pre-fine-tuning" step was complete, we observed the model's new ability to
adapt and complete MLM tasks. We then proceeded with the actual fine-tuning. We
have masked if conditions as was instructed, putting apart from the fine-tuning
dataset 10\% of the instances as an evaluation set and 10\% as a test set. For
this, we implemented a straightforward scheme where we randomly sampled
functions with if conditions and randomly selected and masked a single if
condition per function. Therefore, a function was used only once to train the
model no matter how many "if conditions" it contains. We opted for this scheme
as we already had more than enough samples for our computational resources and
had no extra necessity for getting multiple samples out of a single function.
This last step of fine-tuning created our final model which, with some success,
is able to automatically recommend appropriate conditions for if statements in
Python functions. Lastly, we have tested our final CodeT5 (already pre-trained)
model after our own pre-training and fine-tuning steps, on the small test set
on which we can compare the performance of the models trained by the two
groups.

View file

@ -0,0 +1,34 @@
\section*{Datasets}
After scraping all the data, we had to split up the dataset into several different datasets.
Firstly, we divided it into pretrain and finetune datasets, and then in training, test, and validation.
The number of functions found in our datasets pretrain and finetune datasets are summarized in Table~\ref{tab:table-pre}, while the functions and conditions found in the rest of the databases are summarized in Table~\ref{tab:table}. \\ \\
Before performing either the pretraining or the finetuning, we transformed the functions in an array tokens by using the \verb|Salesforce/codet5-small| tokenizer.
\begin{table}[h]
\centering
\begin{tabular}{| l | c |}
\hline
Dataset & \# of Functions \\
\hline \hline
Pretrain & 126184 \\ \hline
Finetune Train & 100948 \\ \hline
Finetune Validation & 12618 \\ \hline
Finetune Test & 12619 \\
\hline
\end{tabular}
\caption{Number of fucntions for each dataset}
\label{tab:table-pre}
\end{table}
\begin{table}[h]
\centering
\begin{tabular}{| l | c | c |}
\hline
Dataset & \# of Functions & \# of Conditions \\
\hline \hline
Training & 100948 & 21269 \\ \hline
Validation & 12618 & 6547 \\ \hline
Test & 12619 & 6637 \\
\hline
\end{tabular}
\caption{Number of fucntions and conditions for each dataset}
\label{tab:table}
\end{table}

View file

@ -0,0 +1,7 @@
\section*{Model Evaluation on the Test Sets}
The model was statistically evaluated on two test sets, with data respectively from our scraping effort and external data provided with the assignment document.\\ \\
Our test dataset contains 12619 functions and a total of 6637 conditions on which to perform the evaluation onto.
The model accuracy on this test set is 48.26\%, which is very close to the final validation set accuracy (47.69\%) thus indicating that the model is likely not overfitted. \\ \\
The test set provided externally has 5000 conditions in it.
The model accuracy on this dataset was 19.54\%, which is significantly lower than the value we have for our test set.
We suspect this is due to our dataset filtering efforts based on token length, and that this test set covers cases with token lengths we explicitly excluded.

View file

@ -0,0 +1,3 @@
\section*{Introduction}
The goal of this assignment was to train a model that recommends the appropriate condition given an if-statement in Python.
This assignment was divided into several steps, which included the scraping of python files, pre-training and fine-tuning of the model, and finally training the final model to make recommendations.

50
report/sections/model.tex Normal file
View file

@ -0,0 +1,50 @@
\section*{Model Architecture}
Our implementation uses the \texttt{T5ForConditionalGeneration} model from the
HuggingFace \textit{transformers} library.
The model architecture combines the
standard T5 model architecture with a language modeling output layer head to
allow performing generative tasks.
We used the \textsc{CodeT5} pretrained
instance of the model from
Salesforce\footnote{\url{https://github.com/salesforce/CodeT5}} to perform
pretraining on python-specific code and to further fine tune the model to
generate conditions for if statements. \\ \\
The pretrain phase runs for one epoch and uses instances in the pretrained set to
train the model to recognize the structure of Python code.
This is achieved by
tokenizing and then masking 15\% of the tokens within functions by random
sampling.
The output labels that the model should learn to predict are then
these masked tokens. \\ \\
The fine tune phase runs for at most 20 epochs and uses the fine-tune train
dataset to train the model to predict if conditions.
Each function in the
training set is analyzed with the Python \texttt{ast} module to search for
conditions, and for each condition found a training instance is created where
that specific condition is masked.
This means that one function may be considered as multiple differently masked instances, or not considered at all if
it does not contain if conditions. \\ \\
We implemented an early stopping procedure to avoid overfitting the model during
training.
The procedure analyzes model accuracy on the validation set and has
patience of 3 epochs. \\ \\
Both the pretrain and fine tune training loops are custom, and use the
\textsc{AdamW} optimizer with a learning rate of $5 \cdot 10^{-5}$. \\ \\
Figure~\ref{fig:metrics} shows the loss and accuracy metrics for the fine tune
training and validation set.
It is noteworthy to see that even if validation
accuracy increases, validation loss increases as well after the first few
epochs.
According to our early stopping policy, this is not overfitting, but
this might indicate that a different early stopping policy may have chosen an
earlier epoch.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{../models/final/training_metrics}
\caption{Training and Validation Metrics Over Epochs. The plot illustrates the
trends in training and validation loss as well as training and validation
accuracy across different epochs during the training
process.}\label{fig:metrics}
\end{figure}

View file

@ -0,0 +1,18 @@
\section*{Pre-Training}
After the tokenization of the functions, we removed the outliers.
These are the functions with a total number of tokens in the 25\textsuperscript{th} (less than 33 tokens) and 75\textsuperscript{th} (more than 129 tokens) percentiles.
In total, 247.631 functions were removed (123184 in the 25\textsuperscript{th} percentile and 124447 in the 75\textsuperscript{th} percentile). \\ \\
After this step was performed, we proceeded with the pretraining of the model.
In the pretraining step, we had two first mask 15\% of all the tokens.
This was done by taking the vectors of tokens and substituting 15\% of them with the token that represented the \verb|<mask>| special tag. \\ \\
Finally, this dataset was used to pretrain our model.
This part of the training helped the model to better understand the structure of the Python programming language.
Since the model was already pretrained on code, it already had a general understanding of the structure of some programming languages.
But, thanks to our pretraining, it now is more specialized in recognizing the structure of a Python piece of code.
\section*{Fine-tuning}
In this part of the training of the model, we had to teach the model to perform a specific task, in our case recommending suitable conditions for Python if-statements. \\ \\
To do so, we masked some of the if-conditions of the functions in our finetune dataset, and performed a training operation on this new dataset.
The masking was performed by taking the function and converting it into an AST\@.
After doing that, we iterated over the AST nodes and identified those that were if statements.
When an if-statement was found, it was replaced by the special \verb|<mask>| token.

View file

@ -0,0 +1,10 @@
\section*{Scraping}
To scrape the Python files, we used the SEART-GHS crawler.
The tool exported a csv file containing a list of all the repositories that matched our constraints, namely \verb|lines of code: >=| \verb|10000| and \verb|language: Python|.
This CSV file was then used to download the main branch of each repository and save it in a ZIP archive. \\ \\
From this ZIP archive, we wanted to extract functions from the Python files.
To do this, we used the Python AST library to extract functions while discarding comments (docstrings were kept). \\ \\
As the dataset extracted was extremely large, the extractor script was terminated earlier.
When the script terminated, it had generated 70 million functions.
Due to limited computer power for model training, we decided to cut down the number of functions to 500.000 to build the training set.
After extracting the functions, we saved them in a Parquet file in the \verb|dataset/extracted| directory.

10
requirements.txt Normal file
View file

@ -0,0 +1,10 @@
pandas==2.1.3
fastparquet==2023.10.1
tqdm==4.66.1
transformers==4.35.2
torch==2.1.1
matplotlib==3.8.2
deap~=1.4.1
frozendict~=2.3.8
nltk~=3.8.1
swifter~=1.4.0

94
test/test_dataset.py Normal file
View file

@ -0,0 +1,94 @@
import pandas as pd
import pytest
import swifter
import torch
from torch.utils.data import DataLoader
from train.dataset import (TOKENIZER, MAX_TOKEN_LENGTH, PythonCodeDataset, MaskedIfDataset, decode_tokenized,
PRETRAIN_MLM_PROB, BATCH_SIZE, build_pretrain_dataloader, build_fine_tune_dataloader)
@pytest.fixture
def mock_pretrain_data():
data = {'source': ['if a > 2: pass', 'if b <= 4: pass'],
'other_column': [1, 2]}
return pd.DataFrame(data)
@pytest.fixture
def mock_fine_tune_dataloader():
data = {'source': ['if a > 2: pass', 'if b <= 4: pass'],
'other_column': [1, 2]}
return pd.DataFrame(data)
@pytest.fixture
def mock_fine_tune_data():
data = {'masked_code': ['if a > 2: pass', 'if b <= 4: pass'],
'ground_truth': ['if a > 2: pass', 'if b <= 4: pass']}
return pd.DataFrame(data)
@pytest.fixture
def mock_tokenized_output():
return [1234, 5678]
def test_decode_tokenized(mock_tokenized_output):
decoded_output = decode_tokenized(mock_tokenized_output)
expected_output = " msg comments"
assert decoded_output == expected_output
mock_tokenized_output_with_padding = [-100]
decoded_output_with_padding = decode_tokenized(mock_tokenized_output_with_padding)
expected_output_with_padding = None
assert decoded_output_with_padding == expected_output_with_padding
def test_build_pretrain_dataloader(mock_pretrain_data):
dataloader = build_pretrain_dataloader(mock_pretrain_data)
assert isinstance(dataloader, DataLoader)
assert dataloader.batch_size == BATCH_SIZE
assert isinstance(dataloader.dataset, PythonCodeDataset)
assert dataloader.dataset.tokenizer == TOKENIZER
assert dataloader.dataset.data.equals(mock_pretrain_data)
assert dataloader.collate_fn.tokenizer == TOKENIZER
assert dataloader.collate_fn.mlm_probability == PRETRAIN_MLM_PROB
assert dataloader.collate_fn.mlm == True
def test_build_fine_tune_dataloader(mock_fine_tune_dataloader):
train_dataloader = build_fine_tune_dataloader(mock_fine_tune_dataloader, 'train')
assert isinstance(train_dataloader, DataLoader)
assert train_dataloader.batch_size == BATCH_SIZE
assert isinstance(train_dataloader.dataset, PythonCodeDataset)
assert train_dataloader.dataset.tokenizer == TOKENIZER
def test_python_code_dataset(mock_pretrain_data):
dataset = PythonCodeDataset(TOKENIZER, mock_pretrain_data, MAX_TOKEN_LENGTH)
sample = dataset[0]
assert len(dataset) == len(mock_pretrain_data)
assert 'input_ids' in sample
assert 'attention_mask' in sample
assert sample['input_ids'].shape == torch.Size([MAX_TOKEN_LENGTH])
assert sample['attention_mask'].shape == torch.Size([MAX_TOKEN_LENGTH])
def test_masked_if_dataset(mock_fine_tune_data):
dataset = MaskedIfDataset(TOKENIZER, mock_fine_tune_data, MAX_TOKEN_LENGTH)
sample = dataset[0]
assert len(dataset) == len(mock_fine_tune_data)
assert 'input_ids' in sample
assert 'attention_mask' in sample
assert 'labels' in sample
assert sample['input_ids'].shape == torch.Size([MAX_TOKEN_LENGTH])
assert sample['attention_mask'].shape == torch.Size([MAX_TOKEN_LENGTH])
assert sample['labels'].shape == torch.Size([MAX_TOKEN_LENGTH])

34
test/test_evaluate.py Normal file
View file

@ -0,0 +1,34 @@
from train.dataset import TOKENIZER
from train.evaluate import compute_accuracy
def test_compute_accuracy():
batch = {'labels': [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")], 'input_ids': [[1,2],[3,4]]}
outputs = [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")]
result = compute_accuracy(outputs, batch)
correct_predictions, total_predictions, _ = result
print(result)
assert isinstance(result, tuple)
assert isinstance(correct_predictions, int)
assert isinstance(total_predictions, int)
assert correct_predictions == 2
assert total_predictions == 2
def test_compute_accuracy_none():
batch = {'labels': [[-100], TOKENIZER.encode("label 2")], 'input_ids': [[5,6], [7,8]]}
outputs = [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")]
result = compute_accuracy(outputs, batch)
correct_predictions, total_predictions, _ = result
print(result)
assert isinstance(result, tuple)
assert isinstance(correct_predictions, int)
assert isinstance(total_predictions, int)
assert correct_predictions == 1
assert total_predictions == 1

8
test/test_pretrain.py Normal file
View file

@ -0,0 +1,8 @@
from train.pretrain import label
def test_label():
label_test = label(20, 0.01)
assert label_test != ''
assert label_test == 'Epoch=20 Loss=0.01'

31
test/test_train_load.py Normal file
View file

@ -0,0 +1,31 @@
import os
import pandas as pd
from train.load import DataSet, filter_outliers
IN_PATH: str = os.path.join(os.path.dirname(__file__), '..', 'dataset', 'extracted', 'functions.pq')
IN_PATH_USI: str = os.path.join(os.path.dirname(__file__), '..', 'dataset', 'extracted', 'test_set_usi.csv')
RANDOM_STATE: int = 42
def test_dataset_load():
ds = DataSet.load(IN_PATH, IN_PATH_USI, RANDOM_STATE)
assert len(set(ds.fine_tune_val_df.index).intersection(set(ds.fine_tune_test_df.index))) == 0
def test_filter_outliers():
df = pd.DataFrame({'source': ['abc def', 'ab cd ef', 'a b c d e f g h i j k l']})
filtered_df = filter_outliers(df)
assert 'token_length' in filtered_df.columns
assert len(filtered_df) < len(df)
assert len(filtered_df) > 0
def test_filter_outliers_non_utf_8():
df = pd.DataFrame({'source': [b'\xff']})
filtered_df = filter_outliers(df)
assert 'token_length' in filtered_df.columns
assert filtered_df.iloc[0]['token_length'] == 0

122
test/test_train_mask.py Normal file
View file

@ -0,0 +1,122 @@
import pytest
import swifter
import pandas as pd
from train.mask import FineTrainInstance, strip_parentheses, mask_conditions
@pytest.fixture
def sample_function():
return """
def stream_edit(request, stream_id, response_format="html"):
"Stream edit page"
user = request.user.profile
stream = get_object_or_404(MessageStream, pk=stream_id)
if not request.user.profile.has_permission(stream, mode="w"):
return user_denied(
request,
message="You don't have access to this Stream",
response_format=response_format,
)
if request.POST:
if "cancel" not in request.POST:
form = MessageStreamForm(user, request.POST, instance=stream)
if form.is_valid():
stream = form.save()
return HttpResponseRedirect(
reverse("messaging_stream_view", args=[stream.id])
)
else:
return HttpResponseRedirect(
reverse("messaging_stream_view", args=[stream.id])
)
else:
form = MessageStreamForm(user, instance=stream)
context = _get_default_context(request)
context.update({"form": form, "stream": stream})
return render_to_response(
"messaging/stream_edit",
context,
context_instance=RequestContext(request),
response_format=response_format,
)
"""
@pytest.fixture
def sample_function_with_error():
return """
def ciao_mamma():
if 1 > 2:
print("ciao")
else if 1 < 2:
print("ok")
else:
return
"""
@pytest.fixture
def sample_dataframe():
data = {'source': ['if x > 0: pass', 'if (a and b) or c: pass']}
df = pd.DataFrame(data)
return df
@pytest.fixture
def sample_dataframe_usi():
data = {'input_method': ['<fill-in> pass', '<fill-in> pass'], 'target_block': ['if x > 0 :', 'if (a and b) or c :']}
df = pd.DataFrame(data)
return df
def test_mask_does_not_crash(sample_function):
instances = FineTrainInstance.from_function(sample_function)
assert len(instances) == 4
def test_mask_with_syntax_error(sample_function_with_error):
instances = FineTrainInstance.from_function(sample_function_with_error)
assert instances == []
def test_strip_parentheses_balanced():
balanced = '("ok")'
stripped = strip_parentheses(balanced)
assert "(" not in stripped and ")" not in stripped
assert stripped == '"ok"'
def test_strip_parentheses_unbalanced():
balanced = '("ok"))'
stripped = strip_parentheses(balanced)
assert balanced == stripped
def test_mask_conditions(sample_dataframe):
result_df = mask_conditions(sample_dataframe, kind='test')
assert len(result_df) == 2
assert 'masked_code' in result_df.columns
assert 'ground_truth' in result_df.columns
assert '<mask>' in result_df['masked_code'].iloc[0]
assert '<mask>' in result_df['masked_code'].iloc[1]
assert result_df['ground_truth'].iloc[0] == 'x > 0'
assert result_df['ground_truth'].iloc[1] == 'a and b or c'
def test_mask_conditions_usi(sample_dataframe_usi):
result_df = mask_conditions(sample_dataframe_usi, kind='test_usi')
print(result_df)
assert len(result_df) == 2
assert 'masked_code' in result_df.columns
assert 'ground_truth' in result_df.columns
assert '<mask>' in result_df['masked_code'].iloc[0]
assert '<mask>' in result_df['masked_code'].iloc[1]
assert result_df['ground_truth'].iloc[0] == 'x > 0'
assert result_df['ground_truth'].iloc[1] == 'a and b or c'

78
train/dataset.py Normal file
View file

@ -0,0 +1,78 @@
import ast
from typing import Literal, Optional
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, PreTrainedTokenizer, DataCollatorForLanguageModeling
from train.mask import mask_conditions
TOKENIZER: PreTrainedTokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
BATCH_SIZE: int = 8
MAX_TOKEN_LENGTH: int = 512
# Probability of masking a token during pretraining
PRETRAIN_MLM_PROB: float = 0.15
class PythonCodeDataset(Dataset):
"""Dataset class for pre-training"""
def __init__(self, tokenizer, dataframe, max_len):
self.tokenizer = tokenizer
self.data = dataframe
self.max_len = max_len
def __len__(self):
return len(self.data)
def __getitem__(self, index):
code = self.data.iloc[index]['source']
inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len,
padding='max_length', truncation=True)
return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}
def build_pretrain_dataloader(pretrain_df: pd.DataFrame) -> DataLoader:
pretrain_dataset = PythonCodeDataset(TOKENIZER, pretrain_df, MAX_TOKEN_LENGTH)
data_collator = DataCollatorForLanguageModeling(tokenizer=TOKENIZER, mlm=True, mlm_probability=PRETRAIN_MLM_PROB)
return DataLoader(pretrain_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator)
class MaskedIfDataset(PythonCodeDataset):
"""Fine-tuning and evaluation dataset classes"""
def __getitem__(self, index):
masked_code = self.data.iloc[index]['masked_code']
ground_truth = self.data.iloc[index]['ground_truth']
inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True,
return_tensors="pt")
labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True,
return_tensors="pt").input_ids
labels[labels == self.tokenizer.pad_token_id] = -100
return {'input_ids': inputs.input_ids.squeeze(),
'attention_mask': inputs.attention_mask.squeeze(),
'labels': labels.squeeze()}
Kind = Literal['train'] | Literal['val'] | Literal['test'] | Literal['test_usi']
def build_fine_tune_dataloader(df: pd.DataFrame, kind: Kind) -> DataLoader:
print(f"{kind} dataset: {len(df.index)} functions found")
df = mask_conditions(df, kind)
print(f"{kind} dataset: {len(df.index)} conditions found")
dataset = MaskedIfDataset(TOKENIZER, df, MAX_TOKEN_LENGTH)
return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=kind == 'train')
def decode_tokenized(output) -> Optional[str]:
label_trimmed = [token for token in output if token != TOKENIZER.pad_token_id and token != -100]
if label_trimmed:
return TOKENIZER.decode(label_trimmed, skip_special_tokens=True)
else:
return None

92
train/evaluate.py Normal file
View file

@ -0,0 +1,92 @@
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from train.dataset import decode_tokenized, TOKENIZER
from train.mask import normalize_condition
def compute_accuracy(outputs, batch, track_predictions=False, confidences=None) -> tuple[int, int, list[dict]]:
correct_predictions, total_predictions = 0, 0
decoded_outputs = [TOKENIZER.decode(output, skip_special_tokens=True) for output in outputs]
if track_predictions:
confidences = confidences.tolist()
else:
confidences = [None] * len(decoded_outputs)
tracking = []
# Decode labels with added check for None values
decoded_labels = []
for label in batch['labels']:
decoded_labels.append(decode_tokenized(label))
# Calculate accuracy
for output, label, confidence, in_ids in zip(decoded_outputs, decoded_labels, confidences, batch['input_ids']):
is_correct = None
if label is not None:
total_predictions += 1
is_correct = False
output = normalize_condition(output)
label = normalize_condition(label)
if output == label:
correct_predictions += 1
is_correct = True
if track_predictions:
tracking.append({
'input': TOKENIZER.decode(in_ids) \
.replace("<pad>", "") \
.replace("<s>", "") \
.replace("</s>", ""),
'is_correct': is_correct,
'expected_cond': label,
'predicted_cond': output,
'score': confidence
})
return correct_predictions, total_predictions, tracking
def evaluate_accuracy(model, dataloader: DataLoader, device, track_predictions=False) -> tuple[
float, float, list[dict]]:
"""Returns the accuracy and loss on the given validation set"""
model.eval()
total_loss = 0
correct_predictions, total_predictions = 0, 0
tracking = []
for batch in tqdm(dataloader, desc="Evaluating"):
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'],
'labels': batch['labels']}
outputs = model(**inputs)
generation = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)
# Compute prediction score as inverse entropy of the T5 model logits. If their softmax has values close to 0 or
# 1, confidence is high
loss, logits = outputs[:2]
logits_flattened = outputs.logits.flatten(1, 2)
probabilities = torch.nn.functional.softmax(logits_flattened, dim=1)
entropy = -torch.sum(probabilities * torch.log(probabilities), dim=1)
confidence = 1.0 - entropy / torch.log(torch.tensor(probabilities.size(dim=1)))
total_loss += loss.item()
c_pred, total_pred, t = compute_accuracy(generation, batch, track_predictions, confidence)
correct_predictions += c_pred
total_predictions += total_pred
tracking.extend(t)
return correct_predictions / total_predictions if total_predictions > 0 else 0, total_loss / len(
dataloader), tracking

142
train/finetune.py Normal file
View file

@ -0,0 +1,142 @@
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from train.evaluate import evaluate_accuracy, compute_accuracy
ADAM_LR: float = 5e-5
EARLY_STOPPING_PATIENCE: int = 3
PLOT_FILENAME: str = "training_metrics.png"
def fine_tune_with_eval(model, device, train_loader: DataLoader, val_loader: DataLoader, epochs: int,
save_dir: str) -> int:
optimizer = torch.optim.AdamW(model.parameters(), lr=ADAM_LR)
best_epoch = 0
best_eval_accuracy = 0
patience_counter = 0
train_accuracies, train_losses, val_accuracies, val_losses = [], [], [], []
print(f"Tuning for {epochs} epochs")
max_epoch = 0
for epoch in range(epochs):
max_epoch = epoch
model.train()
total_loss = 0
correct_predictions, total_predictions = 0, 0
train_accuracy = 0
train_dl = tqdm(train_loader, desc=f"Train E{epoch + 1}/{epochs}")
i = 0
# Training loop with tqdm for progress tracking
for batch in train_dl:
i += 1
batch = {k: v.to(device) for k, v in batch.items()}
optimizer.zero_grad()
inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'],
'labels': batch['labels']}
outputs = model(**inputs)
generation = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
optimizer.step()
# Update train accuracy
c_pred, total_pred, _ = compute_accuracy(generation, batch)
correct_predictions += c_pred
total_predictions += total_pred
train_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
train_dl.set_description(f"Train E{epoch + 1}/{epochs} loss={total_loss / i:.6f} acc={train_accuracy:.4f}")
average_loss = total_loss / len(train_loader)
train_losses.append(average_loss)
train_accuracies.append(train_accuracy)
# Evaluate on the validation set
val_accuracy, val_loss, _ = evaluate_accuracy(model, val_loader, device)
val_accuracies.append(val_accuracy)
val_losses.append(val_loss)
print(f"E{epoch + 1}/{epochs}, Train Loss: {average_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
# Early stopping and checkpointing
if val_accuracy > best_eval_accuracy:
best_eval_accuracy = val_accuracy
best_epoch = epoch
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= EARLY_STOPPING_PATIENCE:
print("Early stopping triggered.")
break
save_directory = os.path.join(save_dir, str(epoch))
model.save_pretrained(save_directory)
df = pd.DataFrame({'train_loss': train_losses,
'train_acc': train_accuracies,
'val_loss': val_losses,
'val_acc': val_accuracies},
index=list(range(1, max_epoch + 2)))
df.to_csv(os.path.join(save_dir, "stats.csv"))
return best_epoch
def plot_loss_acc(train_losses: list[float], val_losses: list[float], train_accuracies: list[float],
val_accuracies: list[float], save_path: str):
plt.figure(figsize=(12, 10))
plt.subplot(2, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.xticks(np.arange(0, 20, step=1))
plt.yticks(np.arange(0, 1.4, step=0.2))
plt.legend()
plt.subplot(2, 2, 2)
plt.plot(val_losses, label='Validation Loss', color='orange')
plt.title('Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.xticks(np.arange(0, 20, step=1))
plt.yticks(np.arange(0, 1.4, step=0.2))
plt.legend()
plt.subplot(2, 2, 3)
plt.plot(train_accuracies, label='Training Accuracy', color='green')
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.xticks(np.arange(0, 20, step=1))
plt.yticks(np.arange(0, 1.1, step=0.1))
plt.legend()
plt.subplot(2, 2, 4)
plt.plot(val_accuracies, label='Validation Accuracy', color='red')
plt.title('Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.xticks(np.arange(0, 20, step=1))
plt.yticks(np.arange(0, 1.1, step=0.1))
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(save_path, PLOT_FILENAME))

82
train/load.py Normal file
View file

@ -0,0 +1,82 @@
from dataclasses import dataclass
from io import BytesIO
from tokenize import tokenize
import pandas as pd
import swifter
from sklearn.model_selection import train_test_split
from train.mask import MASK_TOKEN
from tqdm import tqdm
# PreTrain/Train = 1-FINE_TUNE_FRAC/FINE_TUNE_FRAC split
FINE_TUNE_FRAC: float = 0.5
# Splits for the fine tune dataset
# TRAIN_FRAC = 1-VAL_FRAC-TEST_FRAC
TEST_FRAC: float = 0.1
VAL_FRAC: float = 0.1
LOW_PERCENTILE: float = 0.25
HIGH_PERCENTILE: float = 0.75
@dataclass
class DataSet:
pretrain_df: pd.DataFrame
fine_tune_train_df: pd.DataFrame
fine_tune_val_df: pd.DataFrame
fine_tune_test_df: pd.DataFrame
usi_test_df: pd.DataFrame
def __init__(self, pretrain_df: pd.DataFrame, fine_tune_train_df: pd.DataFrame, fine_tune_val_df: pd.DataFrame,
fine_tune_test_df: pd.DataFrame, usi_test_df: pd.DataFrame):
self.pretrain_df = pretrain_df
self.fine_tune_train_df = fine_tune_train_df
self.fine_tune_val_df = fine_tune_val_df
self.fine_tune_test_df = fine_tune_test_df
self.usi_test_df = usi_test_df
@staticmethod
def load(dataset_path: str, usi_test_dataset_path: str, random_state: int):
df = pd.read_parquet(dataset_path)
# df = df.iloc[0:100, :] # debug
df = filter_outliers(df)
pretrain_df, fine_tune_df = train_test_split(df, test_size=FINE_TUNE_FRAC, random_state=random_state)
val_df = fine_tune_df.sample(frac=VAL_FRAC, random_state=random_state)
test_df = fine_tune_df.drop(val_df.index).sample(frac=TEST_FRAC / (1 - VAL_FRAC), random_state=random_state)
fine_tune_df = fine_tune_df.drop(val_df.index).drop(test_df.index)
usi_test_df = pd.read_csv(usi_test_dataset_path, index_col=0)
# usi_test_df = usi_test_df.iloc[0:10, :] # debug
print(f"pretrain dataset: {len(pretrain_df.index)} functions loaded")
print(f"finetune train dataset: {len(fine_tune_df.index)} functions loaded")
print(f"finetune val dataset: {len(val_df.index)} functions loaded")
print(f"finetune test dataset: {len(test_df.index)} functions loaded")
return DataSet(pretrain_df, fine_tune_df, val_df, test_df, usi_test_df)
def filter_outliers(df: pd.DataFrame) -> pd.DataFrame:
assert swifter is not None
def count_tokens(s: str) -> int:
try:
count = 0
for _ in tokenize(BytesIO(s.encode('utf-8')).readline):
count += 1
return count
except:
return 0
df["token_length"] = df["source"].swifter.apply(count_tokens)
low_qty = df["token_length"].quantile(LOW_PERCENTILE)
mask_low = df["token_length"] < low_qty
print(f"Excluding token lengths < {low_qty} ({LOW_PERCENTILE * 100:02.02f}%ile): {sum(mask_low)} instances")
high_qty = df["token_length"].quantile(HIGH_PERCENTILE)
mask_high = df["token_length"] > high_qty
print(f"Excluding token lengths > {high_qty} ({HIGH_PERCENTILE * 100:02.02f}%ile): {sum(mask_high)} instances")
return df[~mask_high & ~mask_low]

125
train/mask.py Normal file
View file

@ -0,0 +1,125 @@
import ast
import sys
from _ast import Load
from dataclasses import dataclass
import pandas as pd
from pandas import Series
from tqdm import tqdm
MASK_TOKEN: str = "<mask>"
def strip_parentheses(input_string):
input_string = input_string.strip()
if input_string.startswith('(') and input_string.endswith(')'):
# Dyck Language algorithm
paren_count: int = 0
for char in input_string:
if char == '(':
paren_count += 1
elif char == ')':
if paren_count == 0:
return input_string # closing a non-matching open paren
else:
paren_count -= 1
if paren_count == 0:
return input_string[1:-1] # strip if parens are balanced
return input_string
@dataclass
class FineTrainInstance:
masked_function: str
condition: str
def __init__(self, masked_function: str, condition: str):
self.masked_function = masked_function
self.condition = condition
@staticmethod
def from_function(function: str) -> list['FineTrainInstance']:
try:
tree = ast.parse(function)
except SyntaxError:
return []
instances: list['FineTrainInstance'] = []
for t in ast.walk(tree):
if isinstance(t, ast.If):
# swap in place the condition with the mask node
cond = t.test
# Replace the condition with a mask node and build the masked function source
# treat "<mask>" as a variable rvalue, which may not ever happen in real source code as it is not a
# valid identifier. However, this makes ast.unparse happily print "<mask>" as a string
t.test = ast.Name(identifier_id=MASK_TOKEN, expr_context_ctx=Load())
t.test.id = MASK_TOKEN
masked_fun = ast.unparse(tree)
instances.append(FineTrainInstance(masked_fun, strip_parentheses(ast.unparse(cond))))
# restore the condition
t.test = cond
return instances
def normalize_condition(c: str) -> str:
c = c.strip()
try:
# reformat if syntax is parsable, otherwise return as-is
return strip_parentheses(ast.unparse(ast.parse(c)))
except SyntaxError:
return c
def mask_conditions(df_source: pd.DataFrame, kind: str) -> pd.DataFrame:
if kind != 'test_usi':
df = pd.DataFrame(columns=['masked_code', 'ground_truth'])
instances = df_source["source"].swifter.apply(lambda s: FineTrainInstance.from_function(s))
i = 0
for row in tqdm(instances, desc=f"Building {kind}", total=len(df_source.index)):
for instance in row:
df.loc[i, 'masked_code'] = instance.masked_function
df.loc[i, 'ground_truth'] = instance.condition
i += 1
else:
df = pd.DataFrame(columns=['masked_code', 'ground_truth'], index=df_source.index)
def canonicalize(c: str) -> pd.Series:
prefixes = ["if ", "elif "]
found_prefix = ""
postfix = ":"
c = c.strip()
for prefix in prefixes:
if c.startswith(prefix):
c = c[len(prefix):]
found_prefix = prefix
break
if c.endswith(postfix):
c = c[:len(c) - len(postfix)]
c = normalize_condition(c)
return pd.Series([found_prefix, c], index=['found_prefix', 'c'])
# Canonicalize condition string
df[['prefix', 'ground_truth']] = df_source['target_block'].swifter.apply(canonicalize)
df['masked_code'] = df_source['input_method'].copy()
# Our model is only able to predict the if condition itself, so we re-inject the "if"/"elif" and ":" token
# back in the input
df['masked_code'] = df[['prefix', 'masked_code']] \
.apply(lambda s: s['masked_code'].replace("<fill-in>", s['prefix'] + " " + MASK_TOKEN + " :"), axis=1)
return df

31
train/pretrain.py Normal file
View file

@ -0,0 +1,31 @@
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
ADAM_LR: float = 5e-5
def label(epoch: int, loss: float) -> str:
return f"Epoch={epoch} Loss={loss}"
def pretrain(model, dataloader: DataLoader, device, epochs: int, save_dir: str):
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=ADAM_LR)
print(f"Pretraining for {epochs} epochs")
for epoch in range(epochs):
with tqdm(dataloader, desc=f"Epoch {epoch + 1}") as pbar:
for step, batch in enumerate(pbar):
batch = {k: v.to(device) for k, v in batch.items()}
optimizer.zero_grad()
inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
outputs = model(**inputs, labels=batch['input_ids'])
loss = outputs.loss
loss.backward()
optimizer.step()
pbar.set_description(label(epoch + 1, loss.item()))
model.save_pretrained(save_dir)

76
train_model.py Normal file
View file

@ -0,0 +1,76 @@
import os
import pandas as pd
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from transformers import T5ForConditionalGeneration
from train.evaluate import evaluate_accuracy
from train.finetune import fine_tune_with_eval
from train.dataset import build_pretrain_dataloader, build_fine_tune_dataloader
from train.pretrain import pretrain
from train.load import DataSet
IN_PATH: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'functions.pq')
IN_PATH_USI: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'test_set_usi.csv')
OUT_PATH: str = os.path.join(os.path.dirname(__file__), 'models', 'final')
RANDOM_STATE: int = 42
def train():
dataset = DataSet.load(IN_PATH, IN_PATH_USI, RANDOM_STATE)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
pretrain_dir = os.path.join(OUT_PATH, "pretrain")
if os.path.isfile(os.path.join(pretrain_dir, "config.json")):
# load the pretrained model if it exists
model = T5ForConditionalGeneration.from_pretrained(pretrain_dir)
model.to(device)
else:
# Pre-train the model
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')
model.to(device)
pretrain_loader = build_pretrain_dataloader(dataset.pretrain_df)
pretrain(model, pretrain_loader, device, 1, pretrain_dir)
# Dataloaders for fine-tuning and validation
best_epoch_file = os.path.join(OUT_PATH, "best.txt")
if not os.path.isfile(best_epoch_file):
fine_tune_loader = build_fine_tune_dataloader(dataset.fine_tune_train_df, 'train')
eval_loader = build_fine_tune_dataloader(dataset.fine_tune_val_df, 'val')
best_epoch = fine_tune_with_eval(model, device, fine_tune_loader, eval_loader, 20, OUT_PATH)
with open(best_epoch_file, "w") as f:
f.write(str(best_epoch) + "\n")
# Load model for best epoch
with open(best_epoch_file, "r") as f:
best_epoch = int(f.read().strip())
best_model_directory = os.path.join(OUT_PATH, str(best_epoch))
best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)
best_model.to(device)
test_loader = build_fine_tune_dataloader(dataset.fine_tune_test_df, 'test')
test_usi_loader = build_fine_tune_dataloader(dataset.usi_test_df, 'test_usi')
# Evaluate the model on the test set
test_accuracy, _, test_outs = evaluate_accuracy(best_model, test_loader, device, track_predictions=True)
pd.DataFrame.from_records(test_outs).to_csv(os.path.join(OUT_PATH, 'test_outputs.csv'))
print(f"Test Accuracy: {test_accuracy * 100:02.02f}%")
# Evaluate the model on the usi test set
test_accuracy, _, test_usi_outs = evaluate_accuracy(best_model, test_usi_loader, device, track_predictions=True)
pd.DataFrame.from_records(test_usi_outs).to_csv(os.path.join(OUT_PATH, 'test_usi_outputs.csv'))
print(f"USI Test Accuracy: {test_accuracy * 100:02.02f}%")
if __name__ == "__main__":
train()