Final version of the project

History has been rewritten to delete large files in repo
2024-01-03 15:25:41 +01:00 · 2024-01-03 15:25:41 +01:00 · a4ceee8716
commit a4ceee8716
93 changed files with 215857 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,464 @@
+/dataset/download/*.zip
+/dataset/functions/*.pq
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+**/latex/
+/models/test
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+**/.DS_Store
+out/model/*.pt
+
+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+*.fot
+*.cb
+*.cb2
+.*.lb
+
+## Intermediate documents:
+*.dvi
+*.xdv
+*-converted-to.*
+# these rules might exclude image files for figures etc.
+# *.ps
+# *.eps
+# *.pdf
+
+## Generated if empty string is given at "Please type another file name for output:"
+**/*.pdf
+
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.run.xml
+
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex(busy)
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
+
+## Build tool directories for auxiliary files
+# latexrun
+latex.out/
+
+## Auxiliary and intermediate files from other packages:
+# algorithms
+*.alg
+*.loa
+
+# achemso
+acs-*.bib
+
+# amsthm
+*.thm
+
+# beamer
+*.nav
+*.pre
+*.snm
+*.vrb
+
+# changes
+*.soc
+
+# comment
+*.cut
+
+# cprotect
+*.cpt
+
+# elsarticle (documentclass of Elsevier journals)
+*.spl
+
+# endnotes
+*.ent
+
+*.lox
+
+# feynmf/feynmp
+*.mf
+*.mp
+*.t[1-9]
+*.t[1-9][0-9]
+*.tfm
+
+#(r)(e)ledmac/(r)(e)ledpar
+*.end
+*.?end
+*.[1-9]
+*.[1-9][0-9]
+*.[1-9][0-9][0-9]
+*.[1-9]R
+*.[1-9][0-9]R
+*.[1-9][0-9][0-9]R
+*.eledsec[1-9]
+*.eledsec[1-9]R
+*.eledsec[1-9][0-9]
+*.eledsec[1-9][0-9]R
+*.eledsec[1-9][0-9][0-9]
+*.eledsec[1-9][0-9][0-9]R
+
+# glossaries
+*.acn
+*.acr
+*.glg
+*.glo
+*.gls
+*.glsdefs
+*.lzo
+*.lzs
+*.slg
+*.slo
+*.sls
+
+# uncomment this for glossaries-extra (will ignore makeindex's style files!)
+# *.ist
+
+# gnuplot
+*.gnuplot
+*.table
+
+# gnuplottex
+*-gnuplottex-*
+
+# gregoriotex
+*.gaux
+*.glog
+*.gtex
+
+# htlatex
+*.4ct
+*.4tc
+*.idv
+*.lg
+*.trc
+*.xref
+
+# hyperref
+*.brf
+
+# knitr
+*-concordance.tex
+# *.tikz
+*-tikzDictionary
+
+# listings
+*.lol
+
+# luatexja-ruby
+*.ltjruby
+
+# makeidx
+*.idx
+*.ilg
+*.ind
+
+# minitoc
+*.maf
+*.mlf
+*.mlt
+*.mtc[0-9]*
+*.slf[0-9]*
+*.slt[0-9]*
+*.stc[0-9]*
+
+# minted
+_minted*
+*.pyg
+
+# morewrites
+*.mw
+
+# newpax
+*.newpax
+
+# nomencl
+*.nlg
+*.nlo
+*.nls
+
+# pax
+*.pax
+
+# pdfpcnotes
+*.pdfpc
+
+# sagetex
+*.sagetex.sage
+*.sagetex.py
+*.sagetex.scmd
+
+# scrwfile
+*.wrt
+
+# svg
+svg-inkscape/
+
+# sympy
+*.sout
+*.sympy
+sympy-plots-for-*.tex/
+
+# pdfcomment
+*.upa
+*.upb
+
+# pythontex
+*.pytxcode
+pythontex-files-*/
+
+# tcolorbox
+*.listing
+
+# thmtools
+*.loe
+
+# TikZ & PGF
+*.dpth
+*.md5
+*.auxlock
+
+# titletoc
+*.ptc
+
+# todonotes
+*.tdo
+
+# vhistory
+*.hst
+*.ver
+
+*.lod
+
+# xcolor
+*.xcp
+
+# xmpincl
+*.xmpi
+
+# xindy
+*.xdy
+
+# xypic precompiled matrices and outlines
+*.xyc
+*.xyd
+
+# endfloat
+*.ttt
+*.fff
+
+# Latexian
+TSWLatexianTemp*
+
+## Editors:
+# WinEdt
+*.bak
+*.sav
+
+# Texpad
+.texpadtmp
+
+# LyX
+*.lyx~
+
+# Kile
+*.backup
+
+# gummi
+.*.swp
+
+# KBibTeX
+*~[0-9]*
+
+# TeXnicCenter
+*.tps
+
+# auto folder when using emacs and auctex
+./auto/*
+*.el
+
+# expex forward references with \gathertags
+*-tags.tex
+
+# standalone packages
+*.sta
+
+# Makeindex log files
+*.lpz
+
+# xwatermark package
+*.xwm
+
+# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
+# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
+# Uncomment the next line to have this generated file ignored.
+#*Notes.bib
--- a/README.md
+++ b/README.md
@ -0,0 +1,88 @@
+# Assignment 2: If statements
+
+**Group 2: Baris Aksakal, Edoardo Riggio, Claudio Maggioni**
+
+## Repository Structure
+
+- `/dataset`: code and data related to scraping repository from GitHub;
+- `/models`
+    - `/baris`: code and persisted model of the original architecture built by
+      Baris. `model_0.1.ipynb` and `test_model.ipynb` are respectively an
+      earlier and later iteration of the code used to train this model;
+    - `/final`: persisted model for the final architecture with training and
+      test evaluation statistics;
+        - `/test_outputs.csv`: CSV deliverable for the test set evaluation on
+          the test set we extracted;
+        - `/test_usi_outputs.csv`: CSV deliverable for the test set evaluation
+          on the provided test set.
+- `/test`: unit tests for the model training scripts;
+- `/train`: dependencies of the main model training script;
+- `/train_model.py`: main model training script;
+- `/plot_acc.py`: accuracy statistics plotting script.
+
+## Environment Setup
+
+In order to execute both the scraping and training scripts, Python 3.10 or
+greater is required. Dependencies can be installed through a virtual env by
+running:
+
+```shell
+python3 -m venv .env 
+source .env/bin/activate 
+pip install -r requirements.txt
+```
+
+## Dataset Extraction
+
+Please refer to [the README.md file in `/dataset`](dataset/README.md) for
+documentation on the dataset extraction process.
+
+## Model Training
+
+Model training can be performed by running the script:
+
+```shell
+python3 train_model.py
+```
+
+The script is able to resume fine-tuning if the pretraining phase was completed
+by a previous execution, and it is able to directly skip to model evaluation on
+the two test sets if fine-tuning was already completed.
+
+The persisted pretrained model is located in `/models/final/pretrain`. Each
+epoch of the fine-tuning train process is persisted at path
+`/models/final/<N>`, where `<N>` is the epoch number starting from 0. The epoch
+number for the epoch selected by the early stopping process is stored in
+`/models/final/best.txt`.
+
+`/models/final/stats.csv` stores the training and validation loss and accuracy
+statistics during the training process. `/models/final/test_outputs.csv` is the
+CSV deliverable for the test set evaluation on the test set we extracted, while
+`/models/final/test_usi_outputs.csv` is the CSV deliverable for the test set
+evaluation on the provided test set. 
+
+The stdout for the training process script can be found in the file 
+`/models/final/train_log.txt`.
+
+### Plots
+
+The train and validation loss and accuracy plots can be generated from 
+`/models/final/stats.csv` with the following command:
+
+```shell
+python3 plot_acc.py
+```
+
+The output is stored in `/models/final/training_metrics.png`.
+
+# Report
+
+To compile the report run:
+
+```shell
+cd report
+pdflatex -interaction=nonstopmode -output-directory=. main.tex
+pdflatex -interaction=nonstopmode -output-directory=. main.tex
+```
+
+The report is then located in `report/main.pdf`.
--- a/dataset/README.md
+++ b/dataset/README.md
@ -0,0 +1,78 @@
+# Dataset Download Instructions
+
+## Project .zip Export
+
+We scraped GitHub repositories using the download tool https://seart-ghs.si.usi.ch/ to generate the `results.csv` file
+under this directory. Other than the default constraints applied by the `seart-ghs` crawler, we used the following
+criteria:
+
+- lines of code: >=10000
+- language: `Python`
+
+We found 21269 results. We then downloaded a `.zip` archive of the main branch of each repository using the following
+command. We started the download process on 2023-11-13 at 12:00.
+
+```shell
+mkdir download || true
+cat results.csv | \
+  awk -F, 'NR>1 { print "wget -O " $2 ".zip https://github.com/" $2 "/archive/refs/heads/" $6 ".zip" }' | \
+  sed 's#\/#-#;s#\"##g' > download/to_download.sh
+cd download
+bash to_download.sh
+```
+
+### Manually Excluded Repos
+
+We manually excluded the following repositories from our scraped dataset ("404" means that the repository was
+inaccessible and could not be downloaded):
+
+- `thorn-lab/coronavirus_structural_task_force` (too large, more than 6GiB)
+- `feeicn/security-ppt` (too large, more than 9GiB)
+- `salesforce/ai-economist` (404)
+- `agiliumtrade/ai-metaapi-python-sdk` (404)
+- `pokemonchw/dieloli` (harmful content)
+- `thesnowguru/pytrader-python-mt4-mt5-trading-api-connector-drag-n-drop` (DMCA takedown)
+- `objectiv/objectiv-analytics` (404)
+- `aws/solutions-aws-security-hub-automated-response-and-remediation` (404)
+- `openunited/product-factory-backend` (404)
+- `ibm-epbl/ibm-project-43602-1660718377` (404)
+- `ibm-epbl/ibm-project-1392-1658386621` (404)
+- `potatolondon/django-gcloud-connectors` (404)
+- `fortwoone/oracle-project` (404)
+- `iperov/deepxtools` (404)
+- `frequenz/floss-frequenz-sdk-python` (404)
+
+### Check Archive Health
+
+The following script was used to check the integrity of each downloaded `.zip` file.
+
+```shell
+cd download
+find . -name '*.zip' \
+    -exec bash -c 'echo $0 $(unzip -l "$0" 2>/dev/null 1>/dev/null && echo "1" || echo "0")' \{\} \; \
+    > archive_health.txt
+```
+
+## Function Extraction
+
+The following command builds a dataset from the archives saved in the `/download` subdirectory:
+
+```shell
+python3 ./extract.py
+```
+
+Functions are extracted with the Python `ast` module, which discards comments (but not docstrings). The script generates
+one parquet archive per project in the directory `/functions` containing functions.
+
+As the dataset was large, this script was terminated early. At termination, 70 million functions were extracted. Due to
+computing power limitations for model training, we further extracted only 500000 functions out of the ones downloaded
+to build the training set. The extraction process reads the archives in `/functions` and then stores the extracted
+functions in the Parquet file `extracted/functions.pq`. The extraction script can be invoked with the command:
+
+```shell
+python3 extract.py
+```
+
+The extraction process guarantees that the extracted functions have valid syntax for Python 3.10+ and that the code of
+each function contains only ASCII characters.
+
--- a/dataset/extract.py
+++ b/dataset/extract.py
@ -0,0 +1,90 @@
+import ast
+import os.path
+import typing
+import zipfile
+from typing import Optional
+import pandas as pd
+from tqdm import tqdm
+from fastparquet import write
+import multiprocessing
+
+PWD = os.path.dirname(__file__)
+IN_DIR = os.path.join(PWD, "download")
+OUT_DIR = os.path.join(PWD, "functions")
+
+
+def read_functions(content, filename: str, zip_name: str) -> Optional[pd.DataFrame]:
+    records = []
+
+    try:
+        tree = ast.parse(content.decode('utf-8'), filename=filename)
+
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef):
+                f_source: str = ast.unparse(typing.cast(any, node))
+                records.append({
+                    "zip_filename": zip_name,
+                    "py_filename": filename,
+                    "source": f_source,
+                    "success": True,
+                    "error": None,
+                })
+    except Exception as e:
+        print(f"project '{zip_name}': error parsing '{filename}': {e}")
+        records.append({
+            "zip_filename": zip_name,
+            "py_filename": filename,
+            "source": "",
+            "success": False,
+            "error": str(e)
+        })
+
+    return pd.DataFrame.from_records(records)
+
+
+def read_zip_file(zip_file: str):
+    out_path = os.path.join(OUT_DIR, os.path.basename(zip_file) + ".pq")
+    df = pd.DataFrame(columns=["zip_filename", "py_filename", "source"])
+
+    try:
+        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
+            info_list = [info for info in zip_ref.infolist() if info.filename.endswith('.py')]
+
+            for info in tqdm(info_list, desc=os.path.basename(zip_file), ncols=0, position=None, leave=True):
+                content = zip_ref.read(info.filename)
+
+                df_file = read_functions(content, info.filename, zip_file)
+                if df_file is not None:
+                    df = pd.concat([df, df_file], ignore_index=True)
+                    write(out_path, df, compression='GZIP')
+
+        return zip_file
+    except Exception as e:
+        print(e)
+
+
+def read_clones(zip_dir: str):
+    zip_files = []
+    for a_file in tqdm(os.listdir(zip_dir), desc="Scan dir"):
+        path = os.path.join(zip_dir, a_file)
+        out_path = os.path.join(OUT_DIR, os.path.basename(path) + ".pq")
+        if zipfile.is_zipfile(path) and not os.path.isfile(out_path):
+            zip_files.append(path)
+
+    num_processes = 192
+    with multiprocessing.Manager():
+        with multiprocessing.Pool(processes=num_processes) as pool:
+            for _ in tqdm(pool.imap_unordered(read_zip_file, zip_files), desc="Read ZIPs",
+                          unit="item", total=len(zip_files), position=None, leave=True):
+                pass  # dummy iteration to consume multiprocessing iterator, needed to launch processes
+
+
+def main():
+    if not os.path.isdir(OUT_DIR):
+        os.makedirs(OUT_DIR)
+
+    read_clones(IN_DIR)
+
+
+if __name__ == "__main__":
+    main()
--- a/dataset/extracted/functions.pq
+++ b/dataset/extracted/functions.pq
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:923ad850a4eed1d026b26fedbd5ecd64cf2e4e0f4648108d4732ac0e8fe70eb8
+size 72966215
--- a/dataset/extracted/test_set_usi.csv
+++ b/dataset/extracted/test_set_usi.csv
--- a/dataset/results.csv
+++ b/dataset/results.csv
--- a/dataset/sample.py
+++ b/dataset/sample.py
@ -0,0 +1,68 @@
+import os
+import random
+import pandas as pd
+from fastparquet import write
+from tqdm import tqdm
+
+PWD = os.path.dirname(__file__)
+IN_DIR = os.path.join(PWD, "functions")
+
+OUT_FILE = os.path.join(PWD, "extracted", "functions.pq")
+OUT_SIZE = 500_000
+
+
+def main():
+    out_dir = os.path.dirname(OUT_FILE)
+    if not os.path.isdir(out_dir):
+        os.makedirs(out_dir)
+
+    files = [f for f in os.listdir(IN_DIR) if f.endswith('.pq')]
+    chosen: set[tuple[str, int]] = set()
+
+    df = None
+
+    with tqdm(desc="Sampling", total=OUT_SIZE) as p:
+        while df is None or len(df.index) < OUT_SIZE:
+            filename = random.choice(files)
+            path = os.path.join(IN_DIR, filename)
+
+            df_file = pd.read_parquet(path, engine='fastparquet')
+            df_len = len(df_file.index)
+
+            if df_len == 0:
+                continue
+
+            for _ in range(min(1000, df_len)):
+                index = random.randrange(0, df_len)
+                error_message = df_file.iloc[index]["error"]
+                
+                if error_message is not None and len(error_message) > 0:
+                    continue
+                
+                source = df_file.iloc[index]["source"]
+
+                if not source.isascii():
+                    continue
+                        
+                choice = (filename, index)
+                if choice not in chosen:
+                    chosen.add(choice)
+    
+                    if df is None:
+                        df = df_file
+                    else:
+                        df = pd.concat([df, df_file.iloc[[index], :]], axis=0, ignore_index=True)
+
+                    if len(df.index) % 1000 == 0:
+                        write(OUT_FILE, df, compression='GZIP')
+
+                    p.update(1)
+
+    # Fix success column, which is computed wrong in a previous iteration of extract.py
+    df["success"] = df["error"].apply(lambda e: e is None or len(e) == 0)
+
+    write(OUT_FILE, df, compression='GZIP')
+
+
+if __name__ == "__main__":
+    main()
--- a/environment.yml
+++ b/environment.yml
@ -0,0 +1,111 @@
+name: SA
+channels:
+  - defaults
+dependencies:
+  - abseil-cpp=20230802.0
+  - aiohttp=3.8.5
+  - aiosignal=1.2.0
+  - arrow-cpp=11.0.0
+  - async-timeout=4.0.2
+  - attrs=23.1.0
+  - aws-c-common=0.6.8
+  - aws-c-event-stream=0.1.6
+  - aws-checksums=0.1.11
+  - aws-sdk-cpp=1.8.185
+  - blas=1.0
+  - boost-cpp=1.82.0
+  - bottleneck=1.3.5
+  - brotli=1.0.9
+  - brotli-bin=1.0.9
+  - brotli-python=1.0.9
+  - bzip2=1.0.8
+  - c-ares=1.19.1
+  - ca-certificates=2023.08.22
+  - certifi=2023.11.17
+  - cffi=1.16.0
+  - charset-normalizer=2.0.4
+  - cramjam=2.6.2
+  - cryptography=41.0.3
+  - datasets=2.12.0
+  - dill=0.3.6
+  - fastparquet=2023.8.0
+  - filelock=3.13.1
+  - frozenlist=1.4.0
+  - fsspec=2023.9.2
+  - gflags=2.2.2
+  - glog=0.5.0
+  - grpc-cpp=1.48.2
+  - gtest=1.14.0
+  - huggingface_hub=0.17.3
+  - icu=73.1
+  - idna=3.4
+  - importlib-metadata=6.0.0
+  - krb5=1.20.1
+  - libboost=1.82.0
+  - libbrotlicommon=1.0.9
+  - libbrotlidec=1.0.9
+  - libbrotlienc=1.0.9
+  - libcurl=8.4.0
+  - libcxx=14.0.6
+  - libedit=3.1.20221030
+  - libev=4.33
+  - libevent=2.1.12
+  - libffi=3.4.4
+  - libgfortran=5.0.0
+  - libgfortran5=11.3.0
+  - libiconv=1.16
+  - libnghttp2=1.57.0
+  - libopenblas=0.3.21
+  - libprotobuf=3.20.3
+  - libssh2=1.10.0
+  - libthrift=0.15.0
+  - llvm-openmp=14.0.6
+  - lz4-c=1.9.4
+  - multidict=6.0.2
+  - multiprocess=0.70.14
+  - ncurses=6.4
+  - numexpr=2.8.7
+  - numpy=1.26.0
+  - numpy-base=1.26.0
+  - openssl=3.0.12
+  - orc=1.7.4
+  - packaging=23.1
+  - pandas=2.1.1
+  - pip=23.3.1
+  - pyarrow=11.0.0
+  - pycparser=2.21
+  - pyopenssl=23.2.0
+  - pysocks=1.7.1
+  - python=3.11.5
+  - python-dateutil=2.8.2
+  - python-tzdata=2023.3
+  - python-xxhash=2.0.2
+  - pytz=2023.3.post1
+  - pyyaml=6.0.1
+  - re2=2022.04.01
+  - readline=8.2
+  - regex=2023.10.3
+  - requests=2.31.0
+  - responses=0.13.3
+  - safetensors=0.4.0
+  - setuptools=68.0.0
+  - six=1.16.0
+  - snappy=1.1.9
+  - sqlite=3.41.2
+  - tk=8.6.12
+  - tokenizers=0.13.2
+  - tqdm=4.65.0
+  - transformers=4.32.1
+  - typing-extensions=4.7.1
+  - typing_extensions=4.7.1
+  - tzdata=2023c
+  - urllib3=1.26.18
+  - utf8proc=2.6.1
+  - wheel=0.41.2
+  - xxhash=0.8.0
+  - xz=5.4.2
+  - yaml=0.2.5
+  - yarl=1.8.1
+  - zipp=3.11.0
+  - zlib=1.2.13
+  - zstd=1.5.5
--- a/models/.gitkeep
+++ b/models/.gitkeep
--- a/models/baris/0/config.json
+++ b/models/baris/0/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/baris/0/generation_config.json
+++ b/models/baris/0/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
--- a/models/baris/1/config.json
+++ b/models/baris/1/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/baris/1/generation_config.json
+++ b/models/baris/1/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
--- a/models/baris/2/config.json
+++ b/models/baris/2/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/baris/2/generation_config.json
+++ b/models/baris/2/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
--- a/models/baris/3/config.json
+++ b/models/baris/3/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/baris/3/generation_config.json
+++ b/models/baris/3/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
--- a/models/baris/4/config.json
+++ b/models/baris/4/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/baris/4/generation_config.json
+++ b/models/baris/4/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.34.0"
+}
--- a/models/baris/model_0.1.ipynb
+++ b/models/baris/model_0.1.ipynb
--- a/models/baris/test_model.ipynb
+++ b/models/baris/test_model.ipynb
@ -0,0 +1,449 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5f7ad96a-6b01-4b63-93b6-4008597a0e9e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating: 100%|███████████████████████████| 1092/1092 [04:02<00:00,  4.50it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test Accuracy: 0.3642\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import random\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForLanguageModeling\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from tqdm import tqdm\n",
+    "import os\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "\n",
+    "# Dataset class for pre-training\n",
+    "class PythonCodeDataset(Dataset):\n",
+    "    def __init__(self, tokenizer, dataframe, max_len=512):\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.data = dataframe\n",
+    "        self.max_len = max_len\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        code = self.data.iloc[index]['source']\n",
+    "        inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)\n",
+    "        return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}\n",
+    "\n",
+    "# Function to mask if conditions\n",
+    "def mask_if_condition(code_snippet):\n",
+    "    if_conditions = re.findall(r'(if\\s+.*?:)', code_snippet)\n",
+    "    masked_snippet = code_snippet.replace(if_conditions[0], '<mask>', 1) if if_conditions else code_snippet\n",
+    "    return masked_snippet, if_conditions[0] if if_conditions else None\n",
+    "\n",
+    "# Fine-tuning and evaluation dataset classes\n",
+    "class MaskedIfDataset(PythonCodeDataset):\n",
+    "    def __getitem__(self, index):\n",
+    "        masked_code = self.data.iloc[index]['masked_code']\n",
+    "        ground_truth = self.data.iloc[index]['ground_truth']\n",
+    "        inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\")\n",
+    "        labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\").input_ids\n",
+    "        labels[labels == self.tokenizer.pad_token_id] = -100\n",
+    "        return {'input_ids': inputs.input_ids.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': labels.squeeze()}\n",
+    "\n",
+    "# Define the pre-training loop\n",
+    "def pretrain(model, dataloader, epochs, print_every=10):\n",
+    "    model.train()\n",
+    "    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n",
+    "    global_step = 0  # Initialize a counter for the global training step\n",
+    "\n",
+    "    for epoch in range(epochs):\n",
+    "        for batch in dataloader:\n",
+    "            batch = {k: v.to(device) for k, v in batch.items()}\n",
+    "            optimizer.zero_grad()\n",
+    "            inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}\n",
+    "            outputs = model(**inputs, labels=batch['input_ids'])\n",
+    "            loss = outputs.loss\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "\n",
+    "            if global_step % print_every == 0:  # Print every steps\n",
+    "                print(f\"Step {global_step}, Loss: {loss.item()}\")\n",
+    "\n",
+    "            global_step += 1  # Increment the step counter\n",
+    "\n",
+    "        print(f\"Epoch {epoch+1}/{epochs} completed.\")\n",
+    "        \n",
+    "\n",
+    "def fine_tune_with_eval(model, train_loader, eval_loader, epochs, save_path, print_every=10, early_stopping_patience=3):\n",
+    "    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n",
+    "    best_epoch = 0\n",
+    "    best_eval_accuracy = 0\n",
+    "    patience_counter = 0\n",
+    "    train_losses, eval_accuracies = [], []\n",
+    "\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        total_loss = 0\n",
+    "\n",
+    "        # Training loop with tqdm for progress tracking\n",
+    "        for batch in tqdm(train_loader, desc=f\"Training Epoch {epoch+1}/{epochs}\"):\n",
+    "            batch = {k: v.to(device) for k, v in batch.items()}\n",
+    "            optimizer.zero_grad()\n",
+    "            inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['labels']}\n",
+    "            outputs = model(**inputs)\n",
+    "            loss = outputs.loss\n",
+    "            total_loss += loss.item()\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "\n",
+    "        average_loss = total_loss / len(train_loader)\n",
+    "        train_losses.append(average_loss)\n",
+    "\n",
+    "        # Evaluate on the evaluation set\n",
+    "        eval_accuracy = evaluate_accuracy(model, eval_loader, tokenizer, device)\n",
+    "        eval_accuracies.append(eval_accuracy)\n",
+    "        print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {average_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}\")\n",
+    "\n",
+    "        # Early stopping and checkpointing\n",
+    "        if eval_accuracy > best_eval_accuracy:\n",
+    "            best_eval_accuracy = eval_accuracy\n",
+    "            best_epoch = epoch\n",
+    "            patience_counter = 0\n",
+    "        else:\n",
+    "            patience_counter += 1\n",
+    "            if patience_counter >= early_stopping_patience:\n",
+    "                print(\"Early stopping triggered.\")\n",
+    "                break\n",
+    "    \n",
+    "        save_directory = f\"{save_path}/{epoch}\"\n",
+    "        model.save_pretrained(save_directory)\n",
+    "    \n",
+    "    # Plotting the training loss and evaluation accuracy\n",
+    "    plt.figure(figsize=(12, 5))\n",
+    "    plt.subplot(1, 2, 1)\n",
+    "    plt.plot(train_losses, label='Training Loss')\n",
+    "    plt.title('Training Loss')\n",
+    "    plt.xlabel('Epoch')\n",
+    "    plt.ylabel('Loss')\n",
+    "    plt.legend()\n",
+    "\n",
+    "    plt.subplot(1, 2, 2)\n",
+    "    plt.plot(eval_accuracies, label='Evaluation Accuracy')\n",
+    "    plt.title('Evaluation Accuracy')\n",
+    "    plt.xlabel('Epoch')\n",
+    "    plt.ylabel('Accuracy')\n",
+    "    plt.legend()\n",
+    "\n",
+    "    plt.savefig(f\"{save_path}/training_metrics.png\")\n",
+    "    \n",
+    "    return best_epoch\n",
+    "\n",
+    "\n",
+    "def evaluate_accuracy(model, dataloader, tokenizer, device):\n",
+    "    model.eval()\n",
+    "    correct_predictions, total_predictions = 0, 0\n",
+    "\n",
+    "    for batch in tqdm(dataloader, desc=\"Evaluating\"):\n",
+    "        batch = {k: v.to(device) for k, v in batch.items()}\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)\n",
+    "        decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n",
+    "\n",
+    "        # Decode labels with added check for None values\n",
+    "        decoded_labels = []\n",
+    "        for label in batch['labels']:\n",
+    "            label_trimmed = [l for l in label.tolist() if l != tokenizer.pad_token_id and l != -100]\n",
+    "            if label_trimmed:\n",
+    "                decoded_label = tokenizer.decode(label_trimmed, skip_special_tokens=True)\n",
+    "                decoded_labels.append(decoded_label)\n",
+    "            else:\n",
+    "                decoded_labels.append(None)  # Append None for invalid/empty labels\n",
+    "\n",
+    "        # Calculate accuracy\n",
+    "        for output, label in zip(decoded_outputs, decoded_labels):\n",
+    "            if label is not None and output.strip() == label.strip():\n",
+    "                correct_predictions += 1\n",
+    "            if label is not None:\n",
+    "                total_predictions += 1\n",
+    "\n",
+    "    return correct_predictions / total_predictions if total_predictions > 0 else 0\n",
+    "    \n",
+    "    \n",
+    "# Read the dataset\n",
+    "df = pd.read_parquet('../if-statements/dataset/extracted/functions.pq')\n",
+    "#df = df.head(50)\n",
+    "\n",
+    "# Split the dataset into pre-training, fine-tuning, evaluation, and test sets\n",
+    "pretrain_df, fine_tune_df = train_test_split(df, test_size=0.5, random_state=42)\n",
+    "eval_df = fine_tune_df.sample(frac=0.1, random_state=42)\n",
+    "test_df = fine_tune_df.drop(eval_df.index).sample(frac=0.1111, random_state=42)\n",
+    "fine_tune_df = fine_tune_df.drop(eval_df.index).drop(test_df.index)\n",
+    "\n",
+    "assert len(set(eval_df.index).intersection(set(test_df.index))) == 0\n",
+    "\n",
+    "\n",
+    "# Initialize tokenizer and model\n",
+    "tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')\n",
+    "model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "print(f'Using device: {device}')\n",
+    "model.to(device)\n",
+    "    \n",
+    "# Instantiate the dataset for pre-training\n",
+    "pretrain_dataset = PythonCodeDataset(tokenizer, pretrain_df)\n",
+    "\n",
+    "# Set up the data collator for MLM\n",
+    "data_collator = DataCollatorForLanguageModeling(\n",
+    "    tokenizer=tokenizer,\n",
+    "    mlm=True,\n",
+    "    mlm_probability=0.15\n",
+    ")\n",
+    "\n",
+    "# Create a DataLoader for pre-training\n",
+    "pretrain_loader = DataLoader(pretrain_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)\n",
+    "\n",
+    "# Pre-train the model\n",
+    "#pretrain(model, pretrain_loader, epochs=1)\n",
+    "\n",
+    "\n",
+    "# Prepare data for fine-tuning and evaluation\n",
+    "fine_tune_df['masked_code'], fine_tune_df['ground_truth'] = zip(*fine_tune_df['source'].apply(mask_if_condition))\n",
+    "eval_df['masked_code'], eval_df['ground_truth'] = zip(*eval_df['source'].apply(mask_if_condition))\n",
+    "fine_tune_df.dropna(subset=['ground_truth'], inplace=True)\n",
+    "eval_df.dropna(subset=['ground_truth'], inplace=True)\n",
+    "\n",
+    "\n",
+    "fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
+    "eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
+    "\n",
+    "\n",
+    "# Dataloaders for fine-tuning and evaluation\n",
+    "fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=8, shuffle=True)\n",
+    "eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)\n",
+    "\n",
+    "\n",
+    "# Instantiate the datasets for fine-tuning and evaluation\n",
+    "fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
+    "eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
+    "\n",
+    "\n",
+    "best_epoch = 4\n",
+    "\n",
+    "# Example of calling the modified function\n",
+    "save_path = '../if-statements/dataset/extracted/final'\n",
+    "#best_epoch = fine_tune_with_eval(model, fine_tune_loader, eval_loader, epochs=5, save_path=save_path)\n",
+    "\n",
+    "# Define the directory of the best model\n",
+    "best_model_directory = os.path.join(save_path, str(best_epoch))\n",
+    "\n",
+    "# Load the best model and its config\n",
+    "best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)\n",
+    "\n",
+    "# Optionally, load the model's config\n",
+    "model_config = best_model.config  # This will load the config file associated with the model\n",
+    "\n",
+    "best_model.to(device)\n",
+    "\n",
+    "# Prepare and evaluate on the test set\n",
+    "test_df['masked_code'], test_df['ground_truth'] = zip(*test_df['source'].apply(mask_if_condition))\n",
+    "test_df.dropna(subset=['ground_truth'], inplace=True)\n",
+    "test_dataset = MaskedIfDataset(tokenizer, test_df)\n",
+    "test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)\n",
+    "\n",
+    "# Evaluate the model on the test set\n",
+    "test_accuracy = evaluate_accuracy(best_model, test_loader, tokenizer, device)\n",
+    "print(f\"Test Accuracy: {test_accuracy:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3eb56941-cd5b-405b-ae37-f15d97a2b22e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Unnamed: 0                                    original_method  \\\n",
+      "0        5126  def stream_edit(request, stream_id, response_f...   \n",
+      "1       10859  def _read_and_parse_includes(self):\\n    # Map...   \n",
+      "2       10615  def _get_list_key(self, spaces, lines):\\n    k...   \n",
+      "3       17853  def search_host(self, search_string):\\n    res...   \n",
+      "4        3922  def pop(self, key: Union[str, Enum], default: ...   \n",
+      "\n",
+      "                               target_block  \n",
+      "0       if \"cancel\" not in request . POST :  \n",
+      "1  if isinstance ( node , ast . Include ) :  \n",
+      "2        if len ( line . strip ( ) ) == 0 :  \n",
+      "3           if isinstance ( value , int ) :  \n",
+      "4        if self . _get_flag ( \"struct\" ) :  \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load the new dataset\n",
+    "new_df = pd.read_csv('../if-statements/dataset/extracted/test_set_usi.csv')\n",
+    "\n",
+    "new_df.drop(\"input_method\", axis=1, inplace=True)\n",
+    "new_df.drop(\"tokens_in_method\", axis=1, inplace=True)\n",
+    "\n",
+    "print(new_df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "08a9c76f-32da-4871-b0af-d5afafa50ae0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Unnamed: 0                                    original_method  \\\n",
+      "0        5126  def stream_edit(request, stream_id, response_f...   \n",
+      "1       10859  def _read_and_parse_includes(self):\\n    # Map...   \n",
+      "2       10615  def _get_list_key(self, spaces, lines):\\n    k...   \n",
+      "3       17853  def search_host(self, search_string):\\n    res...   \n",
+      "4        3922  def pop(self, key: Union[str, Enum], default: ...   \n",
+      "\n",
+      "                               target_block  \\\n",
+      "0       if \"cancel\" not in request . POST :   \n",
+      "1  if isinstance ( node , ast . Include ) :   \n",
+      "2        if len ( line . strip ( ) ) == 0 :   \n",
+      "3           if isinstance ( value , int ) :   \n",
+      "4        if self . _get_flag ( \"struct\" ) :   \n",
+      "\n",
+      "                                         masked_code  \\\n",
+      "0  def stream_edit(request, stream_id, response_f...   \n",
+      "1  def _read_and_parse_includes(self):\\n    # Map...   \n",
+      "2  def _get_list_key(self, spaces, lines):\\n    k...   \n",
+      "3  def search_host(self, search_string):\\n    res...   \n",
+      "4  def pop(self, key: Union[str, Enum], default: ...   \n",
+      "\n",
+      "                                        ground_truth  \n",
+      "0  if not request.user.profile.has_permission(str...  \n",
+      "1                  if isinstance(node, ast.Include):  \n",
+      "2                         if len(line.strip()) == 0:  \n",
+      "3              if host_entry.get(\"type\") != \"entry\":  \n",
+      "4                     if self._get_flag(\"readonly\"):  \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Function to preprocess the new dataframe\n",
+    "def preprocess_new_df(df):\n",
+    "    # Apply the masking function\n",
+    "    df['masked_code'], df['ground_truth'] = zip(*df['original_method'].apply(mask_if_condition))\n",
+    "    # Drop rows where ground truth (if statement) is None\n",
+    "    df.dropna(subset=['ground_truth'], inplace=True)\n",
+    "\n",
+    "# Preprocess the new dataframe\n",
+    "preprocess_new_df(new_df)\n",
+    "\n",
+    "# Check the first few rows\n",
+    "print(new_df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "c36c9144-64b2-46dd-b597-5528ff57b10a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating: 100%|█████████████████████████████| 624/624 [02:29<00:00,  4.17it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "New Dataset Accuracy: 0.2841\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create dataset for the new dataframe\n",
+    "new_dataset = MaskedIfDataset(tokenizer, new_df)\n",
+    "\n",
+    "# Create DataLoader for the new dataset\n",
+    "new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False)\n",
+    "\n",
+    "# Evaluate the model on the new dataset\n",
+    "new_accuracy = evaluate_accuracy(best_model, new_loader, tokenizer, device)\n",
+    "print(f\"New Dataset Accuracy: {new_accuracy:.4f}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/models/baris/training_metrics.png
+++ b/models/baris/training_metrics.png
--- a/models/final/0/config.json
+++ b/models/final/0/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/0/generation_config.json
+++ b/models/final/0/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/1/config.json
+++ b/models/final/1/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/1/generation_config.json
+++ b/models/final/1/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/10/config.json
+++ b/models/final/10/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/10/generation_config.json
+++ b/models/final/10/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/11/config.json
+++ b/models/final/11/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/11/generation_config.json
+++ b/models/final/11/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/12/config.json
+++ b/models/final/12/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/12/generation_config.json
+++ b/models/final/12/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/13/config.json
+++ b/models/final/13/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/13/generation_config.json
+++ b/models/final/13/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/14/config.json
+++ b/models/final/14/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/14/generation_config.json
+++ b/models/final/14/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/15/config.json
+++ b/models/final/15/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/15/generation_config.json
+++ b/models/final/15/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/16/config.json
+++ b/models/final/16/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/16/generation_config.json
+++ b/models/final/16/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/17/config.json
+++ b/models/final/17/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/17/generation_config.json
+++ b/models/final/17/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/18/config.json
+++ b/models/final/18/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/18/generation_config.json
+++ b/models/final/18/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/19/config.json
+++ b/models/final/19/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/19/generation_config.json
+++ b/models/final/19/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/2/config.json
+++ b/models/final/2/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/2/generation_config.json
+++ b/models/final/2/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/3/config.json
+++ b/models/final/3/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/3/generation_config.json
+++ b/models/final/3/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/4/config.json
+++ b/models/final/4/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/4/generation_config.json
+++ b/models/final/4/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/5/config.json
+++ b/models/final/5/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/5/generation_config.json
+++ b/models/final/5/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/6/config.json
+++ b/models/final/6/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/6/generation_config.json
+++ b/models/final/6/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/7/config.json
+++ b/models/final/7/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/7/generation_config.json
+++ b/models/final/7/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/8/config.json
+++ b/models/final/8/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/8/generation_config.json
+++ b/models/final/8/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/9/config.json
+++ b/models/final/9/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/9/generation_config.json
+++ b/models/final/9/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/best.txt
+++ b/models/final/best.txt
@ -0,0 +1 @@
+19
--- a/models/final/pretrain/config.json
+++ b/models/final/pretrain/config.json
@ -0,0 +1,69 @@
+{
+  "_name_or_path": "Salesforce/codet5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 32100
+}
--- a/models/final/pretrain/generation_config.json
+++ b/models/final/pretrain/generation_config.json
@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.35.2"
+}
--- a/models/final/stats.csv
+++ b/models/final/stats.csv
@ -0,0 +1,21 @@
+,train_loss,train_acc,val_loss,val_acc
+1,1.1066815287971177,0.2564848070235448,0.8621971841733244,0.35313884221780967
+2,0.8582622519923347,0.334283488208578,0.8207894280070714,0.36795478845272644
+3,0.7284335572143417,0.38036599965794426,0.7892089254894864,0.38903314495188634
+4,0.633081175458829,0.4195693898105391,0.7816493976815753,0.4145410111501451
+5,0.5550723814587575,0.45366094673431767,0.787525433691057,0.41805407056667176
+6,0.489626774831408,0.485301104079965,0.7974283164605881,0.4288987322437758
+7,0.43271683281407614,0.5162001406229215,0.8192677173626487,0.4383687184970215
+8,0.3834469902450057,0.540695133306729,0.8201495354764291,0.44661677104017106
+9,0.34214707001886285,0.5680975999087852,0.8280998212575312,0.45242095616312816
+10,0.30546249586785273,0.5925735894950876,0.8484094327830816,0.4547120818695586
+11,0.2720891370430394,0.6147502042832982,0.860619057973981,0.46250190927142204
+12,0.24163438230665166,0.6392261938696008,0.8756259442051197,0.46234916755766
+13,0.2165358039272913,0.6588753966896604,0.8958285228441377,0.46754238582556895
+14,0.19597039912527445,0.679246717214906,0.9252617640826756,0.46983351153199937
+15,0.17652969488658307,0.6953613438990556,0.9368522311691582,0.468000610966855
+16,0.16078871088729313,0.7160937232768941,0.9492269373215708,0.46906980296318923
+17,0.14277899259151752,0.7370541398247914,0.9709357907776681,0.4727356040934779
+18,0.13330485094920555,0.7507553731258195,0.9966374322321716,0.4719718955246678
+19,0.12318261171442361,0.7625372935788534,0.9880664938842191,0.4768596303650527
+20,0.11463805472369394,0.7749843224445585,1.0141825987084658,0.4780815640751489
--- a/models/final/test_outputs.csv
+++ b/models/final/test_outputs.csv
--- a/models/final/test_usi_outputs.csv
+++ b/models/final/test_usi_outputs.csv
--- a/models/final/train_log.txt
+++ b/models/final/train_log.txt
@ -0,0 +1,39 @@
+Excluding token lengths < 33.0 (25.00%ile): 123184 instances
+Excluding token lengths > 129.0 (75.00%ile): 124447 instances
+pretrain dataset: 126184 functions loaded
+finetune train dataset: 100948 functions loaded
+finetune val dataset: 12618 functions loaded
+finetune test dataset: 12619 functions loaded
+Using device: cuda
+Pretraining for 1 epochs
+train dataset: 100948 functions found
+train dataset: 52623 conditions found
+val dataset: 12618 functions found
+val dataset: 6547 conditions found
+Tuning for 20 epochs
+E1/20, Train Loss: 1.1067, Train Accuracy: 0.2565, Val Loss: 0.8622, Val Accuracy: 0.3531
+E2/20, Train Loss: 0.8583, Train Accuracy: 0.3343, Val Loss: 0.8208, Val Accuracy: 0.3680
+E3/20, Train Loss: 0.7284, Train Accuracy: 0.3804, Val Loss: 0.7892, Val Accuracy: 0.3890
+E4/20, Train Loss: 0.6331, Train Accuracy: 0.4196, Val Loss: 0.7816, Val Accuracy: 0.4145
+E5/20, Train Loss: 0.5551, Train Accuracy: 0.4537, Val Loss: 0.7875, Val Accuracy: 0.4181
+E6/20, Train Loss: 0.4896, Train Accuracy: 0.4853, Val Loss: 0.7974, Val Accuracy: 0.4289
+E7/20, Train Loss: 0.4327, Train Accuracy: 0.5162, Val Loss: 0.8193, Val Accuracy: 0.4384
+E8/20, Train Loss: 0.3834, Train Accuracy: 0.5407, Val Loss: 0.8201, Val Accuracy: 0.4466
+E9/20, Train Loss: 0.3421, Train Accuracy: 0.5681, Val Loss: 0.8281, Val Accuracy: 0.4524
+E10/20, Train Loss: 0.3055, Train Accuracy: 0.5926, Val Loss: 0.8484, Val Accuracy: 0.4547
+E11/20, Train Loss: 0.2721, Train Accuracy: 0.6148, Val Loss: 0.8606, Val Accuracy: 0.4625
+E12/20, Train Loss: 0.2416, Train Accuracy: 0.6392, Val Loss: 0.8756, Val Accuracy: 0.4623
+E13/20, Train Loss: 0.2165, Train Accuracy: 0.6589, Val Loss: 0.8958, Val Accuracy: 0.4675
+E14/20, Train Loss: 0.1960, Train Accuracy: 0.6792, Val Loss: 0.9253, Val Accuracy: 0.4698
+E15/20, Train Loss: 0.1765, Train Accuracy: 0.6954, Val Loss: 0.9369, Val Accuracy: 0.4680
+E16/20, Train Loss: 0.1608, Train Accuracy: 0.7161, Val Loss: 0.9492, Val Accuracy: 0.4691
+E17/20, Train Loss: 0.1428, Train Accuracy: 0.7371, Val Loss: 0.9709, Val Accuracy: 0.4727
+E18/20, Train Loss: 0.1333, Train Accuracy: 0.7508, Val Loss: 0.9966, Val Accuracy: 0.4720
+E19/20, Train Loss: 0.1232, Train Accuracy: 0.7625, Val Loss: 0.9881, Val Accuracy: 0.4769
+E20/20, Train Loss: 0.1146, Train Accuracy: 0.7750, Val Loss: 1.0142, Val Accuracy: 0.4781
+test dataset: 12619 functions found
+test dataset: 6637 conditions found
+test_usi dataset: 5000 functions found
+test_usi dataset: 5000 conditions found
+Test Accuracy: 48.26%
+USI Test Accuracy: 19.54%
--- a/models/final/training_metrics.png
+++ b/models/final/training_metrics.png
--- a/plot_acc.py
+++ b/plot_acc.py
@ -0,0 +1,16 @@
+import os.path
+
+import pandas as pd
+from train.finetune import plot_loss_acc
+
+ROOT = os.path.dirname(__file__)
+
+
+def main():
+    df = pd.read_csv(os.path.join(ROOT, 'models', 'final', 'stats.csv'))
+    plot_loss_acc(df['train_loss'].tolist(), df['val_loss'].tolist(), df['train_acc'].tolist(), df['val_acc'].tolist(),
+                  os.path.join(ROOT, 'models', 'final'))
+
+
+if __name__ == "__main__":
+    main()
--- a/report/main.tex
+++ b/report/main.tex
@ -0,0 +1,73 @@
+\documentclass{scrartcl}
+\setlength\paperwidth{20.999cm}
+\setlength\paperheight{29.699cm}
+\setlength\voffset{-1in}
+\setlength\hoffset{-1in}
+\setlength\topmargin{1.499cm}
+\setlength\headheight{12pt}
+\setlength\headsep{.7cm}
+\setlength\footskip{1.131cm}
+\setlength\textheight{25cm}
+\setlength\oddsidemargin{2.499cm}
+\setlength\textwidth{15.999cm}
+\setlength\parindent{0cm}
+\setlength\parskip{0.3em}
+
+\usepackage{amsmath}
+\usepackage{listings}
+\usepackage{xcolor}
+\usepackage{fancyvrb}
+\usepackage{newverbs}
+\usepackage{fancyhdr}
+\usepackage{extramarks}
+\usepackage{graphicx}
+\usepackage{mathtools}
+\usepackage{multicol}
+\usepackage{hyperref}
+\usepackage{booktabs}
+\usepackage{float}
+\usepackage{subcaption}
+
+\pagestyle{fancy}
+\lhead{Aksakal, Maggioni, Riggio - Bug Triaging}
+\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
+\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
+
+\newcommand\vartextvisiblespace[1][.6em]{%
+    \makebox[\#1]{%
+        \kern.07em
+        \vrule height.4ex
+        \hrulefill
+        \vrule height.4ex
+        \kern.07em
+    }%
+}
+
+\begin{document}
+    \thispagestyle{plain}
+
+    \begin{center}
+        \hrule
+
+        \vspace{.4cm}
+        {\textbf {\Huge If-Conditions}} \\
+        \vspace{.2cm}
+        {\textbf Software Analytics}
+        \vspace{.2cm}
+    \end{center}
+    {\textbf {Baris Aksakal} } (baris.aksakal@usi.ch) \hspace{\fill}  \\
+    {\textbf {Claudio Maggioni} } (claudio.maggioni@usi.ch) \hspace{\fill}  \\
+    {\textbf {Edoardo Riggio} } (edoardo.riggio@usi.ch) \hspace{\fill} \today \\
+    \hrule
+    \vspace{.2cm}
+
+    \input{sections/introduction}
+    \input{sections/scraping}
+    \input{sections/datasets}
+    \input{sections/baris}
+    \input{sections/pretraining-finetuning}
+    \input{sections/model}
+    \input{sections/evaluation}
+
+\end{document}
+
--- a/report/sections/baris.tex
+++ b/report/sections/baris.tex
@ -0,0 +1,32 @@
+\section*{Model Architecture Design (Baris)}
+
+Our model consists of three main segments. The first is the already pre-trained
+transformer CodeT5 (base-sized model) from Hugging Face that also came with its
+own tokenizer. In essence, the model is a unified framework that supports both
+code understanding and generation tasks and allows for multi-task learning. We
+have quickly observed that the model works as expected, in the sense that it
+continued to generate code from the bottom-most line. While expected, this base
+model requires some other modifications to be applicable for an MLM task.
+Therefore, our first step was to teach the CodeT5 the task of MLM with
+considerably less data than what it was initially trained on. As we were
+already using a pre-trained model, we can consider this step almost as a
+"pre-fine-tuning". Meaning that we are actually "fine-tuning" a pre-trained
+model as our own pre-training. At this point in the training, our dataset
+consisted of all sorts of Python functions where 15\% of the tokens in a
+function were randomly masked for the model to predict. After our
+"pre-fine-tuning" step was complete, we observed the model's new ability to
+adapt and complete MLM tasks. We then proceeded with the actual fine-tuning. We
+have masked if conditions as was instructed, putting apart from the fine-tuning
+dataset 10\% of the instances as an evaluation set and 10\% as a test set. For
+this, we implemented a straightforward scheme where we randomly sampled
+functions with if conditions and randomly selected and masked a single if
+condition per function. Therefore, a function was used only once to train the
+model no matter how many "if conditions" it contains. We opted for this scheme
+as we already had more than enough samples for our computational resources and
+had no extra necessity for getting multiple samples out of a single function.
+This last step of fine-tuning created our final model which, with some success,
+is able to automatically recommend appropriate conditions for if statements in
+Python functions. Lastly, we have tested our final CodeT5 (already pre-trained)
+model after our own pre-training and fine-tuning steps, on the small test set
+on which we can compare the performance of the models trained by the two
+groups.
--- a/report/sections/datasets.tex
+++ b/report/sections/datasets.tex
@ -0,0 +1,34 @@
+\section*{Datasets}
+After scraping all the data, we had to split up the dataset into several different datasets.
+Firstly, we divided it into pretrain and finetune datasets, and then in training, test, and validation.
+The number of functions found in our datasets pretrain and finetune datasets are summarized in Table~\ref{tab:table-pre}, while the functions and conditions found in the rest of the databases are summarized in Table~\ref{tab:table}. \\ \\
+Before performing either the pretraining or the finetuning, we transformed the functions in an array tokens by using the \verb|Salesforce/codet5-small| tokenizer.
+\begin{table}[h]
+    \centering
+    \begin{tabular}{| l | c |}
+        \hline
+        Dataset & \# of Functions \\
+        \hline \hline
+        Pretrain & 126184 \\ \hline
+        Finetune Train & 100948 \\ \hline
+        Finetune Validation & 12618 \\ \hline
+        Finetune Test & 12619 \\
+        \hline
+    \end{tabular}
+    \caption{Number of fucntions for each dataset}
+    \label{tab:table-pre}
+\end{table}
+\begin{table}[h]
+    \centering
+    \begin{tabular}{| l | c | c |}
+        \hline
+        Dataset & \# of Functions & \# of Conditions \\
+        \hline \hline
+        Training & 100948 & 21269 \\ \hline
+        Validation & 12618 & 6547 \\ \hline
+        Test & 12619 & 6637 \\
+        \hline
+    \end{tabular}
+    \caption{Number of fucntions and conditions for each dataset}
+    \label{tab:table}
+\end{table}
--- a/report/sections/evaluation.tex
+++ b/report/sections/evaluation.tex
@ -0,0 +1,7 @@
+\section*{Model Evaluation on the Test Sets}
+The model was statistically evaluated on two test sets, with data respectively from our scraping effort and external data provided with the assignment document.\\ \\
+Our test dataset contains 12619 functions and a total of 6637 conditions on which to perform the evaluation onto.
+The model accuracy on this test set is 48.26\%, which is very close to the final validation set accuracy (47.69\%) thus indicating that the model is likely not overfitted. \\ \\
+The test set provided externally has 5000 conditions in it.
+The model accuracy on this dataset was 19.54\%, which is significantly lower than the value we have for our test set.
+We suspect this is due to our dataset filtering efforts based on token length, and that this test set covers cases with token lengths we explicitly excluded.
--- a/report/sections/introduction.tex
+++ b/report/sections/introduction.tex
@ -0,0 +1,3 @@
+\section*{Introduction}
+The goal of this assignment was to train a model that recommends the appropriate condition given an if-statement in Python.
+This assignment was divided into several steps, which included the scraping of python files, pre-training and fine-tuning of the model, and finally training the final model to make recommendations.
--- a/report/sections/model.tex
+++ b/report/sections/model.tex
@ -0,0 +1,50 @@
+\section*{Model Architecture}
+Our implementation uses the \texttt{T5ForConditionalGeneration} model from the
+HuggingFace \textit{transformers} library.
+The model architecture combines the
+standard T5 model architecture with a language modeling output layer head to
+allow performing generative tasks.
+We used the \textsc{CodeT5} pretrained
+instance of the model from
+Salesforce\footnote{\url{https://github.com/salesforce/CodeT5}} to perform
+pretraining on python-specific code and to further fine tune the model to
+generate conditions for if statements. \\ \\
+The pretrain phase runs for one epoch and uses instances in the pretrained set to
+train the model to recognize the structure of Python code.
+This is achieved by
+tokenizing and then masking 15\% of the tokens within functions by random
+sampling.
+The output labels that the model should learn to predict are then
+these masked tokens. \\ \\
+The fine tune phase runs for at most 20 epochs and uses the fine-tune train
+dataset to train the model to predict if conditions.
+Each function in the
+training set is analyzed with the Python \texttt{ast} module to search for
+conditions, and for each condition found a training instance is created where
+that specific condition is masked.
+This means that one function may be considered as multiple differently masked instances, or not considered at all if
+it does not contain if conditions. \\ \\
+We implemented an early stopping procedure to avoid overfitting the model during
+training.
+The procedure analyzes model accuracy on the validation set and has
+patience of 3 epochs. \\ \\
+Both the pretrain and fine tune training loops are custom, and use the
+\textsc{AdamW} optimizer with a learning rate of $5 \cdot 10^{-5}$. \\ \\
+Figure~\ref{fig:metrics} shows the loss and accuracy metrics for the fine tune
+training and validation set.
+It is noteworthy to see that even if validation
+accuracy increases, validation loss increases as well after the first few
+epochs.
+According to our early stopping policy, this is not overfitting, but
+this might indicate that a different early stopping policy may have chosen an
+earlier epoch.
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.8\linewidth]{../models/final/training_metrics}
+    \caption{Training and Validation Metrics Over Epochs. The plot illustrates the
+    trends in training and validation loss as well as training and validation
+    accuracy across different epochs during the training
+    process.}\label{fig:metrics}
+\end{figure}
+
+
--- a/report/sections/pretraining-finetuning.tex
+++ b/report/sections/pretraining-finetuning.tex
@ -0,0 +1,18 @@
+\section*{Pre-Training}
+After the tokenization of the functions, we removed the outliers.
+These are the functions with a total number of tokens in the 25\textsuperscript{th} (less than 33 tokens) and 75\textsuperscript{th} (more than 129 tokens) percentiles.
+In total, 247.631 functions were removed (123184 in the 25\textsuperscript{th} percentile and 124447 in the 75\textsuperscript{th} percentile). \\ \\
+After this step was performed, we proceeded with the pretraining of the model.
+In the pretraining step, we had two first mask 15\% of all the tokens.
+This was done by taking the vectors of tokens and substituting 15\% of them with the token that represented the \verb|<mask>| special tag. \\ \\
+Finally, this dataset was used to pretrain our model.
+This part of the training helped the model to better understand the structure of the Python programming language.
+Since the model was already pretrained on code, it already had a general understanding of the structure of some programming languages.
+But, thanks to our pretraining, it now is more specialized in recognizing the structure of a Python piece of code.
+
+\section*{Fine-tuning}
+In this part of the training of the model, we had to teach the model to perform a specific task, in our case recommending suitable conditions for Python if-statements. \\ \\
+To do so, we masked some of the if-conditions of the functions in our finetune dataset, and performed a training operation on this new dataset.
+The masking was performed by taking the function and converting it into an AST\@.
+After doing that, we iterated over the AST nodes and identified those that were if statements.
+When an if-statement was found, it was replaced by the special \verb|<mask>| token.
--- a/report/sections/scraping.tex
+++ b/report/sections/scraping.tex
@ -0,0 +1,10 @@
+\section*{Scraping}
+To scrape the Python files, we used the SEART-GHS crawler.
+The tool exported a csv file containing a list of all the repositories that matched our constraints, namely \verb|lines of code: >=| \verb|10000| and \verb|language: Python|.
+This CSV file was then used to download the main branch of each repository and save it in a ZIP archive. \\ \\
+From this ZIP archive, we wanted to extract functions from the Python files.
+To do this, we used the Python AST library to extract functions while discarding comments (docstrings were kept). \\ \\
+As the dataset extracted was extremely large, the extractor script was terminated earlier.
+When the script terminated, it had generated 70 million functions.
+Due to limited computer power for model training, we decided to cut down the number of functions to 500.000 to build the training set.
+After extracting the functions, we saved them in a Parquet file in the \verb|dataset/extracted| directory.
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
+pandas==2.1.3
+fastparquet==2023.10.1
+tqdm==4.66.1
+transformers==4.35.2
+torch==2.1.1
+matplotlib==3.8.2
+deap~=1.4.1
+frozendict~=2.3.8
+nltk~=3.8.1
+swifter~=1.4.0
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@ -0,0 +1,94 @@
+import pandas as pd
+import pytest
+import swifter
+import torch
+from torch.utils.data import DataLoader
+
+from train.dataset import (TOKENIZER, MAX_TOKEN_LENGTH, PythonCodeDataset, MaskedIfDataset, decode_tokenized,
+                           PRETRAIN_MLM_PROB, BATCH_SIZE, build_pretrain_dataloader, build_fine_tune_dataloader)
+
+
+@pytest.fixture
+def mock_pretrain_data():
+    data = {'source': ['if a > 2: pass', 'if b <= 4: pass'],
+            'other_column': [1, 2]}
+
+    return pd.DataFrame(data)
+
+
+@pytest.fixture
+def mock_fine_tune_dataloader():
+    data = {'source': ['if a > 2: pass', 'if b <= 4: pass'],
+            'other_column': [1, 2]}
+
+    return pd.DataFrame(data)
+
+
+@pytest.fixture
+def mock_fine_tune_data():
+    data = {'masked_code': ['if a > 2: pass', 'if b <= 4: pass'],
+            'ground_truth': ['if a > 2: pass', 'if b <= 4: pass']}
+
+    return pd.DataFrame(data)
+
+
+@pytest.fixture
+def mock_tokenized_output():
+    return [1234, 5678]
+
+
+def test_decode_tokenized(mock_tokenized_output):
+    decoded_output = decode_tokenized(mock_tokenized_output)
+    expected_output = " msg comments"
+    assert decoded_output == expected_output
+
+    mock_tokenized_output_with_padding = [-100]
+    decoded_output_with_padding = decode_tokenized(mock_tokenized_output_with_padding)
+    expected_output_with_padding = None
+    assert decoded_output_with_padding == expected_output_with_padding
+
+
+def test_build_pretrain_dataloader(mock_pretrain_data):
+    dataloader = build_pretrain_dataloader(mock_pretrain_data)
+
+    assert isinstance(dataloader, DataLoader)
+    assert dataloader.batch_size == BATCH_SIZE
+    assert isinstance(dataloader.dataset, PythonCodeDataset)
+    assert dataloader.dataset.tokenizer == TOKENIZER
+    assert dataloader.dataset.data.equals(mock_pretrain_data)
+    assert dataloader.collate_fn.tokenizer == TOKENIZER
+    assert dataloader.collate_fn.mlm_probability == PRETRAIN_MLM_PROB
+    assert dataloader.collate_fn.mlm == True
+
+
+def test_build_fine_tune_dataloader(mock_fine_tune_dataloader):
+    train_dataloader = build_fine_tune_dataloader(mock_fine_tune_dataloader, 'train')
+
+    assert isinstance(train_dataloader, DataLoader)
+    assert train_dataloader.batch_size == BATCH_SIZE
+    assert isinstance(train_dataloader.dataset, PythonCodeDataset)
+    assert train_dataloader.dataset.tokenizer == TOKENIZER
+
+
+def test_python_code_dataset(mock_pretrain_data):
+    dataset = PythonCodeDataset(TOKENIZER, mock_pretrain_data, MAX_TOKEN_LENGTH)
+    sample = dataset[0]
+
+    assert len(dataset) == len(mock_pretrain_data)
+    assert 'input_ids' in sample
+    assert 'attention_mask' in sample
+    assert sample['input_ids'].shape == torch.Size([MAX_TOKEN_LENGTH])
+    assert sample['attention_mask'].shape == torch.Size([MAX_TOKEN_LENGTH])
+
+
+def test_masked_if_dataset(mock_fine_tune_data):
+    dataset = MaskedIfDataset(TOKENIZER, mock_fine_tune_data, MAX_TOKEN_LENGTH)
+    sample = dataset[0]
+
+    assert len(dataset) == len(mock_fine_tune_data)
+    assert 'input_ids' in sample
+    assert 'attention_mask' in sample
+    assert 'labels' in sample
+    assert sample['input_ids'].shape == torch.Size([MAX_TOKEN_LENGTH])
+    assert sample['attention_mask'].shape == torch.Size([MAX_TOKEN_LENGTH])
+    assert sample['labels'].shape == torch.Size([MAX_TOKEN_LENGTH])
--- a/test/test_evaluate.py
+++ b/test/test_evaluate.py
@ -0,0 +1,34 @@
+from train.dataset import TOKENIZER
+from train.evaluate import compute_accuracy
+
+
+def test_compute_accuracy():
+    batch = {'labels': [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")], 'input_ids': [[1,2],[3,4]]}
+    outputs = [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")]
+
+    result = compute_accuracy(outputs, batch)
+    correct_predictions, total_predictions, _ = result
+
+    print(result)
+
+    assert isinstance(result, tuple)
+    assert isinstance(correct_predictions, int)
+    assert isinstance(total_predictions, int)
+    assert correct_predictions == 2
+    assert total_predictions == 2
+
+
+def test_compute_accuracy_none():
+    batch = {'labels': [[-100], TOKENIZER.encode("label 2")], 'input_ids': [[5,6], [7,8]]}
+    outputs = [TOKENIZER.encode("label 1"), TOKENIZER.encode("label 2")]
+
+    result = compute_accuracy(outputs, batch)
+    correct_predictions, total_predictions, _ = result
+
+    print(result)
+
+    assert isinstance(result, tuple)
+    assert isinstance(correct_predictions, int)
+    assert isinstance(total_predictions, int)
+    assert correct_predictions == 1
+    assert total_predictions == 1
--- a/test/test_pretrain.py
+++ b/test/test_pretrain.py
@ -0,0 +1,8 @@
+from train.pretrain import label
+
+
+def test_label():
+    label_test = label(20, 0.01)
+
+    assert label_test != ''
+    assert label_test == 'Epoch=20 Loss=0.01'
--- a/test/test_train_load.py
+++ b/test/test_train_load.py
@ -0,0 +1,31 @@
+import os
+
+import pandas as pd
+
+from train.load import DataSet, filter_outliers
+
+IN_PATH: str = os.path.join(os.path.dirname(__file__), '..', 'dataset', 'extracted', 'functions.pq')
+IN_PATH_USI: str = os.path.join(os.path.dirname(__file__), '..', 'dataset', 'extracted', 'test_set_usi.csv')
+RANDOM_STATE: int = 42
+
+
+def test_dataset_load():
+    ds = DataSet.load(IN_PATH, IN_PATH_USI, RANDOM_STATE)
+    assert len(set(ds.fine_tune_val_df.index).intersection(set(ds.fine_tune_test_df.index))) == 0
+
+
+def test_filter_outliers():
+    df = pd.DataFrame({'source': ['abc def', 'ab cd ef', 'a b c d e f g h i j k l']})
+    filtered_df = filter_outliers(df)
+
+    assert 'token_length' in filtered_df.columns
+    assert len(filtered_df) < len(df)
+    assert len(filtered_df) > 0
+
+
+def test_filter_outliers_non_utf_8():
+    df = pd.DataFrame({'source': [b'\xff']})
+    filtered_df = filter_outliers(df)
+
+    assert 'token_length' in filtered_df.columns
+    assert filtered_df.iloc[0]['token_length'] == 0
--- a/test/test_train_mask.py
+++ b/test/test_train_mask.py
@ -0,0 +1,122 @@
+import pytest
+import swifter
+import pandas as pd
+
+from train.mask import FineTrainInstance, strip_parentheses, mask_conditions
+
+
+@pytest.fixture
+def sample_function():
+    return """
+def stream_edit(request, stream_id, response_format="html"):
+    "Stream edit page"
+    user = request.user.profile
+    stream = get_object_or_404(MessageStream, pk=stream_id)
+    if not request.user.profile.has_permission(stream, mode="w"):
+        return user_denied(
+            request,
+            message="You don't have access to this Stream",
+            response_format=response_format,
+        )
+    if request.POST:
+        if "cancel" not in request.POST:
+            form = MessageStreamForm(user, request.POST, instance=stream)
+            if form.is_valid():
+                stream = form.save()
+                return HttpResponseRedirect(
+                    reverse("messaging_stream_view", args=[stream.id])
+                )
+        else:
+            return HttpResponseRedirect(
+                reverse("messaging_stream_view", args=[stream.id])
+            )
+    else:
+        form = MessageStreamForm(user, instance=stream)
+    context = _get_default_context(request)
+    context.update({"form": form, "stream": stream})
+    return render_to_response(
+        "messaging/stream_edit",
+        context,
+        context_instance=RequestContext(request),
+        response_format=response_format,
+    )
+    """
+
+
+@pytest.fixture
+def sample_function_with_error():
+    return """
+def ciao_mamma():
+    if 1 > 2:
+        print("ciao")
+    else if 1 < 2:
+        print("ok")
+    else:
+        return
+    """
+
+
+@pytest.fixture
+def sample_dataframe():
+    data = {'source': ['if x > 0: pass', 'if (a and b) or c: pass']}
+    df = pd.DataFrame(data)
+
+    return df
+
+
+@pytest.fixture
+def sample_dataframe_usi():
+    data = {'input_method': ['<fill-in> pass', '<fill-in> pass'], 'target_block': ['if x > 0 :', 'if (a and b) or c :']}
+    df = pd.DataFrame(data)
+
+    return df
+
+
+def test_mask_does_not_crash(sample_function):
+    instances = FineTrainInstance.from_function(sample_function)
+    assert len(instances) == 4
+
+
+def test_mask_with_syntax_error(sample_function_with_error):
+    instances = FineTrainInstance.from_function(sample_function_with_error)
+    assert instances == []
+
+
+def test_strip_parentheses_balanced():
+    balanced = '("ok")'
+    stripped = strip_parentheses(balanced)
+
+    assert "(" not in stripped and ")" not in stripped
+    assert stripped == '"ok"'
+
+
+def test_strip_parentheses_unbalanced():
+    balanced = '("ok"))'
+    stripped = strip_parentheses(balanced)
+
+    assert balanced == stripped
+
+
+def test_mask_conditions(sample_dataframe):
+    result_df = mask_conditions(sample_dataframe, kind='test')
+
+    assert len(result_df) == 2
+    assert 'masked_code' in result_df.columns
+    assert 'ground_truth' in result_df.columns
+    assert '<mask>' in result_df['masked_code'].iloc[0]
+    assert '<mask>' in result_df['masked_code'].iloc[1]
+    assert result_df['ground_truth'].iloc[0] == 'x > 0'
+    assert result_df['ground_truth'].iloc[1] == 'a and b or c'
+
+
+def test_mask_conditions_usi(sample_dataframe_usi):
+    result_df = mask_conditions(sample_dataframe_usi, kind='test_usi')
+
+    print(result_df)
+    assert len(result_df) == 2
+    assert 'masked_code' in result_df.columns
+    assert 'ground_truth' in result_df.columns
+    assert '<mask>' in result_df['masked_code'].iloc[0]
+    assert '<mask>' in result_df['masked_code'].iloc[1]
+    assert result_df['ground_truth'].iloc[0] == 'x > 0'
+    assert result_df['ground_truth'].iloc[1] == 'a and b or c'
--- a/train/dataset.py
+++ b/train/dataset.py
@ -0,0 +1,78 @@
+import ast
+from typing import Literal, Optional
+
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import RobertaTokenizer, PreTrainedTokenizer, DataCollatorForLanguageModeling
+
+from train.mask import mask_conditions
+
+TOKENIZER: PreTrainedTokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
+
+BATCH_SIZE: int = 8
+MAX_TOKEN_LENGTH: int = 512
+
+# Probability of masking a token during pretraining
+PRETRAIN_MLM_PROB: float = 0.15
+
+
+class PythonCodeDataset(Dataset):
+    """Dataset class for pre-training"""
+
+    def __init__(self, tokenizer, dataframe, max_len):
+        self.tokenizer = tokenizer
+        self.data = dataframe
+        self.max_len = max_len
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        code = self.data.iloc[index]['source']
+        inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len,
+                                            padding='max_length', truncation=True)
+        return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
+                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}
+
+
+def build_pretrain_dataloader(pretrain_df: pd.DataFrame) -> DataLoader:
+    pretrain_dataset = PythonCodeDataset(TOKENIZER, pretrain_df, MAX_TOKEN_LENGTH)
+    data_collator = DataCollatorForLanguageModeling(tokenizer=TOKENIZER, mlm=True, mlm_probability=PRETRAIN_MLM_PROB)
+    return DataLoader(pretrain_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator)
+
+
+class MaskedIfDataset(PythonCodeDataset):
+    """Fine-tuning and evaluation dataset classes"""
+
+    def __getitem__(self, index):
+        masked_code = self.data.iloc[index]['masked_code']
+        ground_truth = self.data.iloc[index]['ground_truth']
+        inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True,
+                                return_tensors="pt")
+        labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True,
+                                return_tensors="pt").input_ids
+        labels[labels == self.tokenizer.pad_token_id] = -100
+        return {'input_ids': inputs.input_ids.squeeze(),
+                'attention_mask': inputs.attention_mask.squeeze(),
+                'labels': labels.squeeze()}
+
+
+Kind = Literal['train'] | Literal['val'] | Literal['test'] | Literal['test_usi']
+
+
+def build_fine_tune_dataloader(df: pd.DataFrame, kind: Kind) -> DataLoader:
+    print(f"{kind} dataset: {len(df.index)} functions found")
+    df = mask_conditions(df, kind)
+    print(f"{kind} dataset: {len(df.index)} conditions found")
+
+    dataset = MaskedIfDataset(TOKENIZER, df, MAX_TOKEN_LENGTH)
+    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=kind == 'train')
+
+
+def decode_tokenized(output) -> Optional[str]:
+    label_trimmed = [token for token in output if token != TOKENIZER.pad_token_id and token != -100]
+    if label_trimmed:
+        return TOKENIZER.decode(label_trimmed, skip_special_tokens=True)
+    else:
+        return None
--- a/train/evaluate.py
+++ b/train/evaluate.py
@ -0,0 +1,92 @@
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from train.dataset import decode_tokenized, TOKENIZER
+from train.mask import normalize_condition
+
+
+def compute_accuracy(outputs, batch, track_predictions=False, confidences=None) -> tuple[int, int, list[dict]]:
+    correct_predictions, total_predictions = 0, 0
+
+    decoded_outputs = [TOKENIZER.decode(output, skip_special_tokens=True) for output in outputs]
+
+    if track_predictions:
+        confidences = confidences.tolist()
+    else:
+        confidences = [None] * len(decoded_outputs)
+
+    tracking = []
+
+    # Decode labels with added check for None values
+    decoded_labels = []
+    for label in batch['labels']:
+        decoded_labels.append(decode_tokenized(label))
+
+    # Calculate accuracy
+    for output, label, confidence, in_ids in zip(decoded_outputs, decoded_labels, confidences, batch['input_ids']):
+        is_correct = None
+        if label is not None:
+            total_predictions += 1
+            is_correct = False
+
+            output = normalize_condition(output)
+            label = normalize_condition(label)
+
+            if output == label:
+                correct_predictions += 1
+                is_correct = True
+
+        if track_predictions:
+            tracking.append({
+                'input': TOKENIZER.decode(in_ids) \
+                    .replace("<pad>", "") \
+                    .replace("<s>", "") \
+                    .replace("</s>", ""),
+                'is_correct': is_correct,
+                'expected_cond': label,
+                'predicted_cond': output,
+                'score': confidence
+            })
+
+    return correct_predictions, total_predictions, tracking
+
+
+def evaluate_accuracy(model, dataloader: DataLoader, device, track_predictions=False) -> tuple[
+    float, float, list[dict]]:
+    """Returns the accuracy and loss on the given validation set"""
+
+    model.eval()
+    total_loss = 0
+
+    correct_predictions, total_predictions = 0, 0
+
+    tracking = []
+
+    for batch in tqdm(dataloader, desc="Evaluating"):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'],
+                      'labels': batch['labels']}
+            outputs = model(**inputs)
+            generation = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)
+
+        # Compute prediction score as inverse entropy of the T5 model logits. If their softmax has values close to 0 or
+        # 1, confidence is high
+        loss, logits = outputs[:2]
+
+        logits_flattened = outputs.logits.flatten(1, 2)
+        probabilities = torch.nn.functional.softmax(logits_flattened, dim=1)
+        entropy = -torch.sum(probabilities * torch.log(probabilities), dim=1)
+        confidence = 1.0 - entropy / torch.log(torch.tensor(probabilities.size(dim=1)))
+
+        total_loss += loss.item()
+
+        c_pred, total_pred, t = compute_accuracy(generation, batch, track_predictions, confidence)
+        correct_predictions += c_pred
+        total_predictions += total_pred
+        tracking.extend(t)
+
+    return correct_predictions / total_predictions if total_predictions > 0 else 0, total_loss / len(
+        dataloader), tracking
--- a/train/finetune.py
+++ b/train/finetune.py
@ -0,0 +1,142 @@
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from train.evaluate import evaluate_accuracy, compute_accuracy
+
+ADAM_LR: float = 5e-5
+EARLY_STOPPING_PATIENCE: int = 3
+PLOT_FILENAME: str = "training_metrics.png"
+
+
+def fine_tune_with_eval(model, device, train_loader: DataLoader, val_loader: DataLoader, epochs: int,
+                        save_dir: str) -> int:
+    optimizer = torch.optim.AdamW(model.parameters(), lr=ADAM_LR)
+    best_epoch = 0
+    best_eval_accuracy = 0
+    patience_counter = 0
+    train_accuracies, train_losses, val_accuracies, val_losses = [], [], [], []
+
+    print(f"Tuning for {epochs} epochs")
+
+    max_epoch = 0
+
+    for epoch in range(epochs):
+        max_epoch = epoch
+
+        model.train()
+        total_loss = 0
+
+        correct_predictions, total_predictions = 0, 0
+        train_accuracy = 0
+
+        train_dl = tqdm(train_loader, desc=f"Train E{epoch + 1}/{epochs}")
+        i = 0
+
+        # Training loop with tqdm for progress tracking
+        for batch in train_dl:
+            i += 1
+            batch = {k: v.to(device) for k, v in batch.items()}
+            optimizer.zero_grad()
+            inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'],
+                      'labels': batch['labels']}
+            outputs = model(**inputs)
+            generation = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)
+
+            loss = outputs.loss
+            total_loss += loss.item()
+            loss.backward()
+            optimizer.step()
+
+            # Update train accuracy
+            c_pred, total_pred, _ = compute_accuracy(generation, batch)
+            correct_predictions += c_pred
+            total_predictions += total_pred
+            train_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
+
+            train_dl.set_description(f"Train E{epoch + 1}/{epochs} loss={total_loss / i:.6f} acc={train_accuracy:.4f}")
+
+        average_loss = total_loss / len(train_loader)
+        train_losses.append(average_loss)
+        train_accuracies.append(train_accuracy)
+
+        # Evaluate on the validation set
+        val_accuracy, val_loss, _ = evaluate_accuracy(model, val_loader, device)
+        val_accuracies.append(val_accuracy)
+        val_losses.append(val_loss)
+        print(f"E{epoch + 1}/{epochs}, Train Loss: {average_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
+              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
+
+        # Early stopping and checkpointing
+        if val_accuracy > best_eval_accuracy:
+            best_eval_accuracy = val_accuracy
+            best_epoch = epoch
+            patience_counter = 0
+        else:
+            patience_counter += 1
+            if patience_counter >= EARLY_STOPPING_PATIENCE:
+                print("Early stopping triggered.")
+                break
+
+        save_directory = os.path.join(save_dir, str(epoch))
+        model.save_pretrained(save_directory)
+
+    df = pd.DataFrame({'train_loss': train_losses,
+                       'train_acc': train_accuracies,
+                       'val_loss': val_losses,
+                       'val_acc': val_accuracies},
+                      index=list(range(1, max_epoch + 2)))
+
+    df.to_csv(os.path.join(save_dir, "stats.csv"))
+
+    return best_epoch
+
+
+def plot_loss_acc(train_losses: list[float], val_losses: list[float], train_accuracies: list[float],
+                  val_accuracies: list[float], save_path: str):
+    plt.figure(figsize=(12, 10))
+
+    plt.subplot(2, 2, 1)
+    plt.plot(train_losses, label='Training Loss')
+    plt.title('Training Loss')
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss')
+    plt.xticks(np.arange(0, 20, step=1))
+    plt.yticks(np.arange(0, 1.4, step=0.2))
+    plt.legend()
+
+    plt.subplot(2, 2, 2)
+    plt.plot(val_losses, label='Validation Loss', color='orange')
+    plt.title('Validation Loss')
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss')
+    plt.xticks(np.arange(0, 20, step=1))
+    plt.yticks(np.arange(0, 1.4, step=0.2))
+    plt.legend()
+
+    plt.subplot(2, 2, 3)
+    plt.plot(train_accuracies, label='Training Accuracy', color='green')
+    plt.title('Training Accuracy')
+    plt.xlabel('Epoch')
+    plt.ylabel('Accuracy')
+    plt.xticks(np.arange(0, 20, step=1))
+    plt.yticks(np.arange(0, 1.1, step=0.1))
+    plt.legend()
+
+    plt.subplot(2, 2, 4)
+    plt.plot(val_accuracies, label='Validation Accuracy', color='red')
+    plt.title('Validation Accuracy')
+    plt.xlabel('Epoch')
+    plt.ylabel('Accuracy')
+    plt.xticks(np.arange(0, 20, step=1))
+    plt.yticks(np.arange(0, 1.1, step=0.1))
+    plt.legend()
+
+    plt.tight_layout()
+
+    plt.savefig(os.path.join(save_path, PLOT_FILENAME))
--- a/train/load.py
+++ b/train/load.py
@ -0,0 +1,82 @@
+from dataclasses import dataclass
+from io import BytesIO
+from tokenize import tokenize
+
+import pandas as pd
+import swifter
+from sklearn.model_selection import train_test_split
+from train.mask import MASK_TOKEN
+from tqdm import tqdm
+
+# PreTrain/Train = 1-FINE_TUNE_FRAC/FINE_TUNE_FRAC split
+FINE_TUNE_FRAC: float = 0.5
+
+# Splits for the fine tune dataset
+# TRAIN_FRAC = 1-VAL_FRAC-TEST_FRAC
+TEST_FRAC: float = 0.1
+VAL_FRAC: float = 0.1
+
+LOW_PERCENTILE: float = 0.25
+HIGH_PERCENTILE: float = 0.75
+
+
+@dataclass
+class DataSet:
+    pretrain_df: pd.DataFrame
+    fine_tune_train_df: pd.DataFrame
+    fine_tune_val_df: pd.DataFrame
+    fine_tune_test_df: pd.DataFrame
+    usi_test_df: pd.DataFrame
+
+    def __init__(self, pretrain_df: pd.DataFrame, fine_tune_train_df: pd.DataFrame, fine_tune_val_df: pd.DataFrame,
+                 fine_tune_test_df: pd.DataFrame, usi_test_df: pd.DataFrame):
+        self.pretrain_df = pretrain_df
+        self.fine_tune_train_df = fine_tune_train_df
+        self.fine_tune_val_df = fine_tune_val_df
+        self.fine_tune_test_df = fine_tune_test_df
+        self.usi_test_df = usi_test_df
+
+    @staticmethod
+    def load(dataset_path: str, usi_test_dataset_path: str, random_state: int):
+        df = pd.read_parquet(dataset_path)
+#        df = df.iloc[0:100, :]  # debug
+        df = filter_outliers(df)
+
+        pretrain_df, fine_tune_df = train_test_split(df, test_size=FINE_TUNE_FRAC, random_state=random_state)
+        val_df = fine_tune_df.sample(frac=VAL_FRAC, random_state=random_state)
+        test_df = fine_tune_df.drop(val_df.index).sample(frac=TEST_FRAC / (1 - VAL_FRAC), random_state=random_state)
+        fine_tune_df = fine_tune_df.drop(val_df.index).drop(test_df.index)
+        usi_test_df = pd.read_csv(usi_test_dataset_path, index_col=0)
+#        usi_test_df = usi_test_df.iloc[0:10, :] # debug
+
+        print(f"pretrain dataset: {len(pretrain_df.index)} functions loaded")
+        print(f"finetune train dataset: {len(fine_tune_df.index)} functions loaded")
+        print(f"finetune val dataset: {len(val_df.index)} functions loaded")
+        print(f"finetune test dataset: {len(test_df.index)} functions loaded")
+
+        return DataSet(pretrain_df, fine_tune_df, val_df, test_df, usi_test_df)
+
+
+def filter_outliers(df: pd.DataFrame) -> pd.DataFrame:
+    assert swifter is not None
+
+    def count_tokens(s: str) -> int:
+        try:
+            count = 0
+            for _ in tokenize(BytesIO(s.encode('utf-8')).readline):
+                count += 1
+            return count
+        except:
+            return 0
+
+    df["token_length"] = df["source"].swifter.apply(count_tokens)
+
+    low_qty = df["token_length"].quantile(LOW_PERCENTILE)
+    mask_low = df["token_length"] < low_qty
+    print(f"Excluding token lengths < {low_qty} ({LOW_PERCENTILE * 100:02.02f}%ile): {sum(mask_low)} instances")
+
+    high_qty = df["token_length"].quantile(HIGH_PERCENTILE)
+    mask_high = df["token_length"] > high_qty
+    print(f"Excluding token lengths > {high_qty} ({HIGH_PERCENTILE * 100:02.02f}%ile): {sum(mask_high)} instances")
+
+    return df[~mask_high & ~mask_low]
--- a/train/mask.py
+++ b/train/mask.py
@ -0,0 +1,125 @@
+import ast
+import sys
+from _ast import Load
+from dataclasses import dataclass
+
+import pandas as pd
+from pandas import Series
+from tqdm import tqdm
+
+MASK_TOKEN: str = "<mask>"
+
+
+def strip_parentheses(input_string):
+    input_string = input_string.strip()
+    if input_string.startswith('(') and input_string.endswith(')'):
+        # Dyck Language algorithm
+
+        paren_count: int = 0
+        for char in input_string:
+            if char == '(':
+                paren_count += 1
+            elif char == ')':
+                if paren_count == 0:
+                    return input_string  # closing a non-matching open paren
+                else:
+                    paren_count -= 1
+
+        if paren_count == 0:
+            return input_string[1:-1]  # strip if parens are balanced
+
+    return input_string
+
+
+@dataclass
+class FineTrainInstance:
+    masked_function: str
+    condition: str
+
+    def __init__(self, masked_function: str, condition: str):
+        self.masked_function = masked_function
+        self.condition = condition
+
+    @staticmethod
+    def from_function(function: str) -> list['FineTrainInstance']:
+        try:
+            tree = ast.parse(function)
+        except SyntaxError:
+            return []
+
+        instances: list['FineTrainInstance'] = []
+
+        for t in ast.walk(tree):
+            if isinstance(t, ast.If):
+                # swap in place the condition with the mask node
+                cond = t.test
+
+                # Replace the condition with a mask node and build the masked function source
+                # treat "<mask>" as a variable rvalue, which may not ever happen in real source code as it is not a
+                # valid identifier. However, this makes ast.unparse happily print "<mask>" as a string
+                t.test = ast.Name(identifier_id=MASK_TOKEN, expr_context_ctx=Load())
+                t.test.id = MASK_TOKEN
+                masked_fun = ast.unparse(tree)
+
+                instances.append(FineTrainInstance(masked_fun, strip_parentheses(ast.unparse(cond))))
+
+                # restore the condition
+                t.test = cond
+
+        return instances
+
+
+def normalize_condition(c: str) -> str:
+    c = c.strip()
+
+    try:
+        # reformat if syntax is parsable, otherwise return as-is
+        return strip_parentheses(ast.unparse(ast.parse(c)))
+    except SyntaxError:
+        return c
+
+
+def mask_conditions(df_source: pd.DataFrame, kind: str) -> pd.DataFrame:
+    if kind != 'test_usi':
+        df = pd.DataFrame(columns=['masked_code', 'ground_truth'])
+        instances = df_source["source"].swifter.apply(lambda s: FineTrainInstance.from_function(s))
+
+        i = 0
+        for row in tqdm(instances, desc=f"Building {kind}", total=len(df_source.index)):
+            for instance in row:
+                df.loc[i, 'masked_code'] = instance.masked_function
+                df.loc[i, 'ground_truth'] = instance.condition
+                i += 1
+    else:
+        df = pd.DataFrame(columns=['masked_code', 'ground_truth'], index=df_source.index)
+
+        def canonicalize(c: str) -> pd.Series:
+            prefixes = ["if ", "elif "]
+            found_prefix = ""
+            postfix = ":"
+
+            c = c.strip()
+
+            for prefix in prefixes:
+                if c.startswith(prefix):
+                    c = c[len(prefix):]
+                    found_prefix = prefix
+                    break
+
+            if c.endswith(postfix):
+                c = c[:len(c) - len(postfix)]
+
+            c = normalize_condition(c)
+
+            return pd.Series([found_prefix, c], index=['found_prefix', 'c'])
+
+        # Canonicalize condition string
+        df[['prefix', 'ground_truth']] = df_source['target_block'].swifter.apply(canonicalize)
+        df['masked_code'] = df_source['input_method'].copy()
+
+        # Our model is only able to predict the if condition itself, so we re-inject the "if"/"elif" and ":" token
+        # back in the input
+        df['masked_code'] = df[['prefix', 'masked_code']] \
+            .apply(lambda s: s['masked_code'].replace("<fill-in>", s['prefix'] + " " + MASK_TOKEN + " :"), axis=1)
+
+    return df
--- a/train/pretrain.py
+++ b/train/pretrain.py
@ -0,0 +1,31 @@
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+ADAM_LR: float = 5e-5
+
+
+def label(epoch: int, loss: float) -> str:
+    return f"Epoch={epoch} Loss={loss}"
+
+
+def pretrain(model, dataloader: DataLoader, device, epochs: int, save_dir: str):
+    model.train()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=ADAM_LR)
+
+    print(f"Pretraining for {epochs} epochs")
+
+    for epoch in range(epochs):
+        with tqdm(dataloader, desc=f"Epoch {epoch + 1}") as pbar:
+            for step, batch in enumerate(pbar):
+                batch = {k: v.to(device) for k, v in batch.items()}
+                optimizer.zero_grad()
+                inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
+                outputs = model(**inputs, labels=batch['input_ids'])
+                loss = outputs.loss
+                loss.backward()
+                optimizer.step()
+                pbar.set_description(label(epoch + 1, loss.item()))
+
+    model.save_pretrained(save_dir)
+
--- a/train_model.py
+++ b/train_model.py
@ -0,0 +1,76 @@
+import os
+
+import pandas as pd
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "2"
+
+import torch
+from transformers import T5ForConditionalGeneration
+
+from train.evaluate import evaluate_accuracy
+from train.finetune import fine_tune_with_eval
+from train.dataset import build_pretrain_dataloader, build_fine_tune_dataloader
+from train.pretrain import pretrain
+from train.load import DataSet
+
+IN_PATH: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'functions.pq')
+IN_PATH_USI: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'test_set_usi.csv')
+OUT_PATH: str = os.path.join(os.path.dirname(__file__), 'models', 'final')
+
+RANDOM_STATE: int = 42
+
+
+def train():
+    dataset = DataSet.load(IN_PATH, IN_PATH_USI, RANDOM_STATE)
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f'Using device: {device}')
+
+    pretrain_dir = os.path.join(OUT_PATH, "pretrain")
+
+    if os.path.isfile(os.path.join(pretrain_dir, "config.json")):
+        # load the pretrained model if it exists
+        model = T5ForConditionalGeneration.from_pretrained(pretrain_dir)
+        model.to(device)
+    else:
+        # Pre-train the model
+        model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')
+        model.to(device)
+        pretrain_loader = build_pretrain_dataloader(dataset.pretrain_df)
+        pretrain(model, pretrain_loader, device, 1, pretrain_dir)
+
+    # Dataloaders for fine-tuning and validation
+    best_epoch_file = os.path.join(OUT_PATH, "best.txt")
+
+    if not os.path.isfile(best_epoch_file):
+        fine_tune_loader = build_fine_tune_dataloader(dataset.fine_tune_train_df, 'train')
+        eval_loader = build_fine_tune_dataloader(dataset.fine_tune_val_df, 'val')
+
+        best_epoch = fine_tune_with_eval(model, device, fine_tune_loader, eval_loader, 20, OUT_PATH)
+
+        with open(best_epoch_file, "w") as f:
+            f.write(str(best_epoch) + "\n")
+
+    # Load model for best epoch
+    with open(best_epoch_file, "r") as f:
+        best_epoch = int(f.read().strip())
+    best_model_directory = os.path.join(OUT_PATH, str(best_epoch))
+    best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)
+    best_model.to(device)
+
+    test_loader = build_fine_tune_dataloader(dataset.fine_tune_test_df, 'test')
+    test_usi_loader = build_fine_tune_dataloader(dataset.usi_test_df, 'test_usi')
+
+    # Evaluate the model on the test set
+    test_accuracy, _, test_outs = evaluate_accuracy(best_model, test_loader, device, track_predictions=True)
+    pd.DataFrame.from_records(test_outs).to_csv(os.path.join(OUT_PATH, 'test_outputs.csv'))
+    print(f"Test Accuracy: {test_accuracy * 100:02.02f}%")
+
+    # Evaluate the model on the usi test set
+    test_accuracy, _, test_usi_outs = evaluate_accuracy(best_model, test_usi_loader, device, track_predictions=True)
+    pd.DataFrame.from_records(test_usi_outs).to_csv(os.path.join(OUT_PATH, 'test_usi_outputs.csv'))
+    print(f"USI Test Accuracy: {test_accuracy * 100:02.02f}%")
+
+
+if __name__ == "__main__":
+    train()