Final version of the project

History has been rewritten to delete large files in repo
This commit is contained in:
Claudio Maggioni 2024-01-03 15:25:41 +01:00
commit a4ceee8716
93 changed files with 215857 additions and 0 deletions

464
.gitignore vendored Normal file
View file

@ -0,0 +1,464 @@
/dataset/download/*.zip
/dataset/functions/*.pq
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
**/latex/
/models/test
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
**/.DS_Store
out/model/*.pt
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf
## Generated if empty string is given at "Please type another file name for output:"
**/*.pdf
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
*.slg
*.slo
*.sls
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplot
*.gnuplot
*.table
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.glog
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# *.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# newpax
*.newpax
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# svg
svg-inkscape/
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# titletoc
*.ptc
# todonotes
*.tdo
# vhistory
*.hst
*.ver
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# xwatermark package
*.xwm
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib

88
README.md Normal file
View file

@ -0,0 +1,88 @@
# Assignment 2: If statements
**Group 2: Baris Aksakal, Edoardo Riggio, Claudio Maggioni**
## Repository Structure
- `/dataset`: code and data related to scraping repository from GitHub;
- `/models`
- `/baris`: code and persisted model of the original architecture built by
Baris. `model_0.1.ipynb` and `test_model.ipynb` are respectively an
earlier and later iteration of the code used to train this model;
- `/final`: persisted model for the final architecture with training and
test evaluation statistics;
- `/test_outputs.csv`: CSV deliverable for the test set evaluation on
the test set we extracted;
- `/test_usi_outputs.csv`: CSV deliverable for the test set evaluation
on the provided test set.
- `/test`: unit tests for the model training scripts;
- `/train`: dependencies of the main model training script;
- `/train_model.py`: main model training script;
- `/plot_acc.py`: accuracy statistics plotting script.
## Environment Setup
In order to execute both the scraping and training scripts, Python 3.10 or
greater is required. Dependencies can be installed through a virtual env by
running:
```shell
python3 -m venv .env
source .env/bin/activate
pip install -r requirements.txt
```
## Dataset Extraction
Please refer to [the README.md file in `/dataset`](dataset/README.md) for
documentation on the dataset extraction process.
## Model Training
Model training can be performed by running the script:
```shell
python3 train_model.py
```
The script is able to resume fine-tuning if the pretraining phase was completed
by a previous execution, and it is able to directly skip to model evaluation on
the two test sets if fine-tuning was already completed.
The persisted pretrained model is located in `/models/final/pretrain`. Each
epoch of the fine-tuning train process is persisted at path
`/models/final/<N>`, where `<N>` is the epoch number starting from 0. The epoch
number for the epoch selected by the early stopping process is stored in
`/models/final/best.txt`.
`/models/final/stats.csv` stores the training and validation loss and accuracy
statistics during the training process. `/models/final/test_outputs.csv` is the
CSV deliverable for the test set evaluation on the test set we extracted, while
`/models/final/test_usi_outputs.csv` is the CSV deliverable for the test set
evaluation on the provided test set.
The stdout for the training process script can be found in the file
`/models/final/train_log.txt`.
### Plots
The train and validation loss and accuracy plots can be generated from
`/models/final/stats.csv` with the following command:
```shell
python3 plot_acc.py
```
The output is stored in `/models/final/training_metrics.png`.
# Report
To compile the report run:
```shell
cd report
pdflatex -interaction=nonstopmode -output-directory=. main.tex
pdflatex -interaction=nonstopmode -output-directory=. main.tex
```
The report is then located in `report/main.pdf`.

78
dataset/README.md Normal file
View file

@ -0,0 +1,78 @@
# Dataset Download Instructions
## Project .zip Export
We scraped GitHub repositories using the download tool https://seart-ghs.si.usi.ch/ to generate the `results.csv` file
under this directory. Other than the default constraints applied by the `seart-ghs` crawler, we used the following
criteria:
- lines of code: >=10000
- language: `Python`
We found 21269 results. We then downloaded a `.zip` archive of the main branch of each repository using the following
command. We started the download process on 2023-11-13 at 12:00.
```shell
mkdir download || true
cat results.csv | \
awk -F, 'NR>1 { print "wget -O " $2 ".zip https://github.com/" $2 "/archive/refs/heads/" $6 ".zip" }' | \
sed 's#\/#-#;s#\"##g' > download/to_download.sh
cd download
bash to_download.sh
```
### Manually Excluded Repos
We manually excluded the following repositories from our scraped dataset ("404" means that the repository was
inaccessible and could not be downloaded):
- `thorn-lab/coronavirus_structural_task_force` (too large, more than 6GiB)
- `feeicn/security-ppt` (too large, more than 9GiB)
- `salesforce/ai-economist` (404)
- `agiliumtrade/ai-metaapi-python-sdk` (404)
- `pokemonchw/dieloli` (harmful content)
- `thesnowguru/pytrader-python-mt4-mt5-trading-api-connector-drag-n-drop` (DMCA takedown)
- `objectiv/objectiv-analytics` (404)
- `aws/solutions-aws-security-hub-automated-response-and-remediation` (404)
- `openunited/product-factory-backend` (404)
- `ibm-epbl/ibm-project-43602-1660718377` (404)
- `ibm-epbl/ibm-project-1392-1658386621` (404)
- `potatolondon/django-gcloud-connectors` (404)
- `fortwoone/oracle-project` (404)
- `iperov/deepxtools` (404)
- `frequenz/floss-frequenz-sdk-python` (404)
### Check Archive Health
The following script was used to check the integrity of each downloaded `.zip` file.
```shell
cd download
find . -name '*.zip' \
-exec bash -c 'echo $0 $(unzip -l "$0" 2>/dev/null 1>/dev/null && echo "1" || echo "0")' \{\} \; \
> archive_health.txt
```
## Function Extraction
The following command builds a dataset from the archives saved in the `/download` subdirectory:
```shell
python3 ./extract.py
```
Functions are extracted with the Python `ast` module, which discards comments (but not docstrings). The script generates
one parquet archive per project in the directory `/functions` containing functions.
As the dataset was large, this script was terminated early. At termination, 70 million functions were extracted. Due to
computing power limitations for model training, we further extracted only 500000 functions out of the ones downloaded
to build the training set. The extraction process reads the archives in `/functions` and then stores the extracted
functions in the Parquet file `extracted/functions.pq`. The extraction script can be invoked with the command:
```shell
python3 extract.py
```
The extraction process guarantees that the extracted functions have valid syntax for Python 3.10+ and that the code of
each function contains only ASCII characters.

90
dataset/extract.py Normal file
View file

@ -0,0 +1,90 @@
import ast
import os.path
import typing
import zipfile
from typing import Optional
import pandas as pd
from tqdm import tqdm
from fastparquet import write
import multiprocessing
PWD = os.path.dirname(__file__)
IN_DIR = os.path.join(PWD, "download")
OUT_DIR = os.path.join(PWD, "functions")
def read_functions(content, filename: str, zip_name: str) -> Optional[pd.DataFrame]:
records = []
try:
tree = ast.parse(content.decode('utf-8'), filename=filename)
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
f_source: str = ast.unparse(typing.cast(any, node))
records.append({
"zip_filename": zip_name,
"py_filename": filename,
"source": f_source,
"success": True,
"error": None,
})
except Exception as e:
print(f"project '{zip_name}': error parsing '{filename}': {e}")
records.append({
"zip_filename": zip_name,
"py_filename": filename,
"source": "",
"success": False,
"error": str(e)
})
return pd.DataFrame.from_records(records)
def read_zip_file(zip_file: str):
out_path = os.path.join(OUT_DIR, os.path.basename(zip_file) + ".pq")
df = pd.DataFrame(columns=["zip_filename", "py_filename", "source"])
try:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
info_list = [info for info in zip_ref.infolist() if info.filename.endswith('.py')]
for info in tqdm(info_list, desc=os.path.basename(zip_file), ncols=0, position=None, leave=True):
content = zip_ref.read(info.filename)
df_file = read_functions(content, info.filename, zip_file)
if df_file is not None:
df = pd.concat([df, df_file], ignore_index=True)
write(out_path, df, compression='GZIP')
return zip_file
except Exception as e:
print(e)
def read_clones(zip_dir: str):
zip_files = []
for a_file in tqdm(os.listdir(zip_dir), desc="Scan dir"):
path = os.path.join(zip_dir, a_file)
out_path = os.path.join(OUT_DIR, os.path.basename(path) + ".pq")
if zipfile.is_zipfile(path) and not os.path.isfile(out_path):
zip_files.append(path)
num_processes = 192
with multiprocessing.Manager():
with multiprocessing.Pool(processes=num_processes) as pool:
for _ in tqdm(pool.imap_unordered(read_zip_file, zip_files), desc="Read ZIPs",
unit="item", total=len(zip_files), position=None, leave=True):
pass # dummy iteration to consume multiprocessing iterator, needed to launch processes
def main():
if not os.path.isdir(OUT_DIR):
os.makedirs(OUT_DIR)
read_clones(IN_DIR)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:923ad850a4eed1d026b26fedbd5ecd64cf2e4e0f4648108d4732ac0e8fe70eb8
size 72966215

File diff suppressed because it is too large Load diff

21270
dataset/results.csv Normal file

File diff suppressed because one or more lines are too long

68
dataset/sample.py Normal file
View file

@ -0,0 +1,68 @@
import os
import random
import pandas as pd
from fastparquet import write
from tqdm import tqdm
PWD = os.path.dirname(__file__)
IN_DIR = os.path.join(PWD, "functions")
OUT_FILE = os.path.join(PWD, "extracted", "functions.pq")
OUT_SIZE = 500_000
def main():
out_dir = os.path.dirname(OUT_FILE)
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
files = [f for f in os.listdir(IN_DIR) if f.endswith('.pq')]
chosen: set[tuple[str, int]] = set()
df = None
with tqdm(desc="Sampling", total=OUT_SIZE) as p:
while df is None or len(df.index) < OUT_SIZE:
filename = random.choice(files)
path = os.path.join(IN_DIR, filename)
df_file = pd.read_parquet(path, engine='fastparquet')
df_len = len(df_file.index)
if df_len == 0:
continue
for _ in range(min(1000, df_len)):
index = random.randrange(0, df_len)
error_message = df_file.iloc[index]["error"]
if error_message is not None and len(error_message) > 0:
continue
source = df_file.iloc[index]["source"]
if not source.isascii():
continue
choice = (filename, index)
if choice not in chosen:
chosen.add(choice)
if df is None:
df = df_file
else:
df = pd.concat([df, df_file.iloc[[index], :]], axis=0, ignore_index=True)
if len(df.index) % 1000 == 0:
write(OUT_FILE, df, compression='GZIP')
p.update(1)
# Fix success column, which is computed wrong in a previous iteration of extract.py
df["success"] = df["error"].apply(lambda e: e is None or len(e) == 0)
write(OUT_FILE, df, compression='GZIP')
if __name__ == "__main__":
main()

111
environment.yml Normal file
View file

@ -0,0 +1,111 @@
name: SA
channels:
- defaults
dependencies:
- abseil-cpp=20230802.0
- aiohttp=3.8.5
- aiosignal=1.2.0
- arrow-cpp=11.0.0
- async-timeout=4.0.2
- attrs=23.1.0
- aws-c-common=0.6.8
- aws-c-event-stream=0.1.6
- aws-checksums=0.1.11
- aws-sdk-cpp=1.8.185
- blas=1.0
- boost-cpp=1.82.0
- bottleneck=1.3.5
- brotli=1.0.9
- brotli-bin=1.0.9
- brotli-python=1.0.9
- bzip2=1.0.8
- c-ares=1.19.1
- ca-certificates=2023.08.22
- certifi=2023.11.17
- cffi=1.16.0
- charset-normalizer=2.0.4
- cramjam=2.6.2
- cryptography=41.0.3
- datasets=2.12.0
- dill=0.3.6
- fastparquet=2023.8.0
- filelock=3.13.1
- frozenlist=1.4.0
- fsspec=2023.9.2
- gflags=2.2.2
- glog=0.5.0
- grpc-cpp=1.48.2
- gtest=1.14.0
- huggingface_hub=0.17.3
- icu=73.1
- idna=3.4
- importlib-metadata=6.0.0
- krb5=1.20.1
- libboost=1.82.0
- libbrotlicommon=1.0.9
- libbrotlidec=1.0.9
- libbrotlienc=1.0.9
- libcurl=8.4.0
- libcxx=14.0.6
- libedit=3.1.20221030
- libev=4.33
- libevent=2.1.12
- libffi=3.4.4
- libgfortran=5.0.0
- libgfortran5=11.3.0
- libiconv=1.16
- libnghttp2=1.57.0
- libopenblas=0.3.21
- libprotobuf=3.20.3
- libssh2=1.10.0
- libthrift=0.15.0
- llvm-openmp=14.0.6
- lz4-c=1.9.4
- multidict=6.0.2
- multiprocess=0.70.14
- ncurses=6.4
- numexpr=2.8.7
- numpy=1.26.0
- numpy-base=1.26.0
- openssl=3.0.12
- orc=1.7.4
- packaging=23.1
- pandas=2.1.1
- pip=23.3.1
- pyarrow=11.0.0
- pycparser=2.21
- pyopenssl=23.2.0
- pysocks=1.7.1
- python=3.11.5
- python-dateutil=2.8.2
- python-tzdata=2023.3
- python-xxhash=2.0.2
- pytz=2023.3.post1
- pyyaml=6.0.1
- re2=2022.04.01
- readline=8.2
- regex=2023.10.3
- requests=2.31.0
- responses=0.13.3
- safetensors=0.4.0
- setuptools=68.0.0
- six=1.16.0
- snappy=1.1.9
- sqlite=3.41.2
- tk=8.6.12
- tokenizers=0.13.2
- tqdm=4.65.0
- transformers=4.32.1
- typing-extensions=4.7.1
- typing_extensions=4.7.1
- tzdata=2023c
- urllib3=1.26.18
- utf8proc=2.6.1
- wheel=0.41.2
- xxhash=0.8.0
- xz=5.4.2
- yaml=0.2.5
- yarl=1.8.1
- zipp=3.11.0
- zlib=1.2.13
- zstd=1.5.5

0
models/.gitkeep Normal file
View file

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

View file

@ -0,0 +1,69 @@
{
"_name_or_path": "Salesforce/codet5-small",
"architectures": [
"T5ForConditionalGeneration"
],
"bos_token_id": 1,
"classifier_dropout": 0.0,
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"dropout_rate": 0.1,
"eos_token_id": 2,
"feed_forward_proj": "relu",
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0"
},
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"label2id": {
"LABEL_0": 0
},
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"transformers_version": "4.34.0",
"use_cache": true,
"vocab_size": 32100
}

View file

@ -0,0 +1,8 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"decoder_start_token_id": 0,
"eos_token_id": 2,
"pad_token_id": 0,
"transformers_version": "4.34.0"
}

3250
models/baris/model_0.1.ipynb Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,449 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "5f7ad96a-6b01-4b63-93b6-4008597a0e9e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating: 100%|███████████████████████████| 1092/1092 [04:02<00:00, 4.50it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test Accuracy: 0.3642\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"import random\n",
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForLanguageModeling\n",
"from sklearn.model_selection import train_test_split\n",
"from tqdm import tqdm\n",
"import os\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"# Dataset class for pre-training\n",
"class PythonCodeDataset(Dataset):\n",
" def __init__(self, tokenizer, dataframe, max_len=512):\n",
" self.tokenizer = tokenizer\n",
" self.data = dataframe\n",
" self.max_len = max_len\n",
"\n",
" def __len__(self):\n",
" return len(self.data)\n",
"\n",
" def __getitem__(self, index):\n",
" code = self.data.iloc[index]['source']\n",
" inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)\n",
" return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}\n",
"\n",
"# Function to mask if conditions\n",
"def mask_if_condition(code_snippet):\n",
" if_conditions = re.findall(r'(if\\s+.*?:)', code_snippet)\n",
" masked_snippet = code_snippet.replace(if_conditions[0], '<mask>', 1) if if_conditions else code_snippet\n",
" return masked_snippet, if_conditions[0] if if_conditions else None\n",
"\n",
"# Fine-tuning and evaluation dataset classes\n",
"class MaskedIfDataset(PythonCodeDataset):\n",
" def __getitem__(self, index):\n",
" masked_code = self.data.iloc[index]['masked_code']\n",
" ground_truth = self.data.iloc[index]['ground_truth']\n",
" inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\")\n",
" labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\").input_ids\n",
" labels[labels == self.tokenizer.pad_token_id] = -100\n",
" return {'input_ids': inputs.input_ids.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': labels.squeeze()}\n",
"\n",
"# Define the pre-training loop\n",
"def pretrain(model, dataloader, epochs, print_every=10):\n",
" model.train()\n",
" optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n",
" global_step = 0 # Initialize a counter for the global training step\n",
"\n",
" for epoch in range(epochs):\n",
" for batch in dataloader:\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" optimizer.zero_grad()\n",
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}\n",
" outputs = model(**inputs, labels=batch['input_ids'])\n",
" loss = outputs.loss\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" if global_step % print_every == 0: # Print every steps\n",
" print(f\"Step {global_step}, Loss: {loss.item()}\")\n",
"\n",
" global_step += 1 # Increment the step counter\n",
"\n",
" print(f\"Epoch {epoch+1}/{epochs} completed.\")\n",
" \n",
"\n",
"def fine_tune_with_eval(model, train_loader, eval_loader, epochs, save_path, print_every=10, early_stopping_patience=3):\n",
" optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n",
" best_epoch = 0\n",
" best_eval_accuracy = 0\n",
" patience_counter = 0\n",
" train_losses, eval_accuracies = [], []\n",
"\n",
" for epoch in range(epochs):\n",
" model.train()\n",
" total_loss = 0\n",
"\n",
" # Training loop with tqdm for progress tracking\n",
" for batch in tqdm(train_loader, desc=f\"Training Epoch {epoch+1}/{epochs}\"):\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" optimizer.zero_grad()\n",
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['labels']}\n",
" outputs = model(**inputs)\n",
" loss = outputs.loss\n",
" total_loss += loss.item()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" average_loss = total_loss / len(train_loader)\n",
" train_losses.append(average_loss)\n",
"\n",
" # Evaluate on the evaluation set\n",
" eval_accuracy = evaluate_accuracy(model, eval_loader, tokenizer, device)\n",
" eval_accuracies.append(eval_accuracy)\n",
" print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {average_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}\")\n",
"\n",
" # Early stopping and checkpointing\n",
" if eval_accuracy > best_eval_accuracy:\n",
" best_eval_accuracy = eval_accuracy\n",
" best_epoch = epoch\n",
" patience_counter = 0\n",
" else:\n",
" patience_counter += 1\n",
" if patience_counter >= early_stopping_patience:\n",
" print(\"Early stopping triggered.\")\n",
" break\n",
" \n",
" save_directory = f\"{save_path}/{epoch}\"\n",
" model.save_pretrained(save_directory)\n",
" \n",
" # Plotting the training loss and evaluation accuracy\n",
" plt.figure(figsize=(12, 5))\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(train_losses, label='Training Loss')\n",
" plt.title('Training Loss')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Loss')\n",
" plt.legend()\n",
"\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(eval_accuracies, label='Evaluation Accuracy')\n",
" plt.title('Evaluation Accuracy')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Accuracy')\n",
" plt.legend()\n",
"\n",
" plt.savefig(f\"{save_path}/training_metrics.png\")\n",
" \n",
" return best_epoch\n",
"\n",
"\n",
"def evaluate_accuracy(model, dataloader, tokenizer, device):\n",
" model.eval()\n",
" correct_predictions, total_predictions = 0, 0\n",
"\n",
" for batch in tqdm(dataloader, desc=\"Evaluating\"):\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" with torch.no_grad():\n",
" outputs = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)\n",
" decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n",
"\n",
" # Decode labels with added check for None values\n",
" decoded_labels = []\n",
" for label in batch['labels']:\n",
" label_trimmed = [l for l in label.tolist() if l != tokenizer.pad_token_id and l != -100]\n",
" if label_trimmed:\n",
" decoded_label = tokenizer.decode(label_trimmed, skip_special_tokens=True)\n",
" decoded_labels.append(decoded_label)\n",
" else:\n",
" decoded_labels.append(None) # Append None for invalid/empty labels\n",
"\n",
" # Calculate accuracy\n",
" for output, label in zip(decoded_outputs, decoded_labels):\n",
" if label is not None and output.strip() == label.strip():\n",
" correct_predictions += 1\n",
" if label is not None:\n",
" total_predictions += 1\n",
"\n",
" return correct_predictions / total_predictions if total_predictions > 0 else 0\n",
" \n",
" \n",
"# Read the dataset\n",
"df = pd.read_parquet('../if-statements/dataset/extracted/functions.pq')\n",
"#df = df.head(50)\n",
"\n",
"# Split the dataset into pre-training, fine-tuning, evaluation, and test sets\n",
"pretrain_df, fine_tune_df = train_test_split(df, test_size=0.5, random_state=42)\n",
"eval_df = fine_tune_df.sample(frac=0.1, random_state=42)\n",
"test_df = fine_tune_df.drop(eval_df.index).sample(frac=0.1111, random_state=42)\n",
"fine_tune_df = fine_tune_df.drop(eval_df.index).drop(test_df.index)\n",
"\n",
"assert len(set(eval_df.index).intersection(set(test_df.index))) == 0\n",
"\n",
"\n",
"# Initialize tokenizer and model\n",
"tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')\n",
"model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"print(f'Using device: {device}')\n",
"model.to(device)\n",
" \n",
"# Instantiate the dataset for pre-training\n",
"pretrain_dataset = PythonCodeDataset(tokenizer, pretrain_df)\n",
"\n",
"# Set up the data collator for MLM\n",
"data_collator = DataCollatorForLanguageModeling(\n",
" tokenizer=tokenizer,\n",
" mlm=True,\n",
" mlm_probability=0.15\n",
")\n",
"\n",
"# Create a DataLoader for pre-training\n",
"pretrain_loader = DataLoader(pretrain_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)\n",
"\n",
"# Pre-train the model\n",
"#pretrain(model, pretrain_loader, epochs=1)\n",
"\n",
"\n",
"# Prepare data for fine-tuning and evaluation\n",
"fine_tune_df['masked_code'], fine_tune_df['ground_truth'] = zip(*fine_tune_df['source'].apply(mask_if_condition))\n",
"eval_df['masked_code'], eval_df['ground_truth'] = zip(*eval_df['source'].apply(mask_if_condition))\n",
"fine_tune_df.dropna(subset=['ground_truth'], inplace=True)\n",
"eval_df.dropna(subset=['ground_truth'], inplace=True)\n",
"\n",
"\n",
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
"\n",
"\n",
"# Dataloaders for fine-tuning and evaluation\n",
"fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=8, shuffle=True)\n",
"eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)\n",
"\n",
"\n",
"# Instantiate the datasets for fine-tuning and evaluation\n",
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
"\n",
"\n",
"best_epoch = 4\n",
"\n",
"# Example of calling the modified function\n",
"save_path = '../if-statements/dataset/extracted/final'\n",
"#best_epoch = fine_tune_with_eval(model, fine_tune_loader, eval_loader, epochs=5, save_path=save_path)\n",
"\n",
"# Define the directory of the best model\n",
"best_model_directory = os.path.join(save_path, str(best_epoch))\n",
"\n",
"# Load the best model and its config\n",
"best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)\n",
"\n",
"# Optionally, load the model's config\n",
"model_config = best_model.config # This will load the config file associated with the model\n",
"\n",
"best_model.to(device)\n",
"\n",
"# Prepare and evaluate on the test set\n",
"test_df['masked_code'], test_df['ground_truth'] = zip(*test_df['source'].apply(mask_if_condition))\n",
"test_df.dropna(subset=['ground_truth'], inplace=True)\n",
"test_dataset = MaskedIfDataset(tokenizer, test_df)\n",
"test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)\n",
"\n",
"# Evaluate the model on the test set\n",
"test_accuracy = evaluate_accuracy(best_model, test_loader, tokenizer, device)\n",
"print(f\"Test Accuracy: {test_accuracy:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3eb56941-cd5b-405b-ae37-f15d97a2b22e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 original_method \\\n",
"0 5126 def stream_edit(request, stream_id, response_f... \n",
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
"3 17853 def search_host(self, search_string):\\n res... \n",
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
"\n",
" target_block \n",
"0 if \"cancel\" not in request . POST : \n",
"1 if isinstance ( node , ast . Include ) : \n",
"2 if len ( line . strip ( ) ) == 0 : \n",
"3 if isinstance ( value , int ) : \n",