76 lines
3 KiB
Python
76 lines
3 KiB
Python
import os
|
|
|
|
import pandas as pd
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
|
|
|
import torch
|
|
from transformers import T5ForConditionalGeneration
|
|
|
|
from train.evaluate import evaluate_accuracy
|
|
from train.finetune import fine_tune_with_eval
|
|
from train.dataset import build_pretrain_dataloader, build_fine_tune_dataloader
|
|
from train.pretrain import pretrain
|
|
from train.load import DataSet
|
|
|
|
IN_PATH: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'functions.pq')
|
|
IN_PATH_USI: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'test_set_usi.csv')
|
|
OUT_PATH: str = os.path.join(os.path.dirname(__file__), 'models', 'final')
|
|
|
|
RANDOM_STATE: int = 42
|
|
|
|
|
|
def train():
|
|
dataset = DataSet.load(IN_PATH, IN_PATH_USI, RANDOM_STATE)
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
print(f'Using device: {device}')
|
|
|
|
pretrain_dir = os.path.join(OUT_PATH, "pretrain")
|
|
|
|
if os.path.isfile(os.path.join(pretrain_dir, "config.json")):
|
|
# load the pretrained model if it exists
|
|
model = T5ForConditionalGeneration.from_pretrained(pretrain_dir)
|
|
model.to(device)
|
|
else:
|
|
# Pre-train the model
|
|
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')
|
|
model.to(device)
|
|
pretrain_loader = build_pretrain_dataloader(dataset.pretrain_df)
|
|
pretrain(model, pretrain_loader, device, 1, pretrain_dir)
|
|
|
|
# Dataloaders for fine-tuning and validation
|
|
best_epoch_file = os.path.join(OUT_PATH, "best.txt")
|
|
|
|
if not os.path.isfile(best_epoch_file):
|
|
fine_tune_loader = build_fine_tune_dataloader(dataset.fine_tune_train_df, 'train')
|
|
eval_loader = build_fine_tune_dataloader(dataset.fine_tune_val_df, 'val')
|
|
|
|
best_epoch = fine_tune_with_eval(model, device, fine_tune_loader, eval_loader, 20, OUT_PATH)
|
|
|
|
with open(best_epoch_file, "w") as f:
|
|
f.write(str(best_epoch) + "\n")
|
|
|
|
# Load model for best epoch
|
|
with open(best_epoch_file, "r") as f:
|
|
best_epoch = int(f.read().strip())
|
|
best_model_directory = os.path.join(OUT_PATH, str(best_epoch))
|
|
best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)
|
|
best_model.to(device)
|
|
|
|
test_loader = build_fine_tune_dataloader(dataset.fine_tune_test_df, 'test')
|
|
test_usi_loader = build_fine_tune_dataloader(dataset.usi_test_df, 'test_usi')
|
|
|
|
# Evaluate the model on the test set
|
|
test_accuracy, _, test_outs = evaluate_accuracy(best_model, test_loader, device, track_predictions=True)
|
|
pd.DataFrame.from_records(test_outs).to_csv(os.path.join(OUT_PATH, 'test_outputs.csv'))
|
|
print(f"Test Accuracy: {test_accuracy * 100:02.02f}%")
|
|
|
|
# Evaluate the model on the usi test set
|
|
test_accuracy, _, test_usi_outs = evaluate_accuracy(best_model, test_usi_loader, device, track_predictions=True)
|
|
pd.DataFrame.from_records(test_usi_outs).to_csv(os.path.join(OUT_PATH, 'test_usi_outputs.csv'))
|
|
print(f"USI Test Accuracy: {test_accuracy * 100:02.02f}%")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
train()
|