This repository has been archived on 2024-10-22. You can view files and clone it, but cannot push or open issues or pull requests.
soft-analytics-02/models/baris/test_model.ipynb
Claudio Maggioni a4ceee8716 Final version of the project
History has been rewritten to delete large files in repo
2024-01-03 15:28:43 +01:00

449 lines
18 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "5f7ad96a-6b01-4b63-93b6-4008597a0e9e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating: 100%|███████████████████████████| 1092/1092 [04:02<00:00, 4.50it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test Accuracy: 0.3642\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"import random\n",
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForLanguageModeling\n",
"from sklearn.model_selection import train_test_split\n",
"from tqdm import tqdm\n",
"import os\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"# Dataset class for pre-training\n",
"class PythonCodeDataset(Dataset):\n",
" def __init__(self, tokenizer, dataframe, max_len=512):\n",
" self.tokenizer = tokenizer\n",
" self.data = dataframe\n",
" self.max_len = max_len\n",
"\n",
" def __len__(self):\n",
" return len(self.data)\n",
"\n",
" def __getitem__(self, index):\n",
" code = self.data.iloc[index]['source']\n",
" inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)\n",
" return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}\n",
"\n",
"# Function to mask if conditions\n",
"def mask_if_condition(code_snippet):\n",
" if_conditions = re.findall(r'(if\\s+.*?:)', code_snippet)\n",
" masked_snippet = code_snippet.replace(if_conditions[0], '<mask>', 1) if if_conditions else code_snippet\n",
" return masked_snippet, if_conditions[0] if if_conditions else None\n",
"\n",
"# Fine-tuning and evaluation dataset classes\n",
"class MaskedIfDataset(PythonCodeDataset):\n",
" def __getitem__(self, index):\n",
" masked_code = self.data.iloc[index]['masked_code']\n",
" ground_truth = self.data.iloc[index]['ground_truth']\n",
" inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\")\n",
" labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\").input_ids\n",
" labels[labels == self.tokenizer.pad_token_id] = -100\n",
" return {'input_ids': inputs.input_ids.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': labels.squeeze()}\n",
"\n",
"# Define the pre-training loop\n",
"def pretrain(model, dataloader, epochs, print_every=10):\n",
" model.train()\n",
" optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n",
" global_step = 0 # Initialize a counter for the global training step\n",
"\n",
" for epoch in range(epochs):\n",
" for batch in dataloader:\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" optimizer.zero_grad()\n",
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}\n",
" outputs = model(**inputs, labels=batch['input_ids'])\n",
" loss = outputs.loss\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" if global_step % print_every == 0: # Print every steps\n",
" print(f\"Step {global_step}, Loss: {loss.item()}\")\n",
"\n",
" global_step += 1 # Increment the step counter\n",
"\n",
" print(f\"Epoch {epoch+1}/{epochs} completed.\")\n",
" \n",
"\n",
"def fine_tune_with_eval(model, train_loader, eval_loader, epochs, save_path, print_every=10, early_stopping_patience=3):\n",
" optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n",
" best_epoch = 0\n",
" best_eval_accuracy = 0\n",
" patience_counter = 0\n",
" train_losses, eval_accuracies = [], []\n",
"\n",
" for epoch in range(epochs):\n",
" model.train()\n",
" total_loss = 0\n",
"\n",
" # Training loop with tqdm for progress tracking\n",
" for batch in tqdm(train_loader, desc=f\"Training Epoch {epoch+1}/{epochs}\"):\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" optimizer.zero_grad()\n",
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['labels']}\n",
" outputs = model(**inputs)\n",
" loss = outputs.loss\n",
" total_loss += loss.item()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" average_loss = total_loss / len(train_loader)\n",
" train_losses.append(average_loss)\n",
"\n",
" # Evaluate on the evaluation set\n",
" eval_accuracy = evaluate_accuracy(model, eval_loader, tokenizer, device)\n",
" eval_accuracies.append(eval_accuracy)\n",
" print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {average_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}\")\n",
"\n",
" # Early stopping and checkpointing\n",
" if eval_accuracy > best_eval_accuracy:\n",
" best_eval_accuracy = eval_accuracy\n",
" best_epoch = epoch\n",
" patience_counter = 0\n",
" else:\n",
" patience_counter += 1\n",
" if patience_counter >= early_stopping_patience:\n",
" print(\"Early stopping triggered.\")\n",
" break\n",
" \n",
" save_directory = f\"{save_path}/{epoch}\"\n",
" model.save_pretrained(save_directory)\n",
" \n",
" # Plotting the training loss and evaluation accuracy\n",
" plt.figure(figsize=(12, 5))\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(train_losses, label='Training Loss')\n",
" plt.title('Training Loss')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Loss')\n",
" plt.legend()\n",
"\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(eval_accuracies, label='Evaluation Accuracy')\n",
" plt.title('Evaluation Accuracy')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Accuracy')\n",
" plt.legend()\n",
"\n",
" plt.savefig(f\"{save_path}/training_metrics.png\")\n",
" \n",
" return best_epoch\n",
"\n",
"\n",
"def evaluate_accuracy(model, dataloader, tokenizer, device):\n",
" model.eval()\n",
" correct_predictions, total_predictions = 0, 0\n",
"\n",
" for batch in tqdm(dataloader, desc=\"Evaluating\"):\n",
" batch = {k: v.to(device) for k, v in batch.items()}\n",
" with torch.no_grad():\n",
" outputs = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)\n",
" decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n",
"\n",
" # Decode labels with added check for None values\n",
" decoded_labels = []\n",
" for label in batch['labels']:\n",
" label_trimmed = [l for l in label.tolist() if l != tokenizer.pad_token_id and l != -100]\n",
" if label_trimmed:\n",
" decoded_label = tokenizer.decode(label_trimmed, skip_special_tokens=True)\n",
" decoded_labels.append(decoded_label)\n",
" else:\n",
" decoded_labels.append(None) # Append None for invalid/empty labels\n",
"\n",
" # Calculate accuracy\n",
" for output, label in zip(decoded_outputs, decoded_labels):\n",
" if label is not None and output.strip() == label.strip():\n",
" correct_predictions += 1\n",
" if label is not None:\n",
" total_predictions += 1\n",
"\n",
" return correct_predictions / total_predictions if total_predictions > 0 else 0\n",
" \n",
" \n",
"# Read the dataset\n",
"df = pd.read_parquet('../if-statements/dataset/extracted/functions.pq')\n",
"#df = df.head(50)\n",
"\n",
"# Split the dataset into pre-training, fine-tuning, evaluation, and test sets\n",
"pretrain_df, fine_tune_df = train_test_split(df, test_size=0.5, random_state=42)\n",
"eval_df = fine_tune_df.sample(frac=0.1, random_state=42)\n",
"test_df = fine_tune_df.drop(eval_df.index).sample(frac=0.1111, random_state=42)\n",
"fine_tune_df = fine_tune_df.drop(eval_df.index).drop(test_df.index)\n",
"\n",
"assert len(set(eval_df.index).intersection(set(test_df.index))) == 0\n",
"\n",
"\n",
"# Initialize tokenizer and model\n",
"tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')\n",
"model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"print(f'Using device: {device}')\n",
"model.to(device)\n",
" \n",
"# Instantiate the dataset for pre-training\n",
"pretrain_dataset = PythonCodeDataset(tokenizer, pretrain_df)\n",
"\n",
"# Set up the data collator for MLM\n",
"data_collator = DataCollatorForLanguageModeling(\n",
" tokenizer=tokenizer,\n",
" mlm=True,\n",
" mlm_probability=0.15\n",
")\n",
"\n",
"# Create a DataLoader for pre-training\n",
"pretrain_loader = DataLoader(pretrain_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)\n",
"\n",
"# Pre-train the model\n",
"#pretrain(model, pretrain_loader, epochs=1)\n",
"\n",
"\n",
"# Prepare data for fine-tuning and evaluation\n",
"fine_tune_df['masked_code'], fine_tune_df['ground_truth'] = zip(*fine_tune_df['source'].apply(mask_if_condition))\n",
"eval_df['masked_code'], eval_df['ground_truth'] = zip(*eval_df['source'].apply(mask_if_condition))\n",
"fine_tune_df.dropna(subset=['ground_truth'], inplace=True)\n",
"eval_df.dropna(subset=['ground_truth'], inplace=True)\n",
"\n",
"\n",
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
"\n",
"\n",
"# Dataloaders for fine-tuning and evaluation\n",
"fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=8, shuffle=True)\n",
"eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)\n",
"\n",
"\n",
"# Instantiate the datasets for fine-tuning and evaluation\n",
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
"\n",
"\n",
"best_epoch = 4\n",
"\n",
"# Example of calling the modified function\n",
"save_path = '../if-statements/dataset/extracted/final'\n",
"#best_epoch = fine_tune_with_eval(model, fine_tune_loader, eval_loader, epochs=5, save_path=save_path)\n",
"\n",
"# Define the directory of the best model\n",
"best_model_directory = os.path.join(save_path, str(best_epoch))\n",
"\n",
"# Load the best model and its config\n",
"best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)\n",
"\n",
"# Optionally, load the model's config\n",
"model_config = best_model.config # This will load the config file associated with the model\n",
"\n",
"best_model.to(device)\n",
"\n",
"# Prepare and evaluate on the test set\n",
"test_df['masked_code'], test_df['ground_truth'] = zip(*test_df['source'].apply(mask_if_condition))\n",
"test_df.dropna(subset=['ground_truth'], inplace=True)\n",
"test_dataset = MaskedIfDataset(tokenizer, test_df)\n",
"test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)\n",
"\n",
"# Evaluate the model on the test set\n",
"test_accuracy = evaluate_accuracy(best_model, test_loader, tokenizer, device)\n",
"print(f\"Test Accuracy: {test_accuracy:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3eb56941-cd5b-405b-ae37-f15d97a2b22e",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 original_method \\\n",
"0 5126 def stream_edit(request, stream_id, response_f... \n",
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
"3 17853 def search_host(self, search_string):\\n res... \n",
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
"\n",
" target_block \n",
"0 if \"cancel\" not in request . POST : \n",
"1 if isinstance ( node , ast . Include ) : \n",
"2 if len ( line . strip ( ) ) == 0 : \n",
"3 if isinstance ( value , int ) : \n",
"4 if self . _get_flag ( \"struct\" ) : \n"
]
}
],
"source": [
"# Load the new dataset\n",
"new_df = pd.read_csv('../if-statements/dataset/extracted/test_set_usi.csv')\n",
"\n",
"new_df.drop(\"input_method\", axis=1, inplace=True)\n",
"new_df.drop(\"tokens_in_method\", axis=1, inplace=True)\n",
"\n",
"print(new_df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "08a9c76f-32da-4871-b0af-d5afafa50ae0",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 original_method \\\n",
"0 5126 def stream_edit(request, stream_id, response_f... \n",
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
"3 17853 def search_host(self, search_string):\\n res... \n",
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
"\n",
" target_block \\\n",
"0 if \"cancel\" not in request . POST : \n",
"1 if isinstance ( node , ast . Include ) : \n",
"2 if len ( line . strip ( ) ) == 0 : \n",
"3 if isinstance ( value , int ) : \n",
"4 if self . _get_flag ( \"struct\" ) : \n",
"\n",
" masked_code \\\n",
"0 def stream_edit(request, stream_id, response_f... \n",
"1 def _read_and_parse_includes(self):\\n # Map... \n",
"2 def _get_list_key(self, spaces, lines):\\n k... \n",
"3 def search_host(self, search_string):\\n res... \n",
"4 def pop(self, key: Union[str, Enum], default: ... \n",
"\n",
" ground_truth \n",
"0 if not request.user.profile.has_permission(str... \n",
"1 if isinstance(node, ast.Include): \n",
"2 if len(line.strip()) == 0: \n",
"3 if host_entry.get(\"type\") != \"entry\": \n",
"4 if self._get_flag(\"readonly\"): \n"
]
}
],
"source": [
"# Function to preprocess the new dataframe\n",
"def preprocess_new_df(df):\n",
" # Apply the masking function\n",
" df['masked_code'], df['ground_truth'] = zip(*df['original_method'].apply(mask_if_condition))\n",
" # Drop rows where ground truth (if statement) is None\n",
" df.dropna(subset=['ground_truth'], inplace=True)\n",
"\n",
"# Preprocess the new dataframe\n",
"preprocess_new_df(new_df)\n",
"\n",
"# Check the first few rows\n",
"print(new_df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "c36c9144-64b2-46dd-b597-5528ff57b10a",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating: 100%|█████████████████████████████| 624/624 [02:29<00:00, 4.17it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"New Dataset Accuracy: 0.2841\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Create dataset for the new dataframe\n",
"new_dataset = MaskedIfDataset(tokenizer, new_df)\n",
"\n",
"# Create DataLoader for the new dataset\n",
"new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False)\n",
"\n",
"# Evaluate the model on the new dataset\n",
"new_accuracy = evaluate_accuracy(best_model, new_loader, tokenizer, device)\n",
"print(f\"New Dataset Accuracy: {new_accuracy:.4f}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}