{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "5f7ad96a-6b01-4b63-93b6-4008597a0e9e", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using device: cuda\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating: 100%|███████████████████████████| 1092/1092 [04:02<00:00, 4.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Test Accuracy: 0.3642\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import random\n", "import torch\n", "from torch.utils.data import Dataset, DataLoader\n", "from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForLanguageModeling\n", "from sklearn.model_selection import train_test_split\n", "from tqdm import tqdm\n", "import os\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "# Dataset class for pre-training\n", "class PythonCodeDataset(Dataset):\n", " def __init__(self, tokenizer, dataframe, max_len=512):\n", " self.tokenizer = tokenizer\n", " self.data = dataframe\n", " self.max_len = max_len\n", "\n", " def __len__(self):\n", " return len(self.data)\n", "\n", " def __getitem__(self, index):\n", " code = self.data.iloc[index]['source']\n", " inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)\n", " return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}\n", "\n", "# Function to mask if conditions\n", "def mask_if_condition(code_snippet):\n", " if_conditions = re.findall(r'(if\\s+.*?:)', code_snippet)\n", " masked_snippet = code_snippet.replace(if_conditions[0], '', 1) if if_conditions else code_snippet\n", " return masked_snippet, if_conditions[0] if if_conditions else None\n", "\n", "# Fine-tuning and evaluation dataset classes\n", "class MaskedIfDataset(PythonCodeDataset):\n", " def __getitem__(self, index):\n", " masked_code = self.data.iloc[index]['masked_code']\n", " ground_truth = self.data.iloc[index]['ground_truth']\n", " inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\")\n", " labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\").input_ids\n", " labels[labels == self.tokenizer.pad_token_id] = -100\n", " return {'input_ids': inputs.input_ids.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': labels.squeeze()}\n", "\n", "# Define the pre-training loop\n", "def pretrain(model, dataloader, epochs, print_every=10):\n", " model.train()\n", " optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n", " global_step = 0 # Initialize a counter for the global training step\n", "\n", " for epoch in range(epochs):\n", " for batch in dataloader:\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " optimizer.zero_grad()\n", " inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}\n", " outputs = model(**inputs, labels=batch['input_ids'])\n", " loss = outputs.loss\n", " loss.backward()\n", " optimizer.step()\n", "\n", " if global_step % print_every == 0: # Print every steps\n", " print(f\"Step {global_step}, Loss: {loss.item()}\")\n", "\n", " global_step += 1 # Increment the step counter\n", "\n", " print(f\"Epoch {epoch+1}/{epochs} completed.\")\n", " \n", "\n", "def fine_tune_with_eval(model, train_loader, eval_loader, epochs, save_path, print_every=10, early_stopping_patience=3):\n", " optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n", " best_epoch = 0\n", " best_eval_accuracy = 0\n", " patience_counter = 0\n", " train_losses, eval_accuracies = [], []\n", "\n", " for epoch in range(epochs):\n", " model.train()\n", " total_loss = 0\n", "\n", " # Training loop with tqdm for progress tracking\n", " for batch in tqdm(train_loader, desc=f\"Training Epoch {epoch+1}/{epochs}\"):\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " optimizer.zero_grad()\n", " inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['labels']}\n", " outputs = model(**inputs)\n", " loss = outputs.loss\n", " total_loss += loss.item()\n", " loss.backward()\n", " optimizer.step()\n", "\n", " average_loss = total_loss / len(train_loader)\n", " train_losses.append(average_loss)\n", "\n", " # Evaluate on the evaluation set\n", " eval_accuracy = evaluate_accuracy(model, eval_loader, tokenizer, device)\n", " eval_accuracies.append(eval_accuracy)\n", " print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {average_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}\")\n", "\n", " # Early stopping and checkpointing\n", " if eval_accuracy > best_eval_accuracy:\n", " best_eval_accuracy = eval_accuracy\n", " best_epoch = epoch\n", " patience_counter = 0\n", " else:\n", " patience_counter += 1\n", " if patience_counter >= early_stopping_patience:\n", " print(\"Early stopping triggered.\")\n", " break\n", " \n", " save_directory = f\"{save_path}/{epoch}\"\n", " model.save_pretrained(save_directory)\n", " \n", " # Plotting the training loss and evaluation accuracy\n", " plt.figure(figsize=(12, 5))\n", " plt.subplot(1, 2, 1)\n", " plt.plot(train_losses, label='Training Loss')\n", " plt.title('Training Loss')\n", " plt.xlabel('Epoch')\n", " plt.ylabel('Loss')\n", " plt.legend()\n", "\n", " plt.subplot(1, 2, 2)\n", " plt.plot(eval_accuracies, label='Evaluation Accuracy')\n", " plt.title('Evaluation Accuracy')\n", " plt.xlabel('Epoch')\n", " plt.ylabel('Accuracy')\n", " plt.legend()\n", "\n", " plt.savefig(f\"{save_path}/training_metrics.png\")\n", " \n", " return best_epoch\n", "\n", "\n", "def evaluate_accuracy(model, dataloader, tokenizer, device):\n", " model.eval()\n", " correct_predictions, total_predictions = 0, 0\n", "\n", " for batch in tqdm(dataloader, desc=\"Evaluating\"):\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " with torch.no_grad():\n", " outputs = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)\n", " decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n", "\n", " # Decode labels with added check for None values\n", " decoded_labels = []\n", " for label in batch['labels']:\n", " label_trimmed = [l for l in label.tolist() if l != tokenizer.pad_token_id and l != -100]\n", " if label_trimmed:\n", " decoded_label = tokenizer.decode(label_trimmed, skip_special_tokens=True)\n", " decoded_labels.append(decoded_label)\n", " else:\n", " decoded_labels.append(None) # Append None for invalid/empty labels\n", "\n", " # Calculate accuracy\n", " for output, label in zip(decoded_outputs, decoded_labels):\n", " if label is not None and output.strip() == label.strip():\n", " correct_predictions += 1\n", " if label is not None:\n", " total_predictions += 1\n", "\n", " return correct_predictions / total_predictions if total_predictions > 0 else 0\n", " \n", " \n", "# Read the dataset\n", "df = pd.read_parquet('../if-statements/dataset/extracted/functions.pq')\n", "#df = df.head(50)\n", "\n", "# Split the dataset into pre-training, fine-tuning, evaluation, and test sets\n", "pretrain_df, fine_tune_df = train_test_split(df, test_size=0.5, random_state=42)\n", "eval_df = fine_tune_df.sample(frac=0.1, random_state=42)\n", "test_df = fine_tune_df.drop(eval_df.index).sample(frac=0.1111, random_state=42)\n", "fine_tune_df = fine_tune_df.drop(eval_df.index).drop(test_df.index)\n", "\n", "assert len(set(eval_df.index).intersection(set(test_df.index))) == 0\n", "\n", "\n", "# Initialize tokenizer and model\n", "tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')\n", "model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "print(f'Using device: {device}')\n", "model.to(device)\n", " \n", "# Instantiate the dataset for pre-training\n", "pretrain_dataset = PythonCodeDataset(tokenizer, pretrain_df)\n", "\n", "# Set up the data collator for MLM\n", "data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer,\n", " mlm=True,\n", " mlm_probability=0.15\n", ")\n", "\n", "# Create a DataLoader for pre-training\n", "pretrain_loader = DataLoader(pretrain_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)\n", "\n", "# Pre-train the model\n", "#pretrain(model, pretrain_loader, epochs=1)\n", "\n", "\n", "# Prepare data for fine-tuning and evaluation\n", "fine_tune_df['masked_code'], fine_tune_df['ground_truth'] = zip(*fine_tune_df['source'].apply(mask_if_condition))\n", "eval_df['masked_code'], eval_df['ground_truth'] = zip(*eval_df['source'].apply(mask_if_condition))\n", "fine_tune_df.dropna(subset=['ground_truth'], inplace=True)\n", "eval_df.dropna(subset=['ground_truth'], inplace=True)\n", "\n", "\n", "fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n", "eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n", "\n", "\n", "# Dataloaders for fine-tuning and evaluation\n", "fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=8, shuffle=True)\n", "eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)\n", "\n", "\n", "# Instantiate the datasets for fine-tuning and evaluation\n", "fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n", "eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n", "\n", "\n", "best_epoch = 4\n", "\n", "# Example of calling the modified function\n", "save_path = '../if-statements/dataset/extracted/final'\n", "#best_epoch = fine_tune_with_eval(model, fine_tune_loader, eval_loader, epochs=5, save_path=save_path)\n", "\n", "# Define the directory of the best model\n", "best_model_directory = os.path.join(save_path, str(best_epoch))\n", "\n", "# Load the best model and its config\n", "best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)\n", "\n", "# Optionally, load the model's config\n", "model_config = best_model.config # This will load the config file associated with the model\n", "\n", "best_model.to(device)\n", "\n", "# Prepare and evaluate on the test set\n", "test_df['masked_code'], test_df['ground_truth'] = zip(*test_df['source'].apply(mask_if_condition))\n", "test_df.dropna(subset=['ground_truth'], inplace=True)\n", "test_dataset = MaskedIfDataset(tokenizer, test_df)\n", "test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)\n", "\n", "# Evaluate the model on the test set\n", "test_accuracy = evaluate_accuracy(best_model, test_loader, tokenizer, device)\n", "print(f\"Test Accuracy: {test_accuracy:.4f}\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "3eb56941-cd5b-405b-ae37-f15d97a2b22e", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Unnamed: 0 original_method \\\n", "0 5126 def stream_edit(request, stream_id, response_f... \n", "1 10859 def _read_and_parse_includes(self):\\n # Map... \n", "2 10615 def _get_list_key(self, spaces, lines):\\n k... \n", "3 17853 def search_host(self, search_string):\\n res... \n", "4 3922 def pop(self, key: Union[str, Enum], default: ... \n", "\n", " target_block \n", "0 if \"cancel\" not in request . POST : \n", "1 if isinstance ( node , ast . Include ) : \n", "2 if len ( line . strip ( ) ) == 0 : \n", "3 if isinstance ( value , int ) : \n", "4 if self . _get_flag ( \"struct\" ) : \n" ] } ], "source": [ "# Load the new dataset\n", "new_df = pd.read_csv('../if-statements/dataset/extracted/test_set_usi.csv')\n", "\n", "new_df.drop(\"input_method\", axis=1, inplace=True)\n", "new_df.drop(\"tokens_in_method\", axis=1, inplace=True)\n", "\n", "print(new_df.head())\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "08a9c76f-32da-4871-b0af-d5afafa50ae0", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Unnamed: 0 original_method \\\n", "0 5126 def stream_edit(request, stream_id, response_f... \n", "1 10859 def _read_and_parse_includes(self):\\n # Map... \n", "2 10615 def _get_list_key(self, spaces, lines):\\n k... \n", "3 17853 def search_host(self, search_string):\\n res... \n", "4 3922 def pop(self, key: Union[str, Enum], default: ... \n", "\n", " target_block \\\n", "0 if \"cancel\" not in request . POST : \n", "1 if isinstance ( node , ast . Include ) : \n", "2 if len ( line . strip ( ) ) == 0 : \n", "3 if isinstance ( value , int ) : \n", "4 if self . _get_flag ( \"struct\" ) : \n", "\n", " masked_code \\\n", "0 def stream_edit(request, stream_id, response_f... \n", "1 def _read_and_parse_includes(self):\\n # Map... \n", "2 def _get_list_key(self, spaces, lines):\\n k... \n", "3 def search_host(self, search_string):\\n res... \n", "4 def pop(self, key: Union[str, Enum], default: ... \n", "\n", " ground_truth \n", "0 if not request.user.profile.has_permission(str... \n", "1 if isinstance(node, ast.Include): \n", "2 if len(line.strip()) == 0: \n", "3 if host_entry.get(\"type\") != \"entry\": \n", "4 if self._get_flag(\"readonly\"): \n" ] } ], "source": [ "# Function to preprocess the new dataframe\n", "def preprocess_new_df(df):\n", " # Apply the masking function\n", " df['masked_code'], df['ground_truth'] = zip(*df['original_method'].apply(mask_if_condition))\n", " # Drop rows where ground truth (if statement) is None\n", " df.dropna(subset=['ground_truth'], inplace=True)\n", "\n", "# Preprocess the new dataframe\n", "preprocess_new_df(new_df)\n", "\n", "# Check the first few rows\n", "print(new_df.head())\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "c36c9144-64b2-46dd-b597-5528ff57b10a", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Evaluating: 100%|█████████████████████████████| 624/624 [02:29<00:00, 4.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "New Dataset Accuracy: 0.2841\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Create dataset for the new dataframe\n", "new_dataset = MaskedIfDataset(tokenizer, new_df)\n", "\n", "# Create DataLoader for the new dataset\n", "new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False)\n", "\n", "# Evaluate the model on the new dataset\n", "new_accuracy = evaluate_accuracy(best_model, new_loader, tokenizer, device)\n", "print(f\"New Dataset Accuracy: {new_accuracy:.4f}\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }