449 lines
18 KiB
Text
449 lines
18 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "5f7ad96a-6b01-4b63-93b6-4008597a0e9e",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Using device: cuda\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Evaluating: 100%|███████████████████████████| 1092/1092 [04:02<00:00, 4.50it/s]"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Test Accuracy: 0.3642\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import re\n",
|
|
"import random\n",
|
|
"import torch\n",
|
|
"from torch.utils.data import Dataset, DataLoader\n",
|
|
"from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForLanguageModeling\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from tqdm import tqdm\n",
|
|
"import os\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"\n",
|
|
"# Dataset class for pre-training\n",
|
|
"class PythonCodeDataset(Dataset):\n",
|
|
" def __init__(self, tokenizer, dataframe, max_len=512):\n",
|
|
" self.tokenizer = tokenizer\n",
|
|
" self.data = dataframe\n",
|
|
" self.max_len = max_len\n",
|
|
"\n",
|
|
" def __len__(self):\n",
|
|
" return len(self.data)\n",
|
|
"\n",
|
|
" def __getitem__(self, index):\n",
|
|
" code = self.data.iloc[index]['source']\n",
|
|
" inputs = self.tokenizer.encode_plus(code, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)\n",
|
|
" return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}\n",
|
|
"\n",
|
|
"# Function to mask if conditions\n",
|
|
"def mask_if_condition(code_snippet):\n",
|
|
" if_conditions = re.findall(r'(if\\s+.*?:)', code_snippet)\n",
|
|
" masked_snippet = code_snippet.replace(if_conditions[0], '<mask>', 1) if if_conditions else code_snippet\n",
|
|
" return masked_snippet, if_conditions[0] if if_conditions else None\n",
|
|
"\n",
|
|
"# Fine-tuning and evaluation dataset classes\n",
|
|
"class MaskedIfDataset(PythonCodeDataset):\n",
|
|
" def __getitem__(self, index):\n",
|
|
" masked_code = self.data.iloc[index]['masked_code']\n",
|
|
" ground_truth = self.data.iloc[index]['ground_truth']\n",
|
|
" inputs = self.tokenizer(masked_code, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\")\n",
|
|
" labels = self.tokenizer(ground_truth, max_length=self.max_len, padding='max_length', truncation=True, return_tensors=\"pt\").input_ids\n",
|
|
" labels[labels == self.tokenizer.pad_token_id] = -100\n",
|
|
" return {'input_ids': inputs.input_ids.squeeze(), 'attention_mask': inputs.attention_mask.squeeze(), 'labels': labels.squeeze()}\n",
|
|
"\n",
|
|
"# Define the pre-training loop\n",
|
|
"def pretrain(model, dataloader, epochs, print_every=10):\n",
|
|
" model.train()\n",
|
|
" optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n",
|
|
" global_step = 0 # Initialize a counter for the global training step\n",
|
|
"\n",
|
|
" for epoch in range(epochs):\n",
|
|
" for batch in dataloader:\n",
|
|
" batch = {k: v.to(device) for k, v in batch.items()}\n",
|
|
" optimizer.zero_grad()\n",
|
|
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}\n",
|
|
" outputs = model(**inputs, labels=batch['input_ids'])\n",
|
|
" loss = outputs.loss\n",
|
|
" loss.backward()\n",
|
|
" optimizer.step()\n",
|
|
"\n",
|
|
" if global_step % print_every == 0: # Print every steps\n",
|
|
" print(f\"Step {global_step}, Loss: {loss.item()}\")\n",
|
|
"\n",
|
|
" global_step += 1 # Increment the step counter\n",
|
|
"\n",
|
|
" print(f\"Epoch {epoch+1}/{epochs} completed.\")\n",
|
|
" \n",
|
|
"\n",
|
|
"def fine_tune_with_eval(model, train_loader, eval_loader, epochs, save_path, print_every=10, early_stopping_patience=3):\n",
|
|
" optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n",
|
|
" best_epoch = 0\n",
|
|
" best_eval_accuracy = 0\n",
|
|
" patience_counter = 0\n",
|
|
" train_losses, eval_accuracies = [], []\n",
|
|
"\n",
|
|
" for epoch in range(epochs):\n",
|
|
" model.train()\n",
|
|
" total_loss = 0\n",
|
|
"\n",
|
|
" # Training loop with tqdm for progress tracking\n",
|
|
" for batch in tqdm(train_loader, desc=f\"Training Epoch {epoch+1}/{epochs}\"):\n",
|
|
" batch = {k: v.to(device) for k, v in batch.items()}\n",
|
|
" optimizer.zero_grad()\n",
|
|
" inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['labels']}\n",
|
|
" outputs = model(**inputs)\n",
|
|
" loss = outputs.loss\n",
|
|
" total_loss += loss.item()\n",
|
|
" loss.backward()\n",
|
|
" optimizer.step()\n",
|
|
"\n",
|
|
" average_loss = total_loss / len(train_loader)\n",
|
|
" train_losses.append(average_loss)\n",
|
|
"\n",
|
|
" # Evaluate on the evaluation set\n",
|
|
" eval_accuracy = evaluate_accuracy(model, eval_loader, tokenizer, device)\n",
|
|
" eval_accuracies.append(eval_accuracy)\n",
|
|
" print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {average_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}\")\n",
|
|
"\n",
|
|
" # Early stopping and checkpointing\n",
|
|
" if eval_accuracy > best_eval_accuracy:\n",
|
|
" best_eval_accuracy = eval_accuracy\n",
|
|
" best_epoch = epoch\n",
|
|
" patience_counter = 0\n",
|
|
" else:\n",
|
|
" patience_counter += 1\n",
|
|
" if patience_counter >= early_stopping_patience:\n",
|
|
" print(\"Early stopping triggered.\")\n",
|
|
" break\n",
|
|
" \n",
|
|
" save_directory = f\"{save_path}/{epoch}\"\n",
|
|
" model.save_pretrained(save_directory)\n",
|
|
" \n",
|
|
" # Plotting the training loss and evaluation accuracy\n",
|
|
" plt.figure(figsize=(12, 5))\n",
|
|
" plt.subplot(1, 2, 1)\n",
|
|
" plt.plot(train_losses, label='Training Loss')\n",
|
|
" plt.title('Training Loss')\n",
|
|
" plt.xlabel('Epoch')\n",
|
|
" plt.ylabel('Loss')\n",
|
|
" plt.legend()\n",
|
|
"\n",
|
|
" plt.subplot(1, 2, 2)\n",
|
|
" plt.plot(eval_accuracies, label='Evaluation Accuracy')\n",
|
|
" plt.title('Evaluation Accuracy')\n",
|
|
" plt.xlabel('Epoch')\n",
|
|
" plt.ylabel('Accuracy')\n",
|
|
" plt.legend()\n",
|
|
"\n",
|
|
" plt.savefig(f\"{save_path}/training_metrics.png\")\n",
|
|
" \n",
|
|
" return best_epoch\n",
|
|
"\n",
|
|
"\n",
|
|
"def evaluate_accuracy(model, dataloader, tokenizer, device):\n",
|
|
" model.eval()\n",
|
|
" correct_predictions, total_predictions = 0, 0\n",
|
|
"\n",
|
|
" for batch in tqdm(dataloader, desc=\"Evaluating\"):\n",
|
|
" batch = {k: v.to(device) for k, v in batch.items()}\n",
|
|
" with torch.no_grad():\n",
|
|
" outputs = model.generate(batch['input_ids'], attention_mask=batch['attention_mask'], max_length=512)\n",
|
|
" decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n",
|
|
"\n",
|
|
" # Decode labels with added check for None values\n",
|
|
" decoded_labels = []\n",
|
|
" for label in batch['labels']:\n",
|
|
" label_trimmed = [l for l in label.tolist() if l != tokenizer.pad_token_id and l != -100]\n",
|
|
" if label_trimmed:\n",
|
|
" decoded_label = tokenizer.decode(label_trimmed, skip_special_tokens=True)\n",
|
|
" decoded_labels.append(decoded_label)\n",
|
|
" else:\n",
|
|
" decoded_labels.append(None) # Append None for invalid/empty labels\n",
|
|
"\n",
|
|
" # Calculate accuracy\n",
|
|
" for output, label in zip(decoded_outputs, decoded_labels):\n",
|
|
" if label is not None and output.strip() == label.strip():\n",
|
|
" correct_predictions += 1\n",
|
|
" if label is not None:\n",
|
|
" total_predictions += 1\n",
|
|
"\n",
|
|
" return correct_predictions / total_predictions if total_predictions > 0 else 0\n",
|
|
" \n",
|
|
" \n",
|
|
"# Read the dataset\n",
|
|
"df = pd.read_parquet('../if-statements/dataset/extracted/functions.pq')\n",
|
|
"#df = df.head(50)\n",
|
|
"\n",
|
|
"# Split the dataset into pre-training, fine-tuning, evaluation, and test sets\n",
|
|
"pretrain_df, fine_tune_df = train_test_split(df, test_size=0.5, random_state=42)\n",
|
|
"eval_df = fine_tune_df.sample(frac=0.1, random_state=42)\n",
|
|
"test_df = fine_tune_df.drop(eval_df.index).sample(frac=0.1111, random_state=42)\n",
|
|
"fine_tune_df = fine_tune_df.drop(eval_df.index).drop(test_df.index)\n",
|
|
"\n",
|
|
"assert len(set(eval_df.index).intersection(set(test_df.index))) == 0\n",
|
|
"\n",
|
|
"\n",
|
|
"# Initialize tokenizer and model\n",
|
|
"tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')\n",
|
|
"model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')\n",
|
|
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
|
"print(f'Using device: {device}')\n",
|
|
"model.to(device)\n",
|
|
" \n",
|
|
"# Instantiate the dataset for pre-training\n",
|
|
"pretrain_dataset = PythonCodeDataset(tokenizer, pretrain_df)\n",
|
|
"\n",
|
|
"# Set up the data collator for MLM\n",
|
|
"data_collator = DataCollatorForLanguageModeling(\n",
|
|
" tokenizer=tokenizer,\n",
|
|
" mlm=True,\n",
|
|
" mlm_probability=0.15\n",
|
|
")\n",
|
|
"\n",
|
|
"# Create a DataLoader for pre-training\n",
|
|
"pretrain_loader = DataLoader(pretrain_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)\n",
|
|
"\n",
|
|
"# Pre-train the model\n",
|
|
"#pretrain(model, pretrain_loader, epochs=1)\n",
|
|
"\n",
|
|
"\n",
|
|
"# Prepare data for fine-tuning and evaluation\n",
|
|
"fine_tune_df['masked_code'], fine_tune_df['ground_truth'] = zip(*fine_tune_df['source'].apply(mask_if_condition))\n",
|
|
"eval_df['masked_code'], eval_df['ground_truth'] = zip(*eval_df['source'].apply(mask_if_condition))\n",
|
|
"fine_tune_df.dropna(subset=['ground_truth'], inplace=True)\n",
|
|
"eval_df.dropna(subset=['ground_truth'], inplace=True)\n",
|
|
"\n",
|
|
"\n",
|
|
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
|
|
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
|
|
"\n",
|
|
"\n",
|
|
"# Dataloaders for fine-tuning and evaluation\n",
|
|
"fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=8, shuffle=True)\n",
|
|
"eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)\n",
|
|
"\n",
|
|
"\n",
|
|
"# Instantiate the datasets for fine-tuning and evaluation\n",
|
|
"fine_tune_dataset = MaskedIfDataset(tokenizer, fine_tune_df)\n",
|
|
"eval_dataset = MaskedIfDataset(tokenizer, eval_df)\n",
|
|
"\n",
|
|
"\n",
|
|
"best_epoch = 4\n",
|
|
"\n",
|
|
"# Example of calling the modified function\n",
|
|
"save_path = '../if-statements/dataset/extracted/final'\n",
|
|
"#best_epoch = fine_tune_with_eval(model, fine_tune_loader, eval_loader, epochs=5, save_path=save_path)\n",
|
|
"\n",
|
|
"# Define the directory of the best model\n",
|
|
"best_model_directory = os.path.join(save_path, str(best_epoch))\n",
|
|
"\n",
|
|
"# Load the best model and its config\n",
|
|
"best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)\n",
|
|
"\n",
|
|
"# Optionally, load the model's config\n",
|
|
"model_config = best_model.config # This will load the config file associated with the model\n",
|
|
"\n",
|
|
"best_model.to(device)\n",
|
|
"\n",
|
|
"# Prepare and evaluate on the test set\n",
|
|
"test_df['masked_code'], test_df['ground_truth'] = zip(*test_df['source'].apply(mask_if_condition))\n",
|
|
"test_df.dropna(subset=['ground_truth'], inplace=True)\n",
|
|
"test_dataset = MaskedIfDataset(tokenizer, test_df)\n",
|
|
"test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)\n",
|
|
"\n",
|
|
"# Evaluate the model on the test set\n",
|
|
"test_accuracy = evaluate_accuracy(best_model, test_loader, tokenizer, device)\n",
|
|
"print(f\"Test Accuracy: {test_accuracy:.4f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "3eb56941-cd5b-405b-ae37-f15d97a2b22e",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Unnamed: 0 original_method \\\n",
|
|
"0 5126 def stream_edit(request, stream_id, response_f... \n",
|
|
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
|
|
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
|
|
"3 17853 def search_host(self, search_string):\\n res... \n",
|
|
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
|
|
"\n",
|
|
" target_block \n",
|
|
"0 if \"cancel\" not in request . POST : \n",
|
|
"1 if isinstance ( node , ast . Include ) : \n",
|
|
"2 if len ( line . strip ( ) ) == 0 : \n",
|
|
"3 if isinstance ( value , int ) : \n",
|
|
"4 if self . _get_flag ( \"struct\" ) : \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Load the new dataset\n",
|
|
"new_df = pd.read_csv('../if-statements/dataset/extracted/test_set_usi.csv')\n",
|
|
"\n",
|
|
"new_df.drop(\"input_method\", axis=1, inplace=True)\n",
|
|
"new_df.drop(\"tokens_in_method\", axis=1, inplace=True)\n",
|
|
"\n",
|
|
"print(new_df.head())\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "08a9c76f-32da-4871-b0af-d5afafa50ae0",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Unnamed: 0 original_method \\\n",
|
|
"0 5126 def stream_edit(request, stream_id, response_f... \n",
|
|
"1 10859 def _read_and_parse_includes(self):\\n # Map... \n",
|
|
"2 10615 def _get_list_key(self, spaces, lines):\\n k... \n",
|
|
"3 17853 def search_host(self, search_string):\\n res... \n",
|
|
"4 3922 def pop(self, key: Union[str, Enum], default: ... \n",
|
|
"\n",
|
|
" target_block \\\n",
|
|
"0 if \"cancel\" not in request . POST : \n",
|
|
"1 if isinstance ( node , ast . Include ) : \n",
|
|
"2 if len ( line . strip ( ) ) == 0 : \n",
|
|
"3 if isinstance ( value , int ) : \n",
|
|
"4 if self . _get_flag ( \"struct\" ) : \n",
|
|
"\n",
|
|
" masked_code \\\n",
|
|
"0 def stream_edit(request, stream_id, response_f... \n",
|
|
"1 def _read_and_parse_includes(self):\\n # Map... \n",
|
|
"2 def _get_list_key(self, spaces, lines):\\n k... \n",
|
|
"3 def search_host(self, search_string):\\n res... \n",
|
|
"4 def pop(self, key: Union[str, Enum], default: ... \n",
|
|
"\n",
|
|
" ground_truth \n",
|
|
"0 if not request.user.profile.has_permission(str... \n",
|
|
"1 if isinstance(node, ast.Include): \n",
|
|
"2 if len(line.strip()) == 0: \n",
|
|
"3 if host_entry.get(\"type\") != \"entry\": \n",
|
|
"4 if self._get_flag(\"readonly\"): \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Function to preprocess the new dataframe\n",
|
|
"def preprocess_new_df(df):\n",
|
|
" # Apply the masking function\n",
|
|
" df['masked_code'], df['ground_truth'] = zip(*df['original_method'].apply(mask_if_condition))\n",
|
|
" # Drop rows where ground truth (if statement) is None\n",
|
|
" df.dropna(subset=['ground_truth'], inplace=True)\n",
|
|
"\n",
|
|
"# Preprocess the new dataframe\n",
|
|
"preprocess_new_df(new_df)\n",
|
|
"\n",
|
|
"# Check the first few rows\n",
|
|
"print(new_df.head())\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "c36c9144-64b2-46dd-b597-5528ff57b10a",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Evaluating: 100%|█████████████████████████████| 624/624 [02:29<00:00, 4.17it/s]"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"New Dataset Accuracy: 0.2841\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Create dataset for the new dataframe\n",
|
|
"new_dataset = MaskedIfDataset(tokenizer, new_df)\n",
|
|
"\n",
|
|
"# Create DataLoader for the new dataset\n",
|
|
"new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False)\n",
|
|
"\n",
|
|
"# Evaluate the model on the new dataset\n",
|
|
"new_accuracy = evaluate_accuracy(best_model, new_loader, tokenizer, device)\n",
|
|
"print(f\"New Dataset Accuracy: {new_accuracy:.4f}\")\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|